{ "epoch": 0.9991671471586905, "global_step": 1114, "max_steps": 1114, "logging_steps": 1, "eval_steps": 50, "save_steps": 50, "train_batch_size": 8, "num_train_epochs": 1, "num_input_tokens_seen": 0, "total_flos": 6.811715592467251e+17, "log_history": [ { "loss": 440.6308, "grad_norm": 98.61355590820312, "learning_rate": 0.0004999990058793643, "epoch": 0.0008969184444871549, "step": 1 }, { "loss": 515.3978, "grad_norm": 1112.0234375, "learning_rate": 0.0004999960235253631, "epoch": 0.0017938368889743098, "step": 2 }, { "loss": 477.4767, "grad_norm": 392.8102722167969, "learning_rate": 0.0004999910529617153, "epoch": 0.0026907553334614646, "step": 3 }, { "loss": 457.2771, "grad_norm": 292.9400939941406, "learning_rate": 0.0004999840942279514, "epoch": 0.0035876737779486196, "step": 4 }, { "loss": 444.411, "grad_norm": 166.66598510742188, "learning_rate": 0.000499975147379414, "epoch": 0.004484592222435775, "step": 5 }, { "loss": 438.7729, "grad_norm": 132.8984375, "learning_rate": 0.000499964212487257, "epoch": 0.005381510666922929, "step": 6 }, { "loss": 434.4058, "grad_norm": 102.88407135009766, "learning_rate": 0.0004999512896384454, "epoch": 0.006278429111410084, "step": 7 }, { "loss": 431.3428, "grad_norm": 109.61495971679688, "learning_rate": 0.0004999363789357541, "epoch": 0.007175347555897239, "step": 8 }, { "loss": 430.6904, "grad_norm": 94.442626953125, "learning_rate": 0.0004999194804977674, "epoch": 0.008072266000384394, "step": 9 }, { "loss": 427.7128, "grad_norm": 79.10123443603516, "learning_rate": 0.0004999005944588778, "epoch": 0.00896918444487155, "step": 10 }, { "loss": 430.6295, "grad_norm": 81.77398681640625, "learning_rate": 0.0004998797209692856, "epoch": 0.009866102889358703, "step": 11 }, { "loss": 422.6068, "grad_norm": 67.85909271240234, "learning_rate": 0.0004998568601949967, "epoch": 0.010763021333845858, "step": 12 }, { "loss": 422.3798, "grad_norm": 81.51007843017578, "learning_rate": 0.0004998320123178223, "epoch": 0.011659939778333014, "step": 13 }, { "loss": 423.3609, "grad_norm": 70.7045669555664, "learning_rate": 0.0004998051775353763, "epoch": 0.012556858222820167, "step": 14 }, { "loss": 423.232, "grad_norm": 75.2995834350586, "learning_rate": 0.0004997763560610752, "epoch": 0.013453776667307323, "step": 15 }, { "loss": 414.7621, "grad_norm": 63.627197265625, "learning_rate": 0.000499745548124135, "epoch": 0.014350695111794478, "step": 16 }, { "loss": 419.0351, "grad_norm": 73.96087646484375, "learning_rate": 0.0004997127539695701, "epoch": 0.015247613556281632, "step": 17 }, { "loss": 418.1977, "grad_norm": 70.3633804321289, "learning_rate": 0.0004996779738581913, "epoch": 0.016144532000768787, "step": 18 }, { "loss": 416.1606, "grad_norm": 74.2279052734375, "learning_rate": 0.0004996412080666037, "epoch": 0.017041450445255943, "step": 19 }, { "loss": 417.2284, "grad_norm": 63.311676025390625, "learning_rate": 0.0004996024568872042, "epoch": 0.0179383688897431, "step": 20 }, { "loss": 409.5278, "grad_norm": 63.21588897705078, "learning_rate": 0.0004995617206281797, "epoch": 0.01883528733423025, "step": 21 }, { "loss": 414.1958, "grad_norm": 61.4863395690918, "learning_rate": 0.0004995189996135042, "epoch": 0.019732205778717406, "step": 22 }, { "loss": 419.7891, "grad_norm": 61.297481536865234, "learning_rate": 0.0004994742941829364, "epoch": 0.02062912422320456, "step": 23 }, { "loss": 414.3831, "grad_norm": 68.20845031738281, "learning_rate": 0.0004994276046920171, "epoch": 0.021526042667691717, "step": 24 }, { "loss": 415.8848, "grad_norm": 59.016239166259766, "learning_rate": 0.0004993789315120662, "epoch": 0.022422961112178872, "step": 25 }, { "loss": 417.4357, "grad_norm": 55.90328598022461, "learning_rate": 0.0004993282750301799, "epoch": 0.023319879556666027, "step": 26 }, { "loss": 411.6564, "grad_norm": 59.52859115600586, "learning_rate": 0.000499275635649227, "epoch": 0.02421679800115318, "step": 27 }, { "loss": 412.2451, "grad_norm": 59.61384963989258, "learning_rate": 0.0004992210137878472, "epoch": 0.025113716445640335, "step": 28 }, { "loss": 416.412, "grad_norm": 60.00177001953125, "learning_rate": 0.000499164409880446, "epoch": 0.02601063489012749, "step": 29 }, { "loss": 405.7923, "grad_norm": 59.08831024169922, "learning_rate": 0.0004991058243771922, "epoch": 0.026907553334614646, "step": 30 }, { "loss": 411.6278, "grad_norm": 58.00886154174805, "learning_rate": 0.0004990452577440143, "epoch": 0.0278044717791018, "step": 31 }, { "loss": 406.3222, "grad_norm": 57.3386116027832, "learning_rate": 0.0004989827104625969, "epoch": 0.028701390223588957, "step": 32 }, { "loss": 404.9872, "grad_norm": 56.013816833496094, "learning_rate": 0.000498918183030376, "epoch": 0.02959830866807611, "step": 33 }, { "loss": 406.4626, "grad_norm": 57.787132263183594, "learning_rate": 0.0004988516759605363, "epoch": 0.030495227112563264, "step": 34 }, { "loss": 405.2309, "grad_norm": 54.9903678894043, "learning_rate": 0.0004987831897820059, "epoch": 0.03139214555705042, "step": 35 }, { "loss": 415.0021, "grad_norm": 55.86436462402344, "learning_rate": 0.0004987127250394532, "epoch": 0.032289064001537575, "step": 36 }, { "loss": 402.1766, "grad_norm": 53.72284698486328, "learning_rate": 0.0004986402822932818, "epoch": 0.03318598244602473, "step": 37 }, { "loss": 409.7162, "grad_norm": 56.52421569824219, "learning_rate": 0.0004985658621196263, "epoch": 0.034082900890511886, "step": 38 }, { "loss": 406.8592, "grad_norm": 63.26171875, "learning_rate": 0.0004984894651103478, "epoch": 0.03497981933499904, "step": 39 }, { "loss": 401.9672, "grad_norm": 52.98197937011719, "learning_rate": 0.0004984110918730289, "epoch": 0.0358767377794862, "step": 40 }, { "loss": 402.0731, "grad_norm": 61.255733489990234, "learning_rate": 0.0004983307430309695, "epoch": 0.03677365622397335, "step": 41 }, { "loss": 405.9777, "grad_norm": 62.212188720703125, "learning_rate": 0.0004982484192231808, "epoch": 0.0376705746684605, "step": 42 }, { "loss": 409.4884, "grad_norm": 60.04124450683594, "learning_rate": 0.0004981641211043813, "epoch": 0.03856749311294766, "step": 43 }, { "loss": 402.7691, "grad_norm": 58.80691909790039, "learning_rate": 0.0004980778493449912, "epoch": 0.03946441155743481, "step": 44 }, { "loss": 406.07, "grad_norm": 58.074493408203125, "learning_rate": 0.0004979896046311265, "epoch": 0.04036133000192197, "step": 45 }, { "loss": 406.7423, "grad_norm": 62.749534606933594, "learning_rate": 0.0004978993876645944, "epoch": 0.04125824844640912, "step": 46 }, { "loss": 403.2931, "grad_norm": 58.47712707519531, "learning_rate": 0.0004978071991628874, "epoch": 0.04215516689089628, "step": 47 }, { "loss": 402.5574, "grad_norm": 64.82901000976562, "learning_rate": 0.0004977130398591775, "epoch": 0.04305208533538343, "step": 48 }, { "loss": 405.5097, "grad_norm": 56.95109939575195, "learning_rate": 0.00049761691050231, "epoch": 0.043949003779870585, "step": 49 }, { "loss": 408.4274, "grad_norm": 60.67522048950195, "learning_rate": 0.0004975188118567987, "epoch": 0.044845922224357744, "step": 50 }, { "eval_loss": 1.7932980060577393, "eval_runtime": 41.7475, "eval_samples_per_second": 49.057, "eval_steps_per_second": 3.066, "epoch": 0.044845922224357744, "step": 50 }, { "loss": 405.2191, "grad_norm": 61.441951751708984, "learning_rate": 0.0004974187447028184, "epoch": 0.045742840668844896, "step": 51 }, { "loss": 402.9874, "grad_norm": 56.64131546020508, "learning_rate": 0.0004973167098361999, "epoch": 0.046639759113332055, "step": 52 }, { "loss": 403.7462, "grad_norm": 58.905479431152344, "learning_rate": 0.0004972127080684228, "epoch": 0.04753667755781921, "step": 53 }, { "loss": 402.2606, "grad_norm": 60.9106559753418, "learning_rate": 0.0004971067402266096, "epoch": 0.04843359600230636, "step": 54 }, { "loss": 397.4493, "grad_norm": 55.347869873046875, "learning_rate": 0.0004969988071535188, "epoch": 0.04933051444679352, "step": 55 }, { "loss": 398.7716, "grad_norm": 56.816104888916016, "learning_rate": 0.0004968889097075385, "epoch": 0.05022743289128067, "step": 56 }, { "loss": 399.2036, "grad_norm": 63.388851165771484, "learning_rate": 0.0004967770487626791, "epoch": 0.05112435133576783, "step": 57 }, { "loss": 402.6399, "grad_norm": 58.803466796875, "learning_rate": 0.0004966632252085668, "epoch": 0.05202126978025498, "step": 58 }, { "loss": 401.2329, "grad_norm": 61.42218780517578, "learning_rate": 0.0004965474399504364, "epoch": 0.05291818822474213, "step": 59 }, { "loss": 394.491, "grad_norm": 54.581748962402344, "learning_rate": 0.000496429693909124, "epoch": 0.05381510666922929, "step": 60 }, { "loss": 402.2176, "grad_norm": 60.348812103271484, "learning_rate": 0.0004963099880210597, "epoch": 0.05471202511371644, "step": 61 }, { "loss": 401.5288, "grad_norm": 58.51568603515625, "learning_rate": 0.0004961883232382603, "epoch": 0.0556089435582036, "step": 62 }, { "loss": 402.1975, "grad_norm": 53.891822814941406, "learning_rate": 0.0004960647005283217, "epoch": 0.056505862002690754, "step": 63 }, { "loss": 402.8554, "grad_norm": 54.66781234741211, "learning_rate": 0.0004959391208744108, "epoch": 0.05740278044717791, "step": 64 }, { "loss": 397.2245, "grad_norm": 57.83986282348633, "learning_rate": 0.0004958115852752582, "epoch": 0.058299698891665065, "step": 65 }, { "loss": 398.295, "grad_norm": 56.6056022644043, "learning_rate": 0.0004956820947451502, "epoch": 0.05919661733615222, "step": 66 }, { "loss": 398.1401, "grad_norm": 58.830711364746094, "learning_rate": 0.0004955506503139204, "epoch": 0.060093535780639376, "step": 67 }, { "loss": 401.4149, "grad_norm": 54.770755767822266, "learning_rate": 0.0004954172530269418, "epoch": 0.06099045422512653, "step": 68 }, { "loss": 399.5218, "grad_norm": 59.45661926269531, "learning_rate": 0.0004952819039451183, "epoch": 0.06188737266961369, "step": 69 }, { "loss": 396.4537, "grad_norm": 53.4246826171875, "learning_rate": 0.0004951446041448765, "epoch": 0.06278429111410085, "step": 70 }, { "loss": 401.2764, "grad_norm": 55.125919342041016, "learning_rate": 0.0004950053547181568, "epoch": 0.063681209558588, "step": 71 }, { "loss": 400.9092, "grad_norm": 63.59549331665039, "learning_rate": 0.0004948641567724053, "epoch": 0.06457812800307515, "step": 72 }, { "loss": 397.1968, "grad_norm": 58.40228271484375, "learning_rate": 0.0004947210114305639, "epoch": 0.0654750464475623, "step": 73 }, { "loss": 398.0598, "grad_norm": 62.7151985168457, "learning_rate": 0.0004945759198310629, "epoch": 0.06637196489204945, "step": 74 }, { "loss": 398.7396, "grad_norm": 59.287742614746094, "learning_rate": 0.0004944288831278106, "epoch": 0.06726888333653662, "step": 75 }, { "loss": 391.3397, "grad_norm": 59.052059173583984, "learning_rate": 0.0004942799024901846, "epoch": 0.06816580178102377, "step": 76 }, { "loss": 394.1899, "grad_norm": 54.65058135986328, "learning_rate": 0.0004941289791030229, "epoch": 0.06906272022551092, "step": 77 }, { "loss": 393.8536, "grad_norm": 51.59941101074219, "learning_rate": 0.0004939761141666139, "epoch": 0.06995963866999808, "step": 78 }, { "loss": 396.7059, "grad_norm": 55.84555435180664, "learning_rate": 0.0004938213088966872, "epoch": 0.07085655711448523, "step": 79 }, { "loss": 392.0196, "grad_norm": 55.808250427246094, "learning_rate": 0.0004936645645244033, "epoch": 0.0717534755589724, "step": 80 }, { "loss": 395.5785, "grad_norm": 53.83452224731445, "learning_rate": 0.0004935058822963453, "epoch": 0.07265039400345955, "step": 81 }, { "loss": 398.3966, "grad_norm": 61.950626373291016, "learning_rate": 0.000493345263474507, "epoch": 0.0735473124479467, "step": 82 }, { "loss": 399.4866, "grad_norm": 65.6949462890625, "learning_rate": 0.0004931827093362844, "epoch": 0.07444423089243385, "step": 83 }, { "loss": 393.8017, "grad_norm": 54.928836822509766, "learning_rate": 0.0004930182211744649, "epoch": 0.075341149336921, "step": 84 }, { "loss": 398.1347, "grad_norm": 59.81849670410156, "learning_rate": 0.0004928518002972172, "epoch": 0.07623806778140817, "step": 85 }, { "loss": 392.8837, "grad_norm": 57.970462799072266, "learning_rate": 0.0004926834480280805, "epoch": 0.07713498622589532, "step": 86 }, { "loss": 394.3792, "grad_norm": 57.43026351928711, "learning_rate": 0.0004925131657059547, "epoch": 0.07803190467038247, "step": 87 }, { "loss": 395.7612, "grad_norm": 57.73651123046875, "learning_rate": 0.0004923409546850891, "epoch": 0.07892882311486962, "step": 88 }, { "loss": 396.5627, "grad_norm": 58.27775573730469, "learning_rate": 0.000492166816335072, "epoch": 0.07982574155935677, "step": 89 }, { "loss": 398.5615, "grad_norm": 53.49543762207031, "learning_rate": 0.0004919907520408196, "epoch": 0.08072266000384394, "step": 90 }, { "loss": 398.6497, "grad_norm": 57.175514221191406, "learning_rate": 0.000491812763202565, "epoch": 0.08161957844833109, "step": 91 }, { "loss": 392.5616, "grad_norm": 58.206119537353516, "learning_rate": 0.0004916328512358472, "epoch": 0.08251649689281824, "step": 92 }, { "loss": 390.17, "grad_norm": 56.978179931640625, "learning_rate": 0.0004914510175714999, "epoch": 0.0834134153373054, "step": 93 }, { "loss": 391.477, "grad_norm": 59.842369079589844, "learning_rate": 0.0004912672636556397, "epoch": 0.08431033378179256, "step": 94 }, { "loss": 394.4383, "grad_norm": 52.20112609863281, "learning_rate": 0.0004910815909496555, "epoch": 0.08520725222627971, "step": 95 }, { "loss": 390.8443, "grad_norm": 61.12334060668945, "learning_rate": 0.0004908940009301954, "epoch": 0.08610417067076687, "step": 96 }, { "loss": 395.9276, "grad_norm": 55.49872589111328, "learning_rate": 0.0004907044950891565, "epoch": 0.08700108911525402, "step": 97 }, { "loss": 394.7866, "grad_norm": 59.71890640258789, "learning_rate": 0.000490513074933672, "epoch": 0.08789800755974117, "step": 98 }, { "loss": 388.5464, "grad_norm": 55.72919845581055, "learning_rate": 0.0004903197419860999, "epoch": 0.08879492600422834, "step": 99 }, { "loss": 392.9969, "grad_norm": 61.6799430847168, "learning_rate": 0.0004901244977840103, "epoch": 0.08969184444871549, "step": 100 }, { "eval_loss": 1.7485355138778687, "eval_runtime": 49.5113, "eval_samples_per_second": 41.364, "eval_steps_per_second": 2.585, "epoch": 0.08969184444871549, "step": 100 }, { "loss": 393.0805, "grad_norm": 58.71113204956055, "learning_rate": 0.0004899273438801734, "epoch": 0.09058876289320264, "step": 101 }, { "loss": 391.5116, "grad_norm": 54.11758804321289, "learning_rate": 0.0004897282818425474, "epoch": 0.09148568133768979, "step": 102 }, { "loss": 394.4952, "grad_norm": 53.54176712036133, "learning_rate": 0.0004895273132542658, "epoch": 0.09238259978217694, "step": 103 }, { "loss": 392.5484, "grad_norm": 51.26163101196289, "learning_rate": 0.0004893244397136246, "epoch": 0.09327951822666411, "step": 104 }, { "loss": 392.7574, "grad_norm": 57.158973693847656, "learning_rate": 0.0004891196628340703, "epoch": 0.09417643667115126, "step": 105 }, { "loss": 392.1094, "grad_norm": 51.87057113647461, "learning_rate": 0.0004889129842441859, "epoch": 0.09507335511563841, "step": 106 }, { "loss": 391.9873, "grad_norm": 62.71110534667969, "learning_rate": 0.0004887044055876793, "epoch": 0.09597027356012557, "step": 107 }, { "loss": 393.0227, "grad_norm": 61.41956329345703, "learning_rate": 0.0004884939285233691, "epoch": 0.09686719200461272, "step": 108 }, { "loss": 389.2371, "grad_norm": 59.030765533447266, "learning_rate": 0.0004882815547251721, "epoch": 0.09776411044909988, "step": 109 }, { "loss": 394.932, "grad_norm": 60.926448822021484, "learning_rate": 0.00048806728588208966, "epoch": 0.09866102889358704, "step": 110 }, { "loss": 389.2965, "grad_norm": 59.546268463134766, "learning_rate": 0.0004878511236981945, "epoch": 0.09955794733807419, "step": 111 }, { "loss": 389.0897, "grad_norm": 56.25603103637695, "learning_rate": 0.0004876330698926169, "epoch": 0.10045486578256134, "step": 112 }, { "loss": 391.7546, "grad_norm": 63.1163444519043, "learning_rate": 0.00048741312619953104, "epoch": 0.10135178422704849, "step": 113 }, { "loss": 392.0137, "grad_norm": 70.23162078857422, "learning_rate": 0.00048719129436814156, "epoch": 0.10224870267153566, "step": 114 }, { "loss": 390.5738, "grad_norm": 60.9749755859375, "learning_rate": 0.00048696757616266927, "epoch": 0.10314562111602281, "step": 115 }, { "loss": 387.7592, "grad_norm": 60.2146110534668, "learning_rate": 0.0004867419733623372, "epoch": 0.10404253956050996, "step": 116 }, { "loss": 390.6403, "grad_norm": 59.26010513305664, "learning_rate": 0.00048651448776135654, "epoch": 0.10493945800499711, "step": 117 }, { "loss": 391.4545, "grad_norm": 55.02613067626953, "learning_rate": 0.00048628512116891234, "epoch": 0.10583637644948427, "step": 118 }, { "loss": 388.2937, "grad_norm": 56.28743362426758, "learning_rate": 0.00048605387540914916, "epoch": 0.10673329489397143, "step": 119 }, { "loss": 389.2755, "grad_norm": 55.22878646850586, "learning_rate": 0.0004858207523211563, "epoch": 0.10763021333845858, "step": 120 }, { "loss": 392.9062, "grad_norm": 55.45512771606445, "learning_rate": 0.00048558575375895377, "epoch": 0.10852713178294573, "step": 121 }, { "loss": 388.4548, "grad_norm": 58.8115119934082, "learning_rate": 0.0004853488815914767, "epoch": 0.10942405022743289, "step": 122 }, { "loss": 390.1011, "grad_norm": 55.49444580078125, "learning_rate": 0.00048511013770256134, "epoch": 0.11032096867192005, "step": 123 }, { "loss": 388.7439, "grad_norm": 54.36104202270508, "learning_rate": 0.00048486952399092945, "epoch": 0.1112178871164072, "step": 124 }, { "loss": 391.1307, "grad_norm": 52.75822067260742, "learning_rate": 0.0004846270423701734, "epoch": 0.11211480556089436, "step": 125 }, { "loss": 388.8095, "grad_norm": 55.67084884643555, "learning_rate": 0.0004843826947687411, "epoch": 0.11301172400538151, "step": 126 }, { "loss": 388.7104, "grad_norm": 58.483211517333984, "learning_rate": 0.0004841364831299206, "epoch": 0.11390864244986866, "step": 127 }, { "loss": 392.5351, "grad_norm": 54.69878387451172, "learning_rate": 0.00048388840941182435, "epoch": 0.11480556089435583, "step": 128 }, { "loss": 389.9329, "grad_norm": 56.85935974121094, "learning_rate": 0.00048363847558737395, "epoch": 0.11570247933884298, "step": 129 }, { "loss": 389.8976, "grad_norm": 55.818260192871094, "learning_rate": 0.0004833866836442844, "epoch": 0.11659939778333013, "step": 130 }, { "loss": 389.0714, "grad_norm": 69.33192443847656, "learning_rate": 0.0004831330355850483, "epoch": 0.11749631622781728, "step": 131 }, { "loss": 387.675, "grad_norm": 59.69966506958008, "learning_rate": 0.0004828775334269198, "epoch": 0.11839323467230443, "step": 132 }, { "loss": 389.1474, "grad_norm": 63.28241729736328, "learning_rate": 0.0004826201792018986, "epoch": 0.1192901531167916, "step": 133 }, { "loss": 386.0185, "grad_norm": 60.13338851928711, "learning_rate": 0.0004823609749567138, "epoch": 0.12018707156127875, "step": 134 }, { "loss": 393.0312, "grad_norm": 50.345890045166016, "learning_rate": 0.0004820999227528079, "epoch": 0.1210839900057659, "step": 135 }, { "loss": 388.9017, "grad_norm": 54.398582458496094, "learning_rate": 0.00048183702466631986, "epoch": 0.12198090845025306, "step": 136 }, { "loss": 390.3952, "grad_norm": 58.791343688964844, "learning_rate": 0.0004815722827880689, "epoch": 0.12287782689474021, "step": 137 }, { "loss": 391.5972, "grad_norm": 56.27891540527344, "learning_rate": 0.000481305699223538, "epoch": 0.12377474533922737, "step": 138 }, { "loss": 390.4619, "grad_norm": 57.29872512817383, "learning_rate": 0.000481037276092857, "epoch": 0.12467166378371453, "step": 139 }, { "loss": 386.5269, "grad_norm": 56.40953826904297, "learning_rate": 0.0004807670155307856, "epoch": 0.1255685822282017, "step": 140 }, { "loss": 386.9588, "grad_norm": 56.36626434326172, "learning_rate": 0.0004804949196866967, "epoch": 0.12646550067268883, "step": 141 }, { "loss": 390.6064, "grad_norm": 59.941890716552734, "learning_rate": 0.00048022099072455893, "epoch": 0.127362419117176, "step": 142 }, { "loss": 389.5639, "grad_norm": 55.42548370361328, "learning_rate": 0.0004799452308229199, "epoch": 0.12825933756166313, "step": 143 }, { "loss": 389.1144, "grad_norm": 59.46462631225586, "learning_rate": 0.0004796676421748883, "epoch": 0.1291562560061503, "step": 144 }, { "loss": 387.238, "grad_norm": 61.307960510253906, "learning_rate": 0.0004793882269881172, "epoch": 0.13005317445063747, "step": 145 }, { "loss": 385.9282, "grad_norm": 53.019859313964844, "learning_rate": 0.00047910698748478565, "epoch": 0.1309500928951246, "step": 146 }, { "loss": 388.6133, "grad_norm": 59.57033920288086, "learning_rate": 0.00047882392590158166, "epoch": 0.13184701133961177, "step": 147 }, { "loss": 385.2765, "grad_norm": 55.921993255615234, "learning_rate": 0.000478539044489684, "epoch": 0.1327439297840989, "step": 148 }, { "loss": 387.315, "grad_norm": 53.27146911621094, "learning_rate": 0.0004782523455147448, "epoch": 0.13364084822858607, "step": 149 }, { "loss": 384.9127, "grad_norm": 61.21531295776367, "learning_rate": 0.0004779638312568708, "epoch": 0.13453776667307324, "step": 150 }, { "eval_loss": 1.7258449792861938, "eval_runtime": 36.7008, "eval_samples_per_second": 55.803, "eval_steps_per_second": 3.488, "epoch": 0.13453776667307324, "step": 150 }, { "loss": 385.8539, "grad_norm": 60.04133605957031, "learning_rate": 0.00047767350401060606, "epoch": 0.13543468511756038, "step": 151 }, { "loss": 384.8003, "grad_norm": 59.11763000488281, "learning_rate": 0.0004773813660849128, "epoch": 0.13633160356204754, "step": 152 }, { "loss": 387.7485, "grad_norm": 56.51465606689453, "learning_rate": 0.0004770874198031538, "epoch": 0.13722852200653468, "step": 153 }, { "loss": 383.2278, "grad_norm": 56.18191146850586, "learning_rate": 0.0004767916675030736, "epoch": 0.13812544045102185, "step": 154 }, { "loss": 383.6736, "grad_norm": 57.308799743652344, "learning_rate": 0.00047649411153678, "epoch": 0.139022358895509, "step": 155 }, { "loss": 383.3135, "grad_norm": 56.1787109375, "learning_rate": 0.0004761947542707251, "epoch": 0.13991927733999615, "step": 156 }, { "loss": 380.7021, "grad_norm": 59.29663848876953, "learning_rate": 0.0004758935980856868, "epoch": 0.14081619578448332, "step": 157 }, { "loss": 388.3537, "grad_norm": 56.997901916503906, "learning_rate": 0.00047559064537674973, "epoch": 0.14171311422897045, "step": 158 }, { "loss": 382.6107, "grad_norm": 54.997398376464844, "learning_rate": 0.0004752858985532862, "epoch": 0.14261003267345762, "step": 159 }, { "loss": 390.4788, "grad_norm": 61.30497360229492, "learning_rate": 0.00047497936003893713, "epoch": 0.1435069511179448, "step": 160 }, { "loss": 383.9597, "grad_norm": 56.59492492675781, "learning_rate": 0.0004746710322715926, "epoch": 0.14440386956243192, "step": 161 }, { "loss": 392.4949, "grad_norm": 63.977073669433594, "learning_rate": 0.0004743609177033725, "epoch": 0.1453007880069191, "step": 162 }, { "loss": 385.7721, "grad_norm": 63.132537841796875, "learning_rate": 0.0004740490188006072, "epoch": 0.14619770645140623, "step": 163 }, { "loss": 385.057, "grad_norm": 61.54987716674805, "learning_rate": 0.0004737353380438178, "epoch": 0.1470946248958934, "step": 164 }, { "loss": 384.8288, "grad_norm": 64.65653228759766, "learning_rate": 0.00047341987792769635, "epoch": 0.14799154334038056, "step": 165 }, { "loss": 385.061, "grad_norm": 52.979087829589844, "learning_rate": 0.0004731026409610863, "epoch": 0.1488884617848677, "step": 166 }, { "loss": 385.9828, "grad_norm": 66.97553253173828, "learning_rate": 0.00047278362966696197, "epoch": 0.14978538022935486, "step": 167 }, { "loss": 381.6645, "grad_norm": 49.72977066040039, "learning_rate": 0.00047246284658240925, "epoch": 0.150682298673842, "step": 168 }, { "loss": 387.0713, "grad_norm": 59.0352668762207, "learning_rate": 0.0004721402942586046, "epoch": 0.15157921711832917, "step": 169 }, { "loss": 388.6861, "grad_norm": 56.49056625366211, "learning_rate": 0.0004718159752607955, "epoch": 0.15247613556281633, "step": 170 }, { "loss": 386.6622, "grad_norm": 61.9783935546875, "learning_rate": 0.00047148989216827964, "epoch": 0.15337305400730347, "step": 171 }, { "loss": 385.3264, "grad_norm": 60.84406280517578, "learning_rate": 0.0004711620475743844, "epoch": 0.15426997245179064, "step": 172 }, { "loss": 383.2025, "grad_norm": 55.59370803833008, "learning_rate": 0.00047083244408644646, "epoch": 0.15516689089627778, "step": 173 }, { "loss": 383.7802, "grad_norm": 59.102760314941406, "learning_rate": 0.0004705010843257908, "epoch": 0.15606380934076494, "step": 174 }, { "loss": 387.181, "grad_norm": 63.97918701171875, "learning_rate": 0.00047016797092771004, "epoch": 0.1569607277852521, "step": 175 }, { "loss": 382.4706, "grad_norm": 58.40498733520508, "learning_rate": 0.0004698331065414434, "epoch": 0.15785764622973925, "step": 176 }, { "loss": 374.7974, "grad_norm": 57.276405334472656, "learning_rate": 0.0004694964938301556, "epoch": 0.1587545646742264, "step": 177 }, { "loss": 383.6686, "grad_norm": 65.17239379882812, "learning_rate": 0.0004691581354709159, "epoch": 0.15965148311871355, "step": 178 }, { "loss": 382.2492, "grad_norm": 54.67914962768555, "learning_rate": 0.0004688180341546765, "epoch": 0.16054840156320072, "step": 179 }, { "loss": 379.0845, "grad_norm": 61.17100524902344, "learning_rate": 0.0004684761925862512, "epoch": 0.16144532000768788, "step": 180 }, { "loss": 380.5147, "grad_norm": 53.48952102661133, "learning_rate": 0.00046813261348429403, "epoch": 0.16234223845217502, "step": 181 }, { "loss": 388.3456, "grad_norm": 62.524898529052734, "learning_rate": 0.0004677872995812778, "epoch": 0.16323915689666219, "step": 182 }, { "loss": 384.9105, "grad_norm": 55.23896026611328, "learning_rate": 0.00046744025362347174, "epoch": 0.16413607534114932, "step": 183 }, { "loss": 388.0769, "grad_norm": 58.2794075012207, "learning_rate": 0.0004670914783709203, "epoch": 0.1650329937856365, "step": 184 }, { "loss": 375.4843, "grad_norm": 57.62440872192383, "learning_rate": 0.00046674097659742087, "epoch": 0.16592991223012366, "step": 185 }, { "loss": 388.4005, "grad_norm": 54.49860763549805, "learning_rate": 0.00046638875109050184, "epoch": 0.1668268306746108, "step": 186 }, { "loss": 379.2246, "grad_norm": 56.57727813720703, "learning_rate": 0.00046603480465140035, "epoch": 0.16772374911909796, "step": 187 }, { "loss": 390.5371, "grad_norm": 53.35488510131836, "learning_rate": 0.0004656791400950401, "epoch": 0.16862066756358512, "step": 188 }, { "loss": 376.5087, "grad_norm": 57.38853454589844, "learning_rate": 0.0004653217602500088, "epoch": 0.16951758600807226, "step": 189 }, { "loss": 383.3448, "grad_norm": 53.162269592285156, "learning_rate": 0.00046496266795853606, "epoch": 0.17041450445255943, "step": 190 }, { "loss": 385.954, "grad_norm": 56.76969528198242, "learning_rate": 0.0004646018660764701, "epoch": 0.17131142289704657, "step": 191 }, { "loss": 380.8749, "grad_norm": 55.99345016479492, "learning_rate": 0.0004642393574732559, "epoch": 0.17220834134153373, "step": 192 }, { "loss": 379.5312, "grad_norm": 49.73320770263672, "learning_rate": 0.0004638751450319116, "epoch": 0.1731052597860209, "step": 193 }, { "loss": 385.7988, "grad_norm": 56.80336380004883, "learning_rate": 0.00046350923164900604, "epoch": 0.17400217823050804, "step": 194 }, { "loss": 380.8796, "grad_norm": 57.32421875, "learning_rate": 0.0004631416202346357, "epoch": 0.1748990966749952, "step": 195 }, { "loss": 382.128, "grad_norm": 62.81551742553711, "learning_rate": 0.00046277231371240113, "epoch": 0.17579601511948234, "step": 196 }, { "loss": 383.9042, "grad_norm": 60.5498046875, "learning_rate": 0.00046240131501938436, "epoch": 0.1766929335639695, "step": 197 }, { "loss": 380.0457, "grad_norm": 54.78828811645508, "learning_rate": 0.000462028627106125, "epoch": 0.17758985200845667, "step": 198 }, { "loss": 383.6067, "grad_norm": 60.62177276611328, "learning_rate": 0.00046165425293659694, "epoch": 0.1784867704529438, "step": 199 }, { "loss": 385.004, "grad_norm": 53.65549850463867, "learning_rate": 0.00046127819548818507, "epoch": 0.17938368889743098, "step": 200 }, { "eval_loss": 1.6973483562469482, "eval_runtime": 57.4311, "eval_samples_per_second": 35.66, "eval_steps_per_second": 2.229, "epoch": 0.17938368889743098, "step": 200 }, { "loss": 381.3797, "grad_norm": 60.24985885620117, "learning_rate": 0.0004609004577516609, "epoch": 0.18028060734191811, "step": 201 }, { "loss": 384.8868, "grad_norm": 55.66313552856445, "learning_rate": 0.00046052104273115957, "epoch": 0.18117752578640528, "step": 202 }, { "loss": 381.8181, "grad_norm": 58.7210807800293, "learning_rate": 0.0004601399534441556, "epoch": 0.18207444423089245, "step": 203 }, { "loss": 381.6777, "grad_norm": 51.48910903930664, "learning_rate": 0.0004597571929214386, "epoch": 0.18297136267537958, "step": 204 }, { "loss": 389.5296, "grad_norm": 55.63520050048828, "learning_rate": 0.00045937276420708985, "epoch": 0.18386828111986675, "step": 205 }, { "loss": 379.7319, "grad_norm": 56.91200637817383, "learning_rate": 0.00045898667035845726, "epoch": 0.1847651995643539, "step": 206 }, { "loss": 383.4648, "grad_norm": 60.174800872802734, "learning_rate": 0.0004585989144461319, "epoch": 0.18566211800884105, "step": 207 }, { "loss": 381.6614, "grad_norm": 46.41486740112305, "learning_rate": 0.00045820949955392286, "epoch": 0.18655903645332822, "step": 208 }, { "loss": 388.843, "grad_norm": 66.20514678955078, "learning_rate": 0.0004578184287788333, "epoch": 0.18745595489781536, "step": 209 }, { "loss": 382.3195, "grad_norm": 52.08879470825195, "learning_rate": 0.0004574257052310355, "epoch": 0.18835287334230252, "step": 210 }, { "loss": 376.9011, "grad_norm": 59.04060363769531, "learning_rate": 0.00045703133203384594, "epoch": 0.18924979178678966, "step": 211 }, { "loss": 382.9858, "grad_norm": 57.139583587646484, "learning_rate": 0.000456635312323701, "epoch": 0.19014671023127683, "step": 212 }, { "loss": 386.4098, "grad_norm": 56.69694137573242, "learning_rate": 0.00045623764925013154, "epoch": 0.191043628675764, "step": 213 }, { "loss": 381.0145, "grad_norm": 54.969146728515625, "learning_rate": 0.00045583834597573826, "epoch": 0.19194054712025113, "step": 214 }, { "loss": 386.2006, "grad_norm": 55.187095642089844, "learning_rate": 0.000455437405676166, "epoch": 0.1928374655647383, "step": 215 }, { "loss": 385.4291, "grad_norm": 56.27381896972656, "learning_rate": 0.000455034831540079, "epoch": 0.19373438400922544, "step": 216 }, { "loss": 382.2878, "grad_norm": 55.81896209716797, "learning_rate": 0.00045463062676913527, "epoch": 0.1946313024537126, "step": 217 }, { "loss": 381.0126, "grad_norm": 60.54517364501953, "learning_rate": 0.0004542247945779613, "epoch": 0.19552822089819977, "step": 218 }, { "loss": 382.4228, "grad_norm": 51.44652557373047, "learning_rate": 0.0004538173381941264, "epoch": 0.1964251393426869, "step": 219 }, { "loss": 374.3478, "grad_norm": 57.77920150756836, "learning_rate": 0.0004534082608581168, "epoch": 0.19732205778717407, "step": 220 }, { "loss": 379.4279, "grad_norm": 52.3509635925293, "learning_rate": 0.0004529975658233104, "epoch": 0.1982189762316612, "step": 221 }, { "loss": 380.0542, "grad_norm": 53.75742721557617, "learning_rate": 0.0004525852563559505, "epoch": 0.19911589467614838, "step": 222 }, { "loss": 387.0319, "grad_norm": 59.18511199951172, "learning_rate": 0.0004521713357351198, "epoch": 0.20001281312063554, "step": 223 }, { "loss": 375.638, "grad_norm": 53.67622375488281, "learning_rate": 0.00045175580725271457, "epoch": 0.20090973156512268, "step": 224 }, { "loss": 383.951, "grad_norm": 67.28981018066406, "learning_rate": 0.00045133867421341835, "epoch": 0.20180665000960984, "step": 225 }, { "loss": 380.0722, "grad_norm": 62.926700592041016, "learning_rate": 0.00045091993993467554, "epoch": 0.20270356845409698, "step": 226 }, { "loss": 377.9981, "grad_norm": 53.50834274291992, "learning_rate": 0.0004504996077466654, "epoch": 0.20360048689858415, "step": 227 }, { "loss": 380.4308, "grad_norm": 61.55268096923828, "learning_rate": 0.0004500776809922751, "epoch": 0.20449740534307131, "step": 228 }, { "loss": 375.9146, "grad_norm": 55.11613845825195, "learning_rate": 0.0004496541630270733, "epoch": 0.20539432378755845, "step": 229 }, { "loss": 381.8729, "grad_norm": 61.67683410644531, "learning_rate": 0.00044922905721928366, "epoch": 0.20629124223204562, "step": 230 }, { "loss": 377.6188, "grad_norm": 55.07930374145508, "learning_rate": 0.00044880236694975773, "epoch": 0.20718816067653276, "step": 231 }, { "loss": 383.7285, "grad_norm": 56.17093276977539, "learning_rate": 0.0004483740956119485, "epoch": 0.20808507912101992, "step": 232 }, { "loss": 379.3219, "grad_norm": 57.20262908935547, "learning_rate": 0.0004479442466118828, "epoch": 0.2089819975655071, "step": 233 }, { "loss": 378.996, "grad_norm": 52.91606521606445, "learning_rate": 0.0004475128233681349, "epoch": 0.20987891600999423, "step": 234 }, { "loss": 376.5712, "grad_norm": 53.59124755859375, "learning_rate": 0.00044707982931179856, "epoch": 0.2107758344544814, "step": 235 }, { "loss": 385.7614, "grad_norm": 57.6840705871582, "learning_rate": 0.00044664526788646064, "epoch": 0.21167275289896853, "step": 236 }, { "loss": 381.0049, "grad_norm": 54.7835578918457, "learning_rate": 0.0004462091425481728, "epoch": 0.2125696713434557, "step": 237 }, { "loss": 380.4299, "grad_norm": 56.61455535888672, "learning_rate": 0.0004457714567654247, "epoch": 0.21346658978794286, "step": 238 }, { "loss": 377.3007, "grad_norm": 54.04520797729492, "learning_rate": 0.0004453322140191162, "epoch": 0.21436350823243, "step": 239 }, { "loss": 376.2494, "grad_norm": 61.18534469604492, "learning_rate": 0.0004448914178025293, "epoch": 0.21526042667691717, "step": 240 }, { "loss": 379.0678, "grad_norm": 58.791934967041016, "learning_rate": 0.000444449071621301, "epoch": 0.21615734512140433, "step": 241 }, { "loss": 383.8186, "grad_norm": 54.751407623291016, "learning_rate": 0.0004440051789933951, "epoch": 0.21705426356589147, "step": 242 }, { "loss": 374.9797, "grad_norm": 54.97734451293945, "learning_rate": 0.0004435597434490741, "epoch": 0.21795118201037864, "step": 243 }, { "loss": 381.2922, "grad_norm": 55.37065887451172, "learning_rate": 0.00044311276853087144, "epoch": 0.21884810045486577, "step": 244 }, { "loss": 378.8845, "grad_norm": 58.74147033691406, "learning_rate": 0.0004426642577935629, "epoch": 0.21974501889935294, "step": 245 }, { "loss": 386.1524, "grad_norm": 58.316097259521484, "learning_rate": 0.0004422142148041388, "epoch": 0.2206419373438401, "step": 246 }, { "loss": 378.2374, "grad_norm": 54.42732238769531, "learning_rate": 0.00044176264314177535, "epoch": 0.22153885578832724, "step": 247 }, { "loss": 378.246, "grad_norm": 56.714080810546875, "learning_rate": 0.00044130954639780615, "epoch": 0.2224357742328144, "step": 248 }, { "loss": 373.9691, "grad_norm": 51.52580642700195, "learning_rate": 0.0004408549281756937, "epoch": 0.22333269267730155, "step": 249 }, { "loss": 377.4944, "grad_norm": 61.44560241699219, "learning_rate": 0.0004403987920910011, "epoch": 0.2242296111217887, "step": 250 }, { "eval_loss": 1.6841200590133667, "eval_runtime": 35.8648, "eval_samples_per_second": 57.103, "eval_steps_per_second": 3.569, "epoch": 0.2242296111217887, "step": 250 }, { "loss": 372.7726, "grad_norm": 52.64440155029297, "learning_rate": 0.00043994114177136245, "epoch": 0.22512652956627588, "step": 251 }, { "loss": 374.3314, "grad_norm": 57.64458084106445, "learning_rate": 0.0004394819808564549, "epoch": 0.22602344801076302, "step": 252 }, { "loss": 380.1327, "grad_norm": 48.348487854003906, "learning_rate": 0.00043902131299796923, "epoch": 0.22692036645525018, "step": 253 }, { "loss": 376.8272, "grad_norm": 55.306766510009766, "learning_rate": 0.00043855914185958066, "epoch": 0.22781728489973732, "step": 254 }, { "loss": 373.5811, "grad_norm": 50.16413879394531, "learning_rate": 0.0004380954711169202, "epoch": 0.2287142033442245, "step": 255 }, { "loss": 380.8544, "grad_norm": 52.902305603027344, "learning_rate": 0.00043763030445754516, "epoch": 0.22961112178871165, "step": 256 }, { "loss": 380.7617, "grad_norm": 55.323490142822266, "learning_rate": 0.0004371636455809096, "epoch": 0.2305080402331988, "step": 257 }, { "loss": 378.9308, "grad_norm": 53.362361907958984, "learning_rate": 0.00043669549819833536, "epoch": 0.23140495867768596, "step": 258 }, { "loss": 378.0917, "grad_norm": 51.511932373046875, "learning_rate": 0.0004362258660329822, "epoch": 0.2323018771221731, "step": 259 }, { "loss": 374.3557, "grad_norm": 60.112728118896484, "learning_rate": 0.0004357547528198184, "epoch": 0.23319879556666026, "step": 260 }, { "loss": 382.0044, "grad_norm": 52.59751510620117, "learning_rate": 0.0004352821623055908, "epoch": 0.23409571401114743, "step": 261 }, { "loss": 379.4641, "grad_norm": 54.482444763183594, "learning_rate": 0.0004348080982487953, "epoch": 0.23499263245563456, "step": 262 }, { "loss": 376.0202, "grad_norm": 57.2796516418457, "learning_rate": 0.0004343325644196468, "epoch": 0.23588955090012173, "step": 263 }, { "loss": 380.4021, "grad_norm": 51.36527633666992, "learning_rate": 0.0004338555646000492, "epoch": 0.23678646934460887, "step": 264 }, { "loss": 382.1948, "grad_norm": 54.246639251708984, "learning_rate": 0.0004333771025835655, "epoch": 0.23768338778909603, "step": 265 }, { "loss": 376.0016, "grad_norm": 53.845367431640625, "learning_rate": 0.0004328971821753873, "epoch": 0.2385803062335832, "step": 266 }, { "loss": 378.0241, "grad_norm": 55.82734298706055, "learning_rate": 0.0004324158071923049, "epoch": 0.23947722467807034, "step": 267 }, { "loss": 376.6841, "grad_norm": 52.28315734863281, "learning_rate": 0.0004319329814626768, "epoch": 0.2403741431225575, "step": 268 }, { "loss": 376.4868, "grad_norm": 59.60106658935547, "learning_rate": 0.00043144870882639907, "epoch": 0.24127106156704464, "step": 269 }, { "loss": 376.3779, "grad_norm": 58.55453109741211, "learning_rate": 0.0004309629931348752, "epoch": 0.2421679800115318, "step": 270 }, { "loss": 379.1783, "grad_norm": 52.10798263549805, "learning_rate": 0.0004304758382509849, "epoch": 0.24306489845601897, "step": 271 }, { "loss": 379.3161, "grad_norm": 53.941673278808594, "learning_rate": 0.0004299872480490542, "epoch": 0.2439618169005061, "step": 272 }, { "loss": 379.5319, "grad_norm": 53.70753860473633, "learning_rate": 0.00042949722641482383, "epoch": 0.24485873534499328, "step": 273 }, { "loss": 379.6953, "grad_norm": 61.60326385498047, "learning_rate": 0.0004290057772454187, "epoch": 0.24575565378948042, "step": 274 }, { "loss": 379.7555, "grad_norm": 57.09893798828125, "learning_rate": 0.0004285129044493169, "epoch": 0.24665257223396758, "step": 275 }, { "loss": 381.1754, "grad_norm": 60.31880187988281, "learning_rate": 0.0004280186119463186, "epoch": 0.24754949067845475, "step": 276 }, { "loss": 379.8077, "grad_norm": 57.53593826293945, "learning_rate": 0.0004275229036675148, "epoch": 0.24844640912294189, "step": 277 }, { "loss": 381.0815, "grad_norm": 56.55409240722656, "learning_rate": 0.00042702578355525615, "epoch": 0.24934332756742905, "step": 278 }, { "loss": 378.2445, "grad_norm": 50.37730026245117, "learning_rate": 0.00042652725556312156, "epoch": 0.2502402460119162, "step": 279 }, { "loss": 376.4951, "grad_norm": 50.24005889892578, "learning_rate": 0.0004260273236558867, "epoch": 0.2511371644564034, "step": 280 }, { "loss": 379.3927, "grad_norm": 52.99737548828125, "learning_rate": 0.0004255259918094926, "epoch": 0.2520340829008905, "step": 281 }, { "loss": 379.7873, "grad_norm": 53.95462417602539, "learning_rate": 0.00042502326401101386, "epoch": 0.25293100134537766, "step": 282 }, { "loss": 370.9284, "grad_norm": 51.21118927001953, "learning_rate": 0.0004245191442586273, "epoch": 0.2538279197898648, "step": 283 }, { "loss": 374.7379, "grad_norm": 53.918975830078125, "learning_rate": 0.00042401363656157954, "epoch": 0.254724838234352, "step": 284 }, { "loss": 373.7905, "grad_norm": 51.7956428527832, "learning_rate": 0.00042350674494015566, "epoch": 0.25562175667883913, "step": 285 }, { "loss": 376.9342, "grad_norm": 51.80348205566406, "learning_rate": 0.0004229984734256471, "epoch": 0.25651867512332627, "step": 286 }, { "loss": 378.537, "grad_norm": 53.50684356689453, "learning_rate": 0.0004224888260603195, "epoch": 0.25741559356781346, "step": 287 }, { "loss": 374.9467, "grad_norm": 52.037200927734375, "learning_rate": 0.0004219778068973804, "epoch": 0.2583125120123006, "step": 288 }, { "loss": 382.1371, "grad_norm": 48.98027420043945, "learning_rate": 0.0004214654200009475, "epoch": 0.25920943045678774, "step": 289 }, { "loss": 378.7361, "grad_norm": 51.1038818359375, "learning_rate": 0.0004209516694460157, "epoch": 0.26010634890127493, "step": 290 }, { "loss": 379.9825, "grad_norm": 53.03129577636719, "learning_rate": 0.0004204365593184255, "epoch": 0.26100326734576207, "step": 291 }, { "loss": 376.35, "grad_norm": 54.52887725830078, "learning_rate": 0.0004199200937148297, "epoch": 0.2619001857902492, "step": 292 }, { "loss": 376.654, "grad_norm": 51.10536575317383, "learning_rate": 0.00041940227674266105, "epoch": 0.26279710423473635, "step": 293 }, { "loss": 372.8873, "grad_norm": 57.231117248535156, "learning_rate": 0.0004188831125201, "epoch": 0.26369402267922354, "step": 294 }, { "loss": 372.2591, "grad_norm": 54.170921325683594, "learning_rate": 0.0004183626051760415, "epoch": 0.2645909411237107, "step": 295 }, { "loss": 376.232, "grad_norm": 48.81595230102539, "learning_rate": 0.0004178407588500621, "epoch": 0.2654878595681978, "step": 296 }, { "loss": 377.493, "grad_norm": 51.22395324707031, "learning_rate": 0.00041731757769238764, "epoch": 0.266384778012685, "step": 297 }, { "loss": 373.4135, "grad_norm": 50.80076217651367, "learning_rate": 0.00041679306586385944, "epoch": 0.26728169645717215, "step": 298 }, { "loss": 373.3929, "grad_norm": 52.78483581542969, "learning_rate": 0.00041626722753590185, "epoch": 0.2681786149016593, "step": 299 }, { "loss": 374.4973, "grad_norm": 59.0179328918457, "learning_rate": 0.0004157400668904887, "epoch": 0.2690755333461465, "step": 300 }, { "eval_loss": 1.6736700534820557, "eval_runtime": 48.4303, "eval_samples_per_second": 42.288, "eval_steps_per_second": 2.643, "epoch": 0.2690755333461465, "step": 300 }, { "loss": 370.586, "grad_norm": 51.39365005493164, "learning_rate": 0.0004152115881201102, "epoch": 0.2699724517906336, "step": 301 }, { "loss": 371.1306, "grad_norm": 53.13943862915039, "learning_rate": 0.0004146817954277395, "epoch": 0.27086937023512075, "step": 302 }, { "loss": 375.8091, "grad_norm": 46.9393310546875, "learning_rate": 0.0004141506930267995, "epoch": 0.2717662886796079, "step": 303 }, { "loss": 378.5063, "grad_norm": 56.166954040527344, "learning_rate": 0.00041361828514112884, "epoch": 0.2726632071240951, "step": 304 }, { "loss": 372.5772, "grad_norm": 52.24879455566406, "learning_rate": 0.00041308457600494917, "epoch": 0.2735601255685822, "step": 305 }, { "loss": 371.29, "grad_norm": 53.966949462890625, "learning_rate": 0.00041254956986283044, "epoch": 0.27445704401306936, "step": 306 }, { "loss": 376.5358, "grad_norm": 51.999046325683594, "learning_rate": 0.0004120132709696578, "epoch": 0.27535396245755656, "step": 307 }, { "loss": 377.9629, "grad_norm": 53.83307647705078, "learning_rate": 0.0004114756835905976, "epoch": 0.2762508809020437, "step": 308 }, { "loss": 372.8809, "grad_norm": 55.104217529296875, "learning_rate": 0.0004109368120010636, "epoch": 0.27714779934653083, "step": 309 }, { "loss": 377.9377, "grad_norm": 51.1360969543457, "learning_rate": 0.00041039666048668265, "epoch": 0.278044717791018, "step": 310 }, { "loss": 377.1788, "grad_norm": 50.87997817993164, "learning_rate": 0.00040985523334326093, "epoch": 0.27894163623550516, "step": 311 }, { "loss": 375.3121, "grad_norm": 49.86625289916992, "learning_rate": 0.00040931253487674955, "epoch": 0.2798385546799923, "step": 312 }, { "loss": 373.2664, "grad_norm": 51.52640151977539, "learning_rate": 0.00040876856940321056, "epoch": 0.28073547312447944, "step": 313 }, { "loss": 373.2856, "grad_norm": 49.00104904174805, "learning_rate": 0.00040822334124878236, "epoch": 0.28163239156896663, "step": 314 }, { "loss": 377.6501, "grad_norm": 52.83418655395508, "learning_rate": 0.00040767685474964535, "epoch": 0.28252931001345377, "step": 315 }, { "loss": 370.6684, "grad_norm": 49.96600341796875, "learning_rate": 0.00040712911425198764, "epoch": 0.2834262284579409, "step": 316 }, { "loss": 376.3713, "grad_norm": 50.470123291015625, "learning_rate": 0.0004065801241119702, "epoch": 0.2843231469024281, "step": 317 }, { "loss": 374.6679, "grad_norm": 47.91783142089844, "learning_rate": 0.0004060298886956926, "epoch": 0.28522006534691524, "step": 318 }, { "loss": 376.8799, "grad_norm": 52.6668586730957, "learning_rate": 0.0004054784123791577, "epoch": 0.2861169837914024, "step": 319 }, { "loss": 371.9651, "grad_norm": 50.082279205322266, "learning_rate": 0.00040492569954823763, "epoch": 0.2870139022358896, "step": 320 }, { "loss": 373.8972, "grad_norm": 56.001190185546875, "learning_rate": 0.0004043717545986381, "epoch": 0.2879108206803767, "step": 321 }, { "loss": 370.1523, "grad_norm": 53.00112533569336, "learning_rate": 0.0004038165819358639, "epoch": 0.28880773912486385, "step": 322 }, { "loss": 377.1375, "grad_norm": 52.706729888916016, "learning_rate": 0.0004032601859751839, "epoch": 0.28970465756935104, "step": 323 }, { "loss": 375.1089, "grad_norm": 51.362571716308594, "learning_rate": 0.00040270257114159583, "epoch": 0.2906015760138382, "step": 324 }, { "loss": 370.7276, "grad_norm": 54.43815994262695, "learning_rate": 0.00040214374186979074, "epoch": 0.2914984944583253, "step": 325 }, { "loss": 375.119, "grad_norm": 51.00381851196289, "learning_rate": 0.0004015837026041186, "epoch": 0.29239541290281246, "step": 326 }, { "loss": 371.2367, "grad_norm": 57.776222229003906, "learning_rate": 0.000401022457798552, "epoch": 0.29329233134729965, "step": 327 }, { "loss": 380.1667, "grad_norm": 53.284149169921875, "learning_rate": 0.0004004600119166513, "epoch": 0.2941892497917868, "step": 328 }, { "loss": 369.6853, "grad_norm": 56.30731964111328, "learning_rate": 0.000399896369431529, "epoch": 0.2950861682362739, "step": 329 }, { "loss": 374.0436, "grad_norm": 54.28211975097656, "learning_rate": 0.00039933153482581406, "epoch": 0.2959830866807611, "step": 330 }, { "loss": 372.2117, "grad_norm": 50.88725280761719, "learning_rate": 0.00039876551259161643, "epoch": 0.29688000512524826, "step": 331 }, { "loss": 374.7655, "grad_norm": 54.17941665649414, "learning_rate": 0.00039819830723049105, "epoch": 0.2977769235697354, "step": 332 }, { "loss": 376.0198, "grad_norm": 52.40755081176758, "learning_rate": 0.0003976299232534024, "epoch": 0.2986738420142226, "step": 333 }, { "loss": 371.5096, "grad_norm": 50.74897384643555, "learning_rate": 0.0003970603651806886, "epoch": 0.29957076045870973, "step": 334 }, { "loss": 375.5447, "grad_norm": 47.52690124511719, "learning_rate": 0.00039648963754202496, "epoch": 0.30046767890319687, "step": 335 }, { "loss": 376.1951, "grad_norm": 52.93135070800781, "learning_rate": 0.0003959177448763883, "epoch": 0.301364597347684, "step": 336 }, { "loss": 371.1348, "grad_norm": 50.335418701171875, "learning_rate": 0.0003953446917320214, "epoch": 0.3022615157921712, "step": 337 }, { "loss": 375.4595, "grad_norm": 51.26169204711914, "learning_rate": 0.0003947704826663955, "epoch": 0.30315843423665834, "step": 338 }, { "loss": 372.898, "grad_norm": 54.89933776855469, "learning_rate": 0.0003941951222461756, "epoch": 0.3040553526811455, "step": 339 }, { "loss": 370.8462, "grad_norm": 54.09654235839844, "learning_rate": 0.00039361861504718276, "epoch": 0.30495227112563267, "step": 340 }, { "loss": 373.6092, "grad_norm": 52.41168975830078, "learning_rate": 0.0003930409656543588, "epoch": 0.3058491895701198, "step": 341 }, { "loss": 374.9025, "grad_norm": 45.53563690185547, "learning_rate": 0.00039246217866172907, "epoch": 0.30674610801460694, "step": 342 }, { "loss": 376.0628, "grad_norm": 51.11941146850586, "learning_rate": 0.00039188225867236643, "epoch": 0.30764302645909414, "step": 343 }, { "loss": 374.4197, "grad_norm": 50.10179901123047, "learning_rate": 0.0003913012102983542, "epoch": 0.3085399449035813, "step": 344 }, { "loss": 370.0171, "grad_norm": 50.524696350097656, "learning_rate": 0.00039071903816074977, "epoch": 0.3094368633480684, "step": 345 }, { "loss": 371.2375, "grad_norm": 51.18245315551758, "learning_rate": 0.00039013574688954793, "epoch": 0.31033378179255555, "step": 346 }, { "loss": 374.7748, "grad_norm": 64.64472198486328, "learning_rate": 0.0003895513411236438, "epoch": 0.31123070023704275, "step": 347 }, { "loss": 377.3275, "grad_norm": 56.01545715332031, "learning_rate": 0.0003889658255107959, "epoch": 0.3121276186815299, "step": 348 }, { "loss": 369.5843, "grad_norm": 56.439754486083984, "learning_rate": 0.0003883792047075896, "epoch": 0.313024537126017, "step": 349 }, { "loss": 368.456, "grad_norm": 58.23375701904297, "learning_rate": 0.0003877914833793996, "epoch": 0.3139214555705042, "step": 350 }, { "eval_loss": 1.661989450454712, "eval_runtime": 36.2255, "eval_samples_per_second": 56.535, "eval_steps_per_second": 3.533, "epoch": 0.3139214555705042, "step": 350 }, { "loss": 374.9042, "grad_norm": 52.63510513305664, "learning_rate": 0.00038720266620035314, "epoch": 0.31481837401499135, "step": 351 }, { "loss": 367.9091, "grad_norm": 55.49558639526367, "learning_rate": 0.0003866127578532927, "epoch": 0.3157152924594785, "step": 352 }, { "loss": 374.5601, "grad_norm": 52.941497802734375, "learning_rate": 0.0003860217630297387, "epoch": 0.3166122109039657, "step": 353 }, { "loss": 371.4058, "grad_norm": 44.237648010253906, "learning_rate": 0.0003854296864298523, "epoch": 0.3175091293484528, "step": 354 }, { "loss": 376.094, "grad_norm": 52.86402893066406, "learning_rate": 0.00038483653276239816, "epoch": 0.31840604779293996, "step": 355 }, { "loss": 374.3872, "grad_norm": 49.61796569824219, "learning_rate": 0.0003842423067447066, "epoch": 0.3193029662374271, "step": 356 }, { "loss": 371.5387, "grad_norm": 49.825504302978516, "learning_rate": 0.0003836470131026365, "epoch": 0.3201998846819143, "step": 357 }, { "loss": 371.4422, "grad_norm": 53.598228454589844, "learning_rate": 0.0003830506565705372, "epoch": 0.32109680312640143, "step": 358 }, { "loss": 371.03, "grad_norm": 48.73537063598633, "learning_rate": 0.00038245324189121153, "epoch": 0.32199372157088857, "step": 359 }, { "loss": 377.8967, "grad_norm": 48.377281188964844, "learning_rate": 0.00038185477381587763, "epoch": 0.32289064001537576, "step": 360 }, { "loss": 374.9411, "grad_norm": 53.932228088378906, "learning_rate": 0.0003812552571041311, "epoch": 0.3237875584598629, "step": 361 }, { "loss": 374.6432, "grad_norm": 52.54889678955078, "learning_rate": 0.00038065469652390736, "epoch": 0.32468447690435004, "step": 362 }, { "loss": 371.9634, "grad_norm": 53.84141159057617, "learning_rate": 0.000380053096851444, "epoch": 0.32558139534883723, "step": 363 }, { "loss": 371.487, "grad_norm": 49.041019439697266, "learning_rate": 0.00037945046287124197, "epoch": 0.32647831379332437, "step": 364 }, { "loss": 370.3628, "grad_norm": 51.356388092041016, "learning_rate": 0.00037884679937602827, "epoch": 0.3273752322378115, "step": 365 }, { "loss": 371.4878, "grad_norm": 49.55571746826172, "learning_rate": 0.0003782421111667178, "epoch": 0.32827215068229865, "step": 366 }, { "loss": 373.209, "grad_norm": 51.30101013183594, "learning_rate": 0.00037763640305237456, "epoch": 0.32916906912678584, "step": 367 }, { "loss": 369.0127, "grad_norm": 51.14597702026367, "learning_rate": 0.000377029679850174, "epoch": 0.330065987571273, "step": 368 }, { "loss": 374.4203, "grad_norm": 51.925132751464844, "learning_rate": 0.00037642194638536487, "epoch": 0.3309629060157601, "step": 369 }, { "loss": 370.4622, "grad_norm": 53.620052337646484, "learning_rate": 0.00037581320749123, "epoch": 0.3318598244602473, "step": 370 }, { "loss": 369.0265, "grad_norm": 47.18992233276367, "learning_rate": 0.0003752034680090485, "epoch": 0.33275674290473445, "step": 371 }, { "loss": 372.8077, "grad_norm": 56.7562141418457, "learning_rate": 0.0003745927327880574, "epoch": 0.3336536613492216, "step": 372 }, { "loss": 368.2184, "grad_norm": 56.05765914916992, "learning_rate": 0.00037398100668541227, "epoch": 0.3345505797937088, "step": 373 }, { "loss": 376.1522, "grad_norm": 50.888771057128906, "learning_rate": 0.00037336829456614975, "epoch": 0.3354474982381959, "step": 374 }, { "loss": 371.1161, "grad_norm": 49.758975982666016, "learning_rate": 0.0003727546013031478, "epoch": 0.33634441668268306, "step": 375 }, { "loss": 371.6988, "grad_norm": 53.891990661621094, "learning_rate": 0.00037213993177708746, "epoch": 0.33724133512717025, "step": 376 }, { "loss": 370.6019, "grad_norm": 50.557762145996094, "learning_rate": 0.000371524290876414, "epoch": 0.3381382535716574, "step": 377 }, { "loss": 373.2912, "grad_norm": 51.6466064453125, "learning_rate": 0.00037090768349729833, "epoch": 0.3390351720161445, "step": 378 }, { "loss": 372.9784, "grad_norm": 48.213077545166016, "learning_rate": 0.00037029011454359695, "epoch": 0.33993209046063166, "step": 379 }, { "loss": 368.0577, "grad_norm": 49.39459991455078, "learning_rate": 0.0003696715889268145, "epoch": 0.34082900890511886, "step": 380 }, { "loss": 371.9662, "grad_norm": 49.54859924316406, "learning_rate": 0.00036905211156606344, "epoch": 0.341725927349606, "step": 381 }, { "loss": 376.1466, "grad_norm": 54.29618835449219, "learning_rate": 0.00036843168738802574, "epoch": 0.34262284579409313, "step": 382 }, { "loss": 372.8206, "grad_norm": 47.55562210083008, "learning_rate": 0.00036781032132691304, "epoch": 0.3435197642385803, "step": 383 }, { "loss": 370.9735, "grad_norm": 49.289615631103516, "learning_rate": 0.00036718801832442814, "epoch": 0.34441668268306747, "step": 384 }, { "loss": 370.5686, "grad_norm": 50.339176177978516, "learning_rate": 0.000366564783329725, "epoch": 0.3453136011275546, "step": 385 }, { "loss": 371.3257, "grad_norm": 49.51339340209961, "learning_rate": 0.00036594062129936974, "epoch": 0.3462105195720418, "step": 386 }, { "loss": 366.3475, "grad_norm": 48.21767044067383, "learning_rate": 0.0003653155371973012, "epoch": 0.34710743801652894, "step": 387 }, { "loss": 369.8744, "grad_norm": 52.45291519165039, "learning_rate": 0.0003646895359947915, "epoch": 0.3480043564610161, "step": 388 }, { "loss": 372.5318, "grad_norm": 49.45993423461914, "learning_rate": 0.00036406262267040624, "epoch": 0.3489012749055032, "step": 389 }, { "loss": 369.184, "grad_norm": 48.8317756652832, "learning_rate": 0.0003634348022099652, "epoch": 0.3497981933499904, "step": 390 }, { "loss": 373.9739, "grad_norm": 50.6275634765625, "learning_rate": 0.0003628060796065027, "epoch": 0.35069511179447754, "step": 391 }, { "loss": 372.0473, "grad_norm": 48.547447204589844, "learning_rate": 0.00036217645986022756, "epoch": 0.3515920302389647, "step": 392 }, { "loss": 364.9705, "grad_norm": 48.18462371826172, "learning_rate": 0.0003615459479784837, "epoch": 0.3524889486834519, "step": 393 }, { "loss": 369.6471, "grad_norm": 46.10414123535156, "learning_rate": 0.0003609145489757101, "epoch": 0.353385867127939, "step": 394 }, { "loss": 371.7173, "grad_norm": 46.38992691040039, "learning_rate": 0.0003602822678734008, "epoch": 0.35428278557242615, "step": 395 }, { "loss": 367.3975, "grad_norm": 45.87107467651367, "learning_rate": 0.00035964910970006557, "epoch": 0.35517970401691334, "step": 396 }, { "loss": 371.2871, "grad_norm": 46.54446029663086, "learning_rate": 0.00035901507949118915, "epoch": 0.3560766224614005, "step": 397 }, { "loss": 368.7915, "grad_norm": 45.7996826171875, "learning_rate": 0.0003583801822891917, "epoch": 0.3569735409058876, "step": 398 }, { "loss": 371.0395, "grad_norm": 48.34632873535156, "learning_rate": 0.0003577444231433885, "epoch": 0.35787045935037476, "step": 399 }, { "loss": 374.4672, "grad_norm": 48.63014221191406, "learning_rate": 0.00035710780710994985, "epoch": 0.35876737779486195, "step": 400 }, { "eval_loss": 1.6527702808380127, "eval_runtime": 51.2432, "eval_samples_per_second": 39.966, "eval_steps_per_second": 2.498, "epoch": 0.35876737779486195, "step": 400 }, { "loss": 369.2286, "grad_norm": 50.575950622558594, "learning_rate": 0.00035647033925186066, "epoch": 0.3596642962393491, "step": 401 }, { "loss": 366.6179, "grad_norm": 50.074954986572266, "learning_rate": 0.0003558320246388808, "epoch": 0.36056121468383623, "step": 402 }, { "loss": 370.1017, "grad_norm": 51.92937088012695, "learning_rate": 0.00035519286834750403, "epoch": 0.3614581331283234, "step": 403 }, { "loss": 366.74, "grad_norm": 52.75185775756836, "learning_rate": 0.00035455287546091785, "epoch": 0.36235505157281056, "step": 404 }, { "loss": 369.307, "grad_norm": 50.451271057128906, "learning_rate": 0.0003539120510689636, "epoch": 0.3632519700172977, "step": 405 }, { "loss": 374.2456, "grad_norm": 56.06875228881836, "learning_rate": 0.0003532704002680951, "epoch": 0.3641488884617849, "step": 406 }, { "loss": 371.9364, "grad_norm": 49.18859100341797, "learning_rate": 0.0003526279281613388, "epoch": 0.36504580690627203, "step": 407 }, { "loss": 375.3452, "grad_norm": 60.49544143676758, "learning_rate": 0.00035198463985825303, "epoch": 0.36594272535075917, "step": 408 }, { "loss": 364.7332, "grad_norm": 55.390960693359375, "learning_rate": 0.0003513405404748872, "epoch": 0.3668396437952463, "step": 409 }, { "loss": 367.328, "grad_norm": 45.79146194458008, "learning_rate": 0.00035069563513374105, "epoch": 0.3677365622397335, "step": 410 }, { "loss": 372.7194, "grad_norm": 50.601531982421875, "learning_rate": 0.0003500499289637243, "epoch": 0.36863348068422064, "step": 411 }, { "loss": 373.3177, "grad_norm": 58.5416374206543, "learning_rate": 0.0003494034271001158, "epoch": 0.3695303991287078, "step": 412 }, { "loss": 367.5529, "grad_norm": 48.93236541748047, "learning_rate": 0.00034875613468452203, "epoch": 0.37042731757319497, "step": 413 }, { "loss": 368.6186, "grad_norm": 49.043251037597656, "learning_rate": 0.00034810805686483713, "epoch": 0.3713242360176821, "step": 414 }, { "loss": 363.3611, "grad_norm": 48.577144622802734, "learning_rate": 0.0003474591987952013, "epoch": 0.37222115446216925, "step": 415 }, { "loss": 368.0312, "grad_norm": 48.73127746582031, "learning_rate": 0.0003468095656359601, "epoch": 0.37311807290665644, "step": 416 }, { "loss": 367.3114, "grad_norm": 51.46812057495117, "learning_rate": 0.0003461591625536234, "epoch": 0.3740149913511436, "step": 417 }, { "loss": 375.6931, "grad_norm": 49.236141204833984, "learning_rate": 0.0003455079947208242, "epoch": 0.3749119097956307, "step": 418 }, { "loss": 365.6711, "grad_norm": 48.81379318237305, "learning_rate": 0.00034485606731627755, "epoch": 0.37580882824011785, "step": 419 }, { "loss": 364.9393, "grad_norm": 51.185340881347656, "learning_rate": 0.0003442033855247394, "epoch": 0.37670574668460505, "step": 420 }, { "loss": 369.8553, "grad_norm": 53.58812713623047, "learning_rate": 0.000343549954536965, "epoch": 0.3776026651290922, "step": 421 }, { "loss": 372.3922, "grad_norm": 51.472042083740234, "learning_rate": 0.0003428957795496685, "epoch": 0.3784995835735793, "step": 422 }, { "loss": 371.9807, "grad_norm": 54.97187805175781, "learning_rate": 0.0003422408657654805, "epoch": 0.3793965020180665, "step": 423 }, { "loss": 370.048, "grad_norm": 54.97746276855469, "learning_rate": 0.0003415852183929077, "epoch": 0.38029342046255366, "step": 424 }, { "loss": 370.0667, "grad_norm": 46.41242980957031, "learning_rate": 0.0003409288426462904, "epoch": 0.3811903389070408, "step": 425 }, { "loss": 366.4669, "grad_norm": 51.722904205322266, "learning_rate": 0.0003402717437457624, "epoch": 0.382087257351528, "step": 426 }, { "loss": 367.8651, "grad_norm": 51.60542678833008, "learning_rate": 0.00033961392691720803, "epoch": 0.3829841757960151, "step": 427 }, { "loss": 364.8575, "grad_norm": 46.896331787109375, "learning_rate": 0.0003389553973922217, "epoch": 0.38388109424050226, "step": 428 }, { "loss": 366.1106, "grad_norm": 47.48381042480469, "learning_rate": 0.00033829616040806566, "epoch": 0.38477801268498946, "step": 429 }, { "loss": 369.6983, "grad_norm": 47.15787124633789, "learning_rate": 0.0003376362212076287, "epoch": 0.3856749311294766, "step": 430 }, { "loss": 372.8012, "grad_norm": 49.67255401611328, "learning_rate": 0.0003369755850393841, "epoch": 0.38657184957396373, "step": 431 }, { "loss": 369.0824, "grad_norm": 50.87350082397461, "learning_rate": 0.0003363142571573484, "epoch": 0.38746876801845087, "step": 432 }, { "loss": 368.5385, "grad_norm": 52.32754135131836, "learning_rate": 0.0003356522428210391, "epoch": 0.38836568646293806, "step": 433 }, { "loss": 370.1974, "grad_norm": 46.638084411621094, "learning_rate": 0.0003349895472954331, "epoch": 0.3892626049074252, "step": 434 }, { "loss": 367.2549, "grad_norm": 51.39384460449219, "learning_rate": 0.00033432617585092467, "epoch": 0.39015952335191234, "step": 435 }, { "loss": 368.2899, "grad_norm": 49.1676139831543, "learning_rate": 0.00033366213376328396, "epoch": 0.39105644179639953, "step": 436 }, { "loss": 372.2977, "grad_norm": 51.6141242980957, "learning_rate": 0.0003329974263136144, "epoch": 0.3919533602408867, "step": 437 }, { "loss": 368.3735, "grad_norm": 49.94230270385742, "learning_rate": 0.0003323320587883111, "epoch": 0.3928502786853738, "step": 438 }, { "loss": 370.6481, "grad_norm": 49.947837829589844, "learning_rate": 0.0003316660364790188, "epoch": 0.393747197129861, "step": 439 }, { "loss": 369.6432, "grad_norm": 48.53517532348633, "learning_rate": 0.0003309993646825896, "epoch": 0.39464411557434814, "step": 440 }, { "loss": 366.7539, "grad_norm": 50.93443298339844, "learning_rate": 0.00033033204870104116, "epoch": 0.3955410340188353, "step": 441 }, { "loss": 367.3075, "grad_norm": 49.63651657104492, "learning_rate": 0.000329664093841514, "epoch": 0.3964379524633224, "step": 442 }, { "loss": 369.597, "grad_norm": 48.85470962524414, "learning_rate": 0.00032899550541623, "epoch": 0.3973348709078096, "step": 443 }, { "loss": 366.1455, "grad_norm": 49.675559997558594, "learning_rate": 0.0003283262887424494, "epoch": 0.39823178935229675, "step": 444 }, { "loss": 362.2254, "grad_norm": 48.583370208740234, "learning_rate": 0.0003276564491424292, "epoch": 0.3991287077967839, "step": 445 }, { "loss": 372.5689, "grad_norm": 50.507293701171875, "learning_rate": 0.0003269859919433802, "epoch": 0.4000256262412711, "step": 446 }, { "loss": 366.7801, "grad_norm": 50.75261688232422, "learning_rate": 0.0003263149224774251, "epoch": 0.4009225446857582, "step": 447 }, { "loss": 369.5224, "grad_norm": 49.42384719848633, "learning_rate": 0.00032564324608155604, "epoch": 0.40181946313024536, "step": 448 }, { "loss": 369.6519, "grad_norm": 49.12044143676758, "learning_rate": 0.00032497096809759184, "epoch": 0.40271638157473255, "step": 449 }, { "loss": 370.9763, "grad_norm": 53.04697036743164, "learning_rate": 0.0003242980938721359, "epoch": 0.4036133000192197, "step": 450 }, { "eval_loss": 1.6399173736572266, "eval_runtime": 36.1587, "eval_samples_per_second": 56.639, "eval_steps_per_second": 3.54, "epoch": 0.4036133000192197, "step": 450 }, { "loss": 367.9265, "grad_norm": 52.0450553894043, "learning_rate": 0.00032362462875653355, "epoch": 0.4045102184637068, "step": 451 }, { "loss": 372.4974, "grad_norm": 48.33359146118164, "learning_rate": 0.0003229505781068291, "epoch": 0.40540713690819397, "step": 452 }, { "loss": 366.6081, "grad_norm": 49.462974548339844, "learning_rate": 0.00032227594728372397, "epoch": 0.40630405535268116, "step": 453 }, { "loss": 366.3152, "grad_norm": 48.31398391723633, "learning_rate": 0.0003216007416525335, "epoch": 0.4072009737971683, "step": 454 }, { "loss": 369.983, "grad_norm": 47.523338317871094, "learning_rate": 0.0003209249665831445, "epoch": 0.40809789224165544, "step": 455 }, { "loss": 366.8036, "grad_norm": 45.295806884765625, "learning_rate": 0.00032024862744997265, "epoch": 0.40899481068614263, "step": 456 }, { "loss": 366.4848, "grad_norm": 49.89873504638672, "learning_rate": 0.0003195717296319193, "epoch": 0.40989172913062977, "step": 457 }, { "loss": 365.4414, "grad_norm": 46.948055267333984, "learning_rate": 0.00031889427851232915, "epoch": 0.4107886475751169, "step": 458 }, { "loss": 369.7285, "grad_norm": 48.40359115600586, "learning_rate": 0.0003182162794789474, "epoch": 0.4116855660196041, "step": 459 }, { "loss": 370.345, "grad_norm": 48.55045700073242, "learning_rate": 0.0003175377379238767, "epoch": 0.41258248446409124, "step": 460 }, { "loss": 366.95, "grad_norm": 47.37104415893555, "learning_rate": 0.0003168586592435341, "epoch": 0.4134794029085784, "step": 461 }, { "loss": 370.2368, "grad_norm": 51.285888671875, "learning_rate": 0.00031617904883860903, "epoch": 0.4143763213530655, "step": 462 }, { "loss": 365.4067, "grad_norm": 50.595340728759766, "learning_rate": 0.000315498912114019, "epoch": 0.4152732397975527, "step": 463 }, { "loss": 366.4186, "grad_norm": 45.943519592285156, "learning_rate": 0.0003148182544788678, "epoch": 0.41617015824203984, "step": 464 }, { "loss": 362.8856, "grad_norm": 52.45280075073242, "learning_rate": 0.0003141370813464018, "epoch": 0.417067076686527, "step": 465 }, { "loss": 366.827, "grad_norm": 47.95954132080078, "learning_rate": 0.0003134553981339672, "epoch": 0.4179639951310142, "step": 466 }, { "loss": 370.8824, "grad_norm": 51.57919692993164, "learning_rate": 0.00031277321026296657, "epoch": 0.4188609135755013, "step": 467 }, { "loss": 368.826, "grad_norm": 51.78611755371094, "learning_rate": 0.0003120905231588164, "epoch": 0.41975783201998845, "step": 468 }, { "loss": 369.1159, "grad_norm": 46.962074279785156, "learning_rate": 0.0003114073422509034, "epoch": 0.42065475046447565, "step": 469 }, { "loss": 361.8488, "grad_norm": 46.85802459716797, "learning_rate": 0.0003107236729725414, "epoch": 0.4215516689089628, "step": 470 }, { "loss": 367.4666, "grad_norm": 54.017906188964844, "learning_rate": 0.0003100395207609284, "epoch": 0.4224485873534499, "step": 471 }, { "loss": 366.9775, "grad_norm": 53.34091567993164, "learning_rate": 0.000309354891057103, "epoch": 0.42334550579793706, "step": 472 }, { "loss": 366.0834, "grad_norm": 47.76055908203125, "learning_rate": 0.00030866978930590126, "epoch": 0.42424242424242425, "step": 473 }, { "loss": 368.5773, "grad_norm": 49.945613861083984, "learning_rate": 0.00030798422095591364, "epoch": 0.4251393426869114, "step": 474 }, { "loss": 363.8445, "grad_norm": 48.995609283447266, "learning_rate": 0.00030729819145944114, "epoch": 0.42603626113139853, "step": 475 }, { "loss": 362.6448, "grad_norm": 45.06385040283203, "learning_rate": 0.00030661170627245256, "epoch": 0.4269331795758857, "step": 476 }, { "loss": 364.0858, "grad_norm": 49.73957061767578, "learning_rate": 0.00030592477085454047, "epoch": 0.42783009802037286, "step": 477 }, { "loss": 371.1085, "grad_norm": 49.45321273803711, "learning_rate": 0.00030523739066887836, "epoch": 0.42872701646486, "step": 478 }, { "loss": 363.6934, "grad_norm": 49.325355529785156, "learning_rate": 0.00030454957118217674, "epoch": 0.4296239349093472, "step": 479 }, { "loss": 368.4297, "grad_norm": 47.509742736816406, "learning_rate": 0.0003038613178646401, "epoch": 0.43052085335383433, "step": 480 }, { "loss": 366.2455, "grad_norm": 48.50214767456055, "learning_rate": 0.000303172636189923, "epoch": 0.43141777179832147, "step": 481 }, { "loss": 362.4247, "grad_norm": 46.59059143066406, "learning_rate": 0.00030248353163508674, "epoch": 0.43231469024280866, "step": 482 }, { "loss": 368.7481, "grad_norm": 47.74319839477539, "learning_rate": 0.0003017940096805557, "epoch": 0.4332116086872958, "step": 483 }, { "loss": 365.7433, "grad_norm": 53.59490203857422, "learning_rate": 0.0003011040758100741, "epoch": 0.43410852713178294, "step": 484 }, { "loss": 366.9239, "grad_norm": 49.87615966796875, "learning_rate": 0.00030041373551066173, "epoch": 0.4350054455762701, "step": 485 }, { "loss": 360.9555, "grad_norm": 44.795536041259766, "learning_rate": 0.0002997229942725711, "epoch": 0.43590236402075727, "step": 486 }, { "loss": 370.6934, "grad_norm": 56.454227447509766, "learning_rate": 0.000299031857589243, "epoch": 0.4367992824652444, "step": 487 }, { "loss": 369.9133, "grad_norm": 48.472312927246094, "learning_rate": 0.00029834033095726335, "epoch": 0.43769620090973155, "step": 488 }, { "loss": 361.5723, "grad_norm": 51.665260314941406, "learning_rate": 0.00029764841987631933, "epoch": 0.43859311935421874, "step": 489 }, { "loss": 366.223, "grad_norm": 51.25084686279297, "learning_rate": 0.0002969561298491557, "epoch": 0.4394900377987059, "step": 490 }, { "loss": 367.7071, "grad_norm": 50.52541732788086, "learning_rate": 0.00029626346638153073, "epoch": 0.440386956243193, "step": 491 }, { "loss": 367.0807, "grad_norm": 50.71653366088867, "learning_rate": 0.0002955704349821729, "epoch": 0.4412838746876802, "step": 492 }, { "loss": 366.5776, "grad_norm": 44.603485107421875, "learning_rate": 0.0002948770411627367, "epoch": 0.44218079313216735, "step": 493 }, { "loss": 367.2019, "grad_norm": 49.68048858642578, "learning_rate": 0.0002941832904377589, "epoch": 0.4430777115766545, "step": 494 }, { "loss": 367.4325, "grad_norm": 56.277896881103516, "learning_rate": 0.000293489188324615, "epoch": 0.4439746300211416, "step": 495 }, { "loss": 369.3215, "grad_norm": 46.4665412902832, "learning_rate": 0.00029279474034347465, "epoch": 0.4448715484656288, "step": 496 }, { "loss": 368.6407, "grad_norm": 51.84563446044922, "learning_rate": 0.00029209995201725836, "epoch": 0.44576846691011596, "step": 497 }, { "loss": 366.8856, "grad_norm": 55.93694305419922, "learning_rate": 0.0002914048288715937, "epoch": 0.4466653853546031, "step": 498 }, { "loss": 367.8516, "grad_norm": 50.97298812866211, "learning_rate": 0.00029070937643477056, "epoch": 0.4475623037990903, "step": 499 }, { "loss": 364.7996, "grad_norm": 53.179847717285156, "learning_rate": 0.000290013600237698, "epoch": 0.4484592222435774, "step": 500 }, { "eval_loss": 1.6293703317642212, "eval_runtime": 47.4683, "eval_samples_per_second": 43.145, "eval_steps_per_second": 2.697, "epoch": 0.4484592222435774, "step": 500 }, { "loss": 364.7999, "grad_norm": 53.32307434082031, "learning_rate": 0.00028931750581385975, "epoch": 0.44935614068806456, "step": 501 }, { "loss": 368.2321, "grad_norm": 48.1343994140625, "learning_rate": 0.00028862109869927057, "epoch": 0.45025305913255176, "step": 502 }, { "loss": 363.4522, "grad_norm": 48.97591781616211, "learning_rate": 0.00028792438443243175, "epoch": 0.4511499775770389, "step": 503 }, { "loss": 367.3519, "grad_norm": 48.5214729309082, "learning_rate": 0.00028722736855428755, "epoch": 0.45204689602152603, "step": 504 }, { "loss": 366.9135, "grad_norm": 48.30058288574219, "learning_rate": 0.00028653005660818115, "epoch": 0.4529438144660132, "step": 505 }, { "loss": 365.4208, "grad_norm": 48.56584548950195, "learning_rate": 0.00028583245413980993, "epoch": 0.45384073291050037, "step": 506 }, { "loss": 366.6342, "grad_norm": 44.84033203125, "learning_rate": 0.0002851345666971819, "epoch": 0.4547376513549875, "step": 507 }, { "loss": 366.2589, "grad_norm": 46.03631591796875, "learning_rate": 0.0002844363998305717, "epoch": 0.45563456979947464, "step": 508 }, { "loss": 368.2724, "grad_norm": 52.3626708984375, "learning_rate": 0.0002837379590924759, "epoch": 0.45653148824396184, "step": 509 }, { "loss": 366.9325, "grad_norm": 42.26225280761719, "learning_rate": 0.0002830392500375694, "epoch": 0.457428406688449, "step": 510 }, { "loss": 363.1102, "grad_norm": 47.719661712646484, "learning_rate": 0.0002823402782226608, "epoch": 0.4583253251329361, "step": 511 }, { "loss": 369.943, "grad_norm": 48.35748291015625, "learning_rate": 0.00028164104920664864, "epoch": 0.4592222435774233, "step": 512 }, { "loss": 366.7622, "grad_norm": 47.81887435913086, "learning_rate": 0.00028094156855047687, "epoch": 0.46011916202191044, "step": 513 }, { "loss": 369.4684, "grad_norm": 51.35517883300781, "learning_rate": 0.0002802418418170908, "epoch": 0.4610160804663976, "step": 514 }, { "loss": 367.9245, "grad_norm": 52.903011322021484, "learning_rate": 0.0002795418745713925, "epoch": 0.4619129989108847, "step": 515 }, { "loss": 363.503, "grad_norm": 50.455223083496094, "learning_rate": 0.00027884167238019714, "epoch": 0.4628099173553719, "step": 516 }, { "loss": 361.0208, "grad_norm": 48.27017593383789, "learning_rate": 0.0002781412408121884, "epoch": 0.46370683579985905, "step": 517 }, { "loss": 364.5886, "grad_norm": 49.851619720458984, "learning_rate": 0.0002774405854378739, "epoch": 0.4646037542443462, "step": 518 }, { "loss": 359.5211, "grad_norm": 49.12308120727539, "learning_rate": 0.00027673971182954157, "epoch": 0.4655006726888334, "step": 519 }, { "loss": 366.8299, "grad_norm": 47.60043716430664, "learning_rate": 0.00027603862556121463, "epoch": 0.4663975911333205, "step": 520 }, { "loss": 368.2267, "grad_norm": 41.944801330566406, "learning_rate": 0.0002753373322086077, "epoch": 0.46729450957780766, "step": 521 }, { "loss": 368.1608, "grad_norm": 45.84396743774414, "learning_rate": 0.00027463583734908234, "epoch": 0.46819142802229485, "step": 522 }, { "loss": 359.4468, "grad_norm": 44.122989654541016, "learning_rate": 0.0002739341465616026, "epoch": 0.469088346466782, "step": 523 }, { "loss": 367.6043, "grad_norm": 44.97038269042969, "learning_rate": 0.000273232265426691, "epoch": 0.46998526491126913, "step": 524 }, { "loss": 367.8859, "grad_norm": 49.4835319519043, "learning_rate": 0.0002725301995263835, "epoch": 0.47088218335575627, "step": 525 }, { "loss": 365.9901, "grad_norm": 46.08525466918945, "learning_rate": 0.00027182795444418583, "epoch": 0.47177910180024346, "step": 526 }, { "loss": 362.7762, "grad_norm": 45.26884841918945, "learning_rate": 0.0002711255357650286, "epoch": 0.4726760202447306, "step": 527 }, { "loss": 363.5254, "grad_norm": 52.6630973815918, "learning_rate": 0.0002704229490752229, "epoch": 0.47357293868921774, "step": 528 }, { "loss": 362.2083, "grad_norm": 49.639488220214844, "learning_rate": 0.00026972019996241635, "epoch": 0.47446985713370493, "step": 529 }, { "loss": 370.2541, "grad_norm": 51.361610412597656, "learning_rate": 0.00026901729401554805, "epoch": 0.47536677557819207, "step": 530 }, { "loss": 364.9506, "grad_norm": 45.84967803955078, "learning_rate": 0.00026831423682480425, "epoch": 0.4762636940226792, "step": 531 }, { "loss": 373.7259, "grad_norm": 48.99913024902344, "learning_rate": 0.00026761103398157456, "epoch": 0.4771606124671664, "step": 532 }, { "loss": 367.0407, "grad_norm": 53.0494270324707, "learning_rate": 0.00026690769107840634, "epoch": 0.47805753091165354, "step": 533 }, { "loss": 366.3498, "grad_norm": 46.16975784301758, "learning_rate": 0.00026620421370896136, "epoch": 0.4789544493561407, "step": 534 }, { "loss": 363.5735, "grad_norm": 45.147125244140625, "learning_rate": 0.00026550060746797057, "epoch": 0.47985136780062787, "step": 535 }, { "loss": 362.9278, "grad_norm": 47.262821197509766, "learning_rate": 0.0002647968779511897, "epoch": 0.480748286245115, "step": 536 }, { "loss": 366.6017, "grad_norm": 49.1768913269043, "learning_rate": 0.00026409303075535504, "epoch": 0.48164520468960215, "step": 537 }, { "loss": 363.7893, "grad_norm": 47.41939163208008, "learning_rate": 0.00026338907147813894, "epoch": 0.4825421231340893, "step": 538 }, { "loss": 362.325, "grad_norm": 45.2095947265625, "learning_rate": 0.0002626850057181048, "epoch": 0.4834390415785765, "step": 539 }, { "loss": 368.0108, "grad_norm": 44.87570571899414, "learning_rate": 0.000261980839074663, "epoch": 0.4843359600230636, "step": 540 }, { "loss": 363.8844, "grad_norm": 44.87836456298828, "learning_rate": 0.0002612765771480264, "epoch": 0.48523287846755075, "step": 541 }, { "loss": 366.2256, "grad_norm": 52.47968292236328, "learning_rate": 0.00026057222553916545, "epoch": 0.48612979691203795, "step": 542 }, { "loss": 364.6898, "grad_norm": 49.18819808959961, "learning_rate": 0.0002598677898497638, "epoch": 0.4870267153565251, "step": 543 }, { "loss": 364.0697, "grad_norm": 47.542850494384766, "learning_rate": 0.00025916327568217416, "epoch": 0.4879236338010122, "step": 544 }, { "loss": 362.7703, "grad_norm": 44.471256256103516, "learning_rate": 0.0002584586886393729, "epoch": 0.4888205522454994, "step": 545 }, { "loss": 370.4043, "grad_norm": 46.374263763427734, "learning_rate": 0.0002577540343249162, "epoch": 0.48971747068998656, "step": 546 }, { "loss": 362.8738, "grad_norm": 44.021278381347656, "learning_rate": 0.0002570493183428952, "epoch": 0.4906143891344737, "step": 547 }, { "loss": 365.418, "grad_norm": 47.044212341308594, "learning_rate": 0.00025634454629789156, "epoch": 0.49151130757896083, "step": 548 }, { "loss": 363.5009, "grad_norm": 48.60353469848633, "learning_rate": 0.00025563972379493273, "epoch": 0.492408226023448, "step": 549 }, { "loss": 365.955, "grad_norm": 47.8569221496582, "learning_rate": 0.00025493485643944753, "epoch": 0.49330514446793516, "step": 550 }, { "eval_loss": 1.6247297525405884, "eval_runtime": 36.2552, "eval_samples_per_second": 56.488, "eval_steps_per_second": 3.531, "epoch": 0.49330514446793516, "step": 550 }, { "loss": 361.769, "grad_norm": 52.47264099121094, "learning_rate": 0.00025422994983722127, "epoch": 0.4942020629124223, "step": 551 }, { "loss": 369.0356, "grad_norm": 51.903358459472656, "learning_rate": 0.0002535250095943517, "epoch": 0.4950989813569095, "step": 552 }, { "loss": 362.5946, "grad_norm": 55.91824722290039, "learning_rate": 0.0002528200413172039, "epoch": 0.49599589980139663, "step": 553 }, { "loss": 364.1907, "grad_norm": 49.117069244384766, "learning_rate": 0.00025211505061236583, "epoch": 0.49689281824588377, "step": 554 }, { "loss": 363.2774, "grad_norm": 44.69606018066406, "learning_rate": 0.00025141004308660414, "epoch": 0.49778973669037097, "step": 555 }, { "loss": 363.2139, "grad_norm": 52.18587112426758, "learning_rate": 0.00025070502434681915, "epoch": 0.4986866551348581, "step": 556 }, { "loss": 365.6665, "grad_norm": 57.393428802490234, "learning_rate": 0.00025, "epoch": 0.49958357357934524, "step": 557 }, { "loss": 363.4536, "grad_norm": 52.89313507080078, "learning_rate": 0.0002492949756531809, "epoch": 0.5004804920238324, "step": 558 }, { "loss": 363.2097, "grad_norm": 51.265533447265625, "learning_rate": 0.00024858995691339587, "epoch": 0.5013774104683195, "step": 559 }, { "loss": 366.4611, "grad_norm": 56.473567962646484, "learning_rate": 0.0002478849493876342, "epoch": 0.5022743289128068, "step": 560 }, { "loss": 361.8987, "grad_norm": 49.68058776855469, "learning_rate": 0.0002471799586827962, "epoch": 0.5031712473572939, "step": 561 }, { "loss": 360.8694, "grad_norm": 42.74179458618164, "learning_rate": 0.00024647499040564844, "epoch": 0.504068165801781, "step": 562 }, { "loss": 364.9089, "grad_norm": 45.61265563964844, "learning_rate": 0.00024577005016277885, "epoch": 0.5049650842462682, "step": 563 }, { "loss": 365.8124, "grad_norm": 46.97050857543945, "learning_rate": 0.0002450651435605526, "epoch": 0.5058620026907553, "step": 564 }, { "loss": 360.1623, "grad_norm": 46.26262664794922, "learning_rate": 0.0002443602762050673, "epoch": 0.5067589211352425, "step": 565 }, { "loss": 363.2248, "grad_norm": 44.43347930908203, "learning_rate": 0.00024365545370210842, "epoch": 0.5076558395797296, "step": 566 }, { "loss": 365.1527, "grad_norm": 46.19889831542969, "learning_rate": 0.00024295068165710478, "epoch": 0.5085527580242168, "step": 567 }, { "loss": 365.0658, "grad_norm": 49.645484924316406, "learning_rate": 0.00024224596567508385, "epoch": 0.509449676468704, "step": 568 }, { "loss": 362.5722, "grad_norm": 47.69388961791992, "learning_rate": 0.00024154131136062715, "epoch": 0.5103465949131911, "step": 569 }, { "loss": 361.0171, "grad_norm": 44.855857849121094, "learning_rate": 0.00024083672431782585, "epoch": 0.5112435133576783, "step": 570 }, { "loss": 361.5502, "grad_norm": 48.860435485839844, "learning_rate": 0.00024013221015023619, "epoch": 0.5121404318021654, "step": 571 }, { "loss": 360.8487, "grad_norm": 45.69166564941406, "learning_rate": 0.0002394277744608346, "epoch": 0.5130373502466525, "step": 572 }, { "loss": 361.6857, "grad_norm": 45.67158889770508, "learning_rate": 0.00023872342285197366, "epoch": 0.5139342686911397, "step": 573 }, { "loss": 364.0296, "grad_norm": 51.487369537353516, "learning_rate": 0.00023801916092533706, "epoch": 0.5148311871356269, "step": 574 }, { "loss": 366.4655, "grad_norm": 49.884727478027344, "learning_rate": 0.0002373149942818953, "epoch": 0.5157281055801141, "step": 575 }, { "loss": 360.9107, "grad_norm": 42.73551940917969, "learning_rate": 0.00023661092852186118, "epoch": 0.5166250240246012, "step": 576 }, { "loss": 364.7719, "grad_norm": 44.425777435302734, "learning_rate": 0.000235906969244645, "epoch": 0.5175219424690883, "step": 577 }, { "loss": 362.6983, "grad_norm": 52.82978057861328, "learning_rate": 0.00023520312204881045, "epoch": 0.5184188609135755, "step": 578 }, { "loss": 359.655, "grad_norm": 46.826904296875, "learning_rate": 0.0002344993925320295, "epoch": 0.5193157793580626, "step": 579 }, { "loss": 364.8085, "grad_norm": 42.24338150024414, "learning_rate": 0.00023379578629103865, "epoch": 0.5202126978025499, "step": 580 }, { "loss": 358.4188, "grad_norm": 49.714271545410156, "learning_rate": 0.00023309230892159364, "epoch": 0.521109616247037, "step": 581 }, { "loss": 364.1614, "grad_norm": 47.561073303222656, "learning_rate": 0.0002323889660184255, "epoch": 0.5220065346915241, "step": 582 }, { "loss": 361.0988, "grad_norm": 45.20221710205078, "learning_rate": 0.00023168576317519576, "epoch": 0.5229034531360113, "step": 583 }, { "loss": 367.0533, "grad_norm": 47.38787078857422, "learning_rate": 0.00023098270598445204, "epoch": 0.5238003715804984, "step": 584 }, { "loss": 366.2763, "grad_norm": 47.23054122924805, "learning_rate": 0.00023027980003758363, "epoch": 0.5246972900249856, "step": 585 }, { "loss": 365.6816, "grad_norm": 43.855403900146484, "learning_rate": 0.0002295770509247771, "epoch": 0.5255942084694727, "step": 586 }, { "loss": 365.6198, "grad_norm": 51.30084228515625, "learning_rate": 0.00022887446423497146, "epoch": 0.5264911269139599, "step": 587 }, { "loss": 362.4194, "grad_norm": 50.142330169677734, "learning_rate": 0.00022817204555581418, "epoch": 0.5273880453584471, "step": 588 }, { "loss": 364.2704, "grad_norm": 46.52515411376953, "learning_rate": 0.00022746980047361654, "epoch": 0.5282849638029342, "step": 589 }, { "loss": 362.0045, "grad_norm": 48.26958465576172, "learning_rate": 0.00022676773457330906, "epoch": 0.5291818822474214, "step": 590 }, { "loss": 364.3056, "grad_norm": 45.78593063354492, "learning_rate": 0.0002260658534383974, "epoch": 0.5300788006919085, "step": 591 }, { "loss": 364.2805, "grad_norm": 47.130184173583984, "learning_rate": 0.00022536416265091775, "epoch": 0.5309757191363956, "step": 592 }, { "loss": 362.9882, "grad_norm": 43.309181213378906, "learning_rate": 0.0002246626677913923, "epoch": 0.5318726375808829, "step": 593 }, { "loss": 362.9743, "grad_norm": 40.39152145385742, "learning_rate": 0.00022396137443878535, "epoch": 0.53276955602537, "step": 594 }, { "loss": 359.4163, "grad_norm": 47.722068786621094, "learning_rate": 0.00022326028817045844, "epoch": 0.5336664744698572, "step": 595 }, { "loss": 364.6919, "grad_norm": 42.61846160888672, "learning_rate": 0.00022255941456212605, "epoch": 0.5345633929143443, "step": 596 }, { "loss": 368.3342, "grad_norm": 44.96833038330078, "learning_rate": 0.00022185875918781162, "epoch": 0.5354603113588314, "step": 597 }, { "loss": 363.2259, "grad_norm": 43.944881439208984, "learning_rate": 0.00022115832761980287, "epoch": 0.5363572298033186, "step": 598 }, { "loss": 362.7245, "grad_norm": 47.073341369628906, "learning_rate": 0.00022045812542860756, "epoch": 0.5372541482478057, "step": 599 }, { "loss": 363.0497, "grad_norm": 44.11311721801758, "learning_rate": 0.00021975815818290928, "epoch": 0.538151066692293, "step": 600 }, { "eval_loss": 1.61993408203125, "eval_runtime": 65.3564, "eval_samples_per_second": 31.336, "eval_steps_per_second": 1.958, "epoch": 0.538151066692293, "step": 600 }, { "loss": 360.9368, "grad_norm": 45.97838592529297, "learning_rate": 0.00021905843144952316, "epoch": 0.5390479851367801, "step": 601 }, { "loss": 363.959, "grad_norm": 45.36203384399414, "learning_rate": 0.0002183589507933514, "epoch": 0.5399449035812672, "step": 602 }, { "loss": 363.9291, "grad_norm": 43.02581024169922, "learning_rate": 0.00021765972177733924, "epoch": 0.5408418220257544, "step": 603 }, { "loss": 363.5491, "grad_norm": 47.46310806274414, "learning_rate": 0.0002169607499624307, "epoch": 0.5417387404702415, "step": 604 }, { "loss": 367.6017, "grad_norm": 47.89605712890625, "learning_rate": 0.00021626204090752422, "epoch": 0.5426356589147286, "step": 605 }, { "loss": 364.9732, "grad_norm": 45.463443756103516, "learning_rate": 0.00021556360016942842, "epoch": 0.5435325773592158, "step": 606 }, { "loss": 364.4341, "grad_norm": 43.64617919921875, "learning_rate": 0.00021486543330281812, "epoch": 0.544429495803703, "step": 607 }, { "loss": 366.3894, "grad_norm": 41.575531005859375, "learning_rate": 0.0002141675458601901, "epoch": 0.5453264142481902, "step": 608 }, { "loss": 363.112, "grad_norm": 46.79388427734375, "learning_rate": 0.00021346994339181883, "epoch": 0.5462233326926773, "step": 609 }, { "loss": 361.5751, "grad_norm": 48.13455581665039, "learning_rate": 0.0002127726314457124, "epoch": 0.5471202511371644, "step": 610 }, { "loss": 361.1321, "grad_norm": 45.220550537109375, "learning_rate": 0.0002120756155675683, "epoch": 0.5480171695816516, "step": 611 }, { "loss": 365.0866, "grad_norm": 46.22264099121094, "learning_rate": 0.0002113789013007295, "epoch": 0.5489140880261387, "step": 612 }, { "loss": 360.2099, "grad_norm": 47.99028015136719, "learning_rate": 0.00021068249418614027, "epoch": 0.549811006470626, "step": 613 }, { "loss": 362.4004, "grad_norm": 45.35298538208008, "learning_rate": 0.00020998639976230202, "epoch": 0.5507079249151131, "step": 614 }, { "loss": 362.9482, "grad_norm": 45.84006118774414, "learning_rate": 0.00020929062356522942, "epoch": 0.5516048433596002, "step": 615 }, { "loss": 361.6893, "grad_norm": 46.06373977661133, "learning_rate": 0.00020859517112840637, "epoch": 0.5525017618040874, "step": 616 }, { "loss": 368.1667, "grad_norm": 43.56032180786133, "learning_rate": 0.00020790004798274165, "epoch": 0.5533986802485745, "step": 617 }, { "loss": 363.2073, "grad_norm": 43.215370178222656, "learning_rate": 0.00020720525965652544, "epoch": 0.5542955986930617, "step": 618 }, { "loss": 358.3785, "grad_norm": 47.84462356567383, "learning_rate": 0.00020651081167538508, "epoch": 0.5551925171375488, "step": 619 }, { "loss": 365.6581, "grad_norm": 49.96092987060547, "learning_rate": 0.00020581670956224113, "epoch": 0.556089435582036, "step": 620 }, { "loss": 363.1918, "grad_norm": 44.61714172363281, "learning_rate": 0.00020512295883726338, "epoch": 0.5569863540265232, "step": 621 }, { "loss": 363.2948, "grad_norm": 44.841495513916016, "learning_rate": 0.00020442956501782713, "epoch": 0.5578832724710103, "step": 622 }, { "loss": 358.7636, "grad_norm": 46.29624938964844, "learning_rate": 0.00020373653361846925, "epoch": 0.5587801909154975, "step": 623 }, { "loss": 362.0233, "grad_norm": 43.61477279663086, "learning_rate": 0.0002030438701508443, "epoch": 0.5596771093599846, "step": 624 }, { "loss": 366.3086, "grad_norm": 44.28224182128906, "learning_rate": 0.00020235158012368065, "epoch": 0.5605740278044717, "step": 625 }, { "loss": 357.9655, "grad_norm": 43.08799362182617, "learning_rate": 0.00020165966904273666, "epoch": 0.5614709462489589, "step": 626 }, { "loss": 364.1879, "grad_norm": 45.73900604248047, "learning_rate": 0.00020096814241075703, "epoch": 0.5623678646934461, "step": 627 }, { "loss": 359.9633, "grad_norm": 48.213985443115234, "learning_rate": 0.00020027700572742895, "epoch": 0.5632647831379333, "step": 628 }, { "loss": 365.9498, "grad_norm": 43.3817253112793, "learning_rate": 0.00019958626448933825, "epoch": 0.5641617015824204, "step": 629 }, { "loss": 362.1366, "grad_norm": 42.70503234863281, "learning_rate": 0.00019889592418992594, "epoch": 0.5650586200269075, "step": 630 }, { "loss": 361.433, "grad_norm": 46.60575485229492, "learning_rate": 0.00019820599031944436, "epoch": 0.5659555384713947, "step": 631 }, { "loss": 364.1061, "grad_norm": 42.36573791503906, "learning_rate": 0.00019751646836491338, "epoch": 0.5668524569158818, "step": 632 }, { "loss": 360.4161, "grad_norm": 43.14451599121094, "learning_rate": 0.00019682736381007707, "epoch": 0.5677493753603691, "step": 633 }, { "loss": 357.0567, "grad_norm": 44.19496154785156, "learning_rate": 0.00019613868213535997, "epoch": 0.5686462938048562, "step": 634 }, { "loss": 361.1339, "grad_norm": 42.32905960083008, "learning_rate": 0.00019545042881782333, "epoch": 0.5695432122493433, "step": 635 }, { "loss": 361.2873, "grad_norm": 47.53689956665039, "learning_rate": 0.00019476260933112163, "epoch": 0.5704401306938305, "step": 636 }, { "loss": 362.2348, "grad_norm": 47.5960578918457, "learning_rate": 0.00019407522914545957, "epoch": 0.5713370491383176, "step": 637 }, { "loss": 366.9183, "grad_norm": 43.92160415649414, "learning_rate": 0.00019338829372754745, "epoch": 0.5722339675828048, "step": 638 }, { "loss": 361.6643, "grad_norm": 46.373863220214844, "learning_rate": 0.0001927018085405588, "epoch": 0.5731308860272919, "step": 639 }, { "loss": 362.9005, "grad_norm": 45.955814361572266, "learning_rate": 0.0001920157790440864, "epoch": 0.5740278044717791, "step": 640 }, { "loss": 360.8845, "grad_norm": 46.01215362548828, "learning_rate": 0.00019133021069409872, "epoch": 0.5749247229162663, "step": 641 }, { "loss": 361.9622, "grad_norm": 46.09065628051758, "learning_rate": 0.00019064510894289705, "epoch": 0.5758216413607534, "step": 642 }, { "loss": 363.2684, "grad_norm": 45.370140075683594, "learning_rate": 0.00018996047923907166, "epoch": 0.5767185598052406, "step": 643 }, { "loss": 362.285, "grad_norm": 43.416664123535156, "learning_rate": 0.00018927632702745866, "epoch": 0.5776154782497277, "step": 644 }, { "loss": 360.188, "grad_norm": 44.63084030151367, "learning_rate": 0.00018859265774909668, "epoch": 0.5785123966942148, "step": 645 }, { "loss": 362.1082, "grad_norm": 43.95875930786133, "learning_rate": 0.00018790947684118364, "epoch": 0.5794093151387021, "step": 646 }, { "loss": 364.6595, "grad_norm": 46.196041107177734, "learning_rate": 0.00018722678973703355, "epoch": 0.5803062335831892, "step": 647 }, { "loss": 367.5318, "grad_norm": 52.50529479980469, "learning_rate": 0.00018654460186603295, "epoch": 0.5812031520276764, "step": 648 }, { "loss": 364.7477, "grad_norm": 44.10645294189453, "learning_rate": 0.00018586291865359822, "epoch": 0.5821000704721635, "step": 649 }, { "loss": 362.5089, "grad_norm": 42.808326721191406, "learning_rate": 0.00018518174552113216, "epoch": 0.5829969889166506, "step": 650 }, { "eval_loss": 1.6019372940063477, "eval_runtime": 17.6903, "eval_samples_per_second": 115.769, "eval_steps_per_second": 14.471, "epoch": 0.5829969889166506, "step": 650 }, { "loss": 361.447, "grad_norm": 45.0283088684082, "learning_rate": 0.0001845010878859809, "epoch": 0.5838939073611378, "step": 651 }, { "loss": 363.9907, "grad_norm": 45.77663040161133, "learning_rate": 0.00018382095116139098, "epoch": 0.5847908258056249, "step": 652 }, { "loss": 358.2193, "grad_norm": 47.19649124145508, "learning_rate": 0.00018314134075646582, "epoch": 0.5856877442501122, "step": 653 }, { "loss": 362.618, "grad_norm": 45.46641540527344, "learning_rate": 0.00018246226207612338, "epoch": 0.5865846626945993, "step": 654 }, { "loss": 364.6533, "grad_norm": 45.993873596191406, "learning_rate": 0.00018178372052105263, "epoch": 0.5874815811390864, "step": 655 }, { "loss": 359.9103, "grad_norm": 49.62721252441406, "learning_rate": 0.00018110572148767089, "epoch": 0.5883784995835736, "step": 656 }, { "loss": 362.929, "grad_norm": 47.14739227294922, "learning_rate": 0.00018042827036808074, "epoch": 0.5892754180280607, "step": 657 }, { "loss": 364.1747, "grad_norm": 46.9727897644043, "learning_rate": 0.00017975137255002744, "epoch": 0.5901723364725479, "step": 658 }, { "loss": 362.2029, "grad_norm": 45.876277923583984, "learning_rate": 0.0001790750334168555, "epoch": 0.591069254917035, "step": 659 }, { "loss": 359.2526, "grad_norm": 42.93642807006836, "learning_rate": 0.00017839925834746653, "epoch": 0.5919661733615222, "step": 660 }, { "loss": 363.6162, "grad_norm": 41.57487487792969, "learning_rate": 0.0001777240527162761, "epoch": 0.5928630918060094, "step": 661 }, { "loss": 361.9038, "grad_norm": 46.25205993652344, "learning_rate": 0.00017704942189317104, "epoch": 0.5937600102504965, "step": 662 }, { "loss": 358.8016, "grad_norm": 45.354007720947266, "learning_rate": 0.0001763753712434666, "epoch": 0.5946569286949837, "step": 663 }, { "loss": 361.5577, "grad_norm": 42.980037689208984, "learning_rate": 0.00017570190612786413, "epoch": 0.5955538471394708, "step": 664 }, { "loss": 361.3445, "grad_norm": 44.7468147277832, "learning_rate": 0.00017502903190240815, "epoch": 0.5964507655839579, "step": 665 }, { "loss": 360.489, "grad_norm": 43.96569061279297, "learning_rate": 0.00017435675391844397, "epoch": 0.5973476840284452, "step": 666 }, { "loss": 365.539, "grad_norm": 45.040103912353516, "learning_rate": 0.00017368507752257495, "epoch": 0.5982446024729323, "step": 667 }, { "loss": 363.3497, "grad_norm": 45.93570327758789, "learning_rate": 0.00017301400805661989, "epoch": 0.5991415209174195, "step": 668 }, { "loss": 356.2852, "grad_norm": 41.94508743286133, "learning_rate": 0.00017234355085757086, "epoch": 0.6000384393619066, "step": 669 }, { "loss": 364.3321, "grad_norm": 40.20936584472656, "learning_rate": 0.00017167371125755064, "epoch": 0.6009353578063937, "step": 670 }, { "loss": 365.0333, "grad_norm": 42.29598617553711, "learning_rate": 0.00017100449458377003, "epoch": 0.6018322762508809, "step": 671 }, { "loss": 356.7194, "grad_norm": 41.43622589111328, "learning_rate": 0.00017033590615848598, "epoch": 0.602729194695368, "step": 672 }, { "loss": 362.7276, "grad_norm": 44.03760528564453, "learning_rate": 0.0001696679512989589, "epoch": 0.6036261131398553, "step": 673 }, { "loss": 359.1711, "grad_norm": 39.68849182128906, "learning_rate": 0.00016900063531741048, "epoch": 0.6045230315843424, "step": 674 }, { "loss": 357.2, "grad_norm": 40.92485809326172, "learning_rate": 0.0001683339635209813, "epoch": 0.6054199500288295, "step": 675 }, { "loss": 362.3214, "grad_norm": 41.29072189331055, "learning_rate": 0.000167667941211689, "epoch": 0.6063168684733167, "step": 676 }, { "loss": 361.0124, "grad_norm": 41.026676177978516, "learning_rate": 0.00016700257368638572, "epoch": 0.6072137869178038, "step": 677 }, { "loss": 360.2582, "grad_norm": 43.93520736694336, "learning_rate": 0.0001663378662367161, "epoch": 0.608110705362291, "step": 678 }, { "loss": 358.0945, "grad_norm": 43.4892578125, "learning_rate": 0.00016567382414907532, "epoch": 0.6090076238067781, "step": 679 }, { "loss": 360.7998, "grad_norm": 43.67966842651367, "learning_rate": 0.00016501045270456694, "epoch": 0.6099045422512653, "step": 680 }, { "loss": 359.6815, "grad_norm": 42.92584991455078, "learning_rate": 0.0001643477571789609, "epoch": 0.6108014606957525, "step": 681 }, { "loss": 361.6625, "grad_norm": 42.53407287597656, "learning_rate": 0.00016368574284265165, "epoch": 0.6116983791402396, "step": 682 }, { "loss": 363.5579, "grad_norm": 41.2686767578125, "learning_rate": 0.00016302441496061592, "epoch": 0.6125952975847268, "step": 683 }, { "loss": 360.9108, "grad_norm": 42.09267044067383, "learning_rate": 0.00016236377879237136, "epoch": 0.6134922160292139, "step": 684 }, { "loss": 360.2266, "grad_norm": 42.135650634765625, "learning_rate": 0.0001617038395919344, "epoch": 0.614389134473701, "step": 685 }, { "loss": 355.2124, "grad_norm": 41.78007888793945, "learning_rate": 0.00016104460260777837, "epoch": 0.6152860529181883, "step": 686 }, { "loss": 357.8339, "grad_norm": 41.49577713012695, "learning_rate": 0.00016038607308279198, "epoch": 0.6161829713626754, "step": 687 }, { "loss": 361.7785, "grad_norm": 47.102848052978516, "learning_rate": 0.00015972825625423765, "epoch": 0.6170798898071626, "step": 688 }, { "loss": 357.3535, "grad_norm": 41.43706512451172, "learning_rate": 0.0001590711573537096, "epoch": 0.6179768082516497, "step": 689 }, { "loss": 359.8207, "grad_norm": 40.92182540893555, "learning_rate": 0.00015841478160709242, "epoch": 0.6188737266961368, "step": 690 }, { "loss": 358.1373, "grad_norm": 49.461273193359375, "learning_rate": 0.0001577591342345195, "epoch": 0.619770645140624, "step": 691 }, { "loss": 361.2856, "grad_norm": 50.03120040893555, "learning_rate": 0.00015710422045033158, "epoch": 0.6206675635851111, "step": 692 }, { "loss": 359.0531, "grad_norm": 43.81147003173828, "learning_rate": 0.00015645004546303493, "epoch": 0.6215644820295984, "step": 693 }, { "loss": 357.6739, "grad_norm": 44.85881042480469, "learning_rate": 0.00015579661447526067, "epoch": 0.6224614004740855, "step": 694 }, { "loss": 358.5413, "grad_norm": 45.34134292602539, "learning_rate": 0.00015514393268372247, "epoch": 0.6233583189185726, "step": 695 }, { "loss": 362.4291, "grad_norm": 44.94168472290039, "learning_rate": 0.00015449200527917578, "epoch": 0.6242552373630598, "step": 696 }, { "loss": 353.4212, "grad_norm": 43.28814697265625, "learning_rate": 0.00015384083744637663, "epoch": 0.6251521558075469, "step": 697 }, { "loss": 361.8906, "grad_norm": 42.88665008544922, "learning_rate": 0.00015319043436403992, "epoch": 0.626049074252034, "step": 698 }, { "loss": 357.3509, "grad_norm": 46.005001068115234, "learning_rate": 0.00015254080120479874, "epoch": 0.6269459926965213, "step": 699 }, { "loss": 356.4296, "grad_norm": 44.4104118347168, "learning_rate": 0.00015189194313516288, "epoch": 0.6278429111410084, "step": 700 }, { "eval_loss": 1.597915768623352, "eval_runtime": 17.571, "eval_samples_per_second": 116.555, "eval_steps_per_second": 14.569, "epoch": 0.6278429111410084, "step": 700 }, { "loss": 358.631, "grad_norm": 43.341407775878906, "learning_rate": 0.000151243865315478, "epoch": 0.6287398295854956, "step": 701 }, { "loss": 361.772, "grad_norm": 43.18885803222656, "learning_rate": 0.00015059657289988426, "epoch": 0.6296367480299827, "step": 702 }, { "loss": 359.0464, "grad_norm": 41.106483459472656, "learning_rate": 0.00014995007103627567, "epoch": 0.6305336664744698, "step": 703 }, { "loss": 358.0773, "grad_norm": 42.815834045410156, "learning_rate": 0.00014930436486625907, "epoch": 0.631430584918957, "step": 704 }, { "loss": 358.7279, "grad_norm": 39.7459602355957, "learning_rate": 0.00014865945952511296, "epoch": 0.6323275033634441, "step": 705 }, { "loss": 358.3263, "grad_norm": 42.54743576049805, "learning_rate": 0.00014801536014174706, "epoch": 0.6332244218079314, "step": 706 }, { "loss": 365.4639, "grad_norm": 45.69781494140625, "learning_rate": 0.00014737207183866118, "epoch": 0.6341213402524185, "step": 707 }, { "loss": 357.4766, "grad_norm": 44.834136962890625, "learning_rate": 0.0001467295997319049, "epoch": 0.6350182586969056, "step": 708 }, { "loss": 361.5132, "grad_norm": 40.79405975341797, "learning_rate": 0.00014608794893103646, "epoch": 0.6359151771413928, "step": 709 }, { "loss": 361.108, "grad_norm": 40.1624870300293, "learning_rate": 0.00014544712453908216, "epoch": 0.6368120955858799, "step": 710 }, { "loss": 357.4099, "grad_norm": 42.602073669433594, "learning_rate": 0.00014480713165249609, "epoch": 0.6377090140303671, "step": 711 }, { "loss": 360.979, "grad_norm": 43.97264099121094, "learning_rate": 0.00014416797536111919, "epoch": 0.6386059324748542, "step": 712 }, { "loss": 361.3081, "grad_norm": 40.94137191772461, "learning_rate": 0.00014352966074813932, "epoch": 0.6395028509193414, "step": 713 }, { "loss": 359.9567, "grad_norm": 40.18381881713867, "learning_rate": 0.00014289219289005027, "epoch": 0.6403997693638286, "step": 714 }, { "loss": 353.732, "grad_norm": 45.907203674316406, "learning_rate": 0.0001422555768566115, "epoch": 0.6412966878083157, "step": 715 }, { "loss": 358.1761, "grad_norm": 46.9672737121582, "learning_rate": 0.0001416198177108083, "epoch": 0.6421936062528029, "step": 716 }, { "loss": 358.2166, "grad_norm": 40.92546081542969, "learning_rate": 0.0001409849205088109, "epoch": 0.64309052469729, "step": 717 }, { "loss": 358.0281, "grad_norm": 39.04634475708008, "learning_rate": 0.00014035089029993444, "epoch": 0.6439874431417771, "step": 718 }, { "loss": 358.9151, "grad_norm": 41.55719757080078, "learning_rate": 0.00013971773212659929, "epoch": 0.6448843615862644, "step": 719 }, { "loss": 356.5345, "grad_norm": 41.81498336791992, "learning_rate": 0.00013908545102429, "epoch": 0.6457812800307515, "step": 720 }, { "loss": 358.3629, "grad_norm": 40.042484283447266, "learning_rate": 0.00013845405202151637, "epoch": 0.6466781984752387, "step": 721 }, { "loss": 360.9086, "grad_norm": 44.207122802734375, "learning_rate": 0.00013782354013977245, "epoch": 0.6475751169197258, "step": 722 }, { "loss": 357.7452, "grad_norm": 45.20026779174805, "learning_rate": 0.00013719392039349734, "epoch": 0.6484720353642129, "step": 723 }, { "loss": 358.4982, "grad_norm": 41.07488250732422, "learning_rate": 0.00013656519779003476, "epoch": 0.6493689538087001, "step": 724 }, { "loss": 361.3215, "grad_norm": 43.69713592529297, "learning_rate": 0.00013593737732959382, "epoch": 0.6502658722531872, "step": 725 }, { "loss": 356.6879, "grad_norm": 45.356109619140625, "learning_rate": 0.00013531046400520858, "epoch": 0.6511627906976745, "step": 726 }, { "loss": 363.6577, "grad_norm": 44.325103759765625, "learning_rate": 0.0001346844628026988, "epoch": 0.6520597091421616, "step": 727 }, { "loss": 358.3399, "grad_norm": 40.79582595825195, "learning_rate": 0.0001340593787006303, "epoch": 0.6529566275866487, "step": 728 }, { "loss": 360.8162, "grad_norm": 40.47697448730469, "learning_rate": 0.0001334352166702751, "epoch": 0.6538535460311359, "step": 729 }, { "loss": 356.254, "grad_norm": 43.549407958984375, "learning_rate": 0.00013281198167557185, "epoch": 0.654750464475623, "step": 730 }, { "loss": 356.3695, "grad_norm": 41.08717727661133, "learning_rate": 0.00013218967867308694, "epoch": 0.6556473829201102, "step": 731 }, { "loss": 359.2961, "grad_norm": 44.06740951538086, "learning_rate": 0.00013156831261197438, "epoch": 0.6565443013645973, "step": 732 }, { "loss": 354.8276, "grad_norm": 44.14928436279297, "learning_rate": 0.00013094788843393657, "epoch": 0.6574412198090845, "step": 733 }, { "loss": 356.655, "grad_norm": 41.25139236450195, "learning_rate": 0.0001303284110731856, "epoch": 0.6583381382535717, "step": 734 }, { "loss": 359.9945, "grad_norm": 43.141475677490234, "learning_rate": 0.00012970988545640307, "epoch": 0.6592350566980588, "step": 735 }, { "loss": 354.7369, "grad_norm": 45.27100372314453, "learning_rate": 0.0001290923165027017, "epoch": 0.660131975142546, "step": 736 }, { "loss": 357.4191, "grad_norm": 41.795658111572266, "learning_rate": 0.0001284757091235859, "epoch": 0.6610288935870331, "step": 737 }, { "loss": 353.508, "grad_norm": 43.1330680847168, "learning_rate": 0.0001278600682229126, "epoch": 0.6619258120315202, "step": 738 }, { "loss": 356.3365, "grad_norm": 43.488121032714844, "learning_rate": 0.00012724539869685226, "epoch": 0.6628227304760075, "step": 739 }, { "loss": 357.6046, "grad_norm": 42.182777404785156, "learning_rate": 0.0001266317054338503, "epoch": 0.6637196489204946, "step": 740 }, { "loss": 358.7371, "grad_norm": 43.06134796142578, "learning_rate": 0.00012601899331458777, "epoch": 0.6646165673649818, "step": 741 }, { "loss": 358.2452, "grad_norm": 40.01738357543945, "learning_rate": 0.00012540726721194266, "epoch": 0.6655134858094689, "step": 742 }, { "loss": 361.5233, "grad_norm": 40.66733169555664, "learning_rate": 0.0001247965319909515, "epoch": 0.666410404253956, "step": 743 }, { "loss": 354.1553, "grad_norm": 39.47666931152344, "learning_rate": 0.0001241867925087701, "epoch": 0.6673073226984432, "step": 744 }, { "loss": 358.3203, "grad_norm": 39.22403335571289, "learning_rate": 0.00012357805361463514, "epoch": 0.6682042411429303, "step": 745 }, { "loss": 357.0617, "grad_norm": 39.071529388427734, "learning_rate": 0.00012297032014982597, "epoch": 0.6691011595874176, "step": 746 }, { "loss": 362.905, "grad_norm": 40.75625228881836, "learning_rate": 0.0001223635969476255, "epoch": 0.6699980780319047, "step": 747 }, { "loss": 354.9351, "grad_norm": 42.89009094238281, "learning_rate": 0.00012175788883328232, "epoch": 0.6708949964763918, "step": 748 }, { "loss": 359.415, "grad_norm": 43.072513580322266, "learning_rate": 0.0001211532006239718, "epoch": 0.671791914920879, "step": 749 }, { "loss": 357.7546, "grad_norm": 40.25785446166992, "learning_rate": 0.00012054953712875807, "epoch": 0.6726888333653661, "step": 750 }, { "eval_loss": 1.609327793121338, "eval_runtime": 17.5285, "eval_samples_per_second": 116.839, "eval_steps_per_second": 14.605, "epoch": 0.6726888333653661, "step": 750 }, { "loss": 357.2794, "grad_norm": 41.602596282958984, "learning_rate": 0.00011994690314855598, "epoch": 0.6735857518098533, "step": 751 }, { "loss": 361.091, "grad_norm": 41.749717712402344, "learning_rate": 0.00011934530347609257, "epoch": 0.6744826702543405, "step": 752 }, { "loss": 362.0817, "grad_norm": 39.51606369018555, "learning_rate": 0.00011874474289586895, "epoch": 0.6753795886988276, "step": 753 }, { "loss": 356.8317, "grad_norm": 40.00758743286133, "learning_rate": 0.00011814522618412235, "epoch": 0.6762765071433148, "step": 754 }, { "loss": 359.7722, "grad_norm": 41.676292419433594, "learning_rate": 0.00011754675810878845, "epoch": 0.6771734255878019, "step": 755 }, { "loss": 359.641, "grad_norm": 41.25587463378906, "learning_rate": 0.00011694934342946287, "epoch": 0.678070344032289, "step": 756 }, { "loss": 352.955, "grad_norm": 40.348514556884766, "learning_rate": 0.00011635298689736357, "epoch": 0.6789672624767762, "step": 757 }, { "loss": 362.8987, "grad_norm": 43.387184143066406, "learning_rate": 0.00011575769325529342, "epoch": 0.6798641809212633, "step": 758 }, { "loss": 357.0482, "grad_norm": 40.06668472290039, "learning_rate": 0.00011516346723760193, "epoch": 0.6807610993657506, "step": 759 }, { "loss": 359.7377, "grad_norm": 39.39516830444336, "learning_rate": 0.00011457031357014772, "epoch": 0.6816580178102377, "step": 760 }, { "loss": 362.0869, "grad_norm": 39.07398223876953, "learning_rate": 0.0001139782369702614, "epoch": 0.6825549362547249, "step": 761 }, { "loss": 357.4482, "grad_norm": 42.54057312011719, "learning_rate": 0.00011338724214670734, "epoch": 0.683451854699212, "step": 762 }, { "loss": 360.6057, "grad_norm": 40.7839241027832, "learning_rate": 0.00011279733379964691, "epoch": 0.6843487731436991, "step": 763 }, { "loss": 362.9106, "grad_norm": 41.402889251708984, "learning_rate": 0.00011220851662060047, "epoch": 0.6852456915881863, "step": 764 }, { "loss": 357.1811, "grad_norm": 41.3732795715332, "learning_rate": 0.00011162079529241042, "epoch": 0.6861426100326734, "step": 765 }, { "loss": 358.0857, "grad_norm": 42.31522750854492, "learning_rate": 0.00011103417448920406, "epoch": 0.6870395284771607, "step": 766 }, { "loss": 357.946, "grad_norm": 38.36897277832031, "learning_rate": 0.00011044865887635625, "epoch": 0.6879364469216478, "step": 767 }, { "loss": 360.9647, "grad_norm": 43.01420974731445, "learning_rate": 0.00010986425311045212, "epoch": 0.6888333653661349, "step": 768 }, { "loss": 362.1032, "grad_norm": 40.731163024902344, "learning_rate": 0.00010928096183925024, "epoch": 0.6897302838106221, "step": 769 }, { "loss": 363.3222, "grad_norm": 41.69025421142578, "learning_rate": 0.00010869878970164587, "epoch": 0.6906272022551092, "step": 770 }, { "loss": 358.3542, "grad_norm": 37.463043212890625, "learning_rate": 0.00010811774132763366, "epoch": 0.6915241206995963, "step": 771 }, { "loss": 364.5648, "grad_norm": 38.481815338134766, "learning_rate": 0.00010753782133827093, "epoch": 0.6924210391440836, "step": 772 }, { "loss": 361.0055, "grad_norm": 39.70282745361328, "learning_rate": 0.00010695903434564124, "epoch": 0.6933179575885707, "step": 773 }, { "loss": 359.3154, "grad_norm": 38.182132720947266, "learning_rate": 0.00010638138495281725, "epoch": 0.6942148760330579, "step": 774 }, { "loss": 356.322, "grad_norm": 37.12331008911133, "learning_rate": 0.00010580487775382449, "epoch": 0.695111794477545, "step": 775 }, { "loss": 356.3972, "grad_norm": 40.065006256103516, "learning_rate": 0.00010522951733360456, "epoch": 0.6960087129220321, "step": 776 }, { "loss": 351.4366, "grad_norm": 40.21229553222656, "learning_rate": 0.0001046553082679787, "epoch": 0.6969056313665193, "step": 777 }, { "loss": 356.3872, "grad_norm": 39.17121124267578, "learning_rate": 0.00010408225512361171, "epoch": 0.6978025498110064, "step": 778 }, { "loss": 358.5863, "grad_norm": 38.62257766723633, "learning_rate": 0.0001035103624579751, "epoch": 0.6986994682554937, "step": 779 }, { "loss": 359.1902, "grad_norm": 39.73896408081055, "learning_rate": 0.00010293963481931143, "epoch": 0.6995963866999808, "step": 780 }, { "loss": 357.0757, "grad_norm": 38.72207260131836, "learning_rate": 0.00010237007674659752, "epoch": 0.700493305144468, "step": 781 }, { "loss": 359.07, "grad_norm": 39.15367126464844, "learning_rate": 0.00010180169276950899, "epoch": 0.7013902235889551, "step": 782 }, { "loss": 357.7226, "grad_norm": 39.2513542175293, "learning_rate": 0.00010123448740838367, "epoch": 0.7022871420334422, "step": 783 }, { "loss": 359.4571, "grad_norm": 41.660953521728516, "learning_rate": 0.00010066846517418596, "epoch": 0.7031840604779294, "step": 784 }, { "loss": 358.3033, "grad_norm": 40.074806213378906, "learning_rate": 0.00010010363056847103, "epoch": 0.7040809789224165, "step": 785 }, { "loss": 358.5859, "grad_norm": 40.53306198120117, "learning_rate": 9.953998808334874e-05, "epoch": 0.7049778973669037, "step": 786 }, { "loss": 353.3639, "grad_norm": 43.58430099487305, "learning_rate": 9.8977542201448e-05, "epoch": 0.7058748158113909, "step": 787 }, { "loss": 359.5676, "grad_norm": 39.986785888671875, "learning_rate": 9.841629739588145e-05, "epoch": 0.706771734255878, "step": 788 }, { "loss": 361.0522, "grad_norm": 41.356590270996094, "learning_rate": 9.785625813020923e-05, "epoch": 0.7076686527003652, "step": 789 }, { "loss": 355.244, "grad_norm": 40.596397399902344, "learning_rate": 9.729742885840429e-05, "epoch": 0.7085655711448523, "step": 790 }, { "loss": 358.6471, "grad_norm": 39.8510627746582, "learning_rate": 9.673981402481619e-05, "epoch": 0.7094624895893394, "step": 791 }, { "loss": 355.7997, "grad_norm": 37.443397521972656, "learning_rate": 9.618341806413614e-05, "epoch": 0.7103594080338267, "step": 792 }, { "loss": 358.5055, "grad_norm": 38.937034606933594, "learning_rate": 9.562824540136192e-05, "epoch": 0.7112563264783138, "step": 793 }, { "loss": 357.9367, "grad_norm": 39.378326416015625, "learning_rate": 9.507430045176238e-05, "epoch": 0.712153244922801, "step": 794 }, { "loss": 356.7012, "grad_norm": 40.44821548461914, "learning_rate": 9.452158762084228e-05, "epoch": 0.7130501633672881, "step": 795 }, { "loss": 361.7253, "grad_norm": 39.721378326416016, "learning_rate": 9.397011130430741e-05, "epoch": 0.7139470818117752, "step": 796 }, { "loss": 359.5762, "grad_norm": 40.48420333862305, "learning_rate": 9.341987588802984e-05, "epoch": 0.7148440002562624, "step": 797 }, { "loss": 355.1304, "grad_norm": 38.8956413269043, "learning_rate": 9.287088574801248e-05, "epoch": 0.7157409187007495, "step": 798 }, { "loss": 360.5678, "grad_norm": 41.26605987548828, "learning_rate": 9.23231452503547e-05, "epoch": 0.7166378371452368, "step": 799 }, { "loss": 359.8319, "grad_norm": 36.14881134033203, "learning_rate": 9.177665875121774e-05, "epoch": 0.7175347555897239, "step": 800 }, { "eval_loss": 1.5968618392944336, "eval_runtime": 17.8479, "eval_samples_per_second": 114.747, "eval_steps_per_second": 14.343, "epoch": 0.7175347555897239, "step": 800 }, { "loss": 361.1777, "grad_norm": 40.25320053100586, "learning_rate": 9.123143059678952e-05, "epoch": 0.718431674034211, "step": 801 }, { "loss": 355.5561, "grad_norm": 39.248783111572266, "learning_rate": 9.068746512325046e-05, "epoch": 0.7193285924786982, "step": 802 }, { "loss": 353.493, "grad_norm": 41.21136474609375, "learning_rate": 9.014476665673915e-05, "epoch": 0.7202255109231853, "step": 803 }, { "loss": 355.8681, "grad_norm": 38.923973083496094, "learning_rate": 8.960333951331739e-05, "epoch": 0.7211224293676725, "step": 804 }, { "loss": 355.0969, "grad_norm": 43.01164627075195, "learning_rate": 8.906318799893648e-05, "epoch": 0.7220193478121597, "step": 805 }, { "loss": 354.1833, "grad_norm": 39.02459716796875, "learning_rate": 8.852431640940247e-05, "epoch": 0.7229162662566468, "step": 806 }, { "loss": 359.125, "grad_norm": 37.63704299926758, "learning_rate": 8.798672903034225e-05, "epoch": 0.723813184701134, "step": 807 }, { "loss": 355.6418, "grad_norm": 38.401512145996094, "learning_rate": 8.745043013716955e-05, "epoch": 0.7247101031456211, "step": 808 }, { "loss": 358.6194, "grad_norm": 37.391685485839844, "learning_rate": 8.691542399505081e-05, "epoch": 0.7256070215901083, "step": 809 }, { "loss": 359.1611, "grad_norm": 40.48008728027344, "learning_rate": 8.638171485887111e-05, "epoch": 0.7265039400345954, "step": 810 }, { "loss": 359.4613, "grad_norm": 40.47174835205078, "learning_rate": 8.584930697320053e-05, "epoch": 0.7274008584790825, "step": 811 }, { "loss": 351.1801, "grad_norm": 39.59210968017578, "learning_rate": 8.531820457226055e-05, "epoch": 0.7282977769235698, "step": 812 }, { "loss": 355.662, "grad_norm": 36.89620590209961, "learning_rate": 8.478841187988992e-05, "epoch": 0.7291946953680569, "step": 813 }, { "loss": 361.7194, "grad_norm": 38.956214904785156, "learning_rate": 8.425993310951132e-05, "epoch": 0.7300916138125441, "step": 814 }, { "loss": 359.9547, "grad_norm": 36.15619659423828, "learning_rate": 8.373277246409818e-05, "epoch": 0.7309885322570312, "step": 815 }, { "loss": 353.2803, "grad_norm": 41.085899353027344, "learning_rate": 8.320693413614053e-05, "epoch": 0.7318854507015183, "step": 816 }, { "loss": 356.6743, "grad_norm": 40.31721878051758, "learning_rate": 8.268242230761239e-05, "epoch": 0.7327823691460055, "step": 817 }, { "loss": 356.205, "grad_norm": 41.351558685302734, "learning_rate": 8.215924114993792e-05, "epoch": 0.7336792875904926, "step": 818 }, { "loss": 360.4526, "grad_norm": 39.119476318359375, "learning_rate": 8.163739482395851e-05, "epoch": 0.7345762060349799, "step": 819 }, { "loss": 361.5057, "grad_norm": 38.80229949951172, "learning_rate": 8.111688747990001e-05, "epoch": 0.735473124479467, "step": 820 }, { "loss": 352.7518, "grad_norm": 40.22185134887695, "learning_rate": 8.059772325733899e-05, "epoch": 0.7363700429239541, "step": 821 }, { "loss": 356.2066, "grad_norm": 40.426979064941406, "learning_rate": 8.007990628517034e-05, "epoch": 0.7372669613684413, "step": 822 }, { "loss": 358.5974, "grad_norm": 39.50589370727539, "learning_rate": 7.956344068157443e-05, "epoch": 0.7381638798129284, "step": 823 }, { "loss": 360.1032, "grad_norm": 38.537113189697266, "learning_rate": 7.904833055398428e-05, "epoch": 0.7390607982574156, "step": 824 }, { "loss": 358.6521, "grad_norm": 38.09297180175781, "learning_rate": 7.853457999905264e-05, "epoch": 0.7399577167019028, "step": 825 }, { "loss": 358.724, "grad_norm": 38.27792739868164, "learning_rate": 7.802219310261965e-05, "epoch": 0.7408546351463899, "step": 826 }, { "loss": 361.0538, "grad_norm": 40.946353912353516, "learning_rate": 7.75111739396806e-05, "epoch": 0.7417515535908771, "step": 827 }, { "loss": 354.2574, "grad_norm": 37.80830764770508, "learning_rate": 7.700152657435297e-05, "epoch": 0.7426484720353642, "step": 828 }, { "loss": 356.4567, "grad_norm": 39.698429107666016, "learning_rate": 7.649325505984434e-05, "epoch": 0.7435453904798514, "step": 829 }, { "loss": 355.0162, "grad_norm": 38.21966552734375, "learning_rate": 7.598636343842053e-05, "epoch": 0.7444423089243385, "step": 830 }, { "loss": 356.4822, "grad_norm": 39.37642288208008, "learning_rate": 7.548085574137273e-05, "epoch": 0.7453392273688256, "step": 831 }, { "loss": 357.8192, "grad_norm": 37.3087158203125, "learning_rate": 7.497673598898613e-05, "epoch": 0.7462361458133129, "step": 832 }, { "loss": 363.7517, "grad_norm": 35.9515266418457, "learning_rate": 7.447400819050751e-05, "epoch": 0.7471330642578, "step": 833 }, { "loss": 355.3728, "grad_norm": 36.964534759521484, "learning_rate": 7.397267634411337e-05, "epoch": 0.7480299827022872, "step": 834 }, { "loss": 354.5074, "grad_norm": 39.167415618896484, "learning_rate": 7.347274443687855e-05, "epoch": 0.7489269011467743, "step": 835 }, { "loss": 361.1248, "grad_norm": 40.1679801940918, "learning_rate": 7.297421644474387e-05, "epoch": 0.7498238195912614, "step": 836 }, { "loss": 357.9431, "grad_norm": 38.67217254638672, "learning_rate": 7.247709633248526e-05, "epoch": 0.7507207380357486, "step": 837 }, { "loss": 360.9297, "grad_norm": 37.734153747558594, "learning_rate": 7.198138805368143e-05, "epoch": 0.7516176564802357, "step": 838 }, { "loss": 350.7899, "grad_norm": 36.58796691894531, "learning_rate": 7.148709555068314e-05, "epoch": 0.752514574924723, "step": 839 }, { "loss": 358.5099, "grad_norm": 37.6004753112793, "learning_rate": 7.09942227545814e-05, "epoch": 0.7534114933692101, "step": 840 }, { "loss": 350.2813, "grad_norm": 39.31602096557617, "learning_rate": 7.05027735851762e-05, "epoch": 0.7543084118136972, "step": 841 }, { "loss": 361.4473, "grad_norm": 37.72463607788086, "learning_rate": 7.001275195094581e-05, "epoch": 0.7552053302581844, "step": 842 }, { "loss": 356.7912, "grad_norm": 36.68344497680664, "learning_rate": 6.952416174901504e-05, "epoch": 0.7561022487026715, "step": 843 }, { "loss": 360.7002, "grad_norm": 39.82998275756836, "learning_rate": 6.903700686512485e-05, "epoch": 0.7569991671471586, "step": 844 }, { "loss": 357.1058, "grad_norm": 39.26710510253906, "learning_rate": 6.855129117360095e-05, "epoch": 0.7578960855916459, "step": 845 }, { "loss": 356.4349, "grad_norm": 37.95897674560547, "learning_rate": 6.806701853732319e-05, "epoch": 0.758793004036133, "step": 846 }, { "loss": 353.9336, "grad_norm": 36.72467041015625, "learning_rate": 6.75841928076951e-05, "epoch": 0.7596899224806202, "step": 847 }, { "loss": 355.9283, "grad_norm": 38.29819869995117, "learning_rate": 6.710281782461275e-05, "epoch": 0.7605868409251073, "step": 848 }, { "loss": 357.5876, "grad_norm": 39.196720123291016, "learning_rate": 6.662289741643454e-05, "epoch": 0.7614837593695944, "step": 849 }, { "loss": 359.8077, "grad_norm": 40.00128936767578, "learning_rate": 6.614443539995078e-05, "epoch": 0.7623806778140816, "step": 850 }, { "eval_loss": 1.582360863685608, "eval_runtime": 18.4592, "eval_samples_per_second": 110.947, "eval_steps_per_second": 13.868, "epoch": 0.7623806778140816, "step": 850 }, { "loss": 355.6048, "grad_norm": 38.59453582763672, "learning_rate": 6.56674355803532e-05, "epoch": 0.7632775962585687, "step": 851 }, { "loss": 360.1093, "grad_norm": 39.37229537963867, "learning_rate": 6.519190175120473e-05, "epoch": 0.764174514703056, "step": 852 }, { "loss": 357.6195, "grad_norm": 36.07246017456055, "learning_rate": 6.47178376944092e-05, "epoch": 0.7650714331475431, "step": 853 }, { "loss": 357.4596, "grad_norm": 36.77618408203125, "learning_rate": 6.424524718018163e-05, "epoch": 0.7659683515920302, "step": 854 }, { "loss": 359.593, "grad_norm": 36.766483306884766, "learning_rate": 6.377413396701781e-05, "epoch": 0.7668652700365174, "step": 855 }, { "loss": 356.4777, "grad_norm": 43.47877502441406, "learning_rate": 6.330450180166464e-05, "epoch": 0.7677621884810045, "step": 856 }, { "loss": 353.8591, "grad_norm": 39.65815353393555, "learning_rate": 6.283635441909044e-05, "epoch": 0.7686591069254917, "step": 857 }, { "loss": 358.9107, "grad_norm": 42.22090148925781, "learning_rate": 6.236969554245486e-05, "epoch": 0.7695560253699789, "step": 858 }, { "loss": 361.3808, "grad_norm": 37.009342193603516, "learning_rate": 6.19045288830798e-05, "epoch": 0.770452943814466, "step": 859 }, { "loss": 359.7101, "grad_norm": 36.62922668457031, "learning_rate": 6.144085814041941e-05, "epoch": 0.7713498622589532, "step": 860 }, { "loss": 360.3506, "grad_norm": 35.92998123168945, "learning_rate": 6.097868700203082e-05, "epoch": 0.7722467807034403, "step": 861 }, { "loss": 352.6364, "grad_norm": 40.08286666870117, "learning_rate": 6.05180191435451e-05, "epoch": 0.7731436991479275, "step": 862 }, { "loss": 356.8879, "grad_norm": 38.76757049560547, "learning_rate": 6.0058858228637605e-05, "epoch": 0.7740406175924146, "step": 863 }, { "loss": 355.7852, "grad_norm": 37.80318069458008, "learning_rate": 5.960120790899895e-05, "epoch": 0.7749375360369017, "step": 864 }, { "loss": 357.245, "grad_norm": 36.61247253417969, "learning_rate": 5.914507182430626e-05, "epoch": 0.775834454481389, "step": 865 }, { "loss": 355.3506, "grad_norm": 37.76987838745117, "learning_rate": 5.869045360219391e-05, "epoch": 0.7767313729258761, "step": 866 }, { "loss": 351.2185, "grad_norm": 37.881492614746094, "learning_rate": 5.8237356858224704e-05, "epoch": 0.7776282913703633, "step": 867 }, { "loss": 360.2768, "grad_norm": 39.45249557495117, "learning_rate": 5.7785785195861194e-05, "epoch": 0.7785252098148504, "step": 868 }, { "loss": 353.9251, "grad_norm": 39.94224548339844, "learning_rate": 5.733574220643712e-05, "epoch": 0.7794221282593375, "step": 869 }, { "loss": 355.1441, "grad_norm": 37.91038513183594, "learning_rate": 5.688723146912858e-05, "epoch": 0.7803190467038247, "step": 870 }, { "loss": 359.303, "grad_norm": 36.14017105102539, "learning_rate": 5.644025655092591e-05, "epoch": 0.7812159651483118, "step": 871 }, { "loss": 359.8912, "grad_norm": 37.15394592285156, "learning_rate": 5.5994821006604965e-05, "epoch": 0.7821128835927991, "step": 872 }, { "loss": 360.2237, "grad_norm": 35.74496078491211, "learning_rate": 5.555092837869902e-05, "epoch": 0.7830098020372862, "step": 873 }, { "loss": 352.0333, "grad_norm": 37.32427215576172, "learning_rate": 5.5108582197470784e-05, "epoch": 0.7839067204817733, "step": 874 }, { "loss": 359.9949, "grad_norm": 40.355411529541016, "learning_rate": 5.4667785980883897e-05, "epoch": 0.7848036389262605, "step": 875 }, { "loss": 351.2752, "grad_norm": 36.727745056152344, "learning_rate": 5.422854323457527e-05, "epoch": 0.7857005573707476, "step": 876 }, { "loss": 352.9948, "grad_norm": 37.40601348876953, "learning_rate": 5.379085745182721e-05, "epoch": 0.7865974758152348, "step": 877 }, { "loss": 357.7682, "grad_norm": 36.147159576416016, "learning_rate": 5.335473211353942e-05, "epoch": 0.787494394259722, "step": 878 }, { "loss": 360.3233, "grad_norm": 36.26030349731445, "learning_rate": 5.29201706882014e-05, "epoch": 0.7883913127042091, "step": 879 }, { "loss": 354.8234, "grad_norm": 34.958744049072266, "learning_rate": 5.2487176631865114e-05, "epoch": 0.7892882311486963, "step": 880 }, { "loss": 358.086, "grad_norm": 36.89348602294922, "learning_rate": 5.205575338811719e-05, "epoch": 0.7901851495931834, "step": 881 }, { "loss": 357.6668, "grad_norm": 39.996177673339844, "learning_rate": 5.1625904388051564e-05, "epoch": 0.7910820680376706, "step": 882 }, { "loss": 353.7882, "grad_norm": 36.440711975097656, "learning_rate": 5.119763305024225e-05, "epoch": 0.7919789864821577, "step": 883 }, { "loss": 356.1277, "grad_norm": 36.0537223815918, "learning_rate": 5.077094278071642e-05, "epoch": 0.7928759049266448, "step": 884 }, { "loss": 359.5157, "grad_norm": 35.76783752441406, "learning_rate": 5.034583697292674e-05, "epoch": 0.7937728233711321, "step": 885 }, { "loss": 353.6391, "grad_norm": 34.94169998168945, "learning_rate": 4.9922319007724954e-05, "epoch": 0.7946697418156192, "step": 886 }, { "loss": 361.0958, "grad_norm": 38.87442398071289, "learning_rate": 4.9500392253334635e-05, "epoch": 0.7955666602601064, "step": 887 }, { "loss": 357.8425, "grad_norm": 36.01359558105469, "learning_rate": 4.908006006532445e-05, "epoch": 0.7964635787045935, "step": 888 }, { "loss": 358.4057, "grad_norm": 39.11752700805664, "learning_rate": 4.866132578658172e-05, "epoch": 0.7973604971490806, "step": 889 }, { "loss": 355.1286, "grad_norm": 37.169158935546875, "learning_rate": 4.8244192747285507e-05, "epoch": 0.7982574155935678, "step": 890 }, { "loss": 356.0285, "grad_norm": 35.89703369140625, "learning_rate": 4.7828664264880254e-05, "epoch": 0.7991543340380549, "step": 891 }, { "loss": 353.9138, "grad_norm": 35.52785873413086, "learning_rate": 4.741474364404955e-05, "epoch": 0.8000512524825422, "step": 892 }, { "loss": 359.8646, "grad_norm": 35.992713928222656, "learning_rate": 4.7002434176689564e-05, "epoch": 0.8009481709270293, "step": 893 }, { "loss": 360.1763, "grad_norm": 36.50730514526367, "learning_rate": 4.659173914188319e-05, "epoch": 0.8018450893715164, "step": 894 }, { "loss": 356.7962, "grad_norm": 36.77907180786133, "learning_rate": 4.618266180587363e-05, "epoch": 0.8027420078160036, "step": 895 }, { "loss": 354.5534, "grad_norm": 36.69013214111328, "learning_rate": 4.5775205422038695e-05, "epoch": 0.8036389262604907, "step": 896 }, { "loss": 355.8555, "grad_norm": 36.079769134521484, "learning_rate": 4.536937323086479e-05, "epoch": 0.8045358447049779, "step": 897 }, { "loss": 352.4216, "grad_norm": 36.98958969116211, "learning_rate": 4.4965168459921076e-05, "epoch": 0.8054327631494651, "step": 898 }, { "loss": 354.3763, "grad_norm": 36.339656829833984, "learning_rate": 4.456259432383408e-05, "epoch": 0.8063296815939522, "step": 899 }, { "loss": 353.9048, "grad_norm": 35.602909088134766, "learning_rate": 4.4161654024261756e-05, "epoch": 0.8072266000384394, "step": 900 }, { "eval_loss": 1.581258773803711, "eval_runtime": 19.1453, "eval_samples_per_second": 106.971, "eval_steps_per_second": 13.371, "epoch": 0.8072266000384394, "step": 900 }, { "loss": 353.9864, "grad_norm": 37.425819396972656, "learning_rate": 4.3762350749868425e-05, "epoch": 0.8081235184829265, "step": 901 }, { "loss": 352.1746, "grad_norm": 36.96770095825195, "learning_rate": 4.336468767629906e-05, "epoch": 0.8090204369274137, "step": 902 }, { "loss": 362.0162, "grad_norm": 36.64163589477539, "learning_rate": 4.296866796615406e-05, "epoch": 0.8099173553719008, "step": 903 }, { "loss": 356.8323, "grad_norm": 37.755550384521484, "learning_rate": 4.257429476896454e-05, "epoch": 0.8108142738163879, "step": 904 }, { "loss": 355.0851, "grad_norm": 35.74870300292969, "learning_rate": 4.2181571221166696e-05, "epoch": 0.8117111922608752, "step": 905 }, { "loss": 354.1617, "grad_norm": 35.670047760009766, "learning_rate": 4.179050044607713e-05, "epoch": 0.8126081107053623, "step": 906 }, { "loss": 354.9214, "grad_norm": 36.92220687866211, "learning_rate": 4.140108555386812e-05, "epoch": 0.8135050291498495, "step": 907 }, { "loss": 351.6111, "grad_norm": 38.204166412353516, "learning_rate": 4.101332964154275e-05, "epoch": 0.8144019475943366, "step": 908 }, { "loss": 355.9622, "grad_norm": 35.54768753051758, "learning_rate": 4.0627235792910224e-05, "epoch": 0.8152988660388237, "step": 909 }, { "loss": 359.8922, "grad_norm": 37.4915771484375, "learning_rate": 4.024280707856134e-05, "epoch": 0.8161957844833109, "step": 910 }, { "loss": 356.2166, "grad_norm": 36.84100341796875, "learning_rate": 3.9860046555844406e-05, "epoch": 0.8170927029277981, "step": 911 }, { "loss": 355.0562, "grad_norm": 35.636878967285156, "learning_rate": 3.947895726884038e-05, "epoch": 0.8179896213722853, "step": 912 }, { "loss": 360.0903, "grad_norm": 36.50727081298828, "learning_rate": 3.909954224833911e-05, "epoch": 0.8188865398167724, "step": 913 }, { "loss": 359.0554, "grad_norm": 37.51554489135742, "learning_rate": 3.8721804511815007e-05, "epoch": 0.8197834582612595, "step": 914 }, { "loss": 356.6491, "grad_norm": 36.2037239074707, "learning_rate": 3.834574706340302e-05, "epoch": 0.8206803767057467, "step": 915 }, { "loss": 357.358, "grad_norm": 39.62883758544922, "learning_rate": 3.797137289387503e-05, "epoch": 0.8215772951502338, "step": 916 }, { "loss": 356.6225, "grad_norm": 35.792728424072266, "learning_rate": 3.7598684980615694e-05, "epoch": 0.822474213594721, "step": 917 }, { "loss": 351.0151, "grad_norm": 35.77069854736328, "learning_rate": 3.7227686287598874e-05, "epoch": 0.8233711320392082, "step": 918 }, { "loss": 356.1569, "grad_norm": 36.655330657958984, "learning_rate": 3.685837976536435e-05, "epoch": 0.8242680504836953, "step": 919 }, { "loss": 356.6186, "grad_norm": 35.82206726074219, "learning_rate": 3.649076835099399e-05, "epoch": 0.8251649689281825, "step": 920 }, { "loss": 352.9849, "grad_norm": 36.314361572265625, "learning_rate": 3.612485496808843e-05, "epoch": 0.8260618873726696, "step": 921 }, { "loss": 355.4819, "grad_norm": 37.96638870239258, "learning_rate": 3.57606425267441e-05, "epoch": 0.8269588058171568, "step": 922 }, { "loss": 358.6233, "grad_norm": 36.10899353027344, "learning_rate": 3.539813392352989e-05, "epoch": 0.8278557242616439, "step": 923 }, { "loss": 353.3172, "grad_norm": 34.54022216796875, "learning_rate": 3.5037332041464e-05, "epoch": 0.828752642706131, "step": 924 }, { "loss": 357.7184, "grad_norm": 36.95024108886719, "learning_rate": 3.467823974999115e-05, "epoch": 0.8296495611506183, "step": 925 }, { "loss": 352.9876, "grad_norm": 37.89804458618164, "learning_rate": 3.4320859904959924e-05, "epoch": 0.8305464795951054, "step": 926 }, { "loss": 354.4651, "grad_norm": 36.63965606689453, "learning_rate": 3.3965195348599626e-05, "epoch": 0.8314433980395926, "step": 927 }, { "loss": 356.9139, "grad_norm": 35.67973709106445, "learning_rate": 3.361124890949816e-05, "epoch": 0.8323403164840797, "step": 928 }, { "loss": 358.1943, "grad_norm": 35.843719482421875, "learning_rate": 3.325902340257914e-05, "epoch": 0.8332372349285668, "step": 929 }, { "loss": 352.4489, "grad_norm": 36.6231803894043, "learning_rate": 3.2908521629079704e-05, "epoch": 0.834134153373054, "step": 930 }, { "loss": 350.1209, "grad_norm": 34.934112548828125, "learning_rate": 3.255974637652828e-05, "epoch": 0.8350310718175412, "step": 931 }, { "loss": 356.8803, "grad_norm": 34.707252502441406, "learning_rate": 3.2212700418722265e-05, "epoch": 0.8359279902620284, "step": 932 }, { "loss": 356.7214, "grad_norm": 35.543949127197266, "learning_rate": 3.186738651570595e-05, "epoch": 0.8368249087065155, "step": 933 }, { "loss": 354.0534, "grad_norm": 35.74333572387695, "learning_rate": 3.1523807413748887e-05, "epoch": 0.8377218271510026, "step": 934 }, { "loss": 350.9949, "grad_norm": 36.81149673461914, "learning_rate": 3.118196584532359e-05, "epoch": 0.8386187455954898, "step": 935 }, { "loss": 355.0341, "grad_norm": 36.43380355834961, "learning_rate": 3.084186452908411e-05, "epoch": 0.8395156640399769, "step": 936 }, { "loss": 357.6827, "grad_norm": 35.787872314453125, "learning_rate": 3.0503506169844373e-05, "epoch": 0.840412582484464, "step": 937 }, { "loss": 353.5415, "grad_norm": 35.96485137939453, "learning_rate": 3.0166893458556666e-05, "epoch": 0.8413095009289513, "step": 938 }, { "loss": 357.3773, "grad_norm": 33.9022216796875, "learning_rate": 2.983202907228999e-05, "epoch": 0.8422064193734384, "step": 939 }, { "loss": 355.6847, "grad_norm": 36.94380187988281, "learning_rate": 2.949891567420923e-05, "epoch": 0.8431033378179256, "step": 940 }, { "loss": 352.4488, "grad_norm": 36.33073043823242, "learning_rate": 2.9167555913553577e-05, "epoch": 0.8440002562624127, "step": 941 }, { "loss": 355.2479, "grad_norm": 34.81533432006836, "learning_rate": 2.88379524256156e-05, "epoch": 0.8448971747068998, "step": 942 }, { "loss": 359.0098, "grad_norm": 34.85913848876953, "learning_rate": 2.8510107831720393e-05, "epoch": 0.845794093151387, "step": 943 }, { "loss": 355.3041, "grad_norm": 35.2500114440918, "learning_rate": 2.8184024739204534e-05, "epoch": 0.8466910115958741, "step": 944 }, { "loss": 357.6105, "grad_norm": 36.625144958496094, "learning_rate": 2.7859705741395403e-05, "epoch": 0.8475879300403614, "step": 945 }, { "loss": 355.7482, "grad_norm": 34.630428314208984, "learning_rate": 2.7537153417590803e-05, "epoch": 0.8484848484848485, "step": 946 }, { "loss": 358.0374, "grad_norm": 35.17256164550781, "learning_rate": 2.721637033303803e-05, "epoch": 0.8493817669293356, "step": 947 }, { "loss": 352.4902, "grad_norm": 36.90748596191406, "learning_rate": 2.6897359038913716e-05, "epoch": 0.8502786853738228, "step": 948 }, { "loss": 356.3272, "grad_norm": 35.69559097290039, "learning_rate": 2.6580122072303647e-05, "epoch": 0.8511756038183099, "step": 949 }, { "loss": 351.9118, "grad_norm": 34.44248580932617, "learning_rate": 2.6264661956182212e-05, "epoch": 0.8520725222627971, "step": 950 }, { "eval_loss": 1.5959553718566895, "eval_runtime": 18.4817, "eval_samples_per_second": 110.812, "eval_steps_per_second": 13.852, "epoch": 0.8520725222627971, "step": 950 }, { "loss": 356.2447, "grad_norm": 34.08928680419922, "learning_rate": 2.5950981199392847e-05, "epoch": 0.8529694407072843, "step": 951 }, { "loss": 357.2951, "grad_norm": 35.93143844604492, "learning_rate": 2.5639082296627537e-05, "epoch": 0.8538663591517714, "step": 952 }, { "loss": 357.1935, "grad_norm": 34.351898193359375, "learning_rate": 2.5328967728407454e-05, "epoch": 0.8547632775962586, "step": 953 }, { "loss": 352.3139, "grad_norm": 36.010223388671875, "learning_rate": 2.5020639961062853e-05, "epoch": 0.8556601960407457, "step": 954 }, { "loss": 356.4665, "grad_norm": 34.825042724609375, "learning_rate": 2.4714101446713793e-05, "epoch": 0.8565571144852329, "step": 955 }, { "loss": 354.6561, "grad_norm": 35.965755462646484, "learning_rate": 2.4409354623250307e-05, "epoch": 0.85745403292972, "step": 956 }, { "loss": 350.8446, "grad_norm": 34.73567199707031, "learning_rate": 2.4106401914313238e-05, "epoch": 0.8583509513742071, "step": 957 }, { "loss": 357.6875, "grad_norm": 34.63365936279297, "learning_rate": 2.3805245729274947e-05, "epoch": 0.8592478698186944, "step": 958 }, { "loss": 352.3867, "grad_norm": 37.33460235595703, "learning_rate": 2.3505888463220047e-05, "epoch": 0.8601447882631815, "step": 959 }, { "loss": 357.7318, "grad_norm": 35.54653549194336, "learning_rate": 2.3208332496926387e-05, "epoch": 0.8610417067076687, "step": 960 }, { "loss": 356.5225, "grad_norm": 34.780433654785156, "learning_rate": 2.2912580196846222e-05, "epoch": 0.8619386251521558, "step": 961 }, { "loss": 358.1692, "grad_norm": 37.751983642578125, "learning_rate": 2.2618633915087282e-05, "epoch": 0.8628355435966429, "step": 962 }, { "loss": 359.3351, "grad_norm": 35.848167419433594, "learning_rate": 2.2326495989393985e-05, "epoch": 0.8637324620411301, "step": 963 }, { "loss": 354.9636, "grad_norm": 34.292728424072266, "learning_rate": 2.203616874312919e-05, "epoch": 0.8646293804856173, "step": 964 }, { "loss": 350.5273, "grad_norm": 35.46641540527344, "learning_rate": 2.174765448525523e-05, "epoch": 0.8655262989301045, "step": 965 }, { "loss": 355.4344, "grad_norm": 34.72315979003906, "learning_rate": 2.1460955510315962e-05, "epoch": 0.8664232173745916, "step": 966 }, { "loss": 353.3275, "grad_norm": 36.16691589355469, "learning_rate": 2.1176074098418402e-05, "epoch": 0.8673201358190787, "step": 967 }, { "loss": 355.2486, "grad_norm": 36.415794372558594, "learning_rate": 2.0893012515214388e-05, "epoch": 0.8682170542635659, "step": 968 }, { "loss": 355.4182, "grad_norm": 35.465538024902344, "learning_rate": 2.06117730118828e-05, "epoch": 0.869113972708053, "step": 969 }, { "loss": 354.304, "grad_norm": 35.425926208496094, "learning_rate": 2.0332357825111668e-05, "epoch": 0.8700108911525402, "step": 970 }, { "loss": 351.7629, "grad_norm": 34.78888702392578, "learning_rate": 2.0054769177080185e-05, "epoch": 0.8709078095970274, "step": 971 }, { "loss": 358.8823, "grad_norm": 35.0769157409668, "learning_rate": 1.97790092754411e-05, "epoch": 0.8718047280415145, "step": 972 }, { "loss": 353.2525, "grad_norm": 35.73164749145508, "learning_rate": 1.9505080313303365e-05, "epoch": 0.8727016464860017, "step": 973 }, { "loss": 355.5436, "grad_norm": 35.51607894897461, "learning_rate": 1.9232984469214453e-05, "epoch": 0.8735985649304888, "step": 974 }, { "loss": 353.8528, "grad_norm": 35.09918975830078, "learning_rate": 1.8962723907143044e-05, "epoch": 0.874495483374976, "step": 975 }, { "loss": 358.7514, "grad_norm": 36.12480926513672, "learning_rate": 1.869430077646203e-05, "epoch": 0.8753924018194631, "step": 976 }, { "loss": 354.3459, "grad_norm": 34.32866287231445, "learning_rate": 1.8427717211931177e-05, "epoch": 0.8762893202639502, "step": 977 }, { "loss": 350.5236, "grad_norm": 35.1101188659668, "learning_rate": 1.816297533368022e-05, "epoch": 0.8771862387084375, "step": 978 }, { "loss": 353.4749, "grad_norm": 36.59587478637695, "learning_rate": 1.7900077247192087e-05, "epoch": 0.8780831571529246, "step": 979 }, { "loss": 353.3892, "grad_norm": 34.86069869995117, "learning_rate": 1.7639025043286155e-05, "epoch": 0.8789800755974118, "step": 980 }, { "loss": 354.1761, "grad_norm": 35.580291748046875, "learning_rate": 1.7379820798101383e-05, "epoch": 0.8798769940418989, "step": 981 }, { "loss": 355.6291, "grad_norm": 34.58673095703125, "learning_rate": 1.7122466573080196e-05, "epoch": 0.880773912486386, "step": 982 }, { "loss": 357.7327, "grad_norm": 33.76737976074219, "learning_rate": 1.6866964414951698e-05, "epoch": 0.8816708309308732, "step": 983 }, { "loss": 355.4995, "grad_norm": 34.57607650756836, "learning_rate": 1.6613316355715558e-05, "epoch": 0.8825677493753604, "step": 984 }, { "loss": 357.9588, "grad_norm": 34.49372100830078, "learning_rate": 1.6361524412626088e-05, "epoch": 0.8834646678198476, "step": 985 }, { "loss": 357.0802, "grad_norm": 34.17061996459961, "learning_rate": 1.611159058817571e-05, "epoch": 0.8843615862643347, "step": 986 }, { "loss": 354.1526, "grad_norm": 36.93791198730469, "learning_rate": 1.5863516870079418e-05, "epoch": 0.8852585047088218, "step": 987 }, { "loss": 358.1216, "grad_norm": 35.566646575927734, "learning_rate": 1.5617305231258898e-05, "epoch": 0.886155423153309, "step": 988 }, { "loss": 351.2595, "grad_norm": 35.77732467651367, "learning_rate": 1.5372957629826655e-05, "epoch": 0.8870523415977961, "step": 989 }, { "loss": 353.016, "grad_norm": 37.376441955566406, "learning_rate": 1.513047600907061e-05, "epoch": 0.8879492600422833, "step": 990 }, { "loss": 352.4042, "grad_norm": 34.55933380126953, "learning_rate": 1.4889862297438688e-05, "epoch": 0.8888461784867705, "step": 991 }, { "loss": 352.0331, "grad_norm": 34.30587387084961, "learning_rate": 1.4651118408523317e-05, "epoch": 0.8897430969312576, "step": 992 }, { "loss": 356.2885, "grad_norm": 34.28126525878906, "learning_rate": 1.4414246241046286e-05, "epoch": 0.8906400153757448, "step": 993 }, { "loss": 356.9485, "grad_norm": 35.106529235839844, "learning_rate": 1.4179247678843681e-05, "epoch": 0.8915369338202319, "step": 994 }, { "loss": 357.6618, "grad_norm": 33.811737060546875, "learning_rate": 1.3946124590850901e-05, "epoch": 0.892433852264719, "step": 995 }, { "loss": 361.4888, "grad_norm": 33.41731643676758, "learning_rate": 1.3714878831087657e-05, "epoch": 0.8933307707092062, "step": 996 }, { "loss": 358.7178, "grad_norm": 34.46256637573242, "learning_rate": 1.3485512238643499e-05, "epoch": 0.8942276891536933, "step": 997 }, { "loss": 357.5736, "grad_norm": 35.067893981933594, "learning_rate": 1.3258026637662846e-05, "epoch": 0.8951246075981806, "step": 998 }, { "loss": 353.149, "grad_norm": 34.04292678833008, "learning_rate": 1.3032423837330748e-05, "epoch": 0.8960215260426677, "step": 999 }, { "loss": 356.1142, "grad_norm": 34.39286422729492, "learning_rate": 1.2808705631858459e-05, "epoch": 0.8969184444871549, "step": 1000 }, { "eval_loss": 1.586561918258667, "eval_runtime": 20.2668, "eval_samples_per_second": 101.052, "eval_steps_per_second": 12.631, "epoch": 0.8969184444871549, "step": 1000 }, { "loss": 354.0248, "grad_norm": 36.2171516418457, "learning_rate": 1.2586873800468996e-05, "epoch": 0.897815362931642, "step": 1001 }, { "loss": 362.0434, "grad_norm": 34.42704391479492, "learning_rate": 1.2366930107383156e-05, "epoch": 0.8987122813761291, "step": 1002 }, { "loss": 354.9637, "grad_norm": 34.4918212890625, "learning_rate": 1.2148876301805528e-05, "epoch": 0.8996091998206163, "step": 1003 }, { "loss": 348.8729, "grad_norm": 34.57630157470703, "learning_rate": 1.1932714117910386e-05, "epoch": 0.9005061182651035, "step": 1004 }, { "loss": 352.9299, "grad_norm": 35.46476745605469, "learning_rate": 1.171844527482796e-05, "epoch": 0.9014030367095907, "step": 1005 }, { "loss": 355.247, "grad_norm": 34.4285888671875, "learning_rate": 1.1506071476630964e-05, "epoch": 0.9022999551540778, "step": 1006 }, { "loss": 352.168, "grad_norm": 34.935569763183594, "learning_rate": 1.1295594412320754e-05, "epoch": 0.9031968735985649, "step": 1007 }, { "loss": 357.9673, "grad_norm": 33.162166595458984, "learning_rate": 1.1087015755814084e-05, "epoch": 0.9040937920430521, "step": 1008 }, { "loss": 350.8712, "grad_norm": 34.0540657043457, "learning_rate": 1.088033716592976e-05, "epoch": 0.9049907104875392, "step": 1009 }, { "loss": 356.8466, "grad_norm": 33.83312225341797, "learning_rate": 1.0675560286375369e-05, "epoch": 0.9058876289320263, "step": 1010 }, { "loss": 353.7512, "grad_norm": 34.7866096496582, "learning_rate": 1.0472686745734233e-05, "epoch": 0.9067845473765136, "step": 1011 }, { "loss": 354.8209, "grad_norm": 34.10197067260742, "learning_rate": 1.027171815745262e-05, "epoch": 0.9076814658210007, "step": 1012 }, { "loss": 354.7816, "grad_norm": 34.292598724365234, "learning_rate": 1.0072656119826662e-05, "epoch": 0.9085783842654879, "step": 1013 }, { "loss": 356.8245, "grad_norm": 34.5960693359375, "learning_rate": 9.875502215989791e-06, "epoch": 0.909475302709975, "step": 1014 }, { "loss": 353.8681, "grad_norm": 33.786537170410156, "learning_rate": 9.680258013900129e-06, "epoch": 0.9103722211544621, "step": 1015 }, { "loss": 355.527, "grad_norm": 35.2137565612793, "learning_rate": 9.486925066327978e-06, "epoch": 0.9112691395989493, "step": 1016 }, { "loss": 352.3827, "grad_norm": 34.659767150878906, "learning_rate": 9.295504910843522e-06, "epoch": 0.9121660580434365, "step": 1017 }, { "loss": 355.3458, "grad_norm": 33.41202926635742, "learning_rate": 9.10599906980461e-06, "epoch": 0.9130629764879237, "step": 1018 }, { "loss": 357.3716, "grad_norm": 32.52941131591797, "learning_rate": 8.91840905034455e-06, "epoch": 0.9139598949324108, "step": 1019 }, { "loss": 354.1408, "grad_norm": 33.926963806152344, "learning_rate": 8.732736344360198e-06, "epoch": 0.914856813376898, "step": 1020 }, { "loss": 357.4122, "grad_norm": 33.29584503173828, "learning_rate": 8.548982428500163e-06, "epoch": 0.9157537318213851, "step": 1021 }, { "loss": 356.5175, "grad_norm": 35.51197814941406, "learning_rate": 8.367148764152843e-06, "epoch": 0.9166506502658722, "step": 1022 }, { "loss": 361.666, "grad_norm": 35.082054138183594, "learning_rate": 8.187236797435077e-06, "epoch": 0.9175475687103594, "step": 1023 }, { "loss": 350.1344, "grad_norm": 34.95941925048828, "learning_rate": 8.009247959180482e-06, "epoch": 0.9184444871548466, "step": 1024 }, { "loss": 359.1797, "grad_norm": 34.81248474121094, "learning_rate": 7.833183664928023e-06, "epoch": 0.9193414055993337, "step": 1025 }, { "loss": 352.5403, "grad_norm": 34.408485412597656, "learning_rate": 7.659045314910879e-06, "epoch": 0.9202383240438209, "step": 1026 }, { "loss": 353.7971, "grad_norm": 34.32902526855469, "learning_rate": 7.486834294045286e-06, "epoch": 0.921135242488308, "step": 1027 }, { "loss": 352.8156, "grad_norm": 33.39252471923828, "learning_rate": 7.316551971919522e-06, "epoch": 0.9220321609327952, "step": 1028 }, { "loss": 355.1404, "grad_norm": 35.65606689453125, "learning_rate": 7.148199702782854e-06, "epoch": 0.9229290793772823, "step": 1029 }, { "loss": 358.3244, "grad_norm": 35.14055252075195, "learning_rate": 6.981778825535079e-06, "epoch": 0.9238259978217694, "step": 1030 }, { "loss": 356.6115, "grad_norm": 32.90983581542969, "learning_rate": 6.817290663715614e-06, "epoch": 0.9247229162662567, "step": 1031 }, { "loss": 354.6003, "grad_norm": 33.653778076171875, "learning_rate": 6.654736525493033e-06, "epoch": 0.9256198347107438, "step": 1032 }, { "loss": 356.817, "grad_norm": 35.58637619018555, "learning_rate": 6.494117703654739e-06, "epoch": 0.926516753155231, "step": 1033 }, { "loss": 355.3286, "grad_norm": 33.73952102661133, "learning_rate": 6.335435475596646e-06, "epoch": 0.9274136715997181, "step": 1034 }, { "loss": 355.2651, "grad_norm": 33.62116241455078, "learning_rate": 6.1786911033129e-06, "epoch": 0.9283105900442052, "step": 1035 }, { "loss": 357.9323, "grad_norm": 33.39925003051758, "learning_rate": 6.023885833386061e-06, "epoch": 0.9292075084886924, "step": 1036 }, { "loss": 351.2944, "grad_norm": 34.47417068481445, "learning_rate": 5.87102089697708e-06, "epoch": 0.9301044269331796, "step": 1037 }, { "loss": 355.5925, "grad_norm": 33.980857849121094, "learning_rate": 5.720097509815392e-06, "epoch": 0.9310013453776668, "step": 1038 }, { "loss": 355.6397, "grad_norm": 32.85739517211914, "learning_rate": 5.571116872189475e-06, "epoch": 0.9318982638221539, "step": 1039 }, { "loss": 355.7616, "grad_norm": 33.64262390136719, "learning_rate": 5.424080168937112e-06, "epoch": 0.932795182266641, "step": 1040 }, { "loss": 357.7719, "grad_norm": 34.275169372558594, "learning_rate": 5.278988569436066e-06, "epoch": 0.9336921007111282, "step": 1041 }, { "loss": 357.6499, "grad_norm": 34.75218963623047, "learning_rate": 5.1358432275947775e-06, "epoch": 0.9345890191556153, "step": 1042 }, { "loss": 353.3368, "grad_norm": 34.046241760253906, "learning_rate": 4.994645281843152e-06, "epoch": 0.9354859376001025, "step": 1043 }, { "loss": 354.6295, "grad_norm": 34.62663269042969, "learning_rate": 4.855395855123512e-06, "epoch": 0.9363828560445897, "step": 1044 }, { "loss": 352.3897, "grad_norm": 35.12565231323242, "learning_rate": 4.718096054881688e-06, "epoch": 0.9372797744890768, "step": 1045 }, { "loss": 352.5993, "grad_norm": 33.51365661621094, "learning_rate": 4.582746973058216e-06, "epoch": 0.938176692933564, "step": 1046 }, { "loss": 354.0611, "grad_norm": 33.32587814331055, "learning_rate": 4.449349686079574e-06, "epoch": 0.9390736113780511, "step": 1047 }, { "loss": 361.4709, "grad_norm": 35.336490631103516, "learning_rate": 4.317905254849791e-06, "epoch": 0.9399705298225383, "step": 1048 }, { "loss": 360.2202, "grad_norm": 34.51678466796875, "learning_rate": 4.188414724741768e-06, "epoch": 0.9408674482670254, "step": 1049 }, { "loss": 354.1904, "grad_norm": 34.459373474121094, "learning_rate": 4.060879125589195e-06, "epoch": 0.9417643667115125, "step": 1050 }, { "eval_loss": 1.5787107944488525, "eval_runtime": 18.3575, "eval_samples_per_second": 111.562, "eval_steps_per_second": 13.945, "epoch": 0.9417643667115125, "step": 1050 }, { "loss": 353.3853, "grad_norm": 33.25263214111328, "learning_rate": 3.9352994716783105e-06, "epoch": 0.9426612851559998, "step": 1051 }, { "loss": 350.3391, "grad_norm": 35.57413101196289, "learning_rate": 3.8116767617396298e-06, "epoch": 0.9435582036004869, "step": 1052 }, { "loss": 356.2869, "grad_norm": 33.38325881958008, "learning_rate": 3.690011978940255e-06, "epoch": 0.9444551220449741, "step": 1053 }, { "loss": 356.4574, "grad_norm": 34.5271110534668, "learning_rate": 3.570306090876024e-06, "epoch": 0.9453520404894612, "step": 1054 }, { "loss": 359.7423, "grad_norm": 35.02552795410156, "learning_rate": 3.4525600495636246e-06, "epoch": 0.9462489589339483, "step": 1055 }, { "loss": 353.1874, "grad_norm": 35.6952018737793, "learning_rate": 3.3367747914331838e-06, "epoch": 0.9471458773784355, "step": 1056 }, { "loss": 355.9973, "grad_norm": 35.45086669921875, "learning_rate": 3.222951237320915e-06, "epoch": 0.9480427958229227, "step": 1057 }, { "loss": 355.2783, "grad_norm": 32.976966857910156, "learning_rate": 3.1110902924615102e-06, "epoch": 0.9489397142674099, "step": 1058 }, { "loss": 358.506, "grad_norm": 34.06571960449219, "learning_rate": 3.0011928464811213e-06, "epoch": 0.949836632711897, "step": 1059 }, { "loss": 358.1763, "grad_norm": 33.59235382080078, "learning_rate": 2.8932597733903886e-06, "epoch": 0.9507335511563841, "step": 1060 }, { "loss": 357.5705, "grad_norm": 32.182106018066406, "learning_rate": 2.7872919315772017e-06, "epoch": 0.9516304696008713, "step": 1061 }, { "loss": 354.619, "grad_norm": 35.46062469482422, "learning_rate": 2.683290163800145e-06, "epoch": 0.9525273880453584, "step": 1062 }, { "loss": 350.0426, "grad_norm": 32.130767822265625, "learning_rate": 2.581255297181617e-06, "epoch": 0.9534243064898456, "step": 1063 }, { "loss": 351.98, "grad_norm": 32.878875732421875, "learning_rate": 2.4811881432013905e-06, "epoch": 0.9543212249343328, "step": 1064 }, { "loss": 353.1487, "grad_norm": 33.90510559082031, "learning_rate": 2.3830894976899774e-06, "epoch": 0.9552181433788199, "step": 1065 }, { "loss": 357.164, "grad_norm": 34.16891860961914, "learning_rate": 2.2869601408225805e-06, "epoch": 0.9561150618233071, "step": 1066 }, { "loss": 351.2288, "grad_norm": 33.57730484008789, "learning_rate": 2.1928008371125406e-06, "epoch": 0.9570119802677942, "step": 1067 }, { "loss": 356.0024, "grad_norm": 33.691978454589844, "learning_rate": 2.1006123354055384e-06, "epoch": 0.9579088987122814, "step": 1068 }, { "loss": 361.7596, "grad_norm": 33.60329055786133, "learning_rate": 2.0103953688734853e-06, "epoch": 0.9588058171567685, "step": 1069 }, { "loss": 354.5997, "grad_norm": 35.25307083129883, "learning_rate": 1.9221506550088365e-06, "epoch": 0.9597027356012557, "step": 1070 }, { "loss": 355.2119, "grad_norm": 34.94419860839844, "learning_rate": 1.83587889561862e-06, "epoch": 0.9605996540457429, "step": 1071 }, { "loss": 355.9485, "grad_norm": 34.35773468017578, "learning_rate": 1.7515807768192228e-06, "epoch": 0.96149657249023, "step": 1072 }, { "loss": 353.5008, "grad_norm": 33.7717170715332, "learning_rate": 1.6692569690305859e-06, "epoch": 0.9623934909347172, "step": 1073 }, { "loss": 357.9717, "grad_norm": 35.07488250732422, "learning_rate": 1.5889081269710726e-06, "epoch": 0.9632904093792043, "step": 1074 }, { "loss": 361.8947, "grad_norm": 34.685150146484375, "learning_rate": 1.5105348896522486e-06, "epoch": 0.9641873278236914, "step": 1075 }, { "loss": 357.5904, "grad_norm": 34.1632080078125, "learning_rate": 1.4341378803737204e-06, "epoch": 0.9650842462681786, "step": 1076 }, { "loss": 357.5146, "grad_norm": 34.23555374145508, "learning_rate": 1.3597177067181943e-06, "epoch": 0.9659811647126658, "step": 1077 }, { "loss": 356.91, "grad_norm": 32.962257385253906, "learning_rate": 1.2872749605468137e-06, "epoch": 0.966878083157153, "step": 1078 }, { "loss": 351.4866, "grad_norm": 34.07936096191406, "learning_rate": 1.2168102179941076e-06, "epoch": 0.9677750016016401, "step": 1079 }, { "loss": 355.5893, "grad_norm": 33.35137939453125, "learning_rate": 1.1483240394637717e-06, "epoch": 0.9686719200461272, "step": 1080 }, { "loss": 355.4586, "grad_norm": 34.09134292602539, "learning_rate": 1.0818169696239776e-06, "epoch": 0.9695688384906144, "step": 1081 }, { "loss": 354.5378, "grad_norm": 32.67642593383789, "learning_rate": 1.0172895374031265e-06, "epoch": 0.9704657569351015, "step": 1082 }, { "loss": 354.3784, "grad_norm": 32.6947021484375, "learning_rate": 9.5474225598563e-07, "epoch": 0.9713626753795886, "step": 1083 }, { "loss": 355.8788, "grad_norm": 33.51148986816406, "learning_rate": 8.941756228078579e-07, "epoch": 0.9722595938240759, "step": 1084 }, { "loss": 353.8372, "grad_norm": 33.57039260864258, "learning_rate": 8.35590119554086e-07, "epoch": 0.973156512268563, "step": 1085 }, { "loss": 353.2452, "grad_norm": 33.60462188720703, "learning_rate": 7.789862121528324e-07, "epoch": 0.9740534307130502, "step": 1086 }, { "loss": 357.0675, "grad_norm": 33.704349517822266, "learning_rate": 7.243643507729436e-07, "epoch": 0.9749503491575373, "step": 1087 }, { "loss": 354.5553, "grad_norm": 34.90256881713867, "learning_rate": 6.717249698202088e-07, "epoch": 0.9758472676020244, "step": 1088 }, { "loss": 349.4813, "grad_norm": 34.148128509521484, "learning_rate": 6.210684879337513e-07, "epoch": 0.9767441860465116, "step": 1089 }, { "loss": 357.7331, "grad_norm": 34.612762451171875, "learning_rate": 5.72395307982837e-07, "epoch": 0.9776411044909988, "step": 1090 }, { "loss": 358.809, "grad_norm": 32.881195068359375, "learning_rate": 5.257058170635709e-07, "epoch": 0.978538022935486, "step": 1091 }, { "loss": 356.2231, "grad_norm": 32.4294319152832, "learning_rate": 4.810003864958168e-07, "epoch": 0.9794349413799731, "step": 1092 }, { "loss": 354.6883, "grad_norm": 35.39781951904297, "learning_rate": 4.3827937182033815e-07, "epoch": 0.9803318598244602, "step": 1093 }, { "loss": 352.7607, "grad_norm": 34.17608642578125, "learning_rate": 3.9754311279582844e-07, "epoch": 0.9812287782689474, "step": 1094 }, { "loss": 353.8497, "grad_norm": 31.340768814086914, "learning_rate": 3.587919333963574e-07, "epoch": 0.9821256967134345, "step": 1095 }, { "loss": 357.9939, "grad_norm": 33.75115966796875, "learning_rate": 3.2202614180870673e-07, "epoch": 0.9830226151579217, "step": 1096 }, { "loss": 356.0656, "grad_norm": 32.56006622314453, "learning_rate": 2.872460304299274e-07, "epoch": 0.9839195336024089, "step": 1097 }, { "loss": 353.62, "grad_norm": 34.134193420410156, "learning_rate": 2.5445187586503603e-07, "epoch": 0.984816452046896, "step": 1098 }, { "loss": 355.838, "grad_norm": 34.15678024291992, "learning_rate": 2.2364393892479462e-07, "epoch": 0.9857133704913832, "step": 1099 }, { "loss": 358.3669, "grad_norm": 32.837039947509766, "learning_rate": 1.9482246462365626e-07, "epoch": 0.9866102889358703, "step": 1100 }, { "eval_loss": 1.5716547966003418, "eval_runtime": 18.217, "eval_samples_per_second": 112.422, "eval_steps_per_second": 14.053, "epoch": 0.9866102889358703, "step": 1100 }, { "loss": 356.8408, "grad_norm": 33.33000183105469, "learning_rate": 1.6798768217776706e-07, "epoch": 0.9875072073803575, "step": 1101 }, { "loss": 356.4636, "grad_norm": 34.879573822021484, "learning_rate": 1.4313980500327283e-07, "epoch": 0.9884041258248446, "step": 1102 }, { "loss": 356.378, "grad_norm": 33.825469970703125, "learning_rate": 1.2027903071440415e-07, "epoch": 0.9893010442693317, "step": 1103 }, { "loss": 359.4078, "grad_norm": 34.18437957763672, "learning_rate": 9.94055411221717e-08, "epoch": 0.990197962713819, "step": 1104 }, { "loss": 356.8303, "grad_norm": 35.02104187011719, "learning_rate": 8.051950223267323e-08, "epoch": 0.9910948811583061, "step": 1105 }, { "loss": 351.9132, "grad_norm": 33.7501220703125, "learning_rate": 6.362106424590009e-08, "epoch": 0.9919917996027933, "step": 1106 }, { "loss": 356.2349, "grad_norm": 34.74052810668945, "learning_rate": 4.871036155454367e-08, "epoch": 0.9928887180472804, "step": 1107 }, { "loss": 357.3864, "grad_norm": 33.26545715332031, "learning_rate": 3.578751274294079e-08, "epoch": 0.9937856364917675, "step": 1108 }, { "loss": 358.4432, "grad_norm": 33.61418914794922, "learning_rate": 2.4852620586046647e-08, "epoch": 0.9946825549362547, "step": 1109 }, { "loss": 356.3781, "grad_norm": 33.90690612792969, "learning_rate": 1.5905772048629975e-08, "epoch": 0.9955794733807419, "step": 1110 }, { "loss": 355.2562, "grad_norm": 36.185489654541016, "learning_rate": 8.947038284717879e-09, "epoch": 0.9964763918252291, "step": 1111 }, { "loss": 353.4495, "grad_norm": 35.645416259765625, "learning_rate": 3.976474636874228e-09, "epoch": 0.9973733102697162, "step": 1112 }, { "loss": 358.9317, "grad_norm": 34.38767623901367, "learning_rate": 9.941206357555465e-10, "epoch": 0.9982702287142033, "step": 1113 }, { "loss": 355.1901, "grad_norm": 33.96023941040039, "learning_rate": 0.0, "epoch": 0.9991671471586905, "step": 1114 }, { "train_runtime": 10703.3349, "train_samples_per_second": 186.666, "train_steps_per_second": 0.104, "total_flos": 6.811715592467251e+17, "train_loss": 100.33408414611269, "epoch": 0.9991671471586905, "step": 1114 }, { "eval_loss": 1.585738182067871, "eval_runtime": 19.5932, "eval_samples_per_second": 104.526, "eval_steps_per_second": 13.066, "epoch": 0.9991671471586905, "step": 1114 } ], "best_metric": null, "best_model_checkpoint": null, "is_local_process_zero": true, "is_world_process_zero": true, "is_hyper_param_search": false, "trial_name": null, "trial_params": null, "stateful_callbacks": { "TrainerControl": { "args": { "should_training_stop": true, "should_epoch_stop": false, "should_save": true, "should_evaluate": false, "should_log": false }, "attributes": {} } } }