{ "best_metric": 0.6575854420661926, "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256", "epoch": 10.0, "eval_steps": 1.0, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 0.5230235555406132, "learning_rate": 0.0, "loss": 1.5809, "step": 1 }, { "epoch": 0.03125, "eval_loss": 1.6275018453598022, "eval_runtime": 82.059, "eval_samples_per_second": 2.437, "eval_steps_per_second": 0.305, "step": 1 }, { "epoch": 0.0625, "grad_norm": 0.5095402010892089, "learning_rate": 2e-05, "loss": 1.4958, "step": 2 }, { "epoch": 0.0625, "eval_loss": 1.6275018453598022, "eval_runtime": 76.5747, "eval_samples_per_second": 2.612, "eval_steps_per_second": 0.326, "step": 2 }, { "epoch": 0.09375, "grad_norm": 0.4998514282504938, "learning_rate": 2e-05, "loss": 1.5552, "step": 3 }, { "epoch": 0.09375, "eval_loss": 1.5956931114196777, "eval_runtime": 76.1563, "eval_samples_per_second": 2.626, "eval_steps_per_second": 0.328, "step": 3 }, { "epoch": 0.125, "grad_norm": 0.4280580315108126, "learning_rate": 2e-05, "loss": 1.4846, "step": 4 }, { "epoch": 0.125, "eval_loss": 1.5584176778793335, "eval_runtime": 76.1235, "eval_samples_per_second": 2.627, "eval_steps_per_second": 0.328, "step": 4 }, { "epoch": 0.15625, "grad_norm": 0.5678499435986384, "learning_rate": 2e-05, "loss": 1.5036, "step": 5 }, { "epoch": 0.15625, "eval_loss": 1.5207562446594238, "eval_runtime": 76.1514, "eval_samples_per_second": 2.626, "eval_steps_per_second": 0.328, "step": 5 }, { "epoch": 0.1875, "grad_norm": 0.5368461657542534, "learning_rate": 2e-05, "loss": 1.476, "step": 6 }, { "epoch": 0.1875, "eval_loss": 1.4807783365249634, "eval_runtime": 77.3444, "eval_samples_per_second": 2.586, "eval_steps_per_second": 0.323, "step": 6 }, { "epoch": 0.21875, "grad_norm": 0.5549950083087136, "learning_rate": 2e-05, "loss": 1.4358, "step": 7 }, { "epoch": 0.21875, "eval_loss": 1.4411544799804688, "eval_runtime": 77.066, "eval_samples_per_second": 2.595, "eval_steps_per_second": 0.324, "step": 7 }, { "epoch": 0.25, "grad_norm": 0.5549950083087136, "learning_rate": 2e-05, "loss": 1.4369, "step": 8 }, { "epoch": 0.25, "eval_loss": 1.4411544799804688, "eval_runtime": 77.2807, "eval_samples_per_second": 2.588, "eval_steps_per_second": 0.323, "step": 8 }, { "epoch": 0.28125, "grad_norm": 0.5292240951443854, "learning_rate": 2e-05, "loss": 1.4471, "step": 9 }, { "epoch": 0.28125, "eval_loss": 1.4036556482315063, "eval_runtime": 78.1562, "eval_samples_per_second": 2.559, "eval_steps_per_second": 0.32, "step": 9 }, { "epoch": 0.3125, "grad_norm": 0.5292240951443854, "learning_rate": 2e-05, "loss": 1.3666, "step": 10 }, { "epoch": 0.3125, "eval_loss": 1.4036556482315063, "eval_runtime": 77.1645, "eval_samples_per_second": 2.592, "eval_steps_per_second": 0.324, "step": 10 }, { "epoch": 0.34375, "grad_norm": 0.5292240951443854, "learning_rate": 2e-05, "loss": 1.4149, "step": 11 }, { "epoch": 0.34375, "eval_loss": 1.4036556482315063, "eval_runtime": 78.7627, "eval_samples_per_second": 2.539, "eval_steps_per_second": 0.317, "step": 11 }, { "epoch": 0.375, "grad_norm": 0.684588966714067, "learning_rate": 2e-05, "loss": 1.3883, "step": 12 }, { "epoch": 0.375, "eval_loss": 1.3679308891296387, "eval_runtime": 78.4315, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 12 }, { "epoch": 0.40625, "grad_norm": 0.6261826769491422, "learning_rate": 2e-05, "loss": 1.4271, "step": 13 }, { "epoch": 0.40625, "eval_loss": 1.3369851112365723, "eval_runtime": 78.685, "eval_samples_per_second": 2.542, "eval_steps_per_second": 0.318, "step": 13 }, { "epoch": 0.4375, "grad_norm": 0.6261826769491422, "learning_rate": 2e-05, "loss": 1.2495, "step": 14 }, { "epoch": 0.4375, "eval_loss": 1.3369851112365723, "eval_runtime": 78.0511, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 14 }, { "epoch": 0.46875, "grad_norm": 0.6028103951693778, "learning_rate": 2e-05, "loss": 1.3513, "step": 15 }, { "epoch": 0.46875, "eval_loss": 1.3032653331756592, "eval_runtime": 78.0271, "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.32, "step": 15 }, { "epoch": 0.5, "grad_norm": 0.769290402283396, "learning_rate": 2e-05, "loss": 1.3117, "step": 16 }, { "epoch": 0.5, "eval_loss": 1.2661188840866089, "eval_runtime": 78.1857, "eval_samples_per_second": 2.558, "eval_steps_per_second": 0.32, "step": 16 }, { "epoch": 0.53125, "grad_norm": 1.3279338025863765, "learning_rate": 2e-05, "loss": 1.2768, "step": 17 }, { "epoch": 0.53125, "eval_loss": 1.2299447059631348, "eval_runtime": 78.2064, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 17 }, { "epoch": 0.5625, "grad_norm": 0.7410327159336384, "learning_rate": 2e-05, "loss": 1.256, "step": 18 }, { "epoch": 0.5625, "eval_loss": 1.2044258117675781, "eval_runtime": 78.072, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 18 }, { "epoch": 0.59375, "grad_norm": 0.44078820770408506, "learning_rate": 2e-05, "loss": 1.1252, "step": 19 }, { "epoch": 0.59375, "eval_loss": 1.1826122999191284, "eval_runtime": 78.7312, "eval_samples_per_second": 2.54, "eval_steps_per_second": 0.318, "step": 19 }, { "epoch": 0.625, "grad_norm": 0.49020841613371097, "learning_rate": 2e-05, "loss": 1.2249, "step": 20 }, { "epoch": 0.625, "eval_loss": 1.1616511344909668, "eval_runtime": 78.2736, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 20 }, { "epoch": 0.65625, "grad_norm": 0.43031322695269714, "learning_rate": 2e-05, "loss": 1.1466, "step": 21 }, { "epoch": 0.65625, "eval_loss": 1.1410629749298096, "eval_runtime": 79.6432, "eval_samples_per_second": 2.511, "eval_steps_per_second": 0.314, "step": 21 }, { "epoch": 0.6875, "grad_norm": 0.45632085445955545, "learning_rate": 2e-05, "loss": 1.1951, "step": 22 }, { "epoch": 0.6875, "eval_loss": 1.1204684972763062, "eval_runtime": 79.0609, "eval_samples_per_second": 2.53, "eval_steps_per_second": 0.316, "step": 22 }, { "epoch": 0.71875, "grad_norm": 0.40048586945364495, "learning_rate": 2e-05, "loss": 1.1826, "step": 23 }, { "epoch": 0.71875, "eval_loss": 1.1002545356750488, "eval_runtime": 82.8578, "eval_samples_per_second": 2.414, "eval_steps_per_second": 0.302, "step": 23 }, { "epoch": 0.75, "grad_norm": 0.3703033261027938, "learning_rate": 2e-05, "loss": 1.1543, "step": 24 }, { "epoch": 0.75, "eval_loss": 1.0805977582931519, "eval_runtime": 76.1407, "eval_samples_per_second": 2.627, "eval_steps_per_second": 0.328, "step": 24 }, { "epoch": 0.78125, "grad_norm": 0.3986313105418924, "learning_rate": 2e-05, "loss": 1.1046, "step": 25 }, { "epoch": 0.78125, "eval_loss": 1.0610157251358032, "eval_runtime": 76.3083, "eval_samples_per_second": 2.621, "eval_steps_per_second": 0.328, "step": 25 }, { "epoch": 0.8125, "grad_norm": 0.36265027203577943, "learning_rate": 2e-05, "loss": 1.1048, "step": 26 }, { "epoch": 0.8125, "eval_loss": 1.0421289205551147, "eval_runtime": 77.2186, "eval_samples_per_second": 2.59, "eval_steps_per_second": 0.324, "step": 26 }, { "epoch": 0.84375, "grad_norm": 0.3881748990218768, "learning_rate": 2e-05, "loss": 1.0425, "step": 27 }, { "epoch": 0.84375, "eval_loss": 1.0240073204040527, "eval_runtime": 77.8662, "eval_samples_per_second": 2.569, "eval_steps_per_second": 0.321, "step": 27 }, { "epoch": 0.875, "grad_norm": 0.3734031294324286, "learning_rate": 2e-05, "loss": 1.0484, "step": 28 }, { "epoch": 0.875, "eval_loss": 1.0066957473754883, "eval_runtime": 77.269, "eval_samples_per_second": 2.588, "eval_steps_per_second": 0.324, "step": 28 }, { "epoch": 0.90625, "grad_norm": 0.29695383079342563, "learning_rate": 2e-05, "loss": 1.0387, "step": 29 }, { "epoch": 0.90625, "eval_loss": 0.9906074404716492, "eval_runtime": 77.2245, "eval_samples_per_second": 2.59, "eval_steps_per_second": 0.324, "step": 29 }, { "epoch": 0.9375, "grad_norm": 0.29273146875026623, "learning_rate": 2e-05, "loss": 1.0568, "step": 30 }, { "epoch": 0.9375, "eval_loss": 0.975755512714386, "eval_runtime": 78.0056, "eval_samples_per_second": 2.564, "eval_steps_per_second": 0.32, "step": 30 }, { "epoch": 0.96875, "grad_norm": 0.35070440686850546, "learning_rate": 2e-05, "loss": 0.9114, "step": 31 }, { "epoch": 0.96875, "eval_loss": 0.9615123271942139, "eval_runtime": 77.9051, "eval_samples_per_second": 2.567, "eval_steps_per_second": 0.321, "step": 31 }, { "epoch": 1.0, "grad_norm": 0.30846157140439384, "learning_rate": 2e-05, "loss": 0.9941, "step": 32 }, { "epoch": 1.0, "eval_loss": 0.9480571150779724, "eval_runtime": 77.2322, "eval_samples_per_second": 2.59, "eval_steps_per_second": 0.324, "step": 32 }, { "epoch": 1.03125, "grad_norm": 0.2950381371932973, "learning_rate": 2e-05, "loss": 1.0297, "step": 33 }, { "epoch": 1.03125, "eval_loss": 0.9356330037117004, "eval_runtime": 81.8443, "eval_samples_per_second": 2.444, "eval_steps_per_second": 0.305, "step": 33 }, { "epoch": 1.0625, "grad_norm": 0.27080038065834283, "learning_rate": 2e-05, "loss": 1.021, "step": 34 }, { "epoch": 1.0625, "eval_loss": 0.9245791435241699, "eval_runtime": 76.2071, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 34 }, { "epoch": 1.09375, "grad_norm": 0.23165081252649894, "learning_rate": 2e-05, "loss": 1.0366, "step": 35 }, { "epoch": 1.09375, "eval_loss": 0.9151126146316528, "eval_runtime": 77.0412, "eval_samples_per_second": 2.596, "eval_steps_per_second": 0.325, "step": 35 }, { "epoch": 1.125, "grad_norm": 0.4033780922500775, "learning_rate": 2e-05, "loss": 1.0127, "step": 36 }, { "epoch": 1.125, "eval_loss": 0.9063960313796997, "eval_runtime": 76.9327, "eval_samples_per_second": 2.6, "eval_steps_per_second": 0.325, "step": 36 }, { "epoch": 1.15625, "grad_norm": 0.2398039831439168, "learning_rate": 2e-05, "loss": 0.9418, "step": 37 }, { "epoch": 1.15625, "eval_loss": 0.8982363939285278, "eval_runtime": 76.1234, "eval_samples_per_second": 2.627, "eval_steps_per_second": 0.328, "step": 37 }, { "epoch": 1.1875, "grad_norm": 0.28793451241246804, "learning_rate": 2e-05, "loss": 0.9643, "step": 38 }, { "epoch": 1.1875, "eval_loss": 0.8908895254135132, "eval_runtime": 76.2877, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 38 }, { "epoch": 1.21875, "grad_norm": 0.2927691606307197, "learning_rate": 2e-05, "loss": 1.0087, "step": 39 }, { "epoch": 1.21875, "eval_loss": 0.8845618367195129, "eval_runtime": 76.2282, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 39 }, { "epoch": 1.25, "grad_norm": 0.26410982001408806, "learning_rate": 2e-05, "loss": 0.986, "step": 40 }, { "epoch": 1.25, "eval_loss": 0.8784474730491638, "eval_runtime": 76.2512, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 40 }, { "epoch": 1.28125, "grad_norm": 0.29182630949665306, "learning_rate": 2e-05, "loss": 0.9711, "step": 41 }, { "epoch": 1.28125, "eval_loss": 0.8725223541259766, "eval_runtime": 77.1229, "eval_samples_per_second": 2.593, "eval_steps_per_second": 0.324, "step": 41 }, { "epoch": 1.3125, "grad_norm": 0.36402838796832665, "learning_rate": 2e-05, "loss": 0.9263, "step": 42 }, { "epoch": 1.3125, "eval_loss": 0.8662790060043335, "eval_runtime": 77.2362, "eval_samples_per_second": 2.589, "eval_steps_per_second": 0.324, "step": 42 }, { "epoch": 1.34375, "grad_norm": 0.29338184478895163, "learning_rate": 2e-05, "loss": 0.8947, "step": 43 }, { "epoch": 1.34375, "eval_loss": 0.8600431680679321, "eval_runtime": 77.1213, "eval_samples_per_second": 2.593, "eval_steps_per_second": 0.324, "step": 43 }, { "epoch": 1.375, "grad_norm": 0.2201714229702277, "learning_rate": 2e-05, "loss": 0.9059, "step": 44 }, { "epoch": 1.375, "eval_loss": 0.8545799255371094, "eval_runtime": 77.991, "eval_samples_per_second": 2.564, "eval_steps_per_second": 0.321, "step": 44 }, { "epoch": 1.40625, "grad_norm": 0.2254966625243654, "learning_rate": 2e-05, "loss": 0.8942, "step": 45 }, { "epoch": 1.40625, "eval_loss": 0.8497399687767029, "eval_runtime": 77.2698, "eval_samples_per_second": 2.588, "eval_steps_per_second": 0.324, "step": 45 }, { "epoch": 1.4375, "grad_norm": 0.21753318432075458, "learning_rate": 2e-05, "loss": 0.9376, "step": 46 }, { "epoch": 1.4375, "eval_loss": 0.8452473282814026, "eval_runtime": 77.0568, "eval_samples_per_second": 2.595, "eval_steps_per_second": 0.324, "step": 46 }, { "epoch": 1.46875, "grad_norm": 0.21449718265972945, "learning_rate": 2e-05, "loss": 0.9369, "step": 47 }, { "epoch": 1.46875, "eval_loss": 0.841134786605835, "eval_runtime": 77.225, "eval_samples_per_second": 2.59, "eval_steps_per_second": 0.324, "step": 47 }, { "epoch": 1.5, "grad_norm": 0.2109063266748924, "learning_rate": 2e-05, "loss": 0.8511, "step": 48 }, { "epoch": 1.5, "eval_loss": 0.8373770117759705, "eval_runtime": 76.2309, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 48 }, { "epoch": 1.53125, "grad_norm": 0.232838633689838, "learning_rate": 2e-05, "loss": 0.8694, "step": 49 }, { "epoch": 1.53125, "eval_loss": 0.8338289856910706, "eval_runtime": 76.277, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 49 }, { "epoch": 1.5625, "grad_norm": 0.4189704940803984, "learning_rate": 2e-05, "loss": 0.8464, "step": 50 }, { "epoch": 1.5625, "eval_loss": 0.8297132849693298, "eval_runtime": 76.2872, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 50 }, { "epoch": 1.59375, "grad_norm": 0.2171618165123276, "learning_rate": 2e-05, "loss": 0.8785, "step": 51 }, { "epoch": 1.59375, "eval_loss": 0.8257431983947754, "eval_runtime": 76.2639, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 51 }, { "epoch": 1.625, "grad_norm": 0.21934651037670305, "learning_rate": 2e-05, "loss": 0.7645, "step": 52 }, { "epoch": 1.625, "eval_loss": 0.8223557472229004, "eval_runtime": 76.2383, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 52 }, { "epoch": 1.65625, "grad_norm": 0.24183530733164746, "learning_rate": 2e-05, "loss": 0.9218, "step": 53 }, { "epoch": 1.65625, "eval_loss": 0.8189653158187866, "eval_runtime": 76.9819, "eval_samples_per_second": 2.598, "eval_steps_per_second": 0.325, "step": 53 }, { "epoch": 1.6875, "grad_norm": 0.23450930244279267, "learning_rate": 2e-05, "loss": 0.8896, "step": 54 }, { "epoch": 1.6875, "eval_loss": 0.8152530193328857, "eval_runtime": 76.2378, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 54 }, { "epoch": 1.71875, "grad_norm": 0.22081665899796085, "learning_rate": 2e-05, "loss": 0.8798, "step": 55 }, { "epoch": 1.71875, "eval_loss": 0.8122122287750244, "eval_runtime": 76.289, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 55 }, { "epoch": 1.75, "grad_norm": 0.21311746114111046, "learning_rate": 2e-05, "loss": 0.9482, "step": 56 }, { "epoch": 1.75, "eval_loss": 0.8092318773269653, "eval_runtime": 77.8321, "eval_samples_per_second": 2.57, "eval_steps_per_second": 0.321, "step": 56 }, { "epoch": 1.78125, "grad_norm": 0.2496565307107556, "learning_rate": 2e-05, "loss": 0.8917, "step": 57 }, { "epoch": 1.78125, "eval_loss": 0.8070546984672546, "eval_runtime": 77.2651, "eval_samples_per_second": 2.588, "eval_steps_per_second": 0.324, "step": 57 }, { "epoch": 1.8125, "grad_norm": 0.2137866456424736, "learning_rate": 2e-05, "loss": 0.909, "step": 58 }, { "epoch": 1.8125, "eval_loss": 0.8049566745758057, "eval_runtime": 78.0925, "eval_samples_per_second": 2.561, "eval_steps_per_second": 0.32, "step": 58 }, { "epoch": 1.84375, "grad_norm": 0.22567502859345095, "learning_rate": 2e-05, "loss": 0.8611, "step": 59 }, { "epoch": 1.84375, "eval_loss": 0.8028810024261475, "eval_runtime": 78.0553, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 59 }, { "epoch": 1.875, "grad_norm": 0.23303796552302508, "learning_rate": 2e-05, "loss": 0.9209, "step": 60 }, { "epoch": 1.875, "eval_loss": 0.800568699836731, "eval_runtime": 78.052, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 60 }, { "epoch": 1.90625, "grad_norm": 0.24566727726974544, "learning_rate": 2e-05, "loss": 0.8239, "step": 61 }, { "epoch": 1.90625, "eval_loss": 0.7976545691490173, "eval_runtime": 77.3056, "eval_samples_per_second": 2.587, "eval_steps_per_second": 0.323, "step": 61 }, { "epoch": 1.9375, "grad_norm": 0.23014192522354907, "learning_rate": 2e-05, "loss": 0.8814, "step": 62 }, { "epoch": 1.9375, "eval_loss": 0.7945474982261658, "eval_runtime": 77.3398, "eval_samples_per_second": 2.586, "eval_steps_per_second": 0.323, "step": 62 }, { "epoch": 1.96875, "grad_norm": 0.23042819102671622, "learning_rate": 2e-05, "loss": 0.9064, "step": 63 }, { "epoch": 1.96875, "eval_loss": 0.7918359637260437, "eval_runtime": 77.4272, "eval_samples_per_second": 2.583, "eval_steps_per_second": 0.323, "step": 63 }, { "epoch": 2.0, "grad_norm": 0.23940667173206315, "learning_rate": 2e-05, "loss": 0.8658, "step": 64 }, { "epoch": 2.0, "eval_loss": 0.7891160845756531, "eval_runtime": 77.3236, "eval_samples_per_second": 2.587, "eval_steps_per_second": 0.323, "step": 64 }, { "epoch": 2.03125, "grad_norm": 0.22630342930143643, "learning_rate": 2e-05, "loss": 0.8403, "step": 65 }, { "epoch": 2.03125, "eval_loss": 0.7859742641448975, "eval_runtime": 77.2001, "eval_samples_per_second": 2.591, "eval_steps_per_second": 0.324, "step": 65 }, { "epoch": 2.0625, "grad_norm": 0.20949240460260976, "learning_rate": 2e-05, "loss": 0.8472, "step": 66 }, { "epoch": 2.0625, "eval_loss": 0.7834083437919617, "eval_runtime": 78.9646, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.317, "step": 66 }, { "epoch": 2.09375, "grad_norm": 0.22714400479820654, "learning_rate": 2e-05, "loss": 0.841, "step": 67 }, { "epoch": 2.09375, "eval_loss": 0.7805308699607849, "eval_runtime": 78.7552, "eval_samples_per_second": 2.54, "eval_steps_per_second": 0.317, "step": 67 }, { "epoch": 2.125, "grad_norm": 0.23345123077006047, "learning_rate": 2e-05, "loss": 0.9028, "step": 68 }, { "epoch": 2.125, "eval_loss": 0.7779514789581299, "eval_runtime": 78.3387, "eval_samples_per_second": 2.553, "eval_steps_per_second": 0.319, "step": 68 }, { "epoch": 2.15625, "grad_norm": 0.251841542575211, "learning_rate": 2e-05, "loss": 0.8381, "step": 69 }, { "epoch": 2.15625, "eval_loss": 0.7756664752960205, "eval_runtime": 78.3109, "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 69 }, { "epoch": 2.1875, "grad_norm": 0.23548386839773608, "learning_rate": 2e-05, "loss": 0.7914, "step": 70 }, { "epoch": 2.1875, "eval_loss": 0.7733604907989502, "eval_runtime": 78.9712, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.317, "step": 70 }, { "epoch": 2.21875, "grad_norm": 0.23262740912668387, "learning_rate": 2e-05, "loss": 0.8778, "step": 71 }, { "epoch": 2.21875, "eval_loss": 0.771755576133728, "eval_runtime": 78.2633, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 71 }, { "epoch": 2.25, "grad_norm": 0.22075289612357513, "learning_rate": 2e-05, "loss": 0.7945, "step": 72 }, { "epoch": 2.25, "eval_loss": 0.7705450654029846, "eval_runtime": 78.3151, "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 72 }, { "epoch": 2.28125, "grad_norm": 0.25520381955936466, "learning_rate": 2e-05, "loss": 0.8387, "step": 73 }, { "epoch": 2.28125, "eval_loss": 0.7695029973983765, "eval_runtime": 78.2901, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 73 }, { "epoch": 2.3125, "grad_norm": 0.2047305385827267, "learning_rate": 2e-05, "loss": 0.8404, "step": 74 }, { "epoch": 2.3125, "eval_loss": 0.7684457302093506, "eval_runtime": 78.3875, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 74 }, { "epoch": 2.34375, "grad_norm": 0.2262323045133288, "learning_rate": 2e-05, "loss": 0.8811, "step": 75 }, { "epoch": 2.34375, "eval_loss": 0.7671162486076355, "eval_runtime": 78.202, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 75 }, { "epoch": 2.375, "grad_norm": 0.21885464923925876, "learning_rate": 2e-05, "loss": 0.7942, "step": 76 }, { "epoch": 2.375, "eval_loss": 0.7658494710922241, "eval_runtime": 78.1746, "eval_samples_per_second": 2.558, "eval_steps_per_second": 0.32, "step": 76 }, { "epoch": 2.40625, "grad_norm": 0.21717306953626966, "learning_rate": 2e-05, "loss": 0.8497, "step": 77 }, { "epoch": 2.40625, "eval_loss": 0.7642120122909546, "eval_runtime": 78.2026, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 77 }, { "epoch": 2.4375, "grad_norm": 0.2530725583748258, "learning_rate": 2e-05, "loss": 0.8584, "step": 78 }, { "epoch": 2.4375, "eval_loss": 0.7625510692596436, "eval_runtime": 78.1991, "eval_samples_per_second": 2.558, "eval_steps_per_second": 0.32, "step": 78 }, { "epoch": 2.46875, "grad_norm": 0.25354787036627263, "learning_rate": 2e-05, "loss": 0.8569, "step": 79 }, { "epoch": 2.46875, "eval_loss": 0.7616268396377563, "eval_runtime": 78.2915, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 79 }, { "epoch": 2.5, "grad_norm": 0.2800865746664007, "learning_rate": 2e-05, "loss": 0.9116, "step": 80 }, { "epoch": 2.5, "eval_loss": 0.7603214979171753, "eval_runtime": 78.2749, "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 80 }, { "epoch": 2.53125, "grad_norm": 0.268139688449618, "learning_rate": 2e-05, "loss": 0.8397, "step": 81 }, { "epoch": 2.53125, "eval_loss": 0.7584869265556335, "eval_runtime": 79.1445, "eval_samples_per_second": 2.527, "eval_steps_per_second": 0.316, "step": 81 }, { "epoch": 2.5625, "grad_norm": 0.3128648654463789, "learning_rate": 2e-05, "loss": 0.8888, "step": 82 }, { "epoch": 2.5625, "eval_loss": 0.7566561102867126, "eval_runtime": 79.2089, "eval_samples_per_second": 2.525, "eval_steps_per_second": 0.316, "step": 82 }, { "epoch": 2.59375, "grad_norm": 0.2502355211215609, "learning_rate": 2e-05, "loss": 0.8346, "step": 83 }, { "epoch": 2.59375, "eval_loss": 0.7547345161437988, "eval_runtime": 79.2691, "eval_samples_per_second": 2.523, "eval_steps_per_second": 0.315, "step": 83 }, { "epoch": 2.625, "grad_norm": 0.25281184629018644, "learning_rate": 2e-05, "loss": 0.795, "step": 84 }, { "epoch": 2.625, "eval_loss": 0.7527951598167419, "eval_runtime": 79.4068, "eval_samples_per_second": 2.519, "eval_steps_per_second": 0.315, "step": 84 }, { "epoch": 2.65625, "grad_norm": 0.24246729562645003, "learning_rate": 2e-05, "loss": 0.7649, "step": 85 }, { "epoch": 2.65625, "eval_loss": 0.7509815096855164, "eval_runtime": 79.1612, "eval_samples_per_second": 2.526, "eval_steps_per_second": 0.316, "step": 85 }, { "epoch": 2.6875, "grad_norm": 0.27005475109453947, "learning_rate": 2e-05, "loss": 0.7964, "step": 86 }, { "epoch": 2.6875, "eval_loss": 0.7485950589179993, "eval_runtime": 80.0714, "eval_samples_per_second": 2.498, "eval_steps_per_second": 0.312, "step": 86 }, { "epoch": 2.71875, "grad_norm": 0.2723492355800971, "learning_rate": 2e-05, "loss": 0.8117, "step": 87 }, { "epoch": 2.71875, "eval_loss": 0.7459420561790466, "eval_runtime": 79.4075, "eval_samples_per_second": 2.519, "eval_steps_per_second": 0.315, "step": 87 }, { "epoch": 2.75, "grad_norm": 0.2946493898427159, "learning_rate": 2e-05, "loss": 0.8986, "step": 88 }, { "epoch": 2.75, "eval_loss": 0.7436455488204956, "eval_runtime": 79.3721, "eval_samples_per_second": 2.52, "eval_steps_per_second": 0.315, "step": 88 }, { "epoch": 2.78125, "grad_norm": 0.26411214734213284, "learning_rate": 2e-05, "loss": 0.8145, "step": 89 }, { "epoch": 2.78125, "eval_loss": 0.7424752712249756, "eval_runtime": 79.2988, "eval_samples_per_second": 2.522, "eval_steps_per_second": 0.315, "step": 89 }, { "epoch": 2.8125, "grad_norm": 0.27115747269014817, "learning_rate": 2e-05, "loss": 0.8457, "step": 90 }, { "epoch": 2.8125, "eval_loss": 0.7416408658027649, "eval_runtime": 79.4004, "eval_samples_per_second": 2.519, "eval_steps_per_second": 0.315, "step": 90 }, { "epoch": 2.84375, "grad_norm": 0.25831877964821937, "learning_rate": 2e-05, "loss": 0.7568, "step": 91 }, { "epoch": 2.84375, "eval_loss": 0.7404463291168213, "eval_runtime": 81.7767, "eval_samples_per_second": 2.446, "eval_steps_per_second": 0.306, "step": 91 }, { "epoch": 2.875, "grad_norm": 0.31273388454942935, "learning_rate": 2e-05, "loss": 0.8562, "step": 92 }, { "epoch": 2.875, "eval_loss": 0.7384185791015625, "eval_runtime": 82.3443, "eval_samples_per_second": 2.429, "eval_steps_per_second": 0.304, "step": 92 }, { "epoch": 2.90625, "grad_norm": 0.2838267071008901, "learning_rate": 2e-05, "loss": 0.7869, "step": 93 }, { "epoch": 2.90625, "eval_loss": 0.7366807460784912, "eval_runtime": 82.2622, "eval_samples_per_second": 2.431, "eval_steps_per_second": 0.304, "step": 93 }, { "epoch": 2.9375, "grad_norm": 0.28625827941831467, "learning_rate": 2e-05, "loss": 0.8618, "step": 94 }, { "epoch": 2.9375, "eval_loss": 0.7357398867607117, "eval_runtime": 81.9471, "eval_samples_per_second": 2.441, "eval_steps_per_second": 0.305, "step": 94 }, { "epoch": 2.96875, "grad_norm": 0.25548002643954326, "learning_rate": 2e-05, "loss": 0.8085, "step": 95 }, { "epoch": 2.96875, "eval_loss": 0.7356534004211426, "eval_runtime": 82.1186, "eval_samples_per_second": 2.436, "eval_steps_per_second": 0.304, "step": 95 }, { "epoch": 3.0, "grad_norm": 0.27081450830961107, "learning_rate": 2e-05, "loss": 0.7684, "step": 96 }, { "epoch": 3.0, "eval_loss": 0.7346957921981812, "eval_runtime": 81.5463, "eval_samples_per_second": 2.453, "eval_steps_per_second": 0.307, "step": 96 }, { "epoch": 3.03125, "grad_norm": 0.2985486737236676, "learning_rate": 2e-05, "loss": 0.7274, "step": 97 }, { "epoch": 3.03125, "eval_loss": 0.7325752377510071, "eval_runtime": 81.7804, "eval_samples_per_second": 2.446, "eval_steps_per_second": 0.306, "step": 97 }, { "epoch": 3.0625, "grad_norm": 0.29149719690624026, "learning_rate": 2e-05, "loss": 0.8119, "step": 98 }, { "epoch": 3.0625, "eval_loss": 0.7298976182937622, "eval_runtime": 76.2764, "eval_samples_per_second": 2.622, "eval_steps_per_second": 0.328, "step": 98 }, { "epoch": 3.09375, "grad_norm": 0.25227859825215865, "learning_rate": 2e-05, "loss": 0.7888, "step": 99 }, { "epoch": 3.09375, "eval_loss": 0.727373480796814, "eval_runtime": 76.2418, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 99 }, { "epoch": 3.125, "grad_norm": 0.27316954971752555, "learning_rate": 2e-05, "loss": 0.8224, "step": 100 }, { "epoch": 3.125, "eval_loss": 0.7254325747489929, "eval_runtime": 76.1474, "eval_samples_per_second": 2.626, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 3.15625, "grad_norm": 0.24239788607957785, "learning_rate": 2e-05, "loss": 0.7535, "step": 101 }, { "epoch": 3.15625, "eval_loss": 0.724058985710144, "eval_runtime": 76.2391, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 101 }, { "epoch": 3.1875, "grad_norm": 0.25648385925427025, "learning_rate": 2e-05, "loss": 0.8195, "step": 102 }, { "epoch": 3.1875, "eval_loss": 0.7235870957374573, "eval_runtime": 76.9134, "eval_samples_per_second": 2.6, "eval_steps_per_second": 0.325, "step": 102 }, { "epoch": 3.21875, "grad_norm": 0.29620170789161204, "learning_rate": 2e-05, "loss": 0.8224, "step": 103 }, { "epoch": 3.21875, "eval_loss": 0.7228152751922607, "eval_runtime": 76.095, "eval_samples_per_second": 2.628, "eval_steps_per_second": 0.329, "step": 103 }, { "epoch": 3.25, "grad_norm": 0.3484116181139593, "learning_rate": 2e-05, "loss": 0.7478, "step": 104 }, { "epoch": 3.25, "eval_loss": 0.7209363579750061, "eval_runtime": 76.9377, "eval_samples_per_second": 2.6, "eval_steps_per_second": 0.325, "step": 104 }, { "epoch": 3.28125, "grad_norm": 0.25212350156184643, "learning_rate": 2e-05, "loss": 0.7885, "step": 105 }, { "epoch": 3.28125, "eval_loss": 0.7197096347808838, "eval_runtime": 76.2008, "eval_samples_per_second": 2.625, "eval_steps_per_second": 0.328, "step": 105 }, { "epoch": 3.3125, "grad_norm": 0.264200147608962, "learning_rate": 2e-05, "loss": 0.8371, "step": 106 }, { "epoch": 3.3125, "eval_loss": 0.7197055220603943, "eval_runtime": 78.1542, "eval_samples_per_second": 2.559, "eval_steps_per_second": 0.32, "step": 106 }, { "epoch": 3.34375, "grad_norm": 0.3309431084940201, "learning_rate": 2e-05, "loss": 0.6999, "step": 107 }, { "epoch": 3.34375, "eval_loss": 0.7187016010284424, "eval_runtime": 78.4259, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 107 }, { "epoch": 3.375, "grad_norm": 0.3131644456919823, "learning_rate": 2e-05, "loss": 0.7587, "step": 108 }, { "epoch": 3.375, "eval_loss": 0.717018187046051, "eval_runtime": 78.4558, "eval_samples_per_second": 2.549, "eval_steps_per_second": 0.319, "step": 108 }, { "epoch": 3.40625, "grad_norm": 0.33527684120780293, "learning_rate": 2e-05, "loss": 0.7468, "step": 109 }, { "epoch": 3.40625, "eval_loss": 0.7147062420845032, "eval_runtime": 78.2334, "eval_samples_per_second": 2.556, "eval_steps_per_second": 0.32, "step": 109 }, { "epoch": 3.4375, "grad_norm": 0.29542683956231724, "learning_rate": 2e-05, "loss": 0.7477, "step": 110 }, { "epoch": 3.4375, "eval_loss": 0.7130224704742432, "eval_runtime": 79.1179, "eval_samples_per_second": 2.528, "eval_steps_per_second": 0.316, "step": 110 }, { "epoch": 3.46875, "grad_norm": 0.31128698002926114, "learning_rate": 2e-05, "loss": 0.8153, "step": 111 }, { "epoch": 3.46875, "eval_loss": 0.7120551466941833, "eval_runtime": 80.292, "eval_samples_per_second": 2.491, "eval_steps_per_second": 0.311, "step": 111 }, { "epoch": 3.5, "grad_norm": 0.32502558864214215, "learning_rate": 2e-05, "loss": 0.8043, "step": 112 }, { "epoch": 3.5, "eval_loss": 0.7117202877998352, "eval_runtime": 79.7539, "eval_samples_per_second": 2.508, "eval_steps_per_second": 0.313, "step": 112 }, { "epoch": 3.53125, "grad_norm": 0.34335720855758517, "learning_rate": 2e-05, "loss": 0.871, "step": 113 }, { "epoch": 3.53125, "eval_loss": 0.7117029428482056, "eval_runtime": 80.0281, "eval_samples_per_second": 2.499, "eval_steps_per_second": 0.312, "step": 113 }, { "epoch": 3.5625, "grad_norm": 0.31951931695644, "learning_rate": 2e-05, "loss": 0.7453, "step": 114 }, { "epoch": 3.5625, "eval_loss": 0.7116554379463196, "eval_runtime": 79.7209, "eval_samples_per_second": 2.509, "eval_steps_per_second": 0.314, "step": 114 }, { "epoch": 3.59375, "grad_norm": 0.28067192963874266, "learning_rate": 2e-05, "loss": 0.8045, "step": 115 }, { "epoch": 3.59375, "eval_loss": 0.7118353843688965, "eval_runtime": 80.0195, "eval_samples_per_second": 2.499, "eval_steps_per_second": 0.312, "step": 115 }, { "epoch": 3.625, "grad_norm": 0.2739718257400276, "learning_rate": 2e-05, "loss": 0.775, "step": 116 }, { "epoch": 3.625, "eval_loss": 0.7122579216957092, "eval_runtime": 76.2052, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 116 }, { "epoch": 3.65625, "grad_norm": 0.31401723658881836, "learning_rate": 2e-05, "loss": 0.7826, "step": 117 }, { "epoch": 3.65625, "eval_loss": 0.7118574380874634, "eval_runtime": 76.1509, "eval_samples_per_second": 2.626, "eval_steps_per_second": 0.328, "step": 117 }, { "epoch": 3.6875, "grad_norm": 0.36925964858634625, "learning_rate": 2e-05, "loss": 0.7884, "step": 118 }, { "epoch": 3.6875, "eval_loss": 0.710691511631012, "eval_runtime": 76.2305, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 118 }, { "epoch": 3.71875, "grad_norm": 0.3050583880654791, "learning_rate": 2e-05, "loss": 0.8402, "step": 119 }, { "epoch": 3.71875, "eval_loss": 0.7096763849258423, "eval_runtime": 77.0581, "eval_samples_per_second": 2.595, "eval_steps_per_second": 0.324, "step": 119 }, { "epoch": 3.75, "grad_norm": 0.2648625651290031, "learning_rate": 2e-05, "loss": 0.7889, "step": 120 }, { "epoch": 3.75, "eval_loss": 0.7094223499298096, "eval_runtime": 76.1379, "eval_samples_per_second": 2.627, "eval_steps_per_second": 0.328, "step": 120 }, { "epoch": 3.78125, "grad_norm": 0.3107221696449271, "learning_rate": 2e-05, "loss": 0.7615, "step": 121 }, { "epoch": 3.78125, "eval_loss": 0.7081363201141357, "eval_runtime": 76.626, "eval_samples_per_second": 2.61, "eval_steps_per_second": 0.326, "step": 121 }, { "epoch": 3.8125, "grad_norm": 0.3455151299995048, "learning_rate": 2e-05, "loss": 0.8342, "step": 122 }, { "epoch": 3.8125, "eval_loss": 0.7063001990318298, "eval_runtime": 77.0293, "eval_samples_per_second": 2.596, "eval_steps_per_second": 0.325, "step": 122 }, { "epoch": 3.84375, "grad_norm": 0.28847071926472523, "learning_rate": 2e-05, "loss": 0.7477, "step": 123 }, { "epoch": 3.84375, "eval_loss": 0.7044610381126404, "eval_runtime": 76.2385, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 123 }, { "epoch": 3.875, "grad_norm": 0.26753816515069856, "learning_rate": 2e-05, "loss": 0.7653, "step": 124 }, { "epoch": 3.875, "eval_loss": 0.7033799886703491, "eval_runtime": 76.1985, "eval_samples_per_second": 2.625, "eval_steps_per_second": 0.328, "step": 124 }, { "epoch": 3.90625, "grad_norm": 0.3465046292893005, "learning_rate": 2e-05, "loss": 0.8144, "step": 125 }, { "epoch": 3.90625, "eval_loss": 0.7021930813789368, "eval_runtime": 76.2234, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 125 }, { "epoch": 3.9375, "grad_norm": 0.3451690427620698, "learning_rate": 2e-05, "loss": 0.7871, "step": 126 }, { "epoch": 3.9375, "eval_loss": 0.7013542652130127, "eval_runtime": 78.0752, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 126 }, { "epoch": 3.96875, "grad_norm": 0.31571858642673567, "learning_rate": 2e-05, "loss": 0.7568, "step": 127 }, { "epoch": 3.96875, "eval_loss": 0.7007560729980469, "eval_runtime": 78.3558, "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.319, "step": 127 }, { "epoch": 4.0, "grad_norm": 0.3247003540270338, "learning_rate": 2e-05, "loss": 0.6714, "step": 128 }, { "epoch": 4.0, "eval_loss": 0.6999780535697937, "eval_runtime": 78.9788, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.317, "step": 128 }, { "epoch": 4.03125, "grad_norm": 0.2814983490019739, "learning_rate": 2e-05, "loss": 0.7797, "step": 129 }, { "epoch": 4.03125, "eval_loss": 0.6998200416564941, "eval_runtime": 78.3093, "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 129 }, { "epoch": 4.0625, "grad_norm": 0.31961631715145106, "learning_rate": 2e-05, "loss": 0.7993, "step": 130 }, { "epoch": 4.0625, "eval_loss": 0.6995271444320679, "eval_runtime": 78.2172, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 130 }, { "epoch": 4.09375, "grad_norm": 0.32333364662215863, "learning_rate": 2e-05, "loss": 0.7896, "step": 131 }, { "epoch": 4.09375, "eval_loss": 0.6992727518081665, "eval_runtime": 79.0125, "eval_samples_per_second": 2.531, "eval_steps_per_second": 0.316, "step": 131 }, { "epoch": 4.125, "grad_norm": 0.3255859640449829, "learning_rate": 2e-05, "loss": 0.7542, "step": 132 }, { "epoch": 4.125, "eval_loss": 0.6988572478294373, "eval_runtime": 79.0, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.316, "step": 132 }, { "epoch": 4.15625, "grad_norm": 0.3307068947429175, "learning_rate": 2e-05, "loss": 0.8416, "step": 133 }, { "epoch": 4.15625, "eval_loss": 0.6981343030929565, "eval_runtime": 78.3309, "eval_samples_per_second": 2.553, "eval_steps_per_second": 0.319, "step": 133 }, { "epoch": 4.1875, "grad_norm": 0.3842303818116732, "learning_rate": 2e-05, "loss": 0.7605, "step": 134 }, { "epoch": 4.1875, "eval_loss": 0.6968980431556702, "eval_runtime": 78.5608, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.318, "step": 134 }, { "epoch": 4.21875, "grad_norm": 0.331839472419003, "learning_rate": 2e-05, "loss": 0.7643, "step": 135 }, { "epoch": 4.21875, "eval_loss": 0.6955949664115906, "eval_runtime": 78.3566, "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.319, "step": 135 }, { "epoch": 4.25, "grad_norm": 0.31864813130499836, "learning_rate": 2e-05, "loss": 0.7369, "step": 136 }, { "epoch": 4.25, "eval_loss": 0.6951528787612915, "eval_runtime": 79.7802, "eval_samples_per_second": 2.507, "eval_steps_per_second": 0.313, "step": 136 }, { "epoch": 4.28125, "grad_norm": 0.352549164434451, "learning_rate": 2e-05, "loss": 0.7332, "step": 137 }, { "epoch": 4.28125, "eval_loss": 0.6947290897369385, "eval_runtime": 79.8171, "eval_samples_per_second": 2.506, "eval_steps_per_second": 0.313, "step": 137 }, { "epoch": 4.3125, "grad_norm": 0.37128812818896284, "learning_rate": 2e-05, "loss": 0.7542, "step": 138 }, { "epoch": 4.3125, "eval_loss": 0.6937370300292969, "eval_runtime": 79.7782, "eval_samples_per_second": 2.507, "eval_steps_per_second": 0.313, "step": 138 }, { "epoch": 4.34375, "grad_norm": 0.3348014941412048, "learning_rate": 2e-05, "loss": 0.7079, "step": 139 }, { "epoch": 4.34375, "eval_loss": 0.692456066608429, "eval_runtime": 79.9308, "eval_samples_per_second": 2.502, "eval_steps_per_second": 0.313, "step": 139 }, { "epoch": 4.375, "grad_norm": 0.34411051658527964, "learning_rate": 2e-05, "loss": 0.7465, "step": 140 }, { "epoch": 4.375, "eval_loss": 0.6915809512138367, "eval_runtime": 79.943, "eval_samples_per_second": 2.502, "eval_steps_per_second": 0.313, "step": 140 }, { "epoch": 4.40625, "grad_norm": 0.3373909601921749, "learning_rate": 2e-05, "loss": 0.7648, "step": 141 }, { "epoch": 4.40625, "eval_loss": 0.6912103295326233, "eval_runtime": 79.8515, "eval_samples_per_second": 2.505, "eval_steps_per_second": 0.313, "step": 141 }, { "epoch": 4.4375, "grad_norm": 0.33253827371305456, "learning_rate": 2e-05, "loss": 0.7224, "step": 142 }, { "epoch": 4.4375, "eval_loss": 0.6912806630134583, "eval_runtime": 80.6475, "eval_samples_per_second": 2.48, "eval_steps_per_second": 0.31, "step": 142 }, { "epoch": 4.46875, "grad_norm": 0.38458075172588313, "learning_rate": 2e-05, "loss": 0.7261, "step": 143 }, { "epoch": 4.46875, "eval_loss": 0.6905419230461121, "eval_runtime": 80.2606, "eval_samples_per_second": 2.492, "eval_steps_per_second": 0.311, "step": 143 }, { "epoch": 4.5, "grad_norm": 0.31351962640463144, "learning_rate": 2e-05, "loss": 0.6909, "step": 144 }, { "epoch": 4.5, "eval_loss": 0.6898491382598877, "eval_runtime": 79.9965, "eval_samples_per_second": 2.5, "eval_steps_per_second": 0.313, "step": 144 }, { "epoch": 4.53125, "grad_norm": 0.35474372115704583, "learning_rate": 2e-05, "loss": 0.7605, "step": 145 }, { "epoch": 4.53125, "eval_loss": 0.6893147230148315, "eval_runtime": 1475.5758, "eval_samples_per_second": 0.136, "eval_steps_per_second": 0.017, "step": 145 }, { "epoch": 4.5625, "grad_norm": 0.3479568917421202, "learning_rate": 2e-05, "loss": 0.6638, "step": 146 }, { "epoch": 4.5625, "eval_loss": 0.6884538531303406, "eval_runtime": 84.6835, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.295, "step": 146 }, { "epoch": 4.59375, "grad_norm": 0.3421823344428645, "learning_rate": 2e-05, "loss": 0.7339, "step": 147 }, { "epoch": 4.59375, "eval_loss": 0.6873475909233093, "eval_runtime": 83.3138, "eval_samples_per_second": 2.401, "eval_steps_per_second": 0.3, "step": 147 }, { "epoch": 4.625, "grad_norm": 0.3642187020830788, "learning_rate": 2e-05, "loss": 0.6825, "step": 148 }, { "epoch": 4.625, "eval_loss": 0.6858401298522949, "eval_runtime": 82.1066, "eval_samples_per_second": 2.436, "eval_steps_per_second": 0.304, "step": 148 }, { "epoch": 4.65625, "grad_norm": 0.35097547901391785, "learning_rate": 2e-05, "loss": 0.7986, "step": 149 }, { "epoch": 4.65625, "eval_loss": 0.6848779320716858, "eval_runtime": 84.4076, "eval_samples_per_second": 2.369, "eval_steps_per_second": 0.296, "step": 149 }, { "epoch": 4.6875, "grad_norm": 0.3568694843794629, "learning_rate": 2e-05, "loss": 0.7176, "step": 150 }, { "epoch": 4.6875, "eval_loss": 0.6842290759086609, "eval_runtime": 82.5945, "eval_samples_per_second": 2.421, "eval_steps_per_second": 0.303, "step": 150 }, { "epoch": 4.71875, "grad_norm": 0.34258633585260334, "learning_rate": 2e-05, "loss": 0.7363, "step": 151 }, { "epoch": 4.71875, "eval_loss": 0.6838659048080444, "eval_runtime": 85.9626, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.291, "step": 151 }, { "epoch": 4.75, "grad_norm": 0.42319523894659655, "learning_rate": 2e-05, "loss": 0.7675, "step": 152 }, { "epoch": 4.75, "eval_loss": 0.6830299496650696, "eval_runtime": 85.7189, "eval_samples_per_second": 2.333, "eval_steps_per_second": 0.292, "step": 152 }, { "epoch": 4.78125, "grad_norm": 0.3632195533127194, "learning_rate": 2e-05, "loss": 0.715, "step": 153 }, { "epoch": 4.78125, "eval_loss": 0.6826379895210266, "eval_runtime": 87.8244, "eval_samples_per_second": 2.277, "eval_steps_per_second": 0.285, "step": 153 }, { "epoch": 4.8125, "grad_norm": 0.3738308004604413, "learning_rate": 2e-05, "loss": 0.7344, "step": 154 }, { "epoch": 4.8125, "eval_loss": 0.6826817393302917, "eval_runtime": 86.5822, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 154 }, { "epoch": 4.84375, "grad_norm": 0.3618696330632776, "learning_rate": 2e-05, "loss": 0.6632, "step": 155 }, { "epoch": 4.84375, "eval_loss": 0.6827967166900635, "eval_runtime": 82.1829, "eval_samples_per_second": 2.434, "eval_steps_per_second": 0.304, "step": 155 }, { "epoch": 4.875, "grad_norm": 0.38901912569992203, "learning_rate": 2e-05, "loss": 0.7788, "step": 156 }, { "epoch": 4.875, "eval_loss": 0.6821711659431458, "eval_runtime": 84.4511, "eval_samples_per_second": 2.368, "eval_steps_per_second": 0.296, "step": 156 }, { "epoch": 4.90625, "grad_norm": 0.3516096507348829, "learning_rate": 2e-05, "loss": 0.7794, "step": 157 }, { "epoch": 4.90625, "eval_loss": 0.6819837689399719, "eval_runtime": 84.1594, "eval_samples_per_second": 2.376, "eval_steps_per_second": 0.297, "step": 157 }, { "epoch": 4.9375, "grad_norm": 0.36066902463794986, "learning_rate": 2e-05, "loss": 0.7674, "step": 158 }, { "epoch": 4.9375, "eval_loss": 0.6817716956138611, "eval_runtime": 83.8929, "eval_samples_per_second": 2.384, "eval_steps_per_second": 0.298, "step": 158 }, { "epoch": 4.96875, "grad_norm": 0.36641784926154175, "learning_rate": 2e-05, "loss": 0.7116, "step": 159 }, { "epoch": 4.96875, "eval_loss": 0.6816902160644531, "eval_runtime": 84.4431, "eval_samples_per_second": 2.368, "eval_steps_per_second": 0.296, "step": 159 }, { "epoch": 5.0, "grad_norm": 0.4020716293225933, "learning_rate": 2e-05, "loss": 0.7142, "step": 160 }, { "epoch": 5.0, "eval_loss": 0.6811469793319702, "eval_runtime": 86.0681, "eval_samples_per_second": 2.324, "eval_steps_per_second": 0.29, "step": 160 }, { "epoch": 5.03125, "grad_norm": 0.38360882669254054, "learning_rate": 2e-05, "loss": 0.6756, "step": 161 }, { "epoch": 5.03125, "eval_loss": 0.6798409223556519, "eval_runtime": 81.9903, "eval_samples_per_second": 2.439, "eval_steps_per_second": 0.305, "step": 161 }, { "epoch": 5.0625, "grad_norm": 0.34966156213066135, "learning_rate": 2e-05, "loss": 0.827, "step": 162 }, { "epoch": 5.0625, "eval_loss": 0.6788859367370605, "eval_runtime": 76.1753, "eval_samples_per_second": 2.626, "eval_steps_per_second": 0.328, "step": 162 }, { "epoch": 5.09375, "grad_norm": 0.41140842939901384, "learning_rate": 2e-05, "loss": 0.6409, "step": 163 }, { "epoch": 5.09375, "eval_loss": 0.6787077188491821, "eval_runtime": 76.2239, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 163 }, { "epoch": 5.125, "grad_norm": 0.4222084070163774, "learning_rate": 2e-05, "loss": 0.7774, "step": 164 }, { "epoch": 5.125, "eval_loss": 0.6796822547912598, "eval_runtime": 76.2141, "eval_samples_per_second": 2.624, "eval_steps_per_second": 0.328, "step": 164 }, { "epoch": 5.15625, "grad_norm": 0.4644454724424921, "learning_rate": 2e-05, "loss": 0.6057, "step": 165 }, { "epoch": 5.15625, "eval_loss": 0.6794346570968628, "eval_runtime": 76.3216, "eval_samples_per_second": 2.62, "eval_steps_per_second": 0.328, "step": 165 }, { "epoch": 5.1875, "grad_norm": 0.46128725263272996, "learning_rate": 2e-05, "loss": 0.7158, "step": 166 }, { "epoch": 5.1875, "eval_loss": 0.6791612505912781, "eval_runtime": 78.4909, "eval_samples_per_second": 2.548, "eval_steps_per_second": 0.319, "step": 166 }, { "epoch": 5.21875, "grad_norm": 0.37300666872025545, "learning_rate": 2e-05, "loss": 0.7363, "step": 167 }, { "epoch": 5.21875, "eval_loss": 0.6788016557693481, "eval_runtime": 78.5697, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.318, "step": 167 }, { "epoch": 5.25, "grad_norm": 0.41454648576180214, "learning_rate": 2e-05, "loss": 0.7759, "step": 168 }, { "epoch": 5.25, "eval_loss": 0.6787048578262329, "eval_runtime": 78.5317, "eval_samples_per_second": 2.547, "eval_steps_per_second": 0.318, "step": 168 }, { "epoch": 5.28125, "grad_norm": 0.40724665091386236, "learning_rate": 2e-05, "loss": 0.6944, "step": 169 }, { "epoch": 5.28125, "eval_loss": 0.679679811000824, "eval_runtime": 78.6899, "eval_samples_per_second": 2.542, "eval_steps_per_second": 0.318, "step": 169 }, { "epoch": 5.3125, "grad_norm": 0.3875110486208986, "learning_rate": 2e-05, "loss": 0.6634, "step": 170 }, { "epoch": 5.3125, "eval_loss": 0.6819935441017151, "eval_runtime": 78.3617, "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.319, "step": 170 }, { "epoch": 5.34375, "grad_norm": 0.47956532155617193, "learning_rate": 2e-05, "loss": 0.687, "step": 171 }, { "epoch": 5.34375, "eval_loss": 0.6825206875801086, "eval_runtime": 78.4435, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 171 }, { "epoch": 5.375, "grad_norm": 0.4599359590587781, "learning_rate": 2e-05, "loss": 0.7718, "step": 172 }, { "epoch": 5.375, "eval_loss": 0.6816768050193787, "eval_runtime": 78.3005, "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 172 }, { "epoch": 5.40625, "grad_norm": 0.4057490487995386, "learning_rate": 2e-05, "loss": 0.7292, "step": 173 }, { "epoch": 5.40625, "eval_loss": 0.6806090474128723, "eval_runtime": 78.3313, "eval_samples_per_second": 2.553, "eval_steps_per_second": 0.319, "step": 173 }, { "epoch": 5.4375, "grad_norm": 0.4143979315360467, "learning_rate": 2e-05, "loss": 0.7697, "step": 174 }, { "epoch": 5.4375, "eval_loss": 0.6795693039894104, "eval_runtime": 78.4526, "eval_samples_per_second": 2.549, "eval_steps_per_second": 0.319, "step": 174 }, { "epoch": 5.46875, "grad_norm": 0.4219663662343445, "learning_rate": 2e-05, "loss": 0.7534, "step": 175 }, { "epoch": 5.46875, "eval_loss": 0.6793847680091858, "eval_runtime": 78.8009, "eval_samples_per_second": 2.538, "eval_steps_per_second": 0.317, "step": 175 }, { "epoch": 5.5, "grad_norm": 0.4491811321927657, "learning_rate": 2e-05, "loss": 0.7004, "step": 176 }, { "epoch": 5.5, "eval_loss": 0.6775352358818054, "eval_runtime": 80.0685, "eval_samples_per_second": 2.498, "eval_steps_per_second": 0.312, "step": 176 }, { "epoch": 5.53125, "grad_norm": 0.46366516532638885, "learning_rate": 2e-05, "loss": 0.7357, "step": 177 }, { "epoch": 5.53125, "eval_loss": 0.6748698949813843, "eval_runtime": 80.0487, "eval_samples_per_second": 2.498, "eval_steps_per_second": 0.312, "step": 177 }, { "epoch": 5.5625, "grad_norm": 0.3815188640227797, "learning_rate": 2e-05, "loss": 0.7592, "step": 178 }, { "epoch": 5.5625, "eval_loss": 0.6728273034095764, "eval_runtime": 80.0318, "eval_samples_per_second": 2.499, "eval_steps_per_second": 0.312, "step": 178 }, { "epoch": 5.59375, "grad_norm": 0.41025429416666304, "learning_rate": 2e-05, "loss": 0.6585, "step": 179 }, { "epoch": 5.59375, "eval_loss": 0.6718859672546387, "eval_runtime": 79.8801, "eval_samples_per_second": 2.504, "eval_steps_per_second": 0.313, "step": 179 }, { "epoch": 5.625, "grad_norm": 0.40652817592240054, "learning_rate": 2e-05, "loss": 0.6611, "step": 180 }, { "epoch": 5.625, "eval_loss": 0.6715708374977112, "eval_runtime": 76.7261, "eval_samples_per_second": 2.607, "eval_steps_per_second": 0.326, "step": 180 }, { "epoch": 5.65625, "grad_norm": 0.40753961326688415, "learning_rate": 2e-05, "loss": 0.6779, "step": 181 }, { "epoch": 5.65625, "eval_loss": 0.6719761490821838, "eval_runtime": 77.0136, "eval_samples_per_second": 2.597, "eval_steps_per_second": 0.325, "step": 181 }, { "epoch": 5.6875, "grad_norm": 0.4232811980671673, "learning_rate": 2e-05, "loss": 0.6475, "step": 182 }, { "epoch": 5.6875, "eval_loss": 0.6724664568901062, "eval_runtime": 76.9731, "eval_samples_per_second": 2.598, "eval_steps_per_second": 0.325, "step": 182 }, { "epoch": 5.71875, "grad_norm": 0.5132756318549849, "learning_rate": 2e-05, "loss": 0.6801, "step": 183 }, { "epoch": 5.71875, "eval_loss": 0.6723365783691406, "eval_runtime": 76.4132, "eval_samples_per_second": 2.617, "eval_steps_per_second": 0.327, "step": 183 }, { "epoch": 5.75, "grad_norm": 0.43526879230161264, "learning_rate": 2e-05, "loss": 0.6673, "step": 184 }, { "epoch": 5.75, "eval_loss": 0.672926664352417, "eval_runtime": 76.1936, "eval_samples_per_second": 2.625, "eval_steps_per_second": 0.328, "step": 184 }, { "epoch": 5.78125, "grad_norm": 0.46965560853038507, "learning_rate": 2e-05, "loss": 0.7074, "step": 185 }, { "epoch": 5.78125, "eval_loss": 0.6731134057044983, "eval_runtime": 76.2345, "eval_samples_per_second": 2.623, "eval_steps_per_second": 0.328, "step": 185 }, { "epoch": 5.8125, "grad_norm": 0.4733296318676217, "learning_rate": 2e-05, "loss": 0.6791, "step": 186 }, { "epoch": 5.8125, "eval_loss": 0.6726363301277161, "eval_runtime": 78.3939, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 186 }, { "epoch": 5.84375, "grad_norm": 0.4662943253655961, "learning_rate": 2e-05, "loss": 0.7371, "step": 187 }, { "epoch": 5.84375, "eval_loss": 0.6726526021957397, "eval_runtime": 79.1834, "eval_samples_per_second": 2.526, "eval_steps_per_second": 0.316, "step": 187 }, { "epoch": 5.875, "grad_norm": 0.4420962889993382, "learning_rate": 2e-05, "loss": 0.675, "step": 188 }, { "epoch": 5.875, "eval_loss": 0.6727125644683838, "eval_runtime": 78.252, "eval_samples_per_second": 2.556, "eval_steps_per_second": 0.319, "step": 188 }, { "epoch": 5.90625, "grad_norm": 0.4345166976944551, "learning_rate": 2e-05, "loss": 0.6748, "step": 189 }, { "epoch": 5.90625, "eval_loss": 0.6725904941558838, "eval_runtime": 78.3914, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 189 }, { "epoch": 5.9375, "grad_norm": 0.45109463315374526, "learning_rate": 2e-05, "loss": 0.7024, "step": 190 }, { "epoch": 5.9375, "eval_loss": 0.6718384027481079, "eval_runtime": 78.4361, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 190 }, { "epoch": 5.96875, "grad_norm": 0.42953871838795626, "learning_rate": 2e-05, "loss": 0.6904, "step": 191 }, { "epoch": 5.96875, "eval_loss": 0.6703083515167236, "eval_runtime": 78.3863, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 191 }, { "epoch": 6.0, "grad_norm": 0.4248607379284984, "learning_rate": 2e-05, "loss": 0.6659, "step": 192 }, { "epoch": 6.0, "eval_loss": 0.6693080067634583, "eval_runtime": 78.4373, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 192 }, { "epoch": 6.03125, "grad_norm": 0.42839417453459494, "learning_rate": 2e-05, "loss": 0.7457, "step": 193 }, { "epoch": 6.03125, "eval_loss": 0.6689594984054565, "eval_runtime": 78.4169, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 193 }, { "epoch": 6.0625, "grad_norm": 0.4216922788166874, "learning_rate": 2e-05, "loss": 0.7189, "step": 194 }, { "epoch": 6.0625, "eval_loss": 0.6689300537109375, "eval_runtime": 78.9793, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.317, "step": 194 }, { "epoch": 6.09375, "grad_norm": 0.45199575791858004, "learning_rate": 2e-05, "loss": 0.6438, "step": 195 }, { "epoch": 6.09375, "eval_loss": 0.6690151691436768, "eval_runtime": 78.5002, "eval_samples_per_second": 2.548, "eval_steps_per_second": 0.318, "step": 195 }, { "epoch": 6.125, "grad_norm": 0.4166923177293841, "learning_rate": 2e-05, "loss": 0.6885, "step": 196 }, { "epoch": 6.125, "eval_loss": 0.6688613891601562, "eval_runtime": 80.5497, "eval_samples_per_second": 2.483, "eval_steps_per_second": 0.31, "step": 196 }, { "epoch": 6.15625, "grad_norm": 0.45164281863366285, "learning_rate": 2e-05, "loss": 0.7197, "step": 197 }, { "epoch": 6.15625, "eval_loss": 0.6687932014465332, "eval_runtime": 80.1482, "eval_samples_per_second": 2.495, "eval_steps_per_second": 0.312, "step": 197 }, { "epoch": 6.1875, "grad_norm": 0.45653924787504446, "learning_rate": 2e-05, "loss": 0.776, "step": 198 }, { "epoch": 6.1875, "eval_loss": 0.6690963506698608, "eval_runtime": 80.4464, "eval_samples_per_second": 2.486, "eval_steps_per_second": 0.311, "step": 198 }, { "epoch": 6.21875, "grad_norm": 0.4966562341334706, "learning_rate": 2e-05, "loss": 0.6532, "step": 199 }, { "epoch": 6.21875, "eval_loss": 0.669116735458374, "eval_runtime": 79.8294, "eval_samples_per_second": 2.505, "eval_steps_per_second": 0.313, "step": 199 }, { "epoch": 6.25, "grad_norm": 0.4838469303220975, "learning_rate": 2e-05, "loss": 0.6883, "step": 200 }, { "epoch": 6.25, "eval_loss": 0.6693156957626343, "eval_runtime": 80.25, "eval_samples_per_second": 2.492, "eval_steps_per_second": 0.312, "step": 200 }, { "epoch": 6.28125, "grad_norm": 0.4836820906895964, "learning_rate": 2e-05, "loss": 0.7106, "step": 201 }, { "epoch": 6.28125, "eval_loss": 0.6704170107841492, "eval_runtime": 79.9636, "eval_samples_per_second": 2.501, "eval_steps_per_second": 0.313, "step": 201 }, { "epoch": 6.3125, "grad_norm": 0.4945855983140219, "learning_rate": 2e-05, "loss": 0.6336, "step": 202 }, { "epoch": 6.3125, "eval_loss": 0.6708824038505554, "eval_runtime": 80.8044, "eval_samples_per_second": 2.475, "eval_steps_per_second": 0.309, "step": 202 }, { "epoch": 6.34375, "grad_norm": 0.44587847230103017, "learning_rate": 2e-05, "loss": 0.7811, "step": 203 }, { "epoch": 6.34375, "eval_loss": 0.6723968982696533, "eval_runtime": 80.1715, "eval_samples_per_second": 2.495, "eval_steps_per_second": 0.312, "step": 203 }, { "epoch": 6.375, "grad_norm": 0.5351063503195825, "learning_rate": 2e-05, "loss": 0.6222, "step": 204 }, { "epoch": 6.375, "eval_loss": 0.672196626663208, "eval_runtime": 79.927, "eval_samples_per_second": 2.502, "eval_steps_per_second": 0.313, "step": 204 }, { "epoch": 6.40625, "grad_norm": 0.4742985088010474, "learning_rate": 2e-05, "loss": 0.6157, "step": 205 }, { "epoch": 6.40625, "eval_loss": 0.671062171459198, "eval_runtime": 80.1997, "eval_samples_per_second": 2.494, "eval_steps_per_second": 0.312, "step": 205 }, { "epoch": 6.4375, "grad_norm": 0.5188882333349506, "learning_rate": 2e-05, "loss": 0.6462, "step": 206 }, { "epoch": 6.4375, "eval_loss": 0.6701972484588623, "eval_runtime": 81.6643, "eval_samples_per_second": 2.449, "eval_steps_per_second": 0.306, "step": 206 }, { "epoch": 6.46875, "grad_norm": 0.45328063593983603, "learning_rate": 2e-05, "loss": 0.7058, "step": 207 }, { "epoch": 6.46875, "eval_loss": 0.6699164509773254, "eval_runtime": 81.2228, "eval_samples_per_second": 2.462, "eval_steps_per_second": 0.308, "step": 207 }, { "epoch": 6.5, "grad_norm": 0.5197645538332801, "learning_rate": 2e-05, "loss": 0.6462, "step": 208 }, { "epoch": 6.5, "eval_loss": 0.6702597141265869, "eval_runtime": 81.1451, "eval_samples_per_second": 2.465, "eval_steps_per_second": 0.308, "step": 208 }, { "epoch": 6.53125, "grad_norm": 0.5762528184834232, "learning_rate": 2e-05, "loss": 0.6259, "step": 209 }, { "epoch": 6.53125, "eval_loss": 0.6696366667747498, "eval_runtime": 81.1643, "eval_samples_per_second": 2.464, "eval_steps_per_second": 0.308, "step": 209 }, { "epoch": 6.5625, "grad_norm": 0.5249503180293145, "learning_rate": 2e-05, "loss": 0.6045, "step": 210 }, { "epoch": 6.5625, "eval_loss": 0.6688054800033569, "eval_runtime": 80.9492, "eval_samples_per_second": 2.471, "eval_steps_per_second": 0.309, "step": 210 }, { "epoch": 6.59375, "grad_norm": 0.543503888655844, "learning_rate": 2e-05, "loss": 0.6496, "step": 211 }, { "epoch": 6.59375, "eval_loss": 0.6689916849136353, "eval_runtime": 81.6473, "eval_samples_per_second": 2.45, "eval_steps_per_second": 0.306, "step": 211 }, { "epoch": 6.625, "grad_norm": 0.48119553592193554, "learning_rate": 2e-05, "loss": 0.6211, "step": 212 }, { "epoch": 6.625, "eval_loss": 0.6703050136566162, "eval_runtime": 81.9207, "eval_samples_per_second": 2.441, "eval_steps_per_second": 0.305, "step": 212 }, { "epoch": 6.65625, "grad_norm": 0.5153356086819314, "learning_rate": 2e-05, "loss": 0.7135, "step": 213 }, { "epoch": 6.65625, "eval_loss": 0.6702842116355896, "eval_runtime": 81.1503, "eval_samples_per_second": 2.465, "eval_steps_per_second": 0.308, "step": 213 }, { "epoch": 6.6875, "grad_norm": 0.5249915042825578, "learning_rate": 2e-05, "loss": 0.6635, "step": 214 }, { "epoch": 6.6875, "eval_loss": 0.6687333583831787, "eval_runtime": 81.6743, "eval_samples_per_second": 2.449, "eval_steps_per_second": 0.306, "step": 214 }, { "epoch": 6.71875, "grad_norm": 0.5204840219868723, "learning_rate": 2e-05, "loss": 0.6701, "step": 215 }, { "epoch": 6.71875, "eval_loss": 0.6657728552818298, "eval_runtime": 81.106, "eval_samples_per_second": 2.466, "eval_steps_per_second": 0.308, "step": 215 }, { "epoch": 6.75, "grad_norm": 0.5266935225120133, "learning_rate": 2e-05, "loss": 0.6637, "step": 216 }, { "epoch": 6.75, "eval_loss": 0.6641908884048462, "eval_runtime": 82.2613, "eval_samples_per_second": 2.431, "eval_steps_per_second": 0.304, "step": 216 }, { "epoch": 6.78125, "grad_norm": 0.5438859451742696, "learning_rate": 2e-05, "loss": 0.6168, "step": 217 }, { "epoch": 6.78125, "eval_loss": 0.6652233600616455, "eval_runtime": 82.042, "eval_samples_per_second": 2.438, "eval_steps_per_second": 0.305, "step": 217 }, { "epoch": 6.8125, "grad_norm": 0.5716385253433929, "learning_rate": 2e-05, "loss": 0.6062, "step": 218 }, { "epoch": 6.8125, "eval_loss": 0.6656240820884705, "eval_runtime": 81.233, "eval_samples_per_second": 2.462, "eval_steps_per_second": 0.308, "step": 218 }, { "epoch": 6.84375, "grad_norm": 1.0572787630142522, "learning_rate": 2e-05, "loss": 0.7037, "step": 219 }, { "epoch": 6.84375, "eval_loss": 0.6645559072494507, "eval_runtime": 81.2099, "eval_samples_per_second": 2.463, "eval_steps_per_second": 0.308, "step": 219 }, { "epoch": 6.875, "grad_norm": 0.5924889323251107, "learning_rate": 2e-05, "loss": 0.712, "step": 220 }, { "epoch": 6.875, "eval_loss": 0.6619111895561218, "eval_runtime": 81.7826, "eval_samples_per_second": 2.446, "eval_steps_per_second": 0.306, "step": 220 }, { "epoch": 6.90625, "grad_norm": 0.5290576915218269, "learning_rate": 2e-05, "loss": 0.6659, "step": 221 }, { "epoch": 6.90625, "eval_loss": 0.6609540581703186, "eval_runtime": 82.9922, "eval_samples_per_second": 2.41, "eval_steps_per_second": 0.301, "step": 221 }, { "epoch": 6.9375, "grad_norm": 0.5831209517049147, "learning_rate": 2e-05, "loss": 0.6547, "step": 222 }, { "epoch": 6.9375, "eval_loss": 0.660676896572113, "eval_runtime": 83.6541, "eval_samples_per_second": 2.391, "eval_steps_per_second": 0.299, "step": 222 }, { "epoch": 6.96875, "grad_norm": 0.5320966369511158, "learning_rate": 2e-05, "loss": 0.6968, "step": 223 }, { "epoch": 6.96875, "eval_loss": 0.6618594527244568, "eval_runtime": 83.1148, "eval_samples_per_second": 2.406, "eval_steps_per_second": 0.301, "step": 223 }, { "epoch": 7.0, "grad_norm": 0.5829636446837394, "learning_rate": 2e-05, "loss": 0.7407, "step": 224 }, { "epoch": 7.0, "eval_loss": 0.6635661125183105, "eval_runtime": 82.8183, "eval_samples_per_second": 2.415, "eval_steps_per_second": 0.302, "step": 224 }, { "epoch": 7.03125, "grad_norm": 0.4975095056459566, "learning_rate": 2e-05, "loss": 0.6535, "step": 225 }, { "epoch": 7.03125, "eval_loss": 0.6641671657562256, "eval_runtime": 83.0267, "eval_samples_per_second": 2.409, "eval_steps_per_second": 0.301, "step": 225 }, { "epoch": 7.0625, "grad_norm": 0.5625698523064815, "learning_rate": 2e-05, "loss": 0.6012, "step": 226 }, { "epoch": 7.0625, "eval_loss": 0.6639044880867004, "eval_runtime": 83.3881, "eval_samples_per_second": 2.398, "eval_steps_per_second": 0.3, "step": 226 }, { "epoch": 7.09375, "grad_norm": 0.5436196850683295, "learning_rate": 2e-05, "loss": 0.6485, "step": 227 }, { "epoch": 7.09375, "eval_loss": 0.6651788353919983, "eval_runtime": 82.7096, "eval_samples_per_second": 2.418, "eval_steps_per_second": 0.302, "step": 227 }, { "epoch": 7.125, "grad_norm": 0.5598906287609361, "learning_rate": 2e-05, "loss": 0.6142, "step": 228 }, { "epoch": 7.125, "eval_loss": 0.6688636541366577, "eval_runtime": 82.601, "eval_samples_per_second": 2.421, "eval_steps_per_second": 0.303, "step": 228 }, { "epoch": 7.15625, "grad_norm": 0.7572979310697923, "learning_rate": 2e-05, "loss": 0.6221, "step": 229 }, { "epoch": 7.15625, "eval_loss": 0.6699694991111755, "eval_runtime": 82.6032, "eval_samples_per_second": 2.421, "eval_steps_per_second": 0.303, "step": 229 }, { "epoch": 7.1875, "grad_norm": 0.6173309690580897, "learning_rate": 2e-05, "loss": 0.5919, "step": 230 }, { "epoch": 7.1875, "eval_loss": 0.6706527471542358, "eval_runtime": 82.9732, "eval_samples_per_second": 2.41, "eval_steps_per_second": 0.301, "step": 230 }, { "epoch": 7.21875, "grad_norm": 0.643241771517866, "learning_rate": 2e-05, "loss": 0.7081, "step": 231 }, { "epoch": 7.21875, "eval_loss": 0.6700320243835449, "eval_runtime": 84.5621, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "step": 231 }, { "epoch": 7.25, "grad_norm": 0.577638137570571, "learning_rate": 2e-05, "loss": 0.6873, "step": 232 }, { "epoch": 7.25, "eval_loss": 0.669111430644989, "eval_runtime": 84.5124, "eval_samples_per_second": 2.367, "eval_steps_per_second": 0.296, "step": 232 }, { "epoch": 7.28125, "grad_norm": 0.7229488296023369, "learning_rate": 2e-05, "loss": 0.6301, "step": 233 }, { "epoch": 7.28125, "eval_loss": 0.6664154529571533, "eval_runtime": 84.6437, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.295, "step": 233 }, { "epoch": 7.3125, "grad_norm": 0.5827815449039045, "learning_rate": 2e-05, "loss": 0.669, "step": 234 }, { "epoch": 7.3125, "eval_loss": 0.6641202569007874, "eval_runtime": 84.489, "eval_samples_per_second": 2.367, "eval_steps_per_second": 0.296, "step": 234 }, { "epoch": 7.34375, "grad_norm": 0.57507354017269, "learning_rate": 2e-05, "loss": 0.6474, "step": 235 }, { "epoch": 7.34375, "eval_loss": 0.6623325347900391, "eval_runtime": 84.5536, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "step": 235 }, { "epoch": 7.375, "grad_norm": 0.5810844862533651, "learning_rate": 2e-05, "loss": 0.6048, "step": 236 }, { "epoch": 7.375, "eval_loss": 0.6619194746017456, "eval_runtime": 84.2296, "eval_samples_per_second": 2.374, "eval_steps_per_second": 0.297, "step": 236 }, { "epoch": 7.40625, "grad_norm": 0.6075032415813726, "learning_rate": 2e-05, "loss": 0.6529, "step": 237 }, { "epoch": 7.40625, "eval_loss": 0.6626202464103699, "eval_runtime": 84.9703, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.294, "step": 237 }, { "epoch": 7.4375, "grad_norm": 0.6402642234375245, "learning_rate": 2e-05, "loss": 0.6433, "step": 238 }, { "epoch": 7.4375, "eval_loss": 0.663289487361908, "eval_runtime": 84.8924, "eval_samples_per_second": 2.356, "eval_steps_per_second": 0.294, "step": 238 }, { "epoch": 7.46875, "grad_norm": 0.6335996982657431, "learning_rate": 2e-05, "loss": 0.6815, "step": 239 }, { "epoch": 7.46875, "eval_loss": 0.6636109948158264, "eval_runtime": 85.0551, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 239 }, { "epoch": 7.5, "grad_norm": 0.5796846795848909, "learning_rate": 2e-05, "loss": 0.6236, "step": 240 }, { "epoch": 7.5, "eval_loss": 0.6652829051017761, "eval_runtime": 84.7574, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.295, "step": 240 }, { "epoch": 7.53125, "grad_norm": 0.5380402145760035, "learning_rate": 2e-05, "loss": 0.6564, "step": 241 }, { "epoch": 7.53125, "eval_loss": 0.6676375865936279, "eval_runtime": 86.2058, "eval_samples_per_second": 2.32, "eval_steps_per_second": 0.29, "step": 241 }, { "epoch": 7.5625, "grad_norm": 0.5964298255824012, "learning_rate": 2e-05, "loss": 0.6475, "step": 242 }, { "epoch": 7.5625, "eval_loss": 0.6698520183563232, "eval_runtime": 85.8955, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.291, "step": 242 }, { "epoch": 7.59375, "grad_norm": 0.561279296875, "learning_rate": 2e-05, "loss": 0.6395, "step": 243 }, { "epoch": 7.59375, "eval_loss": 0.6705803871154785, "eval_runtime": 86.0036, "eval_samples_per_second": 2.325, "eval_steps_per_second": 0.291, "step": 243 }, { "epoch": 7.625, "grad_norm": 0.6757292755073548, "learning_rate": 2e-05, "loss": 0.7074, "step": 244 }, { "epoch": 7.625, "eval_loss": 0.6679538488388062, "eval_runtime": 85.5379, "eval_samples_per_second": 2.338, "eval_steps_per_second": 0.292, "step": 244 }, { "epoch": 7.65625, "grad_norm": 0.659077163070129, "learning_rate": 2e-05, "loss": 0.6078, "step": 245 }, { "epoch": 7.65625, "eval_loss": 0.6667564511299133, "eval_runtime": 85.752, "eval_samples_per_second": 2.332, "eval_steps_per_second": 0.292, "step": 245 }, { "epoch": 7.6875, "grad_norm": 0.6215405566454576, "learning_rate": 2e-05, "loss": 0.6603, "step": 246 }, { "epoch": 7.6875, "eval_loss": 0.665945291519165, "eval_runtime": 92.3086, "eval_samples_per_second": 2.167, "eval_steps_per_second": 0.271, "step": 246 }, { "epoch": 7.71875, "grad_norm": 0.6130534921490498, "learning_rate": 2e-05, "loss": 0.6435, "step": 247 }, { "epoch": 7.71875, "eval_loss": 0.6661685109138489, "eval_runtime": 87.1917, "eval_samples_per_second": 2.294, "eval_steps_per_second": 0.287, "step": 247 }, { "epoch": 7.75, "grad_norm": 0.6025415602868736, "learning_rate": 2e-05, "loss": 0.6308, "step": 248 }, { "epoch": 7.75, "eval_loss": 0.6658704280853271, "eval_runtime": 86.8233, "eval_samples_per_second": 2.304, "eval_steps_per_second": 0.288, "step": 248 }, { "epoch": 7.78125, "grad_norm": 0.6901593792019413, "learning_rate": 2e-05, "loss": 0.6777, "step": 249 }, { "epoch": 7.78125, "eval_loss": 0.6652414202690125, "eval_runtime": 86.7625, "eval_samples_per_second": 2.305, "eval_steps_per_second": 0.288, "step": 249 }, { "epoch": 7.8125, "grad_norm": 0.6436454697341579, "learning_rate": 2e-05, "loss": 0.6912, "step": 250 }, { "epoch": 7.8125, "eval_loss": 0.6654212474822998, "eval_runtime": 86.871, "eval_samples_per_second": 2.302, "eval_steps_per_second": 0.288, "step": 250 }, { "epoch": 7.84375, "grad_norm": 0.649040103024529, "learning_rate": 2e-05, "loss": 0.6025, "step": 251 }, { "epoch": 7.84375, "eval_loss": 0.6654068231582642, "eval_runtime": 86.7458, "eval_samples_per_second": 2.306, "eval_steps_per_second": 0.288, "step": 251 }, { "epoch": 7.875, "grad_norm": 0.6595522131680224, "learning_rate": 2e-05, "loss": 0.5973, "step": 252 }, { "epoch": 7.875, "eval_loss": 0.6644830107688904, "eval_runtime": 86.8739, "eval_samples_per_second": 2.302, "eval_steps_per_second": 0.288, "step": 252 }, { "epoch": 7.90625, "grad_norm": 0.6689891717273936, "learning_rate": 2e-05, "loss": 0.687, "step": 253 }, { "epoch": 7.90625, "eval_loss": 0.6616199612617493, "eval_runtime": 86.8222, "eval_samples_per_second": 2.304, "eval_steps_per_second": 0.288, "step": 253 }, { "epoch": 7.9375, "grad_norm": 0.6306846778314292, "learning_rate": 2e-05, "loss": 0.6599, "step": 254 }, { "epoch": 7.9375, "eval_loss": 0.6592965126037598, "eval_runtime": 86.8577, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 254 }, { "epoch": 7.96875, "grad_norm": 0.6021327993890785, "learning_rate": 2e-05, "loss": 0.575, "step": 255 }, { "epoch": 7.96875, "eval_loss": 0.6580593585968018, "eval_runtime": 86.7582, "eval_samples_per_second": 2.305, "eval_steps_per_second": 0.288, "step": 255 }, { "epoch": 8.0, "grad_norm": 0.6174712675568311, "learning_rate": 2e-05, "loss": 0.6341, "step": 256 }, { "epoch": 8.0, "eval_loss": 0.6575854420661926, "eval_runtime": 76.7634, "eval_samples_per_second": 2.605, "eval_steps_per_second": 0.326, "step": 256 }, { "epoch": 8.03125, "grad_norm": 0.6551281786490154, "learning_rate": 2e-05, "loss": 0.6032, "step": 257 }, { "epoch": 8.03125, "eval_loss": 0.6583926677703857, "eval_runtime": 83.4222, "eval_samples_per_second": 2.397, "eval_steps_per_second": 0.3, "step": 257 }, { "epoch": 8.0625, "grad_norm": 0.6033798361300539, "learning_rate": 2e-05, "loss": 0.6352, "step": 258 }, { "epoch": 8.0625, "eval_loss": 0.6615632772445679, "eval_runtime": 76.7227, "eval_samples_per_second": 2.607, "eval_steps_per_second": 0.326, "step": 258 }, { "epoch": 8.09375, "grad_norm": 0.557538857110867, "learning_rate": 2e-05, "loss": 0.6472, "step": 259 }, { "epoch": 8.09375, "eval_loss": 0.6674608588218689, "eval_runtime": 76.6215, "eval_samples_per_second": 2.61, "eval_steps_per_second": 0.326, "step": 259 }, { "epoch": 8.125, "grad_norm": 0.7828450894757938, "learning_rate": 2e-05, "loss": 0.6576, "step": 260 }, { "epoch": 8.125, "eval_loss": 0.670245349407196, "eval_runtime": 76.685, "eval_samples_per_second": 2.608, "eval_steps_per_second": 0.326, "step": 260 }, { "epoch": 8.15625, "grad_norm": 0.7969830757603331, "learning_rate": 2e-05, "loss": 0.5809, "step": 261 }, { "epoch": 8.15625, "eval_loss": 0.6711975336074829, "eval_runtime": 78.0022, "eval_samples_per_second": 2.564, "eval_steps_per_second": 0.321, "step": 261 }, { "epoch": 8.1875, "grad_norm": 0.6431174985709492, "learning_rate": 2e-05, "loss": 0.6971, "step": 262 }, { "epoch": 8.1875, "eval_loss": 0.6719404458999634, "eval_runtime": 78.7599, "eval_samples_per_second": 2.539, "eval_steps_per_second": 0.317, "step": 262 }, { "epoch": 8.21875, "grad_norm": 0.7025583314944188, "learning_rate": 2e-05, "loss": 0.5751, "step": 263 }, { "epoch": 8.21875, "eval_loss": 0.6719526648521423, "eval_runtime": 78.0188, "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.32, "step": 263 }, { "epoch": 8.25, "grad_norm": 0.7114355417811269, "learning_rate": 2e-05, "loss": 0.623, "step": 264 }, { "epoch": 8.25, "eval_loss": 0.6717848181724548, "eval_runtime": 78.6366, "eval_samples_per_second": 2.543, "eval_steps_per_second": 0.318, "step": 264 }, { "epoch": 8.28125, "grad_norm": 0.8272269435769467, "learning_rate": 2e-05, "loss": 0.6509, "step": 265 }, { "epoch": 8.28125, "eval_loss": 0.6701865196228027, "eval_runtime": 78.7279, "eval_samples_per_second": 2.54, "eval_steps_per_second": 0.318, "step": 265 }, { "epoch": 8.3125, "grad_norm": 0.7215994453471393, "learning_rate": 2e-05, "loss": 0.6263, "step": 266 }, { "epoch": 8.3125, "eval_loss": 0.6682087182998657, "eval_runtime": 78.1433, "eval_samples_per_second": 2.559, "eval_steps_per_second": 0.32, "step": 266 }, { "epoch": 8.34375, "grad_norm": 0.6425448006102333, "learning_rate": 2e-05, "loss": 0.5613, "step": 267 }, { "epoch": 8.34375, "eval_loss": 0.6686681509017944, "eval_runtime": 78.0964, "eval_samples_per_second": 2.561, "eval_steps_per_second": 0.32, "step": 267 }, { "epoch": 8.375, "grad_norm": 0.7207053166384572, "learning_rate": 2e-05, "loss": 0.6239, "step": 268 }, { "epoch": 8.375, "eval_loss": 0.6676305532455444, "eval_runtime": 77.9986, "eval_samples_per_second": 2.564, "eval_steps_per_second": 0.321, "step": 268 }, { "epoch": 8.40625, "grad_norm": 0.7459344743811905, "learning_rate": 2e-05, "loss": 0.6159, "step": 269 }, { "epoch": 8.40625, "eval_loss": 0.6660167574882507, "eval_runtime": 78.4159, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 269 }, { "epoch": 8.4375, "grad_norm": 0.7179805119560739, "learning_rate": 2e-05, "loss": 0.6192, "step": 270 }, { "epoch": 8.4375, "eval_loss": 0.6636325716972351, "eval_runtime": 78.2224, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 270 }, { "epoch": 8.46875, "grad_norm": 0.724792498458059, "learning_rate": 2e-05, "loss": 0.5234, "step": 271 }, { "epoch": 8.46875, "eval_loss": 0.6647288799285889, "eval_runtime": 79.0573, "eval_samples_per_second": 2.53, "eval_steps_per_second": 0.316, "step": 271 }, { "epoch": 8.5, "grad_norm": 0.6544107138826364, "learning_rate": 2e-05, "loss": 0.6067, "step": 272 }, { "epoch": 8.5, "eval_loss": 0.6689667701721191, "eval_runtime": 79.2898, "eval_samples_per_second": 2.522, "eval_steps_per_second": 0.315, "step": 272 }, { "epoch": 8.53125, "grad_norm": 0.71580236810568, "learning_rate": 2e-05, "loss": 0.6215, "step": 273 }, { "epoch": 8.53125, "eval_loss": 0.6723271012306213, "eval_runtime": 79.0759, "eval_samples_per_second": 2.529, "eval_steps_per_second": 0.316, "step": 273 }, { "epoch": 8.5625, "grad_norm": 0.7741383931390255, "learning_rate": 2e-05, "loss": 0.6012, "step": 274 }, { "epoch": 8.5625, "eval_loss": 0.6743794083595276, "eval_runtime": 79.0509, "eval_samples_per_second": 2.53, "eval_steps_per_second": 0.316, "step": 274 }, { "epoch": 8.59375, "grad_norm": 0.7927343087738151, "learning_rate": 2e-05, "loss": 0.6241, "step": 275 }, { "epoch": 8.59375, "eval_loss": 0.6728585958480835, "eval_runtime": 79.2296, "eval_samples_per_second": 2.524, "eval_steps_per_second": 0.316, "step": 275 }, { "epoch": 8.625, "grad_norm": 0.759468785526614, "learning_rate": 2e-05, "loss": 0.6209, "step": 276 }, { "epoch": 8.625, "eval_loss": 0.6686221957206726, "eval_runtime": 76.7494, "eval_samples_per_second": 2.606, "eval_steps_per_second": 0.326, "step": 276 }, { "epoch": 8.65625, "grad_norm": 0.7345386079388437, "learning_rate": 2e-05, "loss": 0.5618, "step": 277 }, { "epoch": 8.65625, "eval_loss": 0.6659188270568848, "eval_runtime": 77.4511, "eval_samples_per_second": 2.582, "eval_steps_per_second": 0.323, "step": 277 }, { "epoch": 8.6875, "grad_norm": 0.6822491965046279, "learning_rate": 2e-05, "loss": 0.6064, "step": 278 }, { "epoch": 8.6875, "eval_loss": 0.664726734161377, "eval_runtime": 76.7108, "eval_samples_per_second": 2.607, "eval_steps_per_second": 0.326, "step": 278 }, { "epoch": 8.71875, "grad_norm": 0.7329120674082968, "learning_rate": 2e-05, "loss": 0.5843, "step": 279 }, { "epoch": 8.71875, "eval_loss": 0.6635715961456299, "eval_runtime": 76.7921, "eval_samples_per_second": 2.604, "eval_steps_per_second": 0.326, "step": 279 }, { "epoch": 8.75, "grad_norm": 0.7950781591249908, "learning_rate": 2e-05, "loss": 0.6383, "step": 280 }, { "epoch": 8.75, "eval_loss": 0.664521336555481, "eval_runtime": 76.6952, "eval_samples_per_second": 2.608, "eval_steps_per_second": 0.326, "step": 280 }, { "epoch": 8.78125, "grad_norm": 0.6791182798182671, "learning_rate": 2e-05, "loss": 0.5932, "step": 281 }, { "epoch": 8.78125, "eval_loss": 0.6673008799552917, "eval_runtime": 76.794, "eval_samples_per_second": 2.604, "eval_steps_per_second": 0.326, "step": 281 }, { "epoch": 8.8125, "grad_norm": 0.7633434086832942, "learning_rate": 2e-05, "loss": 0.5754, "step": 282 }, { "epoch": 8.8125, "eval_loss": 0.6692779064178467, "eval_runtime": 76.7749, "eval_samples_per_second": 2.605, "eval_steps_per_second": 0.326, "step": 282 }, { "epoch": 8.84375, "grad_norm": 0.6857090076317197, "learning_rate": 2e-05, "loss": 0.5585, "step": 283 }, { "epoch": 8.84375, "eval_loss": 0.6702080368995667, "eval_runtime": 76.6913, "eval_samples_per_second": 2.608, "eval_steps_per_second": 0.326, "step": 283 }, { "epoch": 8.875, "grad_norm": 0.6961298007385132, "learning_rate": 2e-05, "loss": 0.5093, "step": 284 }, { "epoch": 8.875, "eval_loss": 0.6708166599273682, "eval_runtime": 76.7725, "eval_samples_per_second": 2.605, "eval_steps_per_second": 0.326, "step": 284 }, { "epoch": 8.90625, "grad_norm": 0.7783752192295856, "learning_rate": 2e-05, "loss": 0.5656, "step": 285 }, { "epoch": 8.90625, "eval_loss": 0.6697121262550354, "eval_runtime": 76.7888, "eval_samples_per_second": 2.605, "eval_steps_per_second": 0.326, "step": 285 }, { "epoch": 8.9375, "grad_norm": 0.7327581828795048, "learning_rate": 2e-05, "loss": 0.6984, "step": 286 }, { "epoch": 8.9375, "eval_loss": 0.6684187054634094, "eval_runtime": 78.6657, "eval_samples_per_second": 2.542, "eval_steps_per_second": 0.318, "step": 286 }, { "epoch": 8.96875, "grad_norm": 0.689919829790507, "learning_rate": 2e-05, "loss": 0.6173, "step": 287 }, { "epoch": 8.96875, "eval_loss": 0.6675245761871338, "eval_runtime": 78.1275, "eval_samples_per_second": 2.56, "eval_steps_per_second": 0.32, "step": 287 }, { "epoch": 9.0, "grad_norm": 0.6812947879732435, "learning_rate": 2e-05, "loss": 0.5499, "step": 288 }, { "epoch": 9.0, "eval_loss": 0.6678825616836548, "eval_runtime": 78.8588, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.317, "step": 288 }, { "epoch": 9.03125, "grad_norm": 0.715716761740314, "learning_rate": 2e-05, "loss": 0.5699, "step": 289 }, { "epoch": 9.03125, "eval_loss": 0.6692755222320557, "eval_runtime": 83.098, "eval_samples_per_second": 2.407, "eval_steps_per_second": 0.301, "step": 289 }, { "epoch": 9.0625, "grad_norm": 0.7438930389955494, "learning_rate": 2e-05, "loss": 0.5974, "step": 290 }, { "epoch": 9.0625, "eval_loss": 0.6735746264457703, "eval_runtime": 77.384, "eval_samples_per_second": 2.585, "eval_steps_per_second": 0.323, "step": 290 }, { "epoch": 9.09375, "grad_norm": 0.7271043131369198, "learning_rate": 2e-05, "loss": 0.601, "step": 291 }, { "epoch": 9.09375, "eval_loss": 0.6790977716445923, "eval_runtime": 78.0312, "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.32, "step": 291 }, { "epoch": 9.125, "grad_norm": 0.851687675865168, "learning_rate": 2e-05, "loss": 0.5681, "step": 292 }, { "epoch": 9.125, "eval_loss": 0.6834170818328857, "eval_runtime": 77.8688, "eval_samples_per_second": 2.568, "eval_steps_per_second": 0.321, "step": 292 }, { "epoch": 9.15625, "grad_norm": 0.7905287763218567, "learning_rate": 2e-05, "loss": 0.6222, "step": 293 }, { "epoch": 9.15625, "eval_loss": 0.6843841671943665, "eval_runtime": 77.985, "eval_samples_per_second": 2.565, "eval_steps_per_second": 0.321, "step": 293 }, { "epoch": 9.1875, "grad_norm": 0.7301520002532459, "learning_rate": 2e-05, "loss": 0.5549, "step": 294 }, { "epoch": 9.1875, "eval_loss": 0.6860540509223938, "eval_runtime": 78.0163, "eval_samples_per_second": 2.564, "eval_steps_per_second": 0.32, "step": 294 }, { "epoch": 9.21875, "grad_norm": 0.899999206595601, "learning_rate": 2e-05, "loss": 0.5128, "step": 295 }, { "epoch": 9.21875, "eval_loss": 0.685759425163269, "eval_runtime": 78.4339, "eval_samples_per_second": 2.55, "eval_steps_per_second": 0.319, "step": 295 }, { "epoch": 9.25, "grad_norm": 0.8064287475451557, "learning_rate": 2e-05, "loss": 0.5261, "step": 296 }, { "epoch": 9.25, "eval_loss": 0.6864770650863647, "eval_runtime": 79.6129, "eval_samples_per_second": 2.512, "eval_steps_per_second": 0.314, "step": 296 }, { "epoch": 9.28125, "grad_norm": 0.8837240795882767, "learning_rate": 2e-05, "loss": 0.621, "step": 297 }, { "epoch": 9.28125, "eval_loss": 0.6871599555015564, "eval_runtime": 78.9778, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.317, "step": 297 }, { "epoch": 9.3125, "grad_norm": 0.9676184044078363, "learning_rate": 2e-05, "loss": 0.5655, "step": 298 }, { "epoch": 9.3125, "eval_loss": 0.6881282329559326, "eval_runtime": 78.9944, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.316, "step": 298 }, { "epoch": 9.34375, "grad_norm": 0.8723474213941232, "learning_rate": 2e-05, "loss": 0.5449, "step": 299 }, { "epoch": 9.34375, "eval_loss": 0.6879245638847351, "eval_runtime": 79.0056, "eval_samples_per_second": 2.531, "eval_steps_per_second": 0.316, "step": 299 }, { "epoch": 9.375, "grad_norm": 0.848833488380702, "learning_rate": 2e-05, "loss": 0.5683, "step": 300 }, { "epoch": 9.375, "eval_loss": 0.6846978664398193, "eval_runtime": 78.9003, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.317, "step": 300 }, { "epoch": 9.40625, "grad_norm": 0.8586391766708288, "learning_rate": 2e-05, "loss": 0.5358, "step": 301 }, { "epoch": 9.40625, "eval_loss": 0.6798649430274963, "eval_runtime": 80.0404, "eval_samples_per_second": 2.499, "eval_steps_per_second": 0.312, "step": 301 }, { "epoch": 9.4375, "grad_norm": 0.8007832596916474, "learning_rate": 2e-05, "loss": 0.5792, "step": 302 }, { "epoch": 9.4375, "eval_loss": 0.6757382750511169, "eval_runtime": 79.962, "eval_samples_per_second": 2.501, "eval_steps_per_second": 0.313, "step": 302 }, { "epoch": 9.46875, "grad_norm": 0.7839805948862919, "learning_rate": 2e-05, "loss": 0.5917, "step": 303 }, { "epoch": 9.46875, "eval_loss": 0.6754000782966614, "eval_runtime": 80.738, "eval_samples_per_second": 2.477, "eval_steps_per_second": 0.31, "step": 303 }, { "epoch": 9.5, "grad_norm": 0.7397772754102683, "learning_rate": 2e-05, "loss": 0.6249, "step": 304 }, { "epoch": 9.5, "eval_loss": 0.6777495741844177, "eval_runtime": 80.5144, "eval_samples_per_second": 2.484, "eval_steps_per_second": 0.311, "step": 304 }, { "epoch": 9.53125, "grad_norm": 0.857390001265035, "learning_rate": 2e-05, "loss": 0.5932, "step": 305 }, { "epoch": 9.53125, "eval_loss": 0.6778848171234131, "eval_runtime": 80.1508, "eval_samples_per_second": 2.495, "eval_steps_per_second": 0.312, "step": 305 }, { "epoch": 9.5625, "grad_norm": 0.9430180281536945, "learning_rate": 2e-05, "loss": 0.5793, "step": 306 }, { "epoch": 9.5625, "eval_loss": 0.6771917939186096, "eval_runtime": 76.7109, "eval_samples_per_second": 2.607, "eval_steps_per_second": 0.326, "step": 306 }, { "epoch": 9.59375, "grad_norm": 0.8705050270903875, "learning_rate": 2e-05, "loss": 0.5601, "step": 307 }, { "epoch": 9.59375, "eval_loss": 0.6808632016181946, "eval_runtime": 76.6965, "eval_samples_per_second": 2.608, "eval_steps_per_second": 0.326, "step": 307 }, { "epoch": 9.625, "grad_norm": 0.8611871513168323, "learning_rate": 2e-05, "loss": 0.5953, "step": 308 }, { "epoch": 9.625, "eval_loss": 0.6875945329666138, "eval_runtime": 76.6592, "eval_samples_per_second": 2.609, "eval_steps_per_second": 0.326, "step": 308 }, { "epoch": 9.65625, "grad_norm": 0.9066952565245906, "learning_rate": 2e-05, "loss": 0.5815, "step": 309 }, { "epoch": 9.65625, "eval_loss": 0.6910049319267273, "eval_runtime": 76.7021, "eval_samples_per_second": 2.607, "eval_steps_per_second": 0.326, "step": 309 }, { "epoch": 9.6875, "grad_norm": 1.0666864048105145, "learning_rate": 2e-05, "loss": 0.5663, "step": 310 }, { "epoch": 9.6875, "eval_loss": 0.6869986057281494, "eval_runtime": 76.6344, "eval_samples_per_second": 2.61, "eval_steps_per_second": 0.326, "step": 310 }, { "epoch": 9.71875, "grad_norm": 0.9413311560347162, "learning_rate": 2e-05, "loss": 0.5106, "step": 311 }, { "epoch": 9.71875, "eval_loss": 0.6825075745582581, "eval_runtime": 78.7857, "eval_samples_per_second": 2.539, "eval_steps_per_second": 0.317, "step": 311 }, { "epoch": 9.75, "grad_norm": 0.9175579044457436, "learning_rate": 2e-05, "loss": 0.5821, "step": 312 }, { "epoch": 9.75, "eval_loss": 0.6794223189353943, "eval_runtime": 78.0368, "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.32, "step": 312 }, { "epoch": 9.78125, "grad_norm": 0.7982785075945665, "learning_rate": 2e-05, "loss": 0.5781, "step": 313 }, { "epoch": 9.78125, "eval_loss": 0.679649829864502, "eval_runtime": 78.0513, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 313 }, { "epoch": 9.8125, "grad_norm": 0.9284642289974022, "learning_rate": 2e-05, "loss": 0.5394, "step": 314 }, { "epoch": 9.8125, "eval_loss": 0.6805163025856018, "eval_runtime": 78.2229, "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 314 }, { "epoch": 9.84375, "grad_norm": 0.8816568355396782, "learning_rate": 2e-05, "loss": 0.5722, "step": 315 }, { "epoch": 9.84375, "eval_loss": 0.6801097393035889, "eval_runtime": 78.9282, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.317, "step": 315 }, { "epoch": 9.875, "grad_norm": 0.8137119863863306, "learning_rate": 2e-05, "loss": 0.5831, "step": 316 }, { "epoch": 9.875, "eval_loss": 0.6792600750923157, "eval_runtime": 78.8166, "eval_samples_per_second": 2.538, "eval_steps_per_second": 0.317, "step": 316 }, { "epoch": 9.90625, "grad_norm": 0.9595174764400289, "learning_rate": 2e-05, "loss": 0.5489, "step": 317 }, { "epoch": 9.90625, "eval_loss": 0.6755692958831787, "eval_runtime": 78.1426, "eval_samples_per_second": 2.559, "eval_steps_per_second": 0.32, "step": 317 }, { "epoch": 9.9375, "grad_norm": 0.8612490247878711, "learning_rate": 2e-05, "loss": 0.5508, "step": 318 }, { "epoch": 9.9375, "eval_loss": 0.673053503036499, "eval_runtime": 78.0565, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 318 }, { "epoch": 9.96875, "grad_norm": 0.9474068762478358, "learning_rate": 2e-05, "loss": 0.5859, "step": 319 }, { "epoch": 9.96875, "eval_loss": 0.6695602536201477, "eval_runtime": 78.051, "eval_samples_per_second": 2.562, "eval_steps_per_second": 0.32, "step": 319 }, { "epoch": 10.0, "grad_norm": 0.8401643717683449, "learning_rate": 2e-05, "loss": 0.5277, "step": 320 }, { "epoch": 10.0, "eval_loss": 0.6707890033721924, "eval_runtime": 78.9959, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.316, "step": 320 }, { "epoch": 10.0, "step": 320, "total_flos": 613933061373952.0, "train_loss": 0.056994458101689814, "train_runtime": 3241.7031, "train_samples_per_second": 3.085, "train_steps_per_second": 0.099 } ], "logging_steps": 1.0, "max_steps": 320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 613933061373952.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }