{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.076923076923077, "eval_steps": 10, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015384615384615385, "grad_norm": 10.082767486572266, "learning_rate": 3.0769230769230774e-07, "loss": 2.2325, "step": 10 }, { "epoch": 0.015384615384615385, "eval_loss": 1.8899363279342651, "eval_runtime": 3.2461, "eval_samples_per_second": 35.119, "eval_steps_per_second": 1.848, "step": 10 }, { "epoch": 0.03076923076923077, "grad_norm": 11.988937377929688, "learning_rate": 6.153846153846155e-07, "loss": 1.8704, "step": 20 }, { "epoch": 0.03076923076923077, "eval_loss": 1.7930843830108643, "eval_runtime": 3.2368, "eval_samples_per_second": 35.22, "eval_steps_per_second": 1.854, "step": 20 }, { "epoch": 0.046153846153846156, "grad_norm": 12.660626411437988, "learning_rate": 9.230769230769232e-07, "loss": 1.5485, "step": 30 }, { "epoch": 0.046153846153846156, "eval_loss": 1.3351986408233643, "eval_runtime": 3.2549, "eval_samples_per_second": 35.024, "eval_steps_per_second": 1.843, "step": 30 }, { "epoch": 0.06153846153846154, "grad_norm": 5.869946002960205, "learning_rate": 1.230769230769231e-06, "loss": 1.1465, "step": 40 }, { "epoch": 0.06153846153846154, "eval_loss": 0.9145882725715637, "eval_runtime": 3.2573, "eval_samples_per_second": 34.998, "eval_steps_per_second": 1.842, "step": 40 }, { "epoch": 0.07692307692307693, "grad_norm": 5.386349201202393, "learning_rate": 1.5384615384615387e-06, "loss": 0.7239, "step": 50 }, { "epoch": 0.07692307692307693, "eval_loss": 0.7560557126998901, "eval_runtime": 3.2454, "eval_samples_per_second": 35.126, "eval_steps_per_second": 1.849, "step": 50 }, { "epoch": 0.09230769230769231, "grad_norm": 13.984370231628418, "learning_rate": 1.8461538461538465e-06, "loss": 0.7261, "step": 60 }, { "epoch": 0.09230769230769231, "eval_loss": 0.7075957655906677, "eval_runtime": 3.2359, "eval_samples_per_second": 35.229, "eval_steps_per_second": 1.854, "step": 60 }, { "epoch": 0.1076923076923077, "grad_norm": 12.205921173095703, "learning_rate": 2.153846153846154e-06, "loss": 0.6461, "step": 70 }, { "epoch": 0.1076923076923077, "eval_loss": 0.6797388792037964, "eval_runtime": 3.2439, "eval_samples_per_second": 35.143, "eval_steps_per_second": 1.85, "step": 70 }, { "epoch": 0.12307692307692308, "grad_norm": 8.89425277709961, "learning_rate": 2.461538461538462e-06, "loss": 0.5958, "step": 80 }, { "epoch": 0.12307692307692308, "eval_loss": 0.6671402454376221, "eval_runtime": 3.2181, "eval_samples_per_second": 35.425, "eval_steps_per_second": 1.864, "step": 80 }, { "epoch": 0.13846153846153847, "grad_norm": 19.930360794067383, "learning_rate": 2.7692307692307697e-06, "loss": 0.6231, "step": 90 }, { "epoch": 0.13846153846153847, "eval_loss": 0.650134265422821, "eval_runtime": 3.2427, "eval_samples_per_second": 35.156, "eval_steps_per_second": 1.85, "step": 90 }, { "epoch": 0.15384615384615385, "grad_norm": 9.43468189239502, "learning_rate": 3.0769230769230774e-06, "loss": 0.6271, "step": 100 }, { "epoch": 0.15384615384615385, "eval_loss": 0.6546280384063721, "eval_runtime": 3.2567, "eval_samples_per_second": 35.005, "eval_steps_per_second": 1.842, "step": 100 }, { "epoch": 0.16923076923076924, "grad_norm": 9.599658012390137, "learning_rate": 3.384615384615385e-06, "loss": 0.5577, "step": 110 }, { "epoch": 0.16923076923076924, "eval_loss": 0.6422574520111084, "eval_runtime": 3.2521, "eval_samples_per_second": 35.054, "eval_steps_per_second": 1.845, "step": 110 }, { "epoch": 0.18461538461538463, "grad_norm": 10.455623626708984, "learning_rate": 3.692307692307693e-06, "loss": 0.5193, "step": 120 }, { "epoch": 0.18461538461538463, "eval_loss": 0.6282245516777039, "eval_runtime": 3.2576, "eval_samples_per_second": 34.996, "eval_steps_per_second": 1.842, "step": 120 }, { "epoch": 0.2, "grad_norm": 5.3797407150268555, "learning_rate": 4.000000000000001e-06, "loss": 0.5225, "step": 130 }, { "epoch": 0.2, "eval_loss": 0.6316070556640625, "eval_runtime": 3.2288, "eval_samples_per_second": 35.307, "eval_steps_per_second": 1.858, "step": 130 }, { "epoch": 0.2153846153846154, "grad_norm": 2.727918863296509, "learning_rate": 4.307692307692308e-06, "loss": 0.5738, "step": 140 }, { "epoch": 0.2153846153846154, "eval_loss": 0.6229422688484192, "eval_runtime": 3.2458, "eval_samples_per_second": 35.122, "eval_steps_per_second": 1.849, "step": 140 }, { "epoch": 0.23076923076923078, "grad_norm": 43.989776611328125, "learning_rate": 4.615384615384616e-06, "loss": 0.5529, "step": 150 }, { "epoch": 0.23076923076923078, "eval_loss": 0.6122006773948669, "eval_runtime": 3.2647, "eval_samples_per_second": 34.919, "eval_steps_per_second": 1.838, "step": 150 }, { "epoch": 0.24615384615384617, "grad_norm": 4.098404407501221, "learning_rate": 4.923076923076924e-06, "loss": 0.519, "step": 160 }, { "epoch": 0.24615384615384617, "eval_loss": 0.6218054294586182, "eval_runtime": 3.2453, "eval_samples_per_second": 35.128, "eval_steps_per_second": 1.849, "step": 160 }, { "epoch": 0.26153846153846155, "grad_norm": 2.585014820098877, "learning_rate": 5.230769230769232e-06, "loss": 0.5713, "step": 170 }, { "epoch": 0.26153846153846155, "eval_loss": 0.6205213069915771, "eval_runtime": 3.2516, "eval_samples_per_second": 35.059, "eval_steps_per_second": 1.845, "step": 170 }, { "epoch": 0.27692307692307694, "grad_norm": 2.8192200660705566, "learning_rate": 5.538461538461539e-06, "loss": 0.5355, "step": 180 }, { "epoch": 0.27692307692307694, "eval_loss": 0.6130949258804321, "eval_runtime": 3.2451, "eval_samples_per_second": 35.13, "eval_steps_per_second": 1.849, "step": 180 }, { "epoch": 0.2923076923076923, "grad_norm": 4.156994819641113, "learning_rate": 5.846153846153847e-06, "loss": 0.4487, "step": 190 }, { "epoch": 0.2923076923076923, "eval_loss": 0.6160268783569336, "eval_runtime": 3.2251, "eval_samples_per_second": 35.348, "eval_steps_per_second": 1.86, "step": 190 }, { "epoch": 0.3076923076923077, "grad_norm": 3.249791383743286, "learning_rate": 6.153846153846155e-06, "loss": 0.5708, "step": 200 }, { "epoch": 0.3076923076923077, "eval_loss": 0.6122933626174927, "eval_runtime": 3.2483, "eval_samples_per_second": 35.095, "eval_steps_per_second": 1.847, "step": 200 }, { "epoch": 0.3230769230769231, "grad_norm": 3.2924177646636963, "learning_rate": 6.461538461538463e-06, "loss": 0.5607, "step": 210 }, { "epoch": 0.3230769230769231, "eval_loss": 0.6078358888626099, "eval_runtime": 3.2453, "eval_samples_per_second": 35.128, "eval_steps_per_second": 1.849, "step": 210 }, { "epoch": 0.3384615384615385, "grad_norm": 3.444439649581909, "learning_rate": 6.76923076923077e-06, "loss": 0.5026, "step": 220 }, { "epoch": 0.3384615384615385, "eval_loss": 0.6209812760353088, "eval_runtime": 3.2344, "eval_samples_per_second": 35.246, "eval_steps_per_second": 1.855, "step": 220 }, { "epoch": 0.35384615384615387, "grad_norm": 2.5516700744628906, "learning_rate": 7.076923076923078e-06, "loss": 0.4938, "step": 230 }, { "epoch": 0.35384615384615387, "eval_loss": 0.6547431349754333, "eval_runtime": 3.2496, "eval_samples_per_second": 35.081, "eval_steps_per_second": 1.846, "step": 230 }, { "epoch": 0.36923076923076925, "grad_norm": 25.815881729125977, "learning_rate": 7.384615384615386e-06, "loss": 0.5766, "step": 240 }, { "epoch": 0.36923076923076925, "eval_loss": 0.6063669919967651, "eval_runtime": 3.2779, "eval_samples_per_second": 34.778, "eval_steps_per_second": 1.83, "step": 240 }, { "epoch": 0.38461538461538464, "grad_norm": 2.2970821857452393, "learning_rate": 7.692307692307694e-06, "loss": 0.5924, "step": 250 }, { "epoch": 0.38461538461538464, "eval_loss": 0.6134644746780396, "eval_runtime": 3.2301, "eval_samples_per_second": 35.293, "eval_steps_per_second": 1.858, "step": 250 }, { "epoch": 0.4, "grad_norm": 2.2378745079040527, "learning_rate": 8.000000000000001e-06, "loss": 0.5715, "step": 260 }, { "epoch": 0.4, "eval_loss": 0.6102377772331238, "eval_runtime": 3.2286, "eval_samples_per_second": 35.309, "eval_steps_per_second": 1.858, "step": 260 }, { "epoch": 0.4153846153846154, "grad_norm": 2.2380335330963135, "learning_rate": 8.307692307692309e-06, "loss": 0.5491, "step": 270 }, { "epoch": 0.4153846153846154, "eval_loss": 0.6195328831672668, "eval_runtime": 3.2393, "eval_samples_per_second": 35.193, "eval_steps_per_second": 1.852, "step": 270 }, { "epoch": 0.4307692307692308, "grad_norm": 3.0623953342437744, "learning_rate": 8.615384615384617e-06, "loss": 0.6217, "step": 280 }, { "epoch": 0.4307692307692308, "eval_loss": 0.6168036460876465, "eval_runtime": 3.2516, "eval_samples_per_second": 35.06, "eval_steps_per_second": 1.845, "step": 280 }, { "epoch": 0.4461538461538462, "grad_norm": 2.4642109870910645, "learning_rate": 8.923076923076925e-06, "loss": 0.4625, "step": 290 }, { "epoch": 0.4461538461538462, "eval_loss": 0.6237349510192871, "eval_runtime": 3.243, "eval_samples_per_second": 35.153, "eval_steps_per_second": 1.85, "step": 290 }, { "epoch": 0.46153846153846156, "grad_norm": 2.919198751449585, "learning_rate": 9.230769230769232e-06, "loss": 0.554, "step": 300 }, { "epoch": 0.46153846153846156, "eval_loss": 0.6107349991798401, "eval_runtime": 3.2321, "eval_samples_per_second": 35.271, "eval_steps_per_second": 1.856, "step": 300 }, { "epoch": 0.47692307692307695, "grad_norm": 2.186372756958008, "learning_rate": 9.53846153846154e-06, "loss": 0.4941, "step": 310 }, { "epoch": 0.47692307692307695, "eval_loss": 0.614315390586853, "eval_runtime": 3.2373, "eval_samples_per_second": 35.214, "eval_steps_per_second": 1.853, "step": 310 }, { "epoch": 0.49230769230769234, "grad_norm": 2.5454864501953125, "learning_rate": 9.846153846153848e-06, "loss": 0.5021, "step": 320 }, { "epoch": 0.49230769230769234, "eval_loss": 0.6134229302406311, "eval_runtime": 3.2312, "eval_samples_per_second": 35.281, "eval_steps_per_second": 1.857, "step": 320 }, { "epoch": 0.5076923076923077, "grad_norm": 2.518843412399292, "learning_rate": 1.0153846153846154e-05, "loss": 0.587, "step": 330 }, { "epoch": 0.5076923076923077, "eval_loss": 0.6188045144081116, "eval_runtime": 3.2447, "eval_samples_per_second": 35.135, "eval_steps_per_second": 1.849, "step": 330 }, { "epoch": 0.5230769230769231, "grad_norm": 2.2720816135406494, "learning_rate": 1.0461538461538463e-05, "loss": 0.6655, "step": 340 }, { "epoch": 0.5230769230769231, "eval_loss": 0.6142882108688354, "eval_runtime": 3.2478, "eval_samples_per_second": 35.101, "eval_steps_per_second": 1.847, "step": 340 }, { "epoch": 0.5384615384615384, "grad_norm": 2.510495662689209, "learning_rate": 1.076923076923077e-05, "loss": 0.6116, "step": 350 }, { "epoch": 0.5384615384615384, "eval_loss": 0.615568995475769, "eval_runtime": 3.2332, "eval_samples_per_second": 35.259, "eval_steps_per_second": 1.856, "step": 350 }, { "epoch": 0.5538461538461539, "grad_norm": 3.122288942337036, "learning_rate": 1.1076923076923079e-05, "loss": 0.5906, "step": 360 }, { "epoch": 0.5538461538461539, "eval_loss": 0.6134847402572632, "eval_runtime": 3.23, "eval_samples_per_second": 35.294, "eval_steps_per_second": 1.858, "step": 360 }, { "epoch": 0.5692307692307692, "grad_norm": 2.096451759338379, "learning_rate": 1.1384615384615385e-05, "loss": 0.4887, "step": 370 }, { "epoch": 0.5692307692307692, "eval_loss": 0.6209902763366699, "eval_runtime": 3.2877, "eval_samples_per_second": 34.674, "eval_steps_per_second": 1.825, "step": 370 }, { "epoch": 0.5846153846153846, "grad_norm": 1.9950298070907593, "learning_rate": 1.1692307692307694e-05, "loss": 0.5759, "step": 380 }, { "epoch": 0.5846153846153846, "eval_loss": 0.623406171798706, "eval_runtime": 3.2604, "eval_samples_per_second": 34.965, "eval_steps_per_second": 1.84, "step": 380 }, { "epoch": 0.6, "grad_norm": 1.6859853267669678, "learning_rate": 1.2e-05, "loss": 0.5436, "step": 390 }, { "epoch": 0.6, "eval_loss": 0.622590184211731, "eval_runtime": 3.2828, "eval_samples_per_second": 34.726, "eval_steps_per_second": 1.828, "step": 390 }, { "epoch": 0.6153846153846154, "grad_norm": 2.1276960372924805, "learning_rate": 1.230769230769231e-05, "loss": 0.5281, "step": 400 }, { "epoch": 0.6153846153846154, "eval_loss": 0.6532315611839294, "eval_runtime": 3.2611, "eval_samples_per_second": 34.958, "eval_steps_per_second": 1.84, "step": 400 }, { "epoch": 0.6307692307692307, "grad_norm": 2.362884998321533, "learning_rate": 1.2615384615384616e-05, "loss": 0.5469, "step": 410 }, { "epoch": 0.6307692307692307, "eval_loss": 0.6342897415161133, "eval_runtime": 3.2382, "eval_samples_per_second": 35.205, "eval_steps_per_second": 1.853, "step": 410 }, { "epoch": 0.6461538461538462, "grad_norm": 2.6885156631469727, "learning_rate": 1.2923076923076925e-05, "loss": 0.5795, "step": 420 }, { "epoch": 0.6461538461538462, "eval_loss": 0.6264632940292358, "eval_runtime": 3.238, "eval_samples_per_second": 35.207, "eval_steps_per_second": 1.853, "step": 420 }, { "epoch": 0.6615384615384615, "grad_norm": 2.579245090484619, "learning_rate": 1.3230769230769231e-05, "loss": 0.5347, "step": 430 }, { "epoch": 0.6615384615384615, "eval_loss": 0.6273682117462158, "eval_runtime": 3.2364, "eval_samples_per_second": 35.224, "eval_steps_per_second": 1.854, "step": 430 }, { "epoch": 0.676923076923077, "grad_norm": 1.1545597314834595, "learning_rate": 1.353846153846154e-05, "loss": 0.4647, "step": 440 }, { "epoch": 0.676923076923077, "eval_loss": 0.6299084424972534, "eval_runtime": 3.2446, "eval_samples_per_second": 35.136, "eval_steps_per_second": 1.849, "step": 440 }, { "epoch": 0.6923076923076923, "grad_norm": 2.961758613586426, "learning_rate": 1.3846153846153847e-05, "loss": 0.461, "step": 450 }, { "epoch": 0.6923076923076923, "eval_loss": 0.6325281858444214, "eval_runtime": 3.2395, "eval_samples_per_second": 35.191, "eval_steps_per_second": 1.852, "step": 450 }, { "epoch": 0.7076923076923077, "grad_norm": 1.9980833530426025, "learning_rate": 1.4153846153846156e-05, "loss": 0.5936, "step": 460 }, { "epoch": 0.7076923076923077, "eval_loss": 0.6283787488937378, "eval_runtime": 3.2222, "eval_samples_per_second": 35.379, "eval_steps_per_second": 1.862, "step": 460 }, { "epoch": 0.7230769230769231, "grad_norm": 1.7074766159057617, "learning_rate": 1.4461538461538462e-05, "loss": 0.5754, "step": 470 }, { "epoch": 0.7230769230769231, "eval_loss": 0.6299780011177063, "eval_runtime": 3.4089, "eval_samples_per_second": 33.442, "eval_steps_per_second": 1.76, "step": 470 }, { "epoch": 0.7384615384615385, "grad_norm": 3.0761687755584717, "learning_rate": 1.4769230769230772e-05, "loss": 0.5832, "step": 480 }, { "epoch": 0.7384615384615385, "eval_loss": 0.6351837515830994, "eval_runtime": 3.2325, "eval_samples_per_second": 35.266, "eval_steps_per_second": 1.856, "step": 480 }, { "epoch": 0.7538461538461538, "grad_norm": 2.6451804637908936, "learning_rate": 1.5076923076923078e-05, "loss": 0.5678, "step": 490 }, { "epoch": 0.7538461538461538, "eval_loss": 0.6302112340927124, "eval_runtime": 3.2461, "eval_samples_per_second": 35.119, "eval_steps_per_second": 1.848, "step": 490 }, { "epoch": 0.7692307692307693, "grad_norm": 1.2752690315246582, "learning_rate": 1.5384615384615387e-05, "loss": 0.5067, "step": 500 }, { "epoch": 0.7692307692307693, "eval_loss": 0.6335285902023315, "eval_runtime": 3.233, "eval_samples_per_second": 35.261, "eval_steps_per_second": 1.856, "step": 500 }, { "epoch": 0.7846153846153846, "grad_norm": 1.623413324356079, "learning_rate": 1.5692307692307693e-05, "loss": 0.565, "step": 510 }, { "epoch": 0.7846153846153846, "eval_loss": 0.6364890933036804, "eval_runtime": 3.2571, "eval_samples_per_second": 35.001, "eval_steps_per_second": 1.842, "step": 510 }, { "epoch": 0.8, "grad_norm": 1.4268816709518433, "learning_rate": 1.6000000000000003e-05, "loss": 0.6637, "step": 520 }, { "epoch": 0.8, "eval_loss": 0.6888372898101807, "eval_runtime": 3.2161, "eval_samples_per_second": 35.446, "eval_steps_per_second": 1.866, "step": 520 }, { "epoch": 0.8153846153846154, "grad_norm": 2.0327444076538086, "learning_rate": 1.630769230769231e-05, "loss": 0.5415, "step": 530 }, { "epoch": 0.8153846153846154, "eval_loss": 0.6402605175971985, "eval_runtime": 3.249, "eval_samples_per_second": 35.088, "eval_steps_per_second": 1.847, "step": 530 }, { "epoch": 0.8307692307692308, "grad_norm": 2.4058265686035156, "learning_rate": 1.6615384615384618e-05, "loss": 0.6001, "step": 540 }, { "epoch": 0.8307692307692308, "eval_loss": 0.6381077170372009, "eval_runtime": 3.2304, "eval_samples_per_second": 35.29, "eval_steps_per_second": 1.857, "step": 540 }, { "epoch": 0.8461538461538461, "grad_norm": 3.1266651153564453, "learning_rate": 1.6923076923076924e-05, "loss": 0.6938, "step": 550 }, { "epoch": 0.8461538461538461, "eval_loss": 0.6406418681144714, "eval_runtime": 3.2645, "eval_samples_per_second": 34.921, "eval_steps_per_second": 1.838, "step": 550 }, { "epoch": 0.8615384615384616, "grad_norm": 2.09586501121521, "learning_rate": 1.7230769230769234e-05, "loss": 0.5671, "step": 560 }, { "epoch": 0.8615384615384616, "eval_loss": 0.6451361775398254, "eval_runtime": 3.3107, "eval_samples_per_second": 34.434, "eval_steps_per_second": 1.812, "step": 560 }, { "epoch": 0.8769230769230769, "grad_norm": 3.3246548175811768, "learning_rate": 1.753846153846154e-05, "loss": 0.5231, "step": 570 }, { "epoch": 0.8769230769230769, "eval_loss": 0.6457281708717346, "eval_runtime": 3.2516, "eval_samples_per_second": 35.059, "eval_steps_per_second": 1.845, "step": 570 }, { "epoch": 0.8923076923076924, "grad_norm": 1.8420376777648926, "learning_rate": 1.784615384615385e-05, "loss": 0.6566, "step": 580 }, { "epoch": 0.8923076923076924, "eval_loss": 0.6426037549972534, "eval_runtime": 3.2569, "eval_samples_per_second": 35.003, "eval_steps_per_second": 1.842, "step": 580 }, { "epoch": 0.9076923076923077, "grad_norm": 1.7587623596191406, "learning_rate": 1.8153846153846155e-05, "loss": 0.5607, "step": 590 }, { "epoch": 0.9076923076923077, "eval_loss": 0.6446419358253479, "eval_runtime": 3.2735, "eval_samples_per_second": 34.825, "eval_steps_per_second": 1.833, "step": 590 }, { "epoch": 0.9230769230769231, "grad_norm": 2.1630938053131104, "learning_rate": 1.8461538461538465e-05, "loss": 0.7058, "step": 600 }, { "epoch": 0.9230769230769231, "eval_loss": 0.6553606986999512, "eval_runtime": 3.2562, "eval_samples_per_second": 35.01, "eval_steps_per_second": 1.843, "step": 600 }, { "epoch": 0.9384615384615385, "grad_norm": 1.9333707094192505, "learning_rate": 1.876923076923077e-05, "loss": 0.6126, "step": 610 }, { "epoch": 0.9384615384615385, "eval_loss": 0.6482510566711426, "eval_runtime": 3.2568, "eval_samples_per_second": 35.004, "eval_steps_per_second": 1.842, "step": 610 }, { "epoch": 0.9538461538461539, "grad_norm": 1.994057297706604, "learning_rate": 1.907692307692308e-05, "loss": 0.6155, "step": 620 }, { "epoch": 0.9538461538461539, "eval_loss": 0.6493787169456482, "eval_runtime": 3.2411, "eval_samples_per_second": 35.174, "eval_steps_per_second": 1.851, "step": 620 }, { "epoch": 0.9692307692307692, "grad_norm": 2.5365986824035645, "learning_rate": 1.9384615384615386e-05, "loss": 0.5934, "step": 630 }, { "epoch": 0.9692307692307692, "eval_loss": 0.6541261076927185, "eval_runtime": 3.2414, "eval_samples_per_second": 35.17, "eval_steps_per_second": 1.851, "step": 630 }, { "epoch": 0.9846153846153847, "grad_norm": 2.831939697265625, "learning_rate": 1.9692307692307696e-05, "loss": 0.4716, "step": 640 }, { "epoch": 0.9846153846153847, "eval_loss": 0.6611928343772888, "eval_runtime": 3.2346, "eval_samples_per_second": 35.244, "eval_steps_per_second": 1.855, "step": 640 }, { "epoch": 1.0, "grad_norm": 2.3626530170440674, "learning_rate": 2e-05, "loss": 0.5363, "step": 650 }, { "epoch": 1.0, "eval_loss": 0.6603513956069946, "eval_runtime": 3.238, "eval_samples_per_second": 35.206, "eval_steps_per_second": 1.853, "step": 650 }, { "epoch": 1.0153846153846153, "grad_norm": 1.5879381895065308, "learning_rate": 1.9999855802751384e-05, "loss": 0.3579, "step": 660 }, { "epoch": 1.0153846153846153, "eval_loss": 0.6978694200515747, "eval_runtime": 3.2447, "eval_samples_per_second": 35.135, "eval_steps_per_second": 1.849, "step": 660 }, { "epoch": 1.0307692307692307, "grad_norm": 1.9470375776290894, "learning_rate": 1.9999423215164105e-05, "loss": 0.3559, "step": 670 }, { "epoch": 1.0307692307692307, "eval_loss": 0.6810071468353271, "eval_runtime": 3.2514, "eval_samples_per_second": 35.062, "eval_steps_per_second": 1.845, "step": 670 }, { "epoch": 1.0461538461538462, "grad_norm": 1.9423273801803589, "learning_rate": 1.9998702249713747e-05, "loss": 0.3977, "step": 680 }, { "epoch": 1.0461538461538462, "eval_loss": 0.6764042377471924, "eval_runtime": 3.2336, "eval_samples_per_second": 35.255, "eval_steps_per_second": 1.856, "step": 680 }, { "epoch": 1.0615384615384615, "grad_norm": 2.47997784614563, "learning_rate": 1.9997692927192562e-05, "loss": 0.3449, "step": 690 }, { "epoch": 1.0615384615384615, "eval_loss": 0.6773045659065247, "eval_runtime": 3.2495, "eval_samples_per_second": 35.082, "eval_steps_per_second": 1.846, "step": 690 }, { "epoch": 1.0769230769230769, "grad_norm": 1.2528847455978394, "learning_rate": 1.9996395276708856e-05, "loss": 0.3426, "step": 700 }, { "epoch": 1.0769230769230769, "eval_loss": 0.6868980526924133, "eval_runtime": 3.2469, "eval_samples_per_second": 35.111, "eval_steps_per_second": 1.848, "step": 700 }, { "epoch": 1.0923076923076924, "grad_norm": 1.9821579456329346, "learning_rate": 1.9994809335686152e-05, "loss": 0.4387, "step": 710 }, { "epoch": 1.0923076923076924, "eval_loss": 0.6710843443870544, "eval_runtime": 3.2766, "eval_samples_per_second": 34.792, "eval_steps_per_second": 1.831, "step": 710 }, { "epoch": 1.1076923076923078, "grad_norm": 1.4338393211364746, "learning_rate": 1.9992935149862116e-05, "loss": 0.3443, "step": 720 }, { "epoch": 1.1076923076923078, "eval_loss": 0.6952248215675354, "eval_runtime": 3.2614, "eval_samples_per_second": 34.954, "eval_steps_per_second": 1.84, "step": 720 }, { "epoch": 1.123076923076923, "grad_norm": 1.1517648696899414, "learning_rate": 1.999077277328724e-05, "loss": 0.3484, "step": 730 }, { "epoch": 1.123076923076923, "eval_loss": 0.6964479684829712, "eval_runtime": 3.2521, "eval_samples_per_second": 35.054, "eval_steps_per_second": 1.845, "step": 730 }, { "epoch": 1.1384615384615384, "grad_norm": 1.650405764579773, "learning_rate": 1.998832226832327e-05, "loss": 0.4018, "step": 740 }, { "epoch": 1.1384615384615384, "eval_loss": 0.6902267932891846, "eval_runtime": 3.2586, "eval_samples_per_second": 34.984, "eval_steps_per_second": 1.841, "step": 740 }, { "epoch": 1.1538461538461537, "grad_norm": 2.2939112186431885, "learning_rate": 1.9985583705641418e-05, "loss": 0.3984, "step": 750 }, { "epoch": 1.1538461538461537, "eval_loss": 0.6953668594360352, "eval_runtime": 3.2666, "eval_samples_per_second": 34.899, "eval_steps_per_second": 1.837, "step": 750 }, { "epoch": 1.1692307692307693, "grad_norm": 1.588689923286438, "learning_rate": 1.9982557164220335e-05, "loss": 0.3423, "step": 760 }, { "epoch": 1.1692307692307693, "eval_loss": 0.6961036324501038, "eval_runtime": 3.2728, "eval_samples_per_second": 34.832, "eval_steps_per_second": 1.833, "step": 760 }, { "epoch": 1.1846153846153846, "grad_norm": 2.06250262260437, "learning_rate": 1.9979242731343803e-05, "loss": 0.2843, "step": 770 }, { "epoch": 1.1846153846153846, "eval_loss": 0.7108862400054932, "eval_runtime": 3.2657, "eval_samples_per_second": 34.908, "eval_steps_per_second": 1.837, "step": 770 }, { "epoch": 1.2, "grad_norm": 2.609130382537842, "learning_rate": 1.9975640502598243e-05, "loss": 0.3172, "step": 780 }, { "epoch": 1.2, "eval_loss": 0.6978670358657837, "eval_runtime": 3.2647, "eval_samples_per_second": 34.919, "eval_steps_per_second": 1.838, "step": 780 }, { "epoch": 1.2153846153846155, "grad_norm": 2.1612465381622314, "learning_rate": 1.9971750581869955e-05, "loss": 0.4031, "step": 790 }, { "epoch": 1.2153846153846155, "eval_loss": 0.7043502330780029, "eval_runtime": 3.2943, "eval_samples_per_second": 34.605, "eval_steps_per_second": 1.821, "step": 790 }, { "epoch": 1.2307692307692308, "grad_norm": 2.465644121170044, "learning_rate": 1.9967573081342103e-05, "loss": 0.3869, "step": 800 }, { "epoch": 1.2307692307692308, "eval_loss": 0.694877564907074, "eval_runtime": 3.2465, "eval_samples_per_second": 35.115, "eval_steps_per_second": 1.848, "step": 800 }, { "epoch": 1.2461538461538462, "grad_norm": 2.611905097961426, "learning_rate": 1.9963108121491508e-05, "loss": 0.3364, "step": 810 }, { "epoch": 1.2461538461538462, "eval_loss": 0.6959603428840637, "eval_runtime": 3.2287, "eval_samples_per_second": 35.309, "eval_steps_per_second": 1.858, "step": 810 }, { "epoch": 1.2615384615384615, "grad_norm": 2.0148117542266846, "learning_rate": 1.9958355831085155e-05, "loss": 0.3699, "step": 820 }, { "epoch": 1.2615384615384615, "eval_loss": 0.695041835308075, "eval_runtime": 3.2511, "eval_samples_per_second": 35.065, "eval_steps_per_second": 1.846, "step": 820 }, { "epoch": 1.2769230769230768, "grad_norm": 2.213994264602661, "learning_rate": 1.995331634717649e-05, "loss": 0.4101, "step": 830 }, { "epoch": 1.2769230769230768, "eval_loss": 0.6806493997573853, "eval_runtime": 3.2367, "eval_samples_per_second": 35.221, "eval_steps_per_second": 1.854, "step": 830 }, { "epoch": 1.2923076923076924, "grad_norm": 1.7559289932250977, "learning_rate": 1.9947989815101444e-05, "loss": 0.4012, "step": 840 }, { "epoch": 1.2923076923076924, "eval_loss": 0.6939857006072998, "eval_runtime": 3.2502, "eval_samples_per_second": 35.075, "eval_steps_per_second": 1.846, "step": 840 }, { "epoch": 1.3076923076923077, "grad_norm": 0.8075680136680603, "learning_rate": 1.9942376388474282e-05, "loss": 0.3202, "step": 850 }, { "epoch": 1.3076923076923077, "eval_loss": 0.7051680684089661, "eval_runtime": 3.2612, "eval_samples_per_second": 34.956, "eval_steps_per_second": 1.84, "step": 850 }, { "epoch": 1.323076923076923, "grad_norm": 1.8492660522460938, "learning_rate": 1.9936476229183133e-05, "loss": 0.3985, "step": 860 }, { "epoch": 1.323076923076923, "eval_loss": 0.6843434572219849, "eval_runtime": 3.2739, "eval_samples_per_second": 34.821, "eval_steps_per_second": 1.833, "step": 860 }, { "epoch": 1.3384615384615386, "grad_norm": 1.6737396717071533, "learning_rate": 1.9930289507385344e-05, "loss": 0.4244, "step": 870 }, { "epoch": 1.3384615384615386, "eval_loss": 0.6972203254699707, "eval_runtime": 3.2498, "eval_samples_per_second": 35.079, "eval_steps_per_second": 1.846, "step": 870 }, { "epoch": 1.353846153846154, "grad_norm": 2.5821003913879395, "learning_rate": 1.992381640150257e-05, "loss": 0.3924, "step": 880 }, { "epoch": 1.353846153846154, "eval_loss": 0.6854589581489563, "eval_runtime": 3.2524, "eval_samples_per_second": 35.051, "eval_steps_per_second": 1.845, "step": 880 }, { "epoch": 1.3692307692307693, "grad_norm": 1.248367428779602, "learning_rate": 1.9917057098215624e-05, "loss": 0.3659, "step": 890 }, { "epoch": 1.3692307692307693, "eval_loss": 0.6994220614433289, "eval_runtime": 3.2745, "eval_samples_per_second": 34.814, "eval_steps_per_second": 1.832, "step": 890 }, { "epoch": 1.3846153846153846, "grad_norm": 0.9339770674705505, "learning_rate": 1.9910011792459086e-05, "loss": 0.309, "step": 900 }, { "epoch": 1.3846153846153846, "eval_loss": 0.7046116590499878, "eval_runtime": 3.2642, "eval_samples_per_second": 34.924, "eval_steps_per_second": 1.838, "step": 900 }, { "epoch": 1.4, "grad_norm": 27.820077896118164, "learning_rate": 1.9902680687415704e-05, "loss": 0.3788, "step": 910 }, { "epoch": 1.4, "eval_loss": 0.6996982097625732, "eval_runtime": 3.2777, "eval_samples_per_second": 34.781, "eval_steps_per_second": 1.831, "step": 910 }, { "epoch": 1.4153846153846155, "grad_norm": 2.5190956592559814, "learning_rate": 1.9895063994510512e-05, "loss": 0.3372, "step": 920 }, { "epoch": 1.4153846153846155, "eval_loss": 0.7020460963249207, "eval_runtime": 3.242, "eval_samples_per_second": 35.164, "eval_steps_per_second": 1.851, "step": 920 }, { "epoch": 1.4307692307692308, "grad_norm": 1.8096344470977783, "learning_rate": 1.9887161933404743e-05, "loss": 0.3812, "step": 930 }, { "epoch": 1.4307692307692308, "eval_loss": 0.6968725323677063, "eval_runtime": 3.3598, "eval_samples_per_second": 33.93, "eval_steps_per_second": 1.786, "step": 930 }, { "epoch": 1.4461538461538461, "grad_norm": 1.8657336235046387, "learning_rate": 1.9878974731989487e-05, "loss": 0.414, "step": 940 }, { "epoch": 1.4461538461538461, "eval_loss": 0.6909111142158508, "eval_runtime": 3.2552, "eval_samples_per_second": 35.021, "eval_steps_per_second": 1.843, "step": 940 }, { "epoch": 1.4615384615384617, "grad_norm": 2.4143567085266113, "learning_rate": 1.9870502626379127e-05, "loss": 0.3813, "step": 950 }, { "epoch": 1.4615384615384617, "eval_loss": 0.6953186392784119, "eval_runtime": 3.2929, "eval_samples_per_second": 34.619, "eval_steps_per_second": 1.822, "step": 950 }, { "epoch": 1.476923076923077, "grad_norm": 2.3205642700195312, "learning_rate": 1.9861745860904538e-05, "loss": 0.3701, "step": 960 }, { "epoch": 1.476923076923077, "eval_loss": 0.695374608039856, "eval_runtime": 3.2455, "eval_samples_per_second": 35.126, "eval_steps_per_second": 1.849, "step": 960 }, { "epoch": 1.4923076923076923, "grad_norm": 1.55659818649292, "learning_rate": 1.9852704688106003e-05, "loss": 0.3437, "step": 970 }, { "epoch": 1.4923076923076923, "eval_loss": 0.7013147473335266, "eval_runtime": 3.2638, "eval_samples_per_second": 34.929, "eval_steps_per_second": 1.838, "step": 970 }, { "epoch": 1.5076923076923077, "grad_norm": 2.180811882019043, "learning_rate": 1.9843379368725978e-05, "loss": 0.3343, "step": 980 }, { "epoch": 1.5076923076923077, "eval_loss": 0.7043299078941345, "eval_runtime": 3.2392, "eval_samples_per_second": 35.194, "eval_steps_per_second": 1.852, "step": 980 }, { "epoch": 1.523076923076923, "grad_norm": 1.474899411201477, "learning_rate": 1.983377017170154e-05, "loss": 0.3601, "step": 990 }, { "epoch": 1.523076923076923, "eval_loss": 0.6996614336967468, "eval_runtime": 3.2475, "eval_samples_per_second": 35.104, "eval_steps_per_second": 1.848, "step": 990 }, { "epoch": 1.5384615384615383, "grad_norm": 1.9230364561080933, "learning_rate": 1.9823877374156647e-05, "loss": 0.3752, "step": 1000 }, { "epoch": 1.5384615384615383, "eval_loss": 0.6967916488647461, "eval_runtime": 3.2278, "eval_samples_per_second": 35.318, "eval_steps_per_second": 1.859, "step": 1000 }, { "epoch": 1.5538461538461539, "grad_norm": 1.0019750595092773, "learning_rate": 1.9813701261394136e-05, "loss": 0.3406, "step": 1010 }, { "epoch": 1.5538461538461539, "eval_loss": 0.6963152885437012, "eval_runtime": 3.2442, "eval_samples_per_second": 35.14, "eval_steps_per_second": 1.849, "step": 1010 }, { "epoch": 1.5692307692307692, "grad_norm": 1.7724684476852417, "learning_rate": 1.9803242126887496e-05, "loss": 0.4573, "step": 1020 }, { "epoch": 1.5692307692307692, "eval_loss": 0.7392306327819824, "eval_runtime": 3.2664, "eval_samples_per_second": 34.901, "eval_steps_per_second": 1.837, "step": 1020 }, { "epoch": 1.5846153846153848, "grad_norm": 1.7095409631729126, "learning_rate": 1.979250027227241e-05, "loss": 0.3882, "step": 1030 }, { "epoch": 1.5846153846153848, "eval_loss": 0.708345890045166, "eval_runtime": 3.276, "eval_samples_per_second": 34.799, "eval_steps_per_second": 1.832, "step": 1030 }, { "epoch": 1.6, "grad_norm": 1.9319413900375366, "learning_rate": 1.9781476007338058e-05, "loss": 0.3933, "step": 1040 }, { "epoch": 1.6, "eval_loss": 0.697462797164917, "eval_runtime": 3.243, "eval_samples_per_second": 35.153, "eval_steps_per_second": 1.85, "step": 1040 }, { "epoch": 1.6153846153846154, "grad_norm": 1.8260385990142822, "learning_rate": 1.977016965001817e-05, "loss": 0.427, "step": 1050 }, { "epoch": 1.6153846153846154, "eval_loss": 0.6899636387825012, "eval_runtime": 3.2481, "eval_samples_per_second": 35.098, "eval_steps_per_second": 1.847, "step": 1050 }, { "epoch": 1.6307692307692307, "grad_norm": 1.6716266870498657, "learning_rate": 1.9758581526381878e-05, "loss": 0.3436, "step": 1060 }, { "epoch": 1.6307692307692307, "eval_loss": 0.6924691200256348, "eval_runtime": 3.2562, "eval_samples_per_second": 35.01, "eval_steps_per_second": 1.843, "step": 1060 }, { "epoch": 1.646153846153846, "grad_norm": 0.8898760080337524, "learning_rate": 1.9746711970624282e-05, "loss": 0.3802, "step": 1070 }, { "epoch": 1.646153846153846, "eval_loss": 0.7017173171043396, "eval_runtime": 3.2449, "eval_samples_per_second": 35.132, "eval_steps_per_second": 1.849, "step": 1070 }, { "epoch": 1.6615384615384614, "grad_norm": 1.6189157962799072, "learning_rate": 1.973456132505684e-05, "loss": 0.3668, "step": 1080 }, { "epoch": 1.6615384615384614, "eval_loss": 0.6917209029197693, "eval_runtime": 3.2318, "eval_samples_per_second": 35.275, "eval_steps_per_second": 1.857, "step": 1080 }, { "epoch": 1.676923076923077, "grad_norm": 1.77718985080719, "learning_rate": 1.972212994009749e-05, "loss": 0.3996, "step": 1090 }, { "epoch": 1.676923076923077, "eval_loss": 0.6930002570152283, "eval_runtime": 3.2419, "eval_samples_per_second": 35.165, "eval_steps_per_second": 1.851, "step": 1090 }, { "epoch": 1.6923076923076923, "grad_norm": 1.6316412687301636, "learning_rate": 1.9709418174260523e-05, "loss": 0.4447, "step": 1100 }, { "epoch": 1.6923076923076923, "eval_loss": 0.6898515820503235, "eval_runtime": 3.2294, "eval_samples_per_second": 35.3, "eval_steps_per_second": 1.858, "step": 1100 }, { "epoch": 1.7076923076923078, "grad_norm": 1.2730180025100708, "learning_rate": 1.9696426394146278e-05, "loss": 0.4221, "step": 1110 }, { "epoch": 1.7076923076923078, "eval_loss": 0.693137526512146, "eval_runtime": 3.23, "eval_samples_per_second": 35.294, "eval_steps_per_second": 1.858, "step": 1110 }, { "epoch": 1.7230769230769232, "grad_norm": 2.3808021545410156, "learning_rate": 1.9683154974430544e-05, "loss": 0.3271, "step": 1120 }, { "epoch": 1.7230769230769232, "eval_loss": 0.6987683773040771, "eval_runtime": 3.2399, "eval_samples_per_second": 35.186, "eval_steps_per_second": 1.852, "step": 1120 }, { "epoch": 1.7384615384615385, "grad_norm": 2.563812017440796, "learning_rate": 1.9669604297853766e-05, "loss": 0.3751, "step": 1130 }, { "epoch": 1.7384615384615385, "eval_loss": 0.6946467161178589, "eval_runtime": 3.2296, "eval_samples_per_second": 35.299, "eval_steps_per_second": 1.858, "step": 1130 }, { "epoch": 1.7538461538461538, "grad_norm": 1.6859829425811768, "learning_rate": 1.965577475520999e-05, "loss": 0.3433, "step": 1140 }, { "epoch": 1.7538461538461538, "eval_loss": 0.6889815926551819, "eval_runtime": 3.2391, "eval_samples_per_second": 35.195, "eval_steps_per_second": 1.852, "step": 1140 }, { "epoch": 1.7692307692307692, "grad_norm": 1.8283382654190063, "learning_rate": 1.9641666745335626e-05, "loss": 0.419, "step": 1150 }, { "epoch": 1.7692307692307692, "eval_loss": 0.6913794875144958, "eval_runtime": 3.2283, "eval_samples_per_second": 35.313, "eval_steps_per_second": 1.859, "step": 1150 }, { "epoch": 1.7846153846153845, "grad_norm": 1.826407551765442, "learning_rate": 1.962728067509791e-05, "loss": 0.3423, "step": 1160 }, { "epoch": 1.7846153846153845, "eval_loss": 0.692046046257019, "eval_runtime": 3.2427, "eval_samples_per_second": 35.156, "eval_steps_per_second": 1.85, "step": 1160 }, { "epoch": 1.8, "grad_norm": 1.4921714067459106, "learning_rate": 1.961261695938319e-05, "loss": 0.3351, "step": 1170 }, { "epoch": 1.8, "eval_loss": 0.7031008005142212, "eval_runtime": 3.249, "eval_samples_per_second": 35.088, "eval_steps_per_second": 1.847, "step": 1170 }, { "epoch": 1.8153846153846154, "grad_norm": 2.1426408290863037, "learning_rate": 1.9597676021084962e-05, "loss": 0.3521, "step": 1180 }, { "epoch": 1.8153846153846154, "eval_loss": 0.6923142671585083, "eval_runtime": 3.2308, "eval_samples_per_second": 35.286, "eval_steps_per_second": 1.857, "step": 1180 }, { "epoch": 1.830769230769231, "grad_norm": 1.8514198064804077, "learning_rate": 1.9582458291091664e-05, "loss": 0.454, "step": 1190 }, { "epoch": 1.830769230769231, "eval_loss": 0.6877439022064209, "eval_runtime": 3.2342, "eval_samples_per_second": 35.248, "eval_steps_per_second": 1.855, "step": 1190 }, { "epoch": 1.8461538461538463, "grad_norm": 2.015425205230713, "learning_rate": 1.9566964208274254e-05, "loss": 0.3908, "step": 1200 }, { "epoch": 1.8461538461538463, "eval_loss": 0.6943904161453247, "eval_runtime": 3.2272, "eval_samples_per_second": 35.324, "eval_steps_per_second": 1.859, "step": 1200 }, { "epoch": 1.8615384615384616, "grad_norm": 1.4284974336624146, "learning_rate": 1.9551194219473552e-05, "loss": 0.3538, "step": 1210 }, { "epoch": 1.8615384615384616, "eval_loss": 0.6958539485931396, "eval_runtime": 3.2638, "eval_samples_per_second": 34.929, "eval_steps_per_second": 1.838, "step": 1210 }, { "epoch": 1.876923076923077, "grad_norm": 2.42622447013855, "learning_rate": 1.9535148779487365e-05, "loss": 0.28, "step": 1220 }, { "epoch": 1.876923076923077, "eval_loss": 0.7015026211738586, "eval_runtime": 3.2491, "eval_samples_per_second": 35.086, "eval_steps_per_second": 1.847, "step": 1220 }, { "epoch": 1.8923076923076922, "grad_norm": 1.8617641925811768, "learning_rate": 1.9518828351057345e-05, "loss": 0.4324, "step": 1230 }, { "epoch": 1.8923076923076922, "eval_loss": 0.6908020377159119, "eval_runtime": 3.2376, "eval_samples_per_second": 35.211, "eval_steps_per_second": 1.853, "step": 1230 }, { "epoch": 1.9076923076923076, "grad_norm": 1.7121613025665283, "learning_rate": 1.9502233404855672e-05, "loss": 0.3713, "step": 1240 }, { "epoch": 1.9076923076923076, "eval_loss": 0.703855574131012, "eval_runtime": 3.2355, "eval_samples_per_second": 35.234, "eval_steps_per_second": 1.854, "step": 1240 }, { "epoch": 1.9230769230769231, "grad_norm": 2.0595622062683105, "learning_rate": 1.9485364419471454e-05, "loss": 0.4456, "step": 1250 }, { "epoch": 1.9230769230769231, "eval_loss": 0.686195433139801, "eval_runtime": 3.2266, "eval_samples_per_second": 35.331, "eval_steps_per_second": 1.86, "step": 1250 }, { "epoch": 1.9384615384615385, "grad_norm": 1.9807814359664917, "learning_rate": 1.946822188139696e-05, "loss": 0.2958, "step": 1260 }, { "epoch": 1.9384615384615385, "eval_loss": 0.6912775039672852, "eval_runtime": 3.2509, "eval_samples_per_second": 35.067, "eval_steps_per_second": 1.846, "step": 1260 }, { "epoch": 1.953846153846154, "grad_norm": 1.8224427700042725, "learning_rate": 1.945080628501355e-05, "loss": 0.3876, "step": 1270 }, { "epoch": 1.953846153846154, "eval_loss": 0.6861458420753479, "eval_runtime": 3.2739, "eval_samples_per_second": 34.82, "eval_steps_per_second": 1.833, "step": 1270 }, { "epoch": 1.9692307692307693, "grad_norm": 2.107452630996704, "learning_rate": 1.9433118132577432e-05, "loss": 0.3748, "step": 1280 }, { "epoch": 1.9692307692307693, "eval_loss": 0.6867597699165344, "eval_runtime": 3.2581, "eval_samples_per_second": 34.99, "eval_steps_per_second": 1.842, "step": 1280 }, { "epoch": 1.9846153846153847, "grad_norm": 5.972170352935791, "learning_rate": 1.94151579342052e-05, "loss": 0.4297, "step": 1290 }, { "epoch": 1.9846153846153847, "eval_loss": 1.595029354095459, "eval_runtime": 3.2512, "eval_samples_per_second": 35.064, "eval_steps_per_second": 1.845, "step": 1290 }, { "epoch": 2.0, "grad_norm": 1.460162878036499, "learning_rate": 1.9396926207859085e-05, "loss": 0.6569, "step": 1300 }, { "epoch": 2.0, "eval_loss": 0.7044022083282471, "eval_runtime": 3.2342, "eval_samples_per_second": 35.248, "eval_steps_per_second": 1.855, "step": 1300 }, { "epoch": 2.0153846153846153, "grad_norm": 1.5906578302383423, "learning_rate": 1.9378423479332045e-05, "loss": 0.2524, "step": 1310 }, { "epoch": 2.0153846153846153, "eval_loss": 0.7527978420257568, "eval_runtime": 3.2369, "eval_samples_per_second": 35.218, "eval_steps_per_second": 1.854, "step": 1310 }, { "epoch": 2.0307692307692307, "grad_norm": 1.2187044620513916, "learning_rate": 1.935965028223259e-05, "loss": 0.1678, "step": 1320 }, { "epoch": 2.0307692307692307, "eval_loss": 0.7567086219787598, "eval_runtime": 3.2521, "eval_samples_per_second": 35.054, "eval_steps_per_second": 1.845, "step": 1320 }, { "epoch": 2.046153846153846, "grad_norm": 0.984000563621521, "learning_rate": 1.9340607157969393e-05, "loss": 0.219, "step": 1330 }, { "epoch": 2.046153846153846, "eval_loss": 0.7520028948783875, "eval_runtime": 3.2487, "eval_samples_per_second": 35.091, "eval_steps_per_second": 1.847, "step": 1330 }, { "epoch": 2.0615384615384613, "grad_norm": 1.2751810550689697, "learning_rate": 1.932129465573568e-05, "loss": 0.2095, "step": 1340 }, { "epoch": 2.0615384615384613, "eval_loss": 0.739636242389679, "eval_runtime": 3.2269, "eval_samples_per_second": 35.328, "eval_steps_per_second": 1.859, "step": 1340 }, { "epoch": 2.076923076923077, "grad_norm": 1.4136130809783936, "learning_rate": 1.9301713332493386e-05, "loss": 0.1668, "step": 1350 }, { "epoch": 2.076923076923077, "eval_loss": 0.7536832690238953, "eval_runtime": 3.2417, "eval_samples_per_second": 35.167, "eval_steps_per_second": 1.851, "step": 1350 }, { "epoch": 2.0923076923076924, "grad_norm": 1.7586925029754639, "learning_rate": 1.9281863752957095e-05, "loss": 0.2118, "step": 1360 }, { "epoch": 2.0923076923076924, "eval_loss": 0.7858787775039673, "eval_runtime": 3.2321, "eval_samples_per_second": 35.271, "eval_steps_per_second": 1.856, "step": 1360 }, { "epoch": 2.1076923076923078, "grad_norm": 1.9954192638397217, "learning_rate": 1.9261746489577767e-05, "loss": 0.1911, "step": 1370 }, { "epoch": 2.1076923076923078, "eval_loss": 0.7683790922164917, "eval_runtime": 3.2187, "eval_samples_per_second": 35.418, "eval_steps_per_second": 1.864, "step": 1370 }, { "epoch": 2.123076923076923, "grad_norm": 1.311628818511963, "learning_rate": 1.92413621225262e-05, "loss": 0.1894, "step": 1380 }, { "epoch": 2.123076923076923, "eval_loss": 0.8233888745307922, "eval_runtime": 3.2593, "eval_samples_per_second": 34.977, "eval_steps_per_second": 1.841, "step": 1380 }, { "epoch": 2.1384615384615384, "grad_norm": 1.3510133028030396, "learning_rate": 1.9220711239676325e-05, "loss": 0.2001, "step": 1390 }, { "epoch": 2.1384615384615384, "eval_loss": 0.7833328247070312, "eval_runtime": 3.2388, "eval_samples_per_second": 35.198, "eval_steps_per_second": 1.853, "step": 1390 }, { "epoch": 2.1538461538461537, "grad_norm": 1.1844192743301392, "learning_rate": 1.9199794436588244e-05, "loss": 0.2078, "step": 1400 }, { "epoch": 2.1538461538461537, "eval_loss": 0.7819744348526001, "eval_runtime": 3.2322, "eval_samples_per_second": 35.27, "eval_steps_per_second": 1.856, "step": 1400 }, { "epoch": 2.169230769230769, "grad_norm": 1.4540330171585083, "learning_rate": 1.917861231649104e-05, "loss": 0.2046, "step": 1410 }, { "epoch": 2.169230769230769, "eval_loss": 0.7777317762374878, "eval_runtime": 3.3497, "eval_samples_per_second": 34.033, "eval_steps_per_second": 1.791, "step": 1410 }, { "epoch": 2.184615384615385, "grad_norm": 2.100379467010498, "learning_rate": 1.915716549026541e-05, "loss": 0.2192, "step": 1420 }, { "epoch": 2.184615384615385, "eval_loss": 0.7560202479362488, "eval_runtime": 3.2536, "eval_samples_per_second": 35.038, "eval_steps_per_second": 1.844, "step": 1420 }, { "epoch": 2.2, "grad_norm": 1.1869295835494995, "learning_rate": 1.913545457642601e-05, "loss": 0.2055, "step": 1430 }, { "epoch": 2.2, "eval_loss": 0.7658796906471252, "eval_runtime": 3.2561, "eval_samples_per_second": 35.011, "eval_steps_per_second": 1.843, "step": 1430 }, { "epoch": 2.2153846153846155, "grad_norm": 1.143850564956665, "learning_rate": 1.9113480201103658e-05, "loss": 0.2003, "step": 1440 }, { "epoch": 2.2153846153846155, "eval_loss": 0.7692248821258545, "eval_runtime": 3.2573, "eval_samples_per_second": 34.998, "eval_steps_per_second": 1.842, "step": 1440 }, { "epoch": 2.230769230769231, "grad_norm": 1.4219717979431152, "learning_rate": 1.909124299802724e-05, "loss": 0.1882, "step": 1450 }, { "epoch": 2.230769230769231, "eval_loss": 0.7854686975479126, "eval_runtime": 3.2376, "eval_samples_per_second": 35.211, "eval_steps_per_second": 1.853, "step": 1450 }, { "epoch": 2.246153846153846, "grad_norm": 1.727668046951294, "learning_rate": 1.9068743608505454e-05, "loss": 0.2081, "step": 1460 }, { "epoch": 2.246153846153846, "eval_loss": 0.7837368249893188, "eval_runtime": 3.2688, "eval_samples_per_second": 34.875, "eval_steps_per_second": 1.836, "step": 1460 }, { "epoch": 2.2615384615384615, "grad_norm": 1.9428791999816895, "learning_rate": 1.9045982681408324e-05, "loss": 0.2031, "step": 1470 }, { "epoch": 2.2615384615384615, "eval_loss": 0.7683539390563965, "eval_runtime": 3.2316, "eval_samples_per_second": 35.277, "eval_steps_per_second": 1.857, "step": 1470 }, { "epoch": 2.276923076923077, "grad_norm": 1.5166252851486206, "learning_rate": 1.902296087314845e-05, "loss": 0.1919, "step": 1480 }, { "epoch": 2.276923076923077, "eval_loss": 0.7894486784934998, "eval_runtime": 3.2275, "eval_samples_per_second": 35.321, "eval_steps_per_second": 1.859, "step": 1480 }, { "epoch": 2.292307692307692, "grad_norm": 1.368630290031433, "learning_rate": 1.8999678847662124e-05, "loss": 0.1998, "step": 1490 }, { "epoch": 2.292307692307692, "eval_loss": 0.7855644226074219, "eval_runtime": 3.2357, "eval_samples_per_second": 35.232, "eval_steps_per_second": 1.854, "step": 1490 }, { "epoch": 2.3076923076923075, "grad_norm": 0.9620829820632935, "learning_rate": 1.8976137276390145e-05, "loss": 0.2139, "step": 1500 }, { "epoch": 2.3076923076923075, "eval_loss": 0.797519326210022, "eval_runtime": 3.2212, "eval_samples_per_second": 35.39, "eval_steps_per_second": 1.863, "step": 1500 }, { "epoch": 2.3230769230769233, "grad_norm": 1.0639945268630981, "learning_rate": 1.895233683825847e-05, "loss": 0.2164, "step": 1510 }, { "epoch": 2.3230769230769233, "eval_loss": 0.7683231234550476, "eval_runtime": 3.2416, "eval_samples_per_second": 35.168, "eval_steps_per_second": 1.851, "step": 1510 }, { "epoch": 2.3384615384615386, "grad_norm": 2.229300022125244, "learning_rate": 1.892827821965864e-05, "loss": 0.188, "step": 1520 }, { "epoch": 2.3384615384615386, "eval_loss": 0.7734756469726562, "eval_runtime": 3.239, "eval_samples_per_second": 35.196, "eval_steps_per_second": 1.852, "step": 1520 }, { "epoch": 2.353846153846154, "grad_norm": 1.2442930936813354, "learning_rate": 1.8903962114427985e-05, "loss": 0.1762, "step": 1530 }, { "epoch": 2.353846153846154, "eval_loss": 0.7807677984237671, "eval_runtime": 3.2321, "eval_samples_per_second": 35.272, "eval_steps_per_second": 1.856, "step": 1530 }, { "epoch": 2.3692307692307693, "grad_norm": 0.7546485066413879, "learning_rate": 1.8879389223829592e-05, "loss": 0.1933, "step": 1540 }, { "epoch": 2.3692307692307693, "eval_loss": 0.7788336277008057, "eval_runtime": 3.234, "eval_samples_per_second": 35.25, "eval_steps_per_second": 1.855, "step": 1540 }, { "epoch": 2.3846153846153846, "grad_norm": 1.6050472259521484, "learning_rate": 1.8854560256532098e-05, "loss": 0.2, "step": 1550 }, { "epoch": 2.3846153846153846, "eval_loss": 0.7777507305145264, "eval_runtime": 3.2303, "eval_samples_per_second": 35.291, "eval_steps_per_second": 1.857, "step": 1550 }, { "epoch": 2.4, "grad_norm": 1.6613671779632568, "learning_rate": 1.8829475928589272e-05, "loss": 0.1959, "step": 1560 }, { "epoch": 2.4, "eval_loss": 0.7840877175331116, "eval_runtime": 3.2313, "eval_samples_per_second": 35.28, "eval_steps_per_second": 1.857, "step": 1560 }, { "epoch": 2.4153846153846152, "grad_norm": 1.127969741821289, "learning_rate": 1.8804136963419316e-05, "loss": 0.1791, "step": 1570 }, { "epoch": 2.4153846153846152, "eval_loss": 0.787642776966095, "eval_runtime": 3.2339, "eval_samples_per_second": 35.251, "eval_steps_per_second": 1.855, "step": 1570 }, { "epoch": 2.430769230769231, "grad_norm": 1.0740890502929688, "learning_rate": 1.8778544091784047e-05, "loss": 0.1952, "step": 1580 }, { "epoch": 2.430769230769231, "eval_loss": 0.7895064949989319, "eval_runtime": 3.2507, "eval_samples_per_second": 35.069, "eval_steps_per_second": 1.846, "step": 1580 }, { "epoch": 2.4461538461538463, "grad_norm": 1.3111459016799927, "learning_rate": 1.87526980517678e-05, "loss": 0.2019, "step": 1590 }, { "epoch": 2.4461538461538463, "eval_loss": 0.7794804573059082, "eval_runtime": 3.2322, "eval_samples_per_second": 35.27, "eval_steps_per_second": 1.856, "step": 1590 }, { "epoch": 2.4615384615384617, "grad_norm": 1.7549346685409546, "learning_rate": 1.8726599588756144e-05, "loss": 0.1857, "step": 1600 }, { "epoch": 2.4615384615384617, "eval_loss": 0.7962229251861572, "eval_runtime": 3.2309, "eval_samples_per_second": 35.285, "eval_steps_per_second": 1.857, "step": 1600 }, { "epoch": 2.476923076923077, "grad_norm": 1.6596492528915405, "learning_rate": 1.8700249455414394e-05, "loss": 0.2058, "step": 1610 }, { "epoch": 2.476923076923077, "eval_loss": 0.785554051399231, "eval_runtime": 3.2375, "eval_samples_per_second": 35.212, "eval_steps_per_second": 1.853, "step": 1610 }, { "epoch": 2.4923076923076923, "grad_norm": 1.5621322393417358, "learning_rate": 1.8673648411665895e-05, "loss": 0.1946, "step": 1620 }, { "epoch": 2.4923076923076923, "eval_loss": 0.7949020266532898, "eval_runtime": 3.2269, "eval_samples_per_second": 35.328, "eval_steps_per_second": 1.859, "step": 1620 }, { "epoch": 2.5076923076923077, "grad_norm": 2.000927686691284, "learning_rate": 1.864679722467011e-05, "loss": 0.1984, "step": 1630 }, { "epoch": 2.5076923076923077, "eval_loss": 0.791332483291626, "eval_runtime": 3.2389, "eval_samples_per_second": 35.197, "eval_steps_per_second": 1.852, "step": 1630 }, { "epoch": 2.523076923076923, "grad_norm": 1.7056845426559448, "learning_rate": 1.8619696668800494e-05, "loss": 0.2212, "step": 1640 }, { "epoch": 2.523076923076923, "eval_loss": 0.7772064805030823, "eval_runtime": 3.2441, "eval_samples_per_second": 35.14, "eval_steps_per_second": 1.849, "step": 1640 }, { "epoch": 2.5384615384615383, "grad_norm": 1.5976656675338745, "learning_rate": 1.859234752562217e-05, "loss": 0.1901, "step": 1650 }, { "epoch": 2.5384615384615383, "eval_loss": 0.7850207686424255, "eval_runtime": 3.2221, "eval_samples_per_second": 35.38, "eval_steps_per_second": 1.862, "step": 1650 }, { "epoch": 2.5538461538461537, "grad_norm": 1.0322597026824951, "learning_rate": 1.8564750583869374e-05, "loss": 0.2185, "step": 1660 }, { "epoch": 2.5538461538461537, "eval_loss": 0.7930358648300171, "eval_runtime": 3.2427, "eval_samples_per_second": 35.156, "eval_steps_per_second": 1.85, "step": 1660 }, { "epoch": 2.569230769230769, "grad_norm": 1.1539405584335327, "learning_rate": 1.8536906639422724e-05, "loss": 0.2056, "step": 1670 }, { "epoch": 2.569230769230769, "eval_loss": 0.7705276012420654, "eval_runtime": 3.2511, "eval_samples_per_second": 35.065, "eval_steps_per_second": 1.846, "step": 1670 }, { "epoch": 2.5846153846153848, "grad_norm": 1.2852847576141357, "learning_rate": 1.850881649528625e-05, "loss": 0.2031, "step": 1680 }, { "epoch": 2.5846153846153848, "eval_loss": 0.7809199094772339, "eval_runtime": 3.2419, "eval_samples_per_second": 35.164, "eval_steps_per_second": 1.851, "step": 1680 }, { "epoch": 2.6, "grad_norm": 2.8470299243927, "learning_rate": 1.848048096156426e-05, "loss": 0.207, "step": 1690 }, { "epoch": 2.6, "eval_loss": 0.7837203145027161, "eval_runtime": 3.2437, "eval_samples_per_second": 35.145, "eval_steps_per_second": 1.85, "step": 1690 }, { "epoch": 2.6153846153846154, "grad_norm": 1.169309139251709, "learning_rate": 1.845190085543795e-05, "loss": 0.1924, "step": 1700 }, { "epoch": 2.6153846153846154, "eval_loss": 0.8024268746376038, "eval_runtime": 3.2426, "eval_samples_per_second": 35.157, "eval_steps_per_second": 1.85, "step": 1700 }, { "epoch": 2.6307692307692307, "grad_norm": 1.3079050779342651, "learning_rate": 1.8423077001141848e-05, "loss": 0.2111, "step": 1710 }, { "epoch": 2.6307692307692307, "eval_loss": 0.7842855453491211, "eval_runtime": 3.2512, "eval_samples_per_second": 35.064, "eval_steps_per_second": 1.845, "step": 1710 }, { "epoch": 2.646153846153846, "grad_norm": 1.5863689184188843, "learning_rate": 1.839401022994006e-05, "loss": 0.2039, "step": 1720 }, { "epoch": 2.646153846153846, "eval_loss": 0.7856019735336304, "eval_runtime": 3.2563, "eval_samples_per_second": 35.009, "eval_steps_per_second": 1.843, "step": 1720 }, { "epoch": 2.6615384615384614, "grad_norm": 1.1605026721954346, "learning_rate": 1.8364701380102267e-05, "loss": 0.2183, "step": 1730 }, { "epoch": 2.6615384615384614, "eval_loss": 0.7758111953735352, "eval_runtime": 3.231, "eval_samples_per_second": 35.283, "eval_steps_per_second": 1.857, "step": 1730 }, { "epoch": 2.676923076923077, "grad_norm": 1.4007433652877808, "learning_rate": 1.8335151296879576e-05, "loss": 0.2054, "step": 1740 }, { "epoch": 2.676923076923077, "eval_loss": 0.7833234071731567, "eval_runtime": 3.2356, "eval_samples_per_second": 35.233, "eval_steps_per_second": 1.854, "step": 1740 }, { "epoch": 2.6923076923076925, "grad_norm": 1.3966948986053467, "learning_rate": 1.8305360832480118e-05, "loss": 0.1974, "step": 1750 }, { "epoch": 2.6923076923076925, "eval_loss": 0.7975159287452698, "eval_runtime": 3.2293, "eval_samples_per_second": 35.302, "eval_steps_per_second": 1.858, "step": 1750 }, { "epoch": 2.707692307692308, "grad_norm": 1.2282441854476929, "learning_rate": 1.82753308460445e-05, "loss": 0.2114, "step": 1760 }, { "epoch": 2.707692307692308, "eval_loss": 0.8000977039337158, "eval_runtime": 3.226, "eval_samples_per_second": 35.337, "eval_steps_per_second": 1.86, "step": 1760 }, { "epoch": 2.723076923076923, "grad_norm": 1.7516143321990967, "learning_rate": 1.8245062203621003e-05, "loss": 0.2081, "step": 1770 }, { "epoch": 2.723076923076923, "eval_loss": 0.7978941202163696, "eval_runtime": 3.2435, "eval_samples_per_second": 35.147, "eval_steps_per_second": 1.85, "step": 1770 }, { "epoch": 2.7384615384615385, "grad_norm": 1.6751377582550049, "learning_rate": 1.821455577814062e-05, "loss": 0.2013, "step": 1780 }, { "epoch": 2.7384615384615385, "eval_loss": 0.7863066792488098, "eval_runtime": 3.2507, "eval_samples_per_second": 35.07, "eval_steps_per_second": 1.846, "step": 1780 }, { "epoch": 2.753846153846154, "grad_norm": 1.3899345397949219, "learning_rate": 1.818381244939187e-05, "loss": 0.206, "step": 1790 }, { "epoch": 2.753846153846154, "eval_loss": 0.7733153104782104, "eval_runtime": 3.2514, "eval_samples_per_second": 35.062, "eval_steps_per_second": 1.845, "step": 1790 }, { "epoch": 2.769230769230769, "grad_norm": 1.6483854055404663, "learning_rate": 1.8152833103995443e-05, "loss": 0.1979, "step": 1800 }, { "epoch": 2.769230769230769, "eval_loss": 0.7838578224182129, "eval_runtime": 3.2273, "eval_samples_per_second": 35.323, "eval_steps_per_second": 1.859, "step": 1800 }, { "epoch": 2.7846153846153845, "grad_norm": 1.4585682153701782, "learning_rate": 1.8121618635378616e-05, "loss": 0.2093, "step": 1810 }, { "epoch": 2.7846153846153845, "eval_loss": 0.7677554488182068, "eval_runtime": 3.231, "eval_samples_per_second": 35.284, "eval_steps_per_second": 1.857, "step": 1810 }, { "epoch": 2.8, "grad_norm": 1.3923680782318115, "learning_rate": 1.8090169943749477e-05, "loss": 0.2036, "step": 1820 }, { "epoch": 2.8, "eval_loss": 0.7729052901268005, "eval_runtime": 3.2487, "eval_samples_per_second": 35.091, "eval_steps_per_second": 1.847, "step": 1820 }, { "epoch": 2.815384615384615, "grad_norm": 1.233302354812622, "learning_rate": 1.8058487936070992e-05, "loss": 0.1931, "step": 1830 }, { "epoch": 2.815384615384615, "eval_loss": 0.7708905935287476, "eval_runtime": 3.2127, "eval_samples_per_second": 35.484, "eval_steps_per_second": 1.868, "step": 1830 }, { "epoch": 2.830769230769231, "grad_norm": 1.4429056644439697, "learning_rate": 1.802657352603483e-05, "loss": 0.1929, "step": 1840 }, { "epoch": 2.830769230769231, "eval_loss": 0.7802720069885254, "eval_runtime": 3.2503, "eval_samples_per_second": 35.074, "eval_steps_per_second": 1.846, "step": 1840 }, { "epoch": 2.8461538461538463, "grad_norm": 2.0769877433776855, "learning_rate": 1.7994427634035016e-05, "loss": 0.226, "step": 1850 }, { "epoch": 2.8461538461538463, "eval_loss": 0.766547679901123, "eval_runtime": 3.2413, "eval_samples_per_second": 35.171, "eval_steps_per_second": 1.851, "step": 1850 }, { "epoch": 2.8615384615384616, "grad_norm": 1.845153570175171, "learning_rate": 1.7962051187141377e-05, "loss": 0.2257, "step": 1860 }, { "epoch": 2.8615384615384616, "eval_loss": 0.7760981321334839, "eval_runtime": 3.2416, "eval_samples_per_second": 35.168, "eval_steps_per_second": 1.851, "step": 1860 }, { "epoch": 2.876923076923077, "grad_norm": 1.8381919860839844, "learning_rate": 1.7929445119072837e-05, "loss": 0.2193, "step": 1870 }, { "epoch": 2.876923076923077, "eval_loss": 0.7926127314567566, "eval_runtime": 3.2889, "eval_samples_per_second": 34.662, "eval_steps_per_second": 1.824, "step": 1870 }, { "epoch": 2.8923076923076922, "grad_norm": 1.7760707139968872, "learning_rate": 1.7896610370170452e-05, "loss": 0.2085, "step": 1880 }, { "epoch": 2.8923076923076922, "eval_loss": 0.7870352268218994, "eval_runtime": 3.2379, "eval_samples_per_second": 35.208, "eval_steps_per_second": 1.853, "step": 1880 }, { "epoch": 2.9076923076923076, "grad_norm": 1.2421387434005737, "learning_rate": 1.786354788737031e-05, "loss": 0.2374, "step": 1890 }, { "epoch": 2.9076923076923076, "eval_loss": 0.7905800342559814, "eval_runtime": 3.2268, "eval_samples_per_second": 35.33, "eval_steps_per_second": 1.859, "step": 1890 }, { "epoch": 2.9230769230769234, "grad_norm": 1.8296164274215698, "learning_rate": 1.7830258624176224e-05, "loss": 0.1788, "step": 1900 }, { "epoch": 2.9230769230769234, "eval_loss": 0.7861989736557007, "eval_runtime": 3.2405, "eval_samples_per_second": 35.18, "eval_steps_per_second": 1.852, "step": 1900 }, { "epoch": 2.9384615384615387, "grad_norm": 1.577077865600586, "learning_rate": 1.7796743540632226e-05, "loss": 0.2296, "step": 1910 }, { "epoch": 2.9384615384615387, "eval_loss": 0.775193452835083, "eval_runtime": 3.2468, "eval_samples_per_second": 35.111, "eval_steps_per_second": 1.848, "step": 1910 }, { "epoch": 2.953846153846154, "grad_norm": 1.630001187324524, "learning_rate": 1.776300360329488e-05, "loss": 0.2115, "step": 1920 }, { "epoch": 2.953846153846154, "eval_loss": 0.7809256911277771, "eval_runtime": 3.2326, "eval_samples_per_second": 35.266, "eval_steps_per_second": 1.856, "step": 1920 }, { "epoch": 2.9692307692307693, "grad_norm": 1.5851411819458008, "learning_rate": 1.772903978520542e-05, "loss": 0.1967, "step": 1930 }, { "epoch": 2.9692307692307693, "eval_loss": 0.7896639108657837, "eval_runtime": 3.2497, "eval_samples_per_second": 35.08, "eval_steps_per_second": 1.846, "step": 1930 }, { "epoch": 2.9846153846153847, "grad_norm": 1.3122369050979614, "learning_rate": 1.769485306586166e-05, "loss": 0.2159, "step": 1940 }, { "epoch": 2.9846153846153847, "eval_loss": 0.777010977268219, "eval_runtime": 3.2294, "eval_samples_per_second": 35.3, "eval_steps_per_second": 1.858, "step": 1940 }, { "epoch": 3.0, "grad_norm": 1.2341620922088623, "learning_rate": 1.766044443118978e-05, "loss": 0.1962, "step": 1950 }, { "epoch": 3.0, "eval_loss": 0.7913311123847961, "eval_runtime": 3.226, "eval_samples_per_second": 35.338, "eval_steps_per_second": 1.86, "step": 1950 }, { "epoch": 3.0153846153846153, "grad_norm": 1.1695995330810547, "learning_rate": 1.762581487351587e-05, "loss": 0.1231, "step": 1960 }, { "epoch": 3.0153846153846153, "eval_loss": 0.8296219706535339, "eval_runtime": 3.2283, "eval_samples_per_second": 35.313, "eval_steps_per_second": 1.859, "step": 1960 }, { "epoch": 3.0307692307692307, "grad_norm": 1.0905882120132446, "learning_rate": 1.7590965391537316e-05, "loss": 0.1028, "step": 1970 }, { "epoch": 3.0307692307692307, "eval_loss": 0.8419015407562256, "eval_runtime": 3.2463, "eval_samples_per_second": 35.117, "eval_steps_per_second": 1.848, "step": 1970 }, { "epoch": 3.046153846153846, "grad_norm": 0.9968711137771606, "learning_rate": 1.7555896990294003e-05, "loss": 0.116, "step": 1980 }, { "epoch": 3.046153846153846, "eval_loss": 0.8519408106803894, "eval_runtime": 3.2373, "eval_samples_per_second": 35.214, "eval_steps_per_second": 1.853, "step": 1980 }, { "epoch": 3.0615384615384613, "grad_norm": 1.8941599130630493, "learning_rate": 1.7520610681139322e-05, "loss": 0.1195, "step": 1990 }, { "epoch": 3.0615384615384613, "eval_loss": 0.8432408571243286, "eval_runtime": 3.2477, "eval_samples_per_second": 35.102, "eval_steps_per_second": 1.847, "step": 1990 }, { "epoch": 3.076923076923077, "grad_norm": 1.5090588331222534, "learning_rate": 1.7485107481711014e-05, "loss": 0.1141, "step": 2000 }, { "epoch": 3.076923076923077, "eval_loss": 0.8407796025276184, "eval_runtime": 3.2492, "eval_samples_per_second": 35.085, "eval_steps_per_second": 1.847, "step": 2000 } ], "logging_steps": 10, "max_steps": 6500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 1.3637570942048666e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }