|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.076923076923077, |
|
"eval_steps": 10, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015384615384615385, |
|
"grad_norm": 10.082767486572266, |
|
"learning_rate": 3.0769230769230774e-07, |
|
"loss": 2.2325, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015384615384615385, |
|
"eval_loss": 1.8899363279342651, |
|
"eval_runtime": 3.2461, |
|
"eval_samples_per_second": 35.119, |
|
"eval_steps_per_second": 1.848, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 11.988937377929688, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 1.8704, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.7930843830108643, |
|
"eval_runtime": 3.2368, |
|
"eval_samples_per_second": 35.22, |
|
"eval_steps_per_second": 1.854, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.046153846153846156, |
|
"grad_norm": 12.660626411437988, |
|
"learning_rate": 9.230769230769232e-07, |
|
"loss": 1.5485, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.046153846153846156, |
|
"eval_loss": 1.3351986408233643, |
|
"eval_runtime": 3.2549, |
|
"eval_samples_per_second": 35.024, |
|
"eval_steps_per_second": 1.843, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 5.869946002960205, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 1.1465, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 0.9145882725715637, |
|
"eval_runtime": 3.2573, |
|
"eval_samples_per_second": 34.998, |
|
"eval_steps_per_second": 1.842, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 5.386349201202393, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.7239, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"eval_loss": 0.7560557126998901, |
|
"eval_runtime": 3.2454, |
|
"eval_samples_per_second": 35.126, |
|
"eval_steps_per_second": 1.849, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 13.984370231628418, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 0.7261, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.7075957655906677, |
|
"eval_runtime": 3.2359, |
|
"eval_samples_per_second": 35.229, |
|
"eval_steps_per_second": 1.854, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1076923076923077, |
|
"grad_norm": 12.205921173095703, |
|
"learning_rate": 2.153846153846154e-06, |
|
"loss": 0.6461, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1076923076923077, |
|
"eval_loss": 0.6797388792037964, |
|
"eval_runtime": 3.2439, |
|
"eval_samples_per_second": 35.143, |
|
"eval_steps_per_second": 1.85, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 8.89425277709961, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.5958, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.6671402454376221, |
|
"eval_runtime": 3.2181, |
|
"eval_samples_per_second": 35.425, |
|
"eval_steps_per_second": 1.864, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13846153846153847, |
|
"grad_norm": 19.930360794067383, |
|
"learning_rate": 2.7692307692307697e-06, |
|
"loss": 0.6231, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13846153846153847, |
|
"eval_loss": 0.650134265422821, |
|
"eval_runtime": 3.2427, |
|
"eval_samples_per_second": 35.156, |
|
"eval_steps_per_second": 1.85, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 9.43468189239502, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.6271, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.6546280384063721, |
|
"eval_runtime": 3.2567, |
|
"eval_samples_per_second": 35.005, |
|
"eval_steps_per_second": 1.842, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16923076923076924, |
|
"grad_norm": 9.599658012390137, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.5577, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16923076923076924, |
|
"eval_loss": 0.6422574520111084, |
|
"eval_runtime": 3.2521, |
|
"eval_samples_per_second": 35.054, |
|
"eval_steps_per_second": 1.845, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 10.455623626708984, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.5193, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.6282245516777039, |
|
"eval_runtime": 3.2576, |
|
"eval_samples_per_second": 34.996, |
|
"eval_steps_per_second": 1.842, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.3797407150268555, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5225, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.6316070556640625, |
|
"eval_runtime": 3.2288, |
|
"eval_samples_per_second": 35.307, |
|
"eval_steps_per_second": 1.858, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 2.727918863296509, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.5738, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6229422688484192, |
|
"eval_runtime": 3.2458, |
|
"eval_samples_per_second": 35.122, |
|
"eval_steps_per_second": 1.849, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 43.989776611328125, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.5529, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"eval_loss": 0.6122006773948669, |
|
"eval_runtime": 3.2647, |
|
"eval_samples_per_second": 34.919, |
|
"eval_steps_per_second": 1.838, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 4.098404407501221, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.519, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.6218054294586182, |
|
"eval_runtime": 3.2453, |
|
"eval_samples_per_second": 35.128, |
|
"eval_steps_per_second": 1.849, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.26153846153846155, |
|
"grad_norm": 2.585014820098877, |
|
"learning_rate": 5.230769230769232e-06, |
|
"loss": 0.5713, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26153846153846155, |
|
"eval_loss": 0.6205213069915771, |
|
"eval_runtime": 3.2516, |
|
"eval_samples_per_second": 35.059, |
|
"eval_steps_per_second": 1.845, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 2.8192200660705566, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.5355, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.6130949258804321, |
|
"eval_runtime": 3.2451, |
|
"eval_samples_per_second": 35.13, |
|
"eval_steps_per_second": 1.849, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2923076923076923, |
|
"grad_norm": 4.156994819641113, |
|
"learning_rate": 5.846153846153847e-06, |
|
"loss": 0.4487, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2923076923076923, |
|
"eval_loss": 0.6160268783569336, |
|
"eval_runtime": 3.2251, |
|
"eval_samples_per_second": 35.348, |
|
"eval_steps_per_second": 1.86, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 3.249791383743286, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.5708, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.6122933626174927, |
|
"eval_runtime": 3.2483, |
|
"eval_samples_per_second": 35.095, |
|
"eval_steps_per_second": 1.847, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3230769230769231, |
|
"grad_norm": 3.2924177646636963, |
|
"learning_rate": 6.461538461538463e-06, |
|
"loss": 0.5607, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3230769230769231, |
|
"eval_loss": 0.6078358888626099, |
|
"eval_runtime": 3.2453, |
|
"eval_samples_per_second": 35.128, |
|
"eval_steps_per_second": 1.849, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 3.444439649581909, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.5026, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.6209812760353088, |
|
"eval_runtime": 3.2344, |
|
"eval_samples_per_second": 35.246, |
|
"eval_steps_per_second": 1.855, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.35384615384615387, |
|
"grad_norm": 2.5516700744628906, |
|
"learning_rate": 7.076923076923078e-06, |
|
"loss": 0.4938, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35384615384615387, |
|
"eval_loss": 0.6547431349754333, |
|
"eval_runtime": 3.2496, |
|
"eval_samples_per_second": 35.081, |
|
"eval_steps_per_second": 1.846, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 25.815881729125977, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.5766, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.6063669919967651, |
|
"eval_runtime": 3.2779, |
|
"eval_samples_per_second": 34.778, |
|
"eval_steps_per_second": 1.83, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 2.2970821857452393, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.5924, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"eval_loss": 0.6134644746780396, |
|
"eval_runtime": 3.2301, |
|
"eval_samples_per_second": 35.293, |
|
"eval_steps_per_second": 1.858, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.2378745079040527, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5715, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6102377772331238, |
|
"eval_runtime": 3.2286, |
|
"eval_samples_per_second": 35.309, |
|
"eval_steps_per_second": 1.858, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4153846153846154, |
|
"grad_norm": 2.2380335330963135, |
|
"learning_rate": 8.307692307692309e-06, |
|
"loss": 0.5491, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4153846153846154, |
|
"eval_loss": 0.6195328831672668, |
|
"eval_runtime": 3.2393, |
|
"eval_samples_per_second": 35.193, |
|
"eval_steps_per_second": 1.852, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 3.0623953342437744, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.6217, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.6168036460876465, |
|
"eval_runtime": 3.2516, |
|
"eval_samples_per_second": 35.06, |
|
"eval_steps_per_second": 1.845, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4461538461538462, |
|
"grad_norm": 2.4642109870910645, |
|
"learning_rate": 8.923076923076925e-06, |
|
"loss": 0.4625, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4461538461538462, |
|
"eval_loss": 0.6237349510192871, |
|
"eval_runtime": 3.243, |
|
"eval_samples_per_second": 35.153, |
|
"eval_steps_per_second": 1.85, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.919198751449585, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.554, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6107349991798401, |
|
"eval_runtime": 3.2321, |
|
"eval_samples_per_second": 35.271, |
|
"eval_steps_per_second": 1.856, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47692307692307695, |
|
"grad_norm": 2.186372756958008, |
|
"learning_rate": 9.53846153846154e-06, |
|
"loss": 0.4941, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.47692307692307695, |
|
"eval_loss": 0.614315390586853, |
|
"eval_runtime": 3.2373, |
|
"eval_samples_per_second": 35.214, |
|
"eval_steps_per_second": 1.853, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 2.5454864501953125, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.5021, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.6134229302406311, |
|
"eval_runtime": 3.2312, |
|
"eval_samples_per_second": 35.281, |
|
"eval_steps_per_second": 1.857, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5076923076923077, |
|
"grad_norm": 2.518843412399292, |
|
"learning_rate": 1.0153846153846154e-05, |
|
"loss": 0.587, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5076923076923077, |
|
"eval_loss": 0.6188045144081116, |
|
"eval_runtime": 3.2447, |
|
"eval_samples_per_second": 35.135, |
|
"eval_steps_per_second": 1.849, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 2.2720816135406494, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.6655, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.6142882108688354, |
|
"eval_runtime": 3.2478, |
|
"eval_samples_per_second": 35.101, |
|
"eval_steps_per_second": 1.847, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 2.510495662689209, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.6116, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"eval_loss": 0.615568995475769, |
|
"eval_runtime": 3.2332, |
|
"eval_samples_per_second": 35.259, |
|
"eval_steps_per_second": 1.856, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 3.122288942337036, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.5906, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.6134847402572632, |
|
"eval_runtime": 3.23, |
|
"eval_samples_per_second": 35.294, |
|
"eval_steps_per_second": 1.858, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5692307692307692, |
|
"grad_norm": 2.096451759338379, |
|
"learning_rate": 1.1384615384615385e-05, |
|
"loss": 0.4887, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5692307692307692, |
|
"eval_loss": 0.6209902763366699, |
|
"eval_runtime": 3.2877, |
|
"eval_samples_per_second": 34.674, |
|
"eval_steps_per_second": 1.825, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 1.9950298070907593, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.5759, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.623406171798706, |
|
"eval_runtime": 3.2604, |
|
"eval_samples_per_second": 34.965, |
|
"eval_steps_per_second": 1.84, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.6859853267669678, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5436, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.622590184211731, |
|
"eval_runtime": 3.2828, |
|
"eval_samples_per_second": 34.726, |
|
"eval_steps_per_second": 1.828, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 2.1276960372924805, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.5281, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6532315611839294, |
|
"eval_runtime": 3.2611, |
|
"eval_samples_per_second": 34.958, |
|
"eval_steps_per_second": 1.84, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6307692307692307, |
|
"grad_norm": 2.362884998321533, |
|
"learning_rate": 1.2615384615384616e-05, |
|
"loss": 0.5469, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6307692307692307, |
|
"eval_loss": 0.6342897415161133, |
|
"eval_runtime": 3.2382, |
|
"eval_samples_per_second": 35.205, |
|
"eval_steps_per_second": 1.853, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 2.6885156631469727, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.5795, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.6264632940292358, |
|
"eval_runtime": 3.238, |
|
"eval_samples_per_second": 35.207, |
|
"eval_steps_per_second": 1.853, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6615384615384615, |
|
"grad_norm": 2.579245090484619, |
|
"learning_rate": 1.3230769230769231e-05, |
|
"loss": 0.5347, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6615384615384615, |
|
"eval_loss": 0.6273682117462158, |
|
"eval_runtime": 3.2364, |
|
"eval_samples_per_second": 35.224, |
|
"eval_steps_per_second": 1.854, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 1.1545597314834595, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.4647, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.6299084424972534, |
|
"eval_runtime": 3.2446, |
|
"eval_samples_per_second": 35.136, |
|
"eval_steps_per_second": 1.849, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 2.961758613586426, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.461, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"eval_loss": 0.6325281858444214, |
|
"eval_runtime": 3.2395, |
|
"eval_samples_per_second": 35.191, |
|
"eval_steps_per_second": 1.852, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 1.9980833530426025, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.5936, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.6283787488937378, |
|
"eval_runtime": 3.2222, |
|
"eval_samples_per_second": 35.379, |
|
"eval_steps_per_second": 1.862, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7230769230769231, |
|
"grad_norm": 1.7074766159057617, |
|
"learning_rate": 1.4461538461538462e-05, |
|
"loss": 0.5754, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7230769230769231, |
|
"eval_loss": 0.6299780011177063, |
|
"eval_runtime": 3.4089, |
|
"eval_samples_per_second": 33.442, |
|
"eval_steps_per_second": 1.76, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 3.0761687755584717, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.5832, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.6351837515830994, |
|
"eval_runtime": 3.2325, |
|
"eval_samples_per_second": 35.266, |
|
"eval_steps_per_second": 1.856, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7538461538461538, |
|
"grad_norm": 2.6451804637908936, |
|
"learning_rate": 1.5076923076923078e-05, |
|
"loss": 0.5678, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7538461538461538, |
|
"eval_loss": 0.6302112340927124, |
|
"eval_runtime": 3.2461, |
|
"eval_samples_per_second": 35.119, |
|
"eval_steps_per_second": 1.848, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.2752690315246582, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.5067, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.6335285902023315, |
|
"eval_runtime": 3.233, |
|
"eval_samples_per_second": 35.261, |
|
"eval_steps_per_second": 1.856, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7846153846153846, |
|
"grad_norm": 1.623413324356079, |
|
"learning_rate": 1.5692307692307693e-05, |
|
"loss": 0.565, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7846153846153846, |
|
"eval_loss": 0.6364890933036804, |
|
"eval_runtime": 3.2571, |
|
"eval_samples_per_second": 35.001, |
|
"eval_steps_per_second": 1.842, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.4268816709518433, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6637, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6888372898101807, |
|
"eval_runtime": 3.2161, |
|
"eval_samples_per_second": 35.446, |
|
"eval_steps_per_second": 1.866, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8153846153846154, |
|
"grad_norm": 2.0327444076538086, |
|
"learning_rate": 1.630769230769231e-05, |
|
"loss": 0.5415, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8153846153846154, |
|
"eval_loss": 0.6402605175971985, |
|
"eval_runtime": 3.249, |
|
"eval_samples_per_second": 35.088, |
|
"eval_steps_per_second": 1.847, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 2.4058265686035156, |
|
"learning_rate": 1.6615384615384618e-05, |
|
"loss": 0.6001, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 0.6381077170372009, |
|
"eval_runtime": 3.2304, |
|
"eval_samples_per_second": 35.29, |
|
"eval_steps_per_second": 1.857, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 3.1266651153564453, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.6938, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"eval_loss": 0.6406418681144714, |
|
"eval_runtime": 3.2645, |
|
"eval_samples_per_second": 34.921, |
|
"eval_steps_per_second": 1.838, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 2.09586501121521, |
|
"learning_rate": 1.7230769230769234e-05, |
|
"loss": 0.5671, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"eval_loss": 0.6451361775398254, |
|
"eval_runtime": 3.3107, |
|
"eval_samples_per_second": 34.434, |
|
"eval_steps_per_second": 1.812, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8769230769230769, |
|
"grad_norm": 3.3246548175811768, |
|
"learning_rate": 1.753846153846154e-05, |
|
"loss": 0.5231, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8769230769230769, |
|
"eval_loss": 0.6457281708717346, |
|
"eval_runtime": 3.2516, |
|
"eval_samples_per_second": 35.059, |
|
"eval_steps_per_second": 1.845, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 1.8420376777648926, |
|
"learning_rate": 1.784615384615385e-05, |
|
"loss": 0.6566, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"eval_loss": 0.6426037549972534, |
|
"eval_runtime": 3.2569, |
|
"eval_samples_per_second": 35.003, |
|
"eval_steps_per_second": 1.842, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9076923076923077, |
|
"grad_norm": 1.7587623596191406, |
|
"learning_rate": 1.8153846153846155e-05, |
|
"loss": 0.5607, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9076923076923077, |
|
"eval_loss": 0.6446419358253479, |
|
"eval_runtime": 3.2735, |
|
"eval_samples_per_second": 34.825, |
|
"eval_steps_per_second": 1.833, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.1630938053131104, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.7058, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.6553606986999512, |
|
"eval_runtime": 3.2562, |
|
"eval_samples_per_second": 35.01, |
|
"eval_steps_per_second": 1.843, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9384615384615385, |
|
"grad_norm": 1.9333707094192505, |
|
"learning_rate": 1.876923076923077e-05, |
|
"loss": 0.6126, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9384615384615385, |
|
"eval_loss": 0.6482510566711426, |
|
"eval_runtime": 3.2568, |
|
"eval_samples_per_second": 35.004, |
|
"eval_steps_per_second": 1.842, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 1.994057297706604, |
|
"learning_rate": 1.907692307692308e-05, |
|
"loss": 0.6155, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"eval_loss": 0.6493787169456482, |
|
"eval_runtime": 3.2411, |
|
"eval_samples_per_second": 35.174, |
|
"eval_steps_per_second": 1.851, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9692307692307692, |
|
"grad_norm": 2.5365986824035645, |
|
"learning_rate": 1.9384615384615386e-05, |
|
"loss": 0.5934, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9692307692307692, |
|
"eval_loss": 0.6541261076927185, |
|
"eval_runtime": 3.2414, |
|
"eval_samples_per_second": 35.17, |
|
"eval_steps_per_second": 1.851, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 2.831939697265625, |
|
"learning_rate": 1.9692307692307696e-05, |
|
"loss": 0.4716, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"eval_loss": 0.6611928343772888, |
|
"eval_runtime": 3.2346, |
|
"eval_samples_per_second": 35.244, |
|
"eval_steps_per_second": 1.855, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.3626530170440674, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5363, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6603513956069946, |
|
"eval_runtime": 3.238, |
|
"eval_samples_per_second": 35.206, |
|
"eval_steps_per_second": 1.853, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 1.5879381895065308, |
|
"learning_rate": 1.9999855802751384e-05, |
|
"loss": 0.3579, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"eval_loss": 0.6978694200515747, |
|
"eval_runtime": 3.2447, |
|
"eval_samples_per_second": 35.135, |
|
"eval_steps_per_second": 1.849, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0307692307692307, |
|
"grad_norm": 1.9470375776290894, |
|
"learning_rate": 1.9999423215164105e-05, |
|
"loss": 0.3559, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0307692307692307, |
|
"eval_loss": 0.6810071468353271, |
|
"eval_runtime": 3.2514, |
|
"eval_samples_per_second": 35.062, |
|
"eval_steps_per_second": 1.845, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 1.9423273801803589, |
|
"learning_rate": 1.9998702249713747e-05, |
|
"loss": 0.3977, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"eval_loss": 0.6764042377471924, |
|
"eval_runtime": 3.2336, |
|
"eval_samples_per_second": 35.255, |
|
"eval_steps_per_second": 1.856, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0615384615384615, |
|
"grad_norm": 2.47997784614563, |
|
"learning_rate": 1.9997692927192562e-05, |
|
"loss": 0.3449, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0615384615384615, |
|
"eval_loss": 0.6773045659065247, |
|
"eval_runtime": 3.2495, |
|
"eval_samples_per_second": 35.082, |
|
"eval_steps_per_second": 1.846, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.2528847455978394, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.3426, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.6868980526924133, |
|
"eval_runtime": 3.2469, |
|
"eval_samples_per_second": 35.111, |
|
"eval_steps_per_second": 1.848, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0923076923076924, |
|
"grad_norm": 1.9821579456329346, |
|
"learning_rate": 1.9994809335686152e-05, |
|
"loss": 0.4387, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0923076923076924, |
|
"eval_loss": 0.6710843443870544, |
|
"eval_runtime": 3.2766, |
|
"eval_samples_per_second": 34.792, |
|
"eval_steps_per_second": 1.831, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 1.4338393211364746, |
|
"learning_rate": 1.9992935149862116e-05, |
|
"loss": 0.3443, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 0.6952248215675354, |
|
"eval_runtime": 3.2614, |
|
"eval_samples_per_second": 34.954, |
|
"eval_steps_per_second": 1.84, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.123076923076923, |
|
"grad_norm": 1.1517648696899414, |
|
"learning_rate": 1.999077277328724e-05, |
|
"loss": 0.3484, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.123076923076923, |
|
"eval_loss": 0.6964479684829712, |
|
"eval_runtime": 3.2521, |
|
"eval_samples_per_second": 35.054, |
|
"eval_steps_per_second": 1.845, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 1.650405764579773, |
|
"learning_rate": 1.998832226832327e-05, |
|
"loss": 0.4018, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"eval_loss": 0.6902267932891846, |
|
"eval_runtime": 3.2586, |
|
"eval_samples_per_second": 34.984, |
|
"eval_steps_per_second": 1.841, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 2.2939112186431885, |
|
"learning_rate": 1.9985583705641418e-05, |
|
"loss": 0.3984, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"eval_loss": 0.6953668594360352, |
|
"eval_runtime": 3.2666, |
|
"eval_samples_per_second": 34.899, |
|
"eval_steps_per_second": 1.837, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 1.588689923286438, |
|
"learning_rate": 1.9982557164220335e-05, |
|
"loss": 0.3423, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"eval_loss": 0.6961036324501038, |
|
"eval_runtime": 3.2728, |
|
"eval_samples_per_second": 34.832, |
|
"eval_steps_per_second": 1.833, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1846153846153846, |
|
"grad_norm": 2.06250262260437, |
|
"learning_rate": 1.9979242731343803e-05, |
|
"loss": 0.2843, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1846153846153846, |
|
"eval_loss": 0.7108862400054932, |
|
"eval_runtime": 3.2657, |
|
"eval_samples_per_second": 34.908, |
|
"eval_steps_per_second": 1.837, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.609130382537842, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.3172, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.6978670358657837, |
|
"eval_runtime": 3.2647, |
|
"eval_samples_per_second": 34.919, |
|
"eval_steps_per_second": 1.838, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2153846153846155, |
|
"grad_norm": 2.1612465381622314, |
|
"learning_rate": 1.9971750581869955e-05, |
|
"loss": 0.4031, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2153846153846155, |
|
"eval_loss": 0.7043502330780029, |
|
"eval_runtime": 3.2943, |
|
"eval_samples_per_second": 34.605, |
|
"eval_steps_per_second": 1.821, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 2.465644121170044, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.3869, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.694877564907074, |
|
"eval_runtime": 3.2465, |
|
"eval_samples_per_second": 35.115, |
|
"eval_steps_per_second": 1.848, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2461538461538462, |
|
"grad_norm": 2.611905097961426, |
|
"learning_rate": 1.9963108121491508e-05, |
|
"loss": 0.3364, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2461538461538462, |
|
"eval_loss": 0.6959603428840637, |
|
"eval_runtime": 3.2287, |
|
"eval_samples_per_second": 35.309, |
|
"eval_steps_per_second": 1.858, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 2.0148117542266846, |
|
"learning_rate": 1.9958355831085155e-05, |
|
"loss": 0.3699, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"eval_loss": 0.695041835308075, |
|
"eval_runtime": 3.2511, |
|
"eval_samples_per_second": 35.065, |
|
"eval_steps_per_second": 1.846, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2769230769230768, |
|
"grad_norm": 2.213994264602661, |
|
"learning_rate": 1.995331634717649e-05, |
|
"loss": 0.4101, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2769230769230768, |
|
"eval_loss": 0.6806493997573853, |
|
"eval_runtime": 3.2367, |
|
"eval_samples_per_second": 35.221, |
|
"eval_steps_per_second": 1.854, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 1.7559289932250977, |
|
"learning_rate": 1.9947989815101444e-05, |
|
"loss": 0.4012, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"eval_loss": 0.6939857006072998, |
|
"eval_runtime": 3.2502, |
|
"eval_samples_per_second": 35.075, |
|
"eval_steps_per_second": 1.846, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.8075680136680603, |
|
"learning_rate": 1.9942376388474282e-05, |
|
"loss": 0.3202, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"eval_loss": 0.7051680684089661, |
|
"eval_runtime": 3.2612, |
|
"eval_samples_per_second": 34.956, |
|
"eval_steps_per_second": 1.84, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 1.8492660522460938, |
|
"learning_rate": 1.9936476229183133e-05, |
|
"loss": 0.3985, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"eval_loss": 0.6843434572219849, |
|
"eval_runtime": 3.2739, |
|
"eval_samples_per_second": 34.821, |
|
"eval_steps_per_second": 1.833, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3384615384615386, |
|
"grad_norm": 1.6737396717071533, |
|
"learning_rate": 1.9930289507385344e-05, |
|
"loss": 0.4244, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3384615384615386, |
|
"eval_loss": 0.6972203254699707, |
|
"eval_runtime": 3.2498, |
|
"eval_samples_per_second": 35.079, |
|
"eval_steps_per_second": 1.846, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 2.5821003913879395, |
|
"learning_rate": 1.992381640150257e-05, |
|
"loss": 0.3924, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"eval_loss": 0.6854589581489563, |
|
"eval_runtime": 3.2524, |
|
"eval_samples_per_second": 35.051, |
|
"eval_steps_per_second": 1.845, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3692307692307693, |
|
"grad_norm": 1.248367428779602, |
|
"learning_rate": 1.9917057098215624e-05, |
|
"loss": 0.3659, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3692307692307693, |
|
"eval_loss": 0.6994220614433289, |
|
"eval_runtime": 3.2745, |
|
"eval_samples_per_second": 34.814, |
|
"eval_steps_per_second": 1.832, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.9339770674705505, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.309, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.7046116590499878, |
|
"eval_runtime": 3.2642, |
|
"eval_samples_per_second": 34.924, |
|
"eval_steps_per_second": 1.838, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 27.820077896118164, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.3788, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.6996982097625732, |
|
"eval_runtime": 3.2777, |
|
"eval_samples_per_second": 34.781, |
|
"eval_steps_per_second": 1.831, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 2.5190956592559814, |
|
"learning_rate": 1.9895063994510512e-05, |
|
"loss": 0.3372, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"eval_loss": 0.7020460963249207, |
|
"eval_runtime": 3.242, |
|
"eval_samples_per_second": 35.164, |
|
"eval_steps_per_second": 1.851, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4307692307692308, |
|
"grad_norm": 1.8096344470977783, |
|
"learning_rate": 1.9887161933404743e-05, |
|
"loss": 0.3812, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4307692307692308, |
|
"eval_loss": 0.6968725323677063, |
|
"eval_runtime": 3.3598, |
|
"eval_samples_per_second": 33.93, |
|
"eval_steps_per_second": 1.786, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 1.8657336235046387, |
|
"learning_rate": 1.9878974731989487e-05, |
|
"loss": 0.414, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"eval_loss": 0.6909111142158508, |
|
"eval_runtime": 3.2552, |
|
"eval_samples_per_second": 35.021, |
|
"eval_steps_per_second": 1.843, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 2.4143567085266113, |
|
"learning_rate": 1.9870502626379127e-05, |
|
"loss": 0.3813, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"eval_loss": 0.6953186392784119, |
|
"eval_runtime": 3.2929, |
|
"eval_samples_per_second": 34.619, |
|
"eval_steps_per_second": 1.822, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 2.3205642700195312, |
|
"learning_rate": 1.9861745860904538e-05, |
|
"loss": 0.3701, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"eval_loss": 0.695374608039856, |
|
"eval_runtime": 3.2455, |
|
"eval_samples_per_second": 35.126, |
|
"eval_steps_per_second": 1.849, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4923076923076923, |
|
"grad_norm": 1.55659818649292, |
|
"learning_rate": 1.9852704688106003e-05, |
|
"loss": 0.3437, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4923076923076923, |
|
"eval_loss": 0.7013147473335266, |
|
"eval_runtime": 3.2638, |
|
"eval_samples_per_second": 34.929, |
|
"eval_steps_per_second": 1.838, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 2.180811882019043, |
|
"learning_rate": 1.9843379368725978e-05, |
|
"loss": 0.3343, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"eval_loss": 0.7043299078941345, |
|
"eval_runtime": 3.2392, |
|
"eval_samples_per_second": 35.194, |
|
"eval_steps_per_second": 1.852, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.523076923076923, |
|
"grad_norm": 1.474899411201477, |
|
"learning_rate": 1.983377017170154e-05, |
|
"loss": 0.3601, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.523076923076923, |
|
"eval_loss": 0.6996614336967468, |
|
"eval_runtime": 3.2475, |
|
"eval_samples_per_second": 35.104, |
|
"eval_steps_per_second": 1.848, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.9230364561080933, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.3752, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.6967916488647461, |
|
"eval_runtime": 3.2278, |
|
"eval_samples_per_second": 35.318, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5538461538461539, |
|
"grad_norm": 1.0019750595092773, |
|
"learning_rate": 1.9813701261394136e-05, |
|
"loss": 0.3406, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5538461538461539, |
|
"eval_loss": 0.6963152885437012, |
|
"eval_runtime": 3.2442, |
|
"eval_samples_per_second": 35.14, |
|
"eval_steps_per_second": 1.849, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"grad_norm": 1.7724684476852417, |
|
"learning_rate": 1.9803242126887496e-05, |
|
"loss": 0.4573, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"eval_loss": 0.7392306327819824, |
|
"eval_runtime": 3.2664, |
|
"eval_samples_per_second": 34.901, |
|
"eval_steps_per_second": 1.837, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5846153846153848, |
|
"grad_norm": 1.7095409631729126, |
|
"learning_rate": 1.979250027227241e-05, |
|
"loss": 0.3882, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5846153846153848, |
|
"eval_loss": 0.708345890045166, |
|
"eval_runtime": 3.276, |
|
"eval_samples_per_second": 34.799, |
|
"eval_steps_per_second": 1.832, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.9319413900375366, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.3933, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.697462797164917, |
|
"eval_runtime": 3.243, |
|
"eval_samples_per_second": 35.153, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 1.8260385990142822, |
|
"learning_rate": 1.977016965001817e-05, |
|
"loss": 0.427, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"eval_loss": 0.6899636387825012, |
|
"eval_runtime": 3.2481, |
|
"eval_samples_per_second": 35.098, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"grad_norm": 1.6716266870498657, |
|
"learning_rate": 1.9758581526381878e-05, |
|
"loss": 0.3436, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"eval_loss": 0.6924691200256348, |
|
"eval_runtime": 3.2562, |
|
"eval_samples_per_second": 35.01, |
|
"eval_steps_per_second": 1.843, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.646153846153846, |
|
"grad_norm": 0.8898760080337524, |
|
"learning_rate": 1.9746711970624282e-05, |
|
"loss": 0.3802, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.646153846153846, |
|
"eval_loss": 0.7017173171043396, |
|
"eval_runtime": 3.2449, |
|
"eval_samples_per_second": 35.132, |
|
"eval_steps_per_second": 1.849, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"grad_norm": 1.6189157962799072, |
|
"learning_rate": 1.973456132505684e-05, |
|
"loss": 0.3668, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"eval_loss": 0.6917209029197693, |
|
"eval_runtime": 3.2318, |
|
"eval_samples_per_second": 35.275, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.676923076923077, |
|
"grad_norm": 1.77718985080719, |
|
"learning_rate": 1.972212994009749e-05, |
|
"loss": 0.3996, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.676923076923077, |
|
"eval_loss": 0.6930002570152283, |
|
"eval_runtime": 3.2419, |
|
"eval_samples_per_second": 35.165, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 1.6316412687301636, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.4447, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"eval_loss": 0.6898515820503235, |
|
"eval_runtime": 3.2294, |
|
"eval_samples_per_second": 35.3, |
|
"eval_steps_per_second": 1.858, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7076923076923078, |
|
"grad_norm": 1.2730180025100708, |
|
"learning_rate": 1.9696426394146278e-05, |
|
"loss": 0.4221, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.7076923076923078, |
|
"eval_loss": 0.693137526512146, |
|
"eval_runtime": 3.23, |
|
"eval_samples_per_second": 35.294, |
|
"eval_steps_per_second": 1.858, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"grad_norm": 2.3808021545410156, |
|
"learning_rate": 1.9683154974430544e-05, |
|
"loss": 0.3271, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"eval_loss": 0.6987683773040771, |
|
"eval_runtime": 3.2399, |
|
"eval_samples_per_second": 35.186, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7384615384615385, |
|
"grad_norm": 2.563812017440796, |
|
"learning_rate": 1.9669604297853766e-05, |
|
"loss": 0.3751, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7384615384615385, |
|
"eval_loss": 0.6946467161178589, |
|
"eval_runtime": 3.2296, |
|
"eval_samples_per_second": 35.299, |
|
"eval_steps_per_second": 1.858, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"grad_norm": 1.6859829425811768, |
|
"learning_rate": 1.965577475520999e-05, |
|
"loss": 0.3433, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"eval_loss": 0.6889815926551819, |
|
"eval_runtime": 3.2391, |
|
"eval_samples_per_second": 35.195, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 1.8283382654190063, |
|
"learning_rate": 1.9641666745335626e-05, |
|
"loss": 0.419, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"eval_loss": 0.6913794875144958, |
|
"eval_runtime": 3.2283, |
|
"eval_samples_per_second": 35.313, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"grad_norm": 1.826407551765442, |
|
"learning_rate": 1.962728067509791e-05, |
|
"loss": 0.3423, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"eval_loss": 0.692046046257019, |
|
"eval_runtime": 3.2427, |
|
"eval_samples_per_second": 35.156, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.4921714067459106, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.3351, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.7031008005142212, |
|
"eval_runtime": 3.249, |
|
"eval_samples_per_second": 35.088, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"grad_norm": 2.1426408290863037, |
|
"learning_rate": 1.9597676021084962e-05, |
|
"loss": 0.3521, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"eval_loss": 0.6923142671585083, |
|
"eval_runtime": 3.2308, |
|
"eval_samples_per_second": 35.286, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.830769230769231, |
|
"grad_norm": 1.8514198064804077, |
|
"learning_rate": 1.9582458291091664e-05, |
|
"loss": 0.454, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.830769230769231, |
|
"eval_loss": 0.6877439022064209, |
|
"eval_runtime": 3.2342, |
|
"eval_samples_per_second": 35.248, |
|
"eval_steps_per_second": 1.855, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 2.015425205230713, |
|
"learning_rate": 1.9566964208274254e-05, |
|
"loss": 0.3908, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"eval_loss": 0.6943904161453247, |
|
"eval_runtime": 3.2272, |
|
"eval_samples_per_second": 35.324, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8615384615384616, |
|
"grad_norm": 1.4284974336624146, |
|
"learning_rate": 1.9551194219473552e-05, |
|
"loss": 0.3538, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8615384615384616, |
|
"eval_loss": 0.6958539485931396, |
|
"eval_runtime": 3.2638, |
|
"eval_samples_per_second": 34.929, |
|
"eval_steps_per_second": 1.838, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"grad_norm": 2.42622447013855, |
|
"learning_rate": 1.9535148779487365e-05, |
|
"loss": 0.28, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"eval_loss": 0.7015026211738586, |
|
"eval_runtime": 3.2491, |
|
"eval_samples_per_second": 35.086, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8923076923076922, |
|
"grad_norm": 1.8617641925811768, |
|
"learning_rate": 1.9518828351057345e-05, |
|
"loss": 0.4324, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8923076923076922, |
|
"eval_loss": 0.6908020377159119, |
|
"eval_runtime": 3.2376, |
|
"eval_samples_per_second": 35.211, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"grad_norm": 1.7121613025665283, |
|
"learning_rate": 1.9502233404855672e-05, |
|
"loss": 0.3713, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"eval_loss": 0.703855574131012, |
|
"eval_runtime": 3.2355, |
|
"eval_samples_per_second": 35.234, |
|
"eval_steps_per_second": 1.854, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 2.0595622062683105, |
|
"learning_rate": 1.9485364419471454e-05, |
|
"loss": 0.4456, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"eval_loss": 0.686195433139801, |
|
"eval_runtime": 3.2266, |
|
"eval_samples_per_second": 35.331, |
|
"eval_steps_per_second": 1.86, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"grad_norm": 1.9807814359664917, |
|
"learning_rate": 1.946822188139696e-05, |
|
"loss": 0.2958, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"eval_loss": 0.6912775039672852, |
|
"eval_runtime": 3.2509, |
|
"eval_samples_per_second": 35.067, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.953846153846154, |
|
"grad_norm": 1.8224427700042725, |
|
"learning_rate": 1.945080628501355e-05, |
|
"loss": 0.3876, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.953846153846154, |
|
"eval_loss": 0.6861458420753479, |
|
"eval_runtime": 3.2739, |
|
"eval_samples_per_second": 34.82, |
|
"eval_steps_per_second": 1.833, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 2.107452630996704, |
|
"learning_rate": 1.9433118132577432e-05, |
|
"loss": 0.3748, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"eval_loss": 0.6867597699165344, |
|
"eval_runtime": 3.2581, |
|
"eval_samples_per_second": 34.99, |
|
"eval_steps_per_second": 1.842, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9846153846153847, |
|
"grad_norm": 5.972170352935791, |
|
"learning_rate": 1.94151579342052e-05, |
|
"loss": 0.4297, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9846153846153847, |
|
"eval_loss": 1.595029354095459, |
|
"eval_runtime": 3.2512, |
|
"eval_samples_per_second": 35.064, |
|
"eval_steps_per_second": 1.845, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.460162878036499, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.6569, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7044022083282471, |
|
"eval_runtime": 3.2342, |
|
"eval_samples_per_second": 35.248, |
|
"eval_steps_per_second": 1.855, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0153846153846153, |
|
"grad_norm": 1.5906578302383423, |
|
"learning_rate": 1.9378423479332045e-05, |
|
"loss": 0.2524, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.0153846153846153, |
|
"eval_loss": 0.7527978420257568, |
|
"eval_runtime": 3.2369, |
|
"eval_samples_per_second": 35.218, |
|
"eval_steps_per_second": 1.854, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"grad_norm": 1.2187044620513916, |
|
"learning_rate": 1.935965028223259e-05, |
|
"loss": 0.1678, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"eval_loss": 0.7567086219787598, |
|
"eval_runtime": 3.2521, |
|
"eval_samples_per_second": 35.054, |
|
"eval_steps_per_second": 1.845, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.046153846153846, |
|
"grad_norm": 0.984000563621521, |
|
"learning_rate": 1.9340607157969393e-05, |
|
"loss": 0.219, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.046153846153846, |
|
"eval_loss": 0.7520028948783875, |
|
"eval_runtime": 3.2487, |
|
"eval_samples_per_second": 35.091, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"grad_norm": 1.2751810550689697, |
|
"learning_rate": 1.932129465573568e-05, |
|
"loss": 0.2095, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"eval_loss": 0.739636242389679, |
|
"eval_runtime": 3.2269, |
|
"eval_samples_per_second": 35.328, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 1.4136130809783936, |
|
"learning_rate": 1.9301713332493386e-05, |
|
"loss": 0.1668, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"eval_loss": 0.7536832690238953, |
|
"eval_runtime": 3.2417, |
|
"eval_samples_per_second": 35.167, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"grad_norm": 1.7586925029754639, |
|
"learning_rate": 1.9281863752957095e-05, |
|
"loss": 0.2118, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"eval_loss": 0.7858787775039673, |
|
"eval_runtime": 3.2321, |
|
"eval_samples_per_second": 35.271, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.1076923076923078, |
|
"grad_norm": 1.9954192638397217, |
|
"learning_rate": 1.9261746489577767e-05, |
|
"loss": 0.1911, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.1076923076923078, |
|
"eval_loss": 0.7683790922164917, |
|
"eval_runtime": 3.2187, |
|
"eval_samples_per_second": 35.418, |
|
"eval_steps_per_second": 1.864, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"grad_norm": 1.311628818511963, |
|
"learning_rate": 1.92413621225262e-05, |
|
"loss": 0.1894, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"eval_loss": 0.8233888745307922, |
|
"eval_runtime": 3.2593, |
|
"eval_samples_per_second": 34.977, |
|
"eval_steps_per_second": 1.841, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.1384615384615384, |
|
"grad_norm": 1.3510133028030396, |
|
"learning_rate": 1.9220711239676325e-05, |
|
"loss": 0.2001, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1384615384615384, |
|
"eval_loss": 0.7833328247070312, |
|
"eval_runtime": 3.2388, |
|
"eval_samples_per_second": 35.198, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 1.1844192743301392, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.2078, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"eval_loss": 0.7819744348526001, |
|
"eval_runtime": 3.2322, |
|
"eval_samples_per_second": 35.27, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.169230769230769, |
|
"grad_norm": 1.4540330171585083, |
|
"learning_rate": 1.917861231649104e-05, |
|
"loss": 0.2046, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.169230769230769, |
|
"eval_loss": 0.7777317762374878, |
|
"eval_runtime": 3.3497, |
|
"eval_samples_per_second": 34.033, |
|
"eval_steps_per_second": 1.791, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"grad_norm": 2.100379467010498, |
|
"learning_rate": 1.915716549026541e-05, |
|
"loss": 0.2192, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"eval_loss": 0.7560202479362488, |
|
"eval_runtime": 3.2536, |
|
"eval_samples_per_second": 35.038, |
|
"eval_steps_per_second": 1.844, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.1869295835494995, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.2055, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.7658796906471252, |
|
"eval_runtime": 3.2561, |
|
"eval_samples_per_second": 35.011, |
|
"eval_steps_per_second": 1.843, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"grad_norm": 1.143850564956665, |
|
"learning_rate": 1.9113480201103658e-05, |
|
"loss": 0.2003, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"eval_loss": 0.7692248821258545, |
|
"eval_runtime": 3.2573, |
|
"eval_samples_per_second": 34.998, |
|
"eval_steps_per_second": 1.842, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 1.4219717979431152, |
|
"learning_rate": 1.909124299802724e-05, |
|
"loss": 0.1882, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"eval_loss": 0.7854686975479126, |
|
"eval_runtime": 3.2376, |
|
"eval_samples_per_second": 35.211, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"grad_norm": 1.727668046951294, |
|
"learning_rate": 1.9068743608505454e-05, |
|
"loss": 0.2081, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"eval_loss": 0.7837368249893188, |
|
"eval_runtime": 3.2688, |
|
"eval_samples_per_second": 34.875, |
|
"eval_steps_per_second": 1.836, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2615384615384615, |
|
"grad_norm": 1.9428791999816895, |
|
"learning_rate": 1.9045982681408324e-05, |
|
"loss": 0.2031, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.2615384615384615, |
|
"eval_loss": 0.7683539390563965, |
|
"eval_runtime": 3.2316, |
|
"eval_samples_per_second": 35.277, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"grad_norm": 1.5166252851486206, |
|
"learning_rate": 1.902296087314845e-05, |
|
"loss": 0.1919, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"eval_loss": 0.7894486784934998, |
|
"eval_runtime": 3.2275, |
|
"eval_samples_per_second": 35.321, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.292307692307692, |
|
"grad_norm": 1.368630290031433, |
|
"learning_rate": 1.8999678847662124e-05, |
|
"loss": 0.1998, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.292307692307692, |
|
"eval_loss": 0.7855644226074219, |
|
"eval_runtime": 3.2357, |
|
"eval_samples_per_second": 35.232, |
|
"eval_steps_per_second": 1.854, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.9620829820632935, |
|
"learning_rate": 1.8976137276390145e-05, |
|
"loss": 0.2139, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 0.797519326210022, |
|
"eval_runtime": 3.2212, |
|
"eval_samples_per_second": 35.39, |
|
"eval_steps_per_second": 1.863, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.3230769230769233, |
|
"grad_norm": 1.0639945268630981, |
|
"learning_rate": 1.895233683825847e-05, |
|
"loss": 0.2164, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.3230769230769233, |
|
"eval_loss": 0.7683231234550476, |
|
"eval_runtime": 3.2416, |
|
"eval_samples_per_second": 35.168, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"grad_norm": 2.229300022125244, |
|
"learning_rate": 1.892827821965864e-05, |
|
"loss": 0.188, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"eval_loss": 0.7734756469726562, |
|
"eval_runtime": 3.239, |
|
"eval_samples_per_second": 35.196, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.353846153846154, |
|
"grad_norm": 1.2442930936813354, |
|
"learning_rate": 1.8903962114427985e-05, |
|
"loss": 0.1762, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.353846153846154, |
|
"eval_loss": 0.7807677984237671, |
|
"eval_runtime": 3.2321, |
|
"eval_samples_per_second": 35.272, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"grad_norm": 0.7546485066413879, |
|
"learning_rate": 1.8879389223829592e-05, |
|
"loss": 0.1933, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"eval_loss": 0.7788336277008057, |
|
"eval_runtime": 3.234, |
|
"eval_samples_per_second": 35.25, |
|
"eval_steps_per_second": 1.855, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 1.6050472259521484, |
|
"learning_rate": 1.8854560256532098e-05, |
|
"loss": 0.2, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"eval_loss": 0.7777507305145264, |
|
"eval_runtime": 3.2303, |
|
"eval_samples_per_second": 35.291, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.6613671779632568, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.1959, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.7840877175331116, |
|
"eval_runtime": 3.2313, |
|
"eval_samples_per_second": 35.28, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.4153846153846152, |
|
"grad_norm": 1.127969741821289, |
|
"learning_rate": 1.8804136963419316e-05, |
|
"loss": 0.1791, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.4153846153846152, |
|
"eval_loss": 0.787642776966095, |
|
"eval_runtime": 3.2339, |
|
"eval_samples_per_second": 35.251, |
|
"eval_steps_per_second": 1.855, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"grad_norm": 1.0740890502929688, |
|
"learning_rate": 1.8778544091784047e-05, |
|
"loss": 0.1952, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"eval_loss": 0.7895064949989319, |
|
"eval_runtime": 3.2507, |
|
"eval_samples_per_second": 35.069, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.4461538461538463, |
|
"grad_norm": 1.3111459016799927, |
|
"learning_rate": 1.87526980517678e-05, |
|
"loss": 0.2019, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.4461538461538463, |
|
"eval_loss": 0.7794804573059082, |
|
"eval_runtime": 3.2322, |
|
"eval_samples_per_second": 35.27, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.7549346685409546, |
|
"learning_rate": 1.8726599588756144e-05, |
|
"loss": 0.1857, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"eval_loss": 0.7962229251861572, |
|
"eval_runtime": 3.2309, |
|
"eval_samples_per_second": 35.285, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.476923076923077, |
|
"grad_norm": 1.6596492528915405, |
|
"learning_rate": 1.8700249455414394e-05, |
|
"loss": 0.2058, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.476923076923077, |
|
"eval_loss": 0.785554051399231, |
|
"eval_runtime": 3.2375, |
|
"eval_samples_per_second": 35.212, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"grad_norm": 1.5621322393417358, |
|
"learning_rate": 1.8673648411665895e-05, |
|
"loss": 0.1946, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"eval_loss": 0.7949020266532898, |
|
"eval_runtime": 3.2269, |
|
"eval_samples_per_second": 35.328, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.5076923076923077, |
|
"grad_norm": 2.000927686691284, |
|
"learning_rate": 1.864679722467011e-05, |
|
"loss": 0.1984, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.5076923076923077, |
|
"eval_loss": 0.791332483291626, |
|
"eval_runtime": 3.2389, |
|
"eval_samples_per_second": 35.197, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"grad_norm": 1.7056845426559448, |
|
"learning_rate": 1.8619696668800494e-05, |
|
"loss": 0.2212, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"eval_loss": 0.7772064805030823, |
|
"eval_runtime": 3.2441, |
|
"eval_samples_per_second": 35.14, |
|
"eval_steps_per_second": 1.849, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 1.5976656675338745, |
|
"learning_rate": 1.859234752562217e-05, |
|
"loss": 0.1901, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"eval_loss": 0.7850207686424255, |
|
"eval_runtime": 3.2221, |
|
"eval_samples_per_second": 35.38, |
|
"eval_steps_per_second": 1.862, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"grad_norm": 1.0322597026824951, |
|
"learning_rate": 1.8564750583869374e-05, |
|
"loss": 0.2185, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"eval_loss": 0.7930358648300171, |
|
"eval_runtime": 3.2427, |
|
"eval_samples_per_second": 35.156, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.569230769230769, |
|
"grad_norm": 1.1539405584335327, |
|
"learning_rate": 1.8536906639422724e-05, |
|
"loss": 0.2056, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.569230769230769, |
|
"eval_loss": 0.7705276012420654, |
|
"eval_runtime": 3.2511, |
|
"eval_samples_per_second": 35.065, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"grad_norm": 1.2852847576141357, |
|
"learning_rate": 1.850881649528625e-05, |
|
"loss": 0.2031, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"eval_loss": 0.7809199094772339, |
|
"eval_runtime": 3.2419, |
|
"eval_samples_per_second": 35.164, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.8470299243927, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 0.207, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.7837203145027161, |
|
"eval_runtime": 3.2437, |
|
"eval_samples_per_second": 35.145, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.169309139251709, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 0.1924, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"eval_loss": 0.8024268746376038, |
|
"eval_runtime": 3.2426, |
|
"eval_samples_per_second": 35.157, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.6307692307692307, |
|
"grad_norm": 1.3079050779342651, |
|
"learning_rate": 1.8423077001141848e-05, |
|
"loss": 0.2111, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.6307692307692307, |
|
"eval_loss": 0.7842855453491211, |
|
"eval_runtime": 3.2512, |
|
"eval_samples_per_second": 35.064, |
|
"eval_steps_per_second": 1.845, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"grad_norm": 1.5863689184188843, |
|
"learning_rate": 1.839401022994006e-05, |
|
"loss": 0.2039, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"eval_loss": 0.7856019735336304, |
|
"eval_runtime": 3.2563, |
|
"eval_samples_per_second": 35.009, |
|
"eval_steps_per_second": 1.843, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.6615384615384614, |
|
"grad_norm": 1.1605026721954346, |
|
"learning_rate": 1.8364701380102267e-05, |
|
"loss": 0.2183, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.6615384615384614, |
|
"eval_loss": 0.7758111953735352, |
|
"eval_runtime": 3.231, |
|
"eval_samples_per_second": 35.283, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"grad_norm": 1.4007433652877808, |
|
"learning_rate": 1.8335151296879576e-05, |
|
"loss": 0.2054, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"eval_loss": 0.7833234071731567, |
|
"eval_runtime": 3.2356, |
|
"eval_samples_per_second": 35.233, |
|
"eval_steps_per_second": 1.854, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 1.3966948986053467, |
|
"learning_rate": 1.8305360832480118e-05, |
|
"loss": 0.1974, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"eval_loss": 0.7975159287452698, |
|
"eval_runtime": 3.2293, |
|
"eval_samples_per_second": 35.302, |
|
"eval_steps_per_second": 1.858, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"grad_norm": 1.2282441854476929, |
|
"learning_rate": 1.82753308460445e-05, |
|
"loss": 0.2114, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"eval_loss": 0.8000977039337158, |
|
"eval_runtime": 3.226, |
|
"eval_samples_per_second": 35.337, |
|
"eval_steps_per_second": 1.86, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.723076923076923, |
|
"grad_norm": 1.7516143321990967, |
|
"learning_rate": 1.8245062203621003e-05, |
|
"loss": 0.2081, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.723076923076923, |
|
"eval_loss": 0.7978941202163696, |
|
"eval_runtime": 3.2435, |
|
"eval_samples_per_second": 35.147, |
|
"eval_steps_per_second": 1.85, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"grad_norm": 1.6751377582550049, |
|
"learning_rate": 1.821455577814062e-05, |
|
"loss": 0.2013, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"eval_loss": 0.7863066792488098, |
|
"eval_runtime": 3.2507, |
|
"eval_samples_per_second": 35.07, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.753846153846154, |
|
"grad_norm": 1.3899345397949219, |
|
"learning_rate": 1.818381244939187e-05, |
|
"loss": 0.206, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.753846153846154, |
|
"eval_loss": 0.7733153104782104, |
|
"eval_runtime": 3.2514, |
|
"eval_samples_per_second": 35.062, |
|
"eval_steps_per_second": 1.845, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.6483854055404663, |
|
"learning_rate": 1.8152833103995443e-05, |
|
"loss": 0.1979, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 0.7838578224182129, |
|
"eval_runtime": 3.2273, |
|
"eval_samples_per_second": 35.323, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7846153846153845, |
|
"grad_norm": 1.4585682153701782, |
|
"learning_rate": 1.8121618635378616e-05, |
|
"loss": 0.2093, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.7846153846153845, |
|
"eval_loss": 0.7677554488182068, |
|
"eval_runtime": 3.231, |
|
"eval_samples_per_second": 35.284, |
|
"eval_steps_per_second": 1.857, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.3923680782318115, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.2036, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.7729052901268005, |
|
"eval_runtime": 3.2487, |
|
"eval_samples_per_second": 35.091, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.815384615384615, |
|
"grad_norm": 1.233302354812622, |
|
"learning_rate": 1.8058487936070992e-05, |
|
"loss": 0.1931, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.815384615384615, |
|
"eval_loss": 0.7708905935287476, |
|
"eval_runtime": 3.2127, |
|
"eval_samples_per_second": 35.484, |
|
"eval_steps_per_second": 1.868, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"grad_norm": 1.4429056644439697, |
|
"learning_rate": 1.802657352603483e-05, |
|
"loss": 0.1929, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"eval_loss": 0.7802720069885254, |
|
"eval_runtime": 3.2503, |
|
"eval_samples_per_second": 35.074, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 2.0769877433776855, |
|
"learning_rate": 1.7994427634035016e-05, |
|
"loss": 0.226, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"eval_loss": 0.766547679901123, |
|
"eval_runtime": 3.2413, |
|
"eval_samples_per_second": 35.171, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"grad_norm": 1.845153570175171, |
|
"learning_rate": 1.7962051187141377e-05, |
|
"loss": 0.2257, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"eval_loss": 0.7760981321334839, |
|
"eval_runtime": 3.2416, |
|
"eval_samples_per_second": 35.168, |
|
"eval_steps_per_second": 1.851, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.876923076923077, |
|
"grad_norm": 1.8381919860839844, |
|
"learning_rate": 1.7929445119072837e-05, |
|
"loss": 0.2193, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.876923076923077, |
|
"eval_loss": 0.7926127314567566, |
|
"eval_runtime": 3.2889, |
|
"eval_samples_per_second": 34.662, |
|
"eval_steps_per_second": 1.824, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"grad_norm": 1.7760707139968872, |
|
"learning_rate": 1.7896610370170452e-05, |
|
"loss": 0.2085, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"eval_loss": 0.7870352268218994, |
|
"eval_runtime": 3.2379, |
|
"eval_samples_per_second": 35.208, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.9076923076923076, |
|
"grad_norm": 1.2421387434005737, |
|
"learning_rate": 1.786354788737031e-05, |
|
"loss": 0.2374, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.9076923076923076, |
|
"eval_loss": 0.7905800342559814, |
|
"eval_runtime": 3.2268, |
|
"eval_samples_per_second": 35.33, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.8296164274215698, |
|
"learning_rate": 1.7830258624176224e-05, |
|
"loss": 0.1788, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"eval_loss": 0.7861989736557007, |
|
"eval_runtime": 3.2405, |
|
"eval_samples_per_second": 35.18, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.9384615384615387, |
|
"grad_norm": 1.577077865600586, |
|
"learning_rate": 1.7796743540632226e-05, |
|
"loss": 0.2296, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.9384615384615387, |
|
"eval_loss": 0.775193452835083, |
|
"eval_runtime": 3.2468, |
|
"eval_samples_per_second": 35.111, |
|
"eval_steps_per_second": 1.848, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"grad_norm": 1.630001187324524, |
|
"learning_rate": 1.776300360329488e-05, |
|
"loss": 0.2115, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"eval_loss": 0.7809256911277771, |
|
"eval_runtime": 3.2326, |
|
"eval_samples_per_second": 35.266, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.9692307692307693, |
|
"grad_norm": 1.5851411819458008, |
|
"learning_rate": 1.772903978520542e-05, |
|
"loss": 0.1967, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9692307692307693, |
|
"eval_loss": 0.7896639108657837, |
|
"eval_runtime": 3.2497, |
|
"eval_samples_per_second": 35.08, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"grad_norm": 1.3122369050979614, |
|
"learning_rate": 1.769485306586166e-05, |
|
"loss": 0.2159, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"eval_loss": 0.777010977268219, |
|
"eval_runtime": 3.2294, |
|
"eval_samples_per_second": 35.3, |
|
"eval_steps_per_second": 1.858, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.2341620922088623, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.1962, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7913311123847961, |
|
"eval_runtime": 3.226, |
|
"eval_samples_per_second": 35.338, |
|
"eval_steps_per_second": 1.86, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"grad_norm": 1.1695995330810547, |
|
"learning_rate": 1.762581487351587e-05, |
|
"loss": 0.1231, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"eval_loss": 0.8296219706535339, |
|
"eval_runtime": 3.2283, |
|
"eval_samples_per_second": 35.313, |
|
"eval_steps_per_second": 1.859, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.0307692307692307, |
|
"grad_norm": 1.0905882120132446, |
|
"learning_rate": 1.7590965391537316e-05, |
|
"loss": 0.1028, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.0307692307692307, |
|
"eval_loss": 0.8419015407562256, |
|
"eval_runtime": 3.2463, |
|
"eval_samples_per_second": 35.117, |
|
"eval_steps_per_second": 1.848, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"grad_norm": 0.9968711137771606, |
|
"learning_rate": 1.7555896990294003e-05, |
|
"loss": 0.116, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"eval_loss": 0.8519408106803894, |
|
"eval_runtime": 3.2373, |
|
"eval_samples_per_second": 35.214, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.0615384615384613, |
|
"grad_norm": 1.8941599130630493, |
|
"learning_rate": 1.7520610681139322e-05, |
|
"loss": 0.1195, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.0615384615384613, |
|
"eval_loss": 0.8432408571243286, |
|
"eval_runtime": 3.2477, |
|
"eval_samples_per_second": 35.102, |
|
"eval_steps_per_second": 1.847, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.5090588331222534, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.1141, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 0.8407796025276184, |
|
"eval_runtime": 3.2492, |
|
"eval_samples_per_second": 35.085, |
|
"eval_steps_per_second": 1.847, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"total_flos": 1.3637570942048666e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|