“Sara
updating model weights after retraining
c742f69
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.076923076923077,
"eval_steps": 10,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015384615384615385,
"grad_norm": 10.082767486572266,
"learning_rate": 3.0769230769230774e-07,
"loss": 2.2325,
"step": 10
},
{
"epoch": 0.015384615384615385,
"eval_loss": 1.8899363279342651,
"eval_runtime": 3.2461,
"eval_samples_per_second": 35.119,
"eval_steps_per_second": 1.848,
"step": 10
},
{
"epoch": 0.03076923076923077,
"grad_norm": 11.988937377929688,
"learning_rate": 6.153846153846155e-07,
"loss": 1.8704,
"step": 20
},
{
"epoch": 0.03076923076923077,
"eval_loss": 1.7930843830108643,
"eval_runtime": 3.2368,
"eval_samples_per_second": 35.22,
"eval_steps_per_second": 1.854,
"step": 20
},
{
"epoch": 0.046153846153846156,
"grad_norm": 12.660626411437988,
"learning_rate": 9.230769230769232e-07,
"loss": 1.5485,
"step": 30
},
{
"epoch": 0.046153846153846156,
"eval_loss": 1.3351986408233643,
"eval_runtime": 3.2549,
"eval_samples_per_second": 35.024,
"eval_steps_per_second": 1.843,
"step": 30
},
{
"epoch": 0.06153846153846154,
"grad_norm": 5.869946002960205,
"learning_rate": 1.230769230769231e-06,
"loss": 1.1465,
"step": 40
},
{
"epoch": 0.06153846153846154,
"eval_loss": 0.9145882725715637,
"eval_runtime": 3.2573,
"eval_samples_per_second": 34.998,
"eval_steps_per_second": 1.842,
"step": 40
},
{
"epoch": 0.07692307692307693,
"grad_norm": 5.386349201202393,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.7239,
"step": 50
},
{
"epoch": 0.07692307692307693,
"eval_loss": 0.7560557126998901,
"eval_runtime": 3.2454,
"eval_samples_per_second": 35.126,
"eval_steps_per_second": 1.849,
"step": 50
},
{
"epoch": 0.09230769230769231,
"grad_norm": 13.984370231628418,
"learning_rate": 1.8461538461538465e-06,
"loss": 0.7261,
"step": 60
},
{
"epoch": 0.09230769230769231,
"eval_loss": 0.7075957655906677,
"eval_runtime": 3.2359,
"eval_samples_per_second": 35.229,
"eval_steps_per_second": 1.854,
"step": 60
},
{
"epoch": 0.1076923076923077,
"grad_norm": 12.205921173095703,
"learning_rate": 2.153846153846154e-06,
"loss": 0.6461,
"step": 70
},
{
"epoch": 0.1076923076923077,
"eval_loss": 0.6797388792037964,
"eval_runtime": 3.2439,
"eval_samples_per_second": 35.143,
"eval_steps_per_second": 1.85,
"step": 70
},
{
"epoch": 0.12307692307692308,
"grad_norm": 8.89425277709961,
"learning_rate": 2.461538461538462e-06,
"loss": 0.5958,
"step": 80
},
{
"epoch": 0.12307692307692308,
"eval_loss": 0.6671402454376221,
"eval_runtime": 3.2181,
"eval_samples_per_second": 35.425,
"eval_steps_per_second": 1.864,
"step": 80
},
{
"epoch": 0.13846153846153847,
"grad_norm": 19.930360794067383,
"learning_rate": 2.7692307692307697e-06,
"loss": 0.6231,
"step": 90
},
{
"epoch": 0.13846153846153847,
"eval_loss": 0.650134265422821,
"eval_runtime": 3.2427,
"eval_samples_per_second": 35.156,
"eval_steps_per_second": 1.85,
"step": 90
},
{
"epoch": 0.15384615384615385,
"grad_norm": 9.43468189239502,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.6271,
"step": 100
},
{
"epoch": 0.15384615384615385,
"eval_loss": 0.6546280384063721,
"eval_runtime": 3.2567,
"eval_samples_per_second": 35.005,
"eval_steps_per_second": 1.842,
"step": 100
},
{
"epoch": 0.16923076923076924,
"grad_norm": 9.599658012390137,
"learning_rate": 3.384615384615385e-06,
"loss": 0.5577,
"step": 110
},
{
"epoch": 0.16923076923076924,
"eval_loss": 0.6422574520111084,
"eval_runtime": 3.2521,
"eval_samples_per_second": 35.054,
"eval_steps_per_second": 1.845,
"step": 110
},
{
"epoch": 0.18461538461538463,
"grad_norm": 10.455623626708984,
"learning_rate": 3.692307692307693e-06,
"loss": 0.5193,
"step": 120
},
{
"epoch": 0.18461538461538463,
"eval_loss": 0.6282245516777039,
"eval_runtime": 3.2576,
"eval_samples_per_second": 34.996,
"eval_steps_per_second": 1.842,
"step": 120
},
{
"epoch": 0.2,
"grad_norm": 5.3797407150268555,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5225,
"step": 130
},
{
"epoch": 0.2,
"eval_loss": 0.6316070556640625,
"eval_runtime": 3.2288,
"eval_samples_per_second": 35.307,
"eval_steps_per_second": 1.858,
"step": 130
},
{
"epoch": 0.2153846153846154,
"grad_norm": 2.727918863296509,
"learning_rate": 4.307692307692308e-06,
"loss": 0.5738,
"step": 140
},
{
"epoch": 0.2153846153846154,
"eval_loss": 0.6229422688484192,
"eval_runtime": 3.2458,
"eval_samples_per_second": 35.122,
"eval_steps_per_second": 1.849,
"step": 140
},
{
"epoch": 0.23076923076923078,
"grad_norm": 43.989776611328125,
"learning_rate": 4.615384615384616e-06,
"loss": 0.5529,
"step": 150
},
{
"epoch": 0.23076923076923078,
"eval_loss": 0.6122006773948669,
"eval_runtime": 3.2647,
"eval_samples_per_second": 34.919,
"eval_steps_per_second": 1.838,
"step": 150
},
{
"epoch": 0.24615384615384617,
"grad_norm": 4.098404407501221,
"learning_rate": 4.923076923076924e-06,
"loss": 0.519,
"step": 160
},
{
"epoch": 0.24615384615384617,
"eval_loss": 0.6218054294586182,
"eval_runtime": 3.2453,
"eval_samples_per_second": 35.128,
"eval_steps_per_second": 1.849,
"step": 160
},
{
"epoch": 0.26153846153846155,
"grad_norm": 2.585014820098877,
"learning_rate": 5.230769230769232e-06,
"loss": 0.5713,
"step": 170
},
{
"epoch": 0.26153846153846155,
"eval_loss": 0.6205213069915771,
"eval_runtime": 3.2516,
"eval_samples_per_second": 35.059,
"eval_steps_per_second": 1.845,
"step": 170
},
{
"epoch": 0.27692307692307694,
"grad_norm": 2.8192200660705566,
"learning_rate": 5.538461538461539e-06,
"loss": 0.5355,
"step": 180
},
{
"epoch": 0.27692307692307694,
"eval_loss": 0.6130949258804321,
"eval_runtime": 3.2451,
"eval_samples_per_second": 35.13,
"eval_steps_per_second": 1.849,
"step": 180
},
{
"epoch": 0.2923076923076923,
"grad_norm": 4.156994819641113,
"learning_rate": 5.846153846153847e-06,
"loss": 0.4487,
"step": 190
},
{
"epoch": 0.2923076923076923,
"eval_loss": 0.6160268783569336,
"eval_runtime": 3.2251,
"eval_samples_per_second": 35.348,
"eval_steps_per_second": 1.86,
"step": 190
},
{
"epoch": 0.3076923076923077,
"grad_norm": 3.249791383743286,
"learning_rate": 6.153846153846155e-06,
"loss": 0.5708,
"step": 200
},
{
"epoch": 0.3076923076923077,
"eval_loss": 0.6122933626174927,
"eval_runtime": 3.2483,
"eval_samples_per_second": 35.095,
"eval_steps_per_second": 1.847,
"step": 200
},
{
"epoch": 0.3230769230769231,
"grad_norm": 3.2924177646636963,
"learning_rate": 6.461538461538463e-06,
"loss": 0.5607,
"step": 210
},
{
"epoch": 0.3230769230769231,
"eval_loss": 0.6078358888626099,
"eval_runtime": 3.2453,
"eval_samples_per_second": 35.128,
"eval_steps_per_second": 1.849,
"step": 210
},
{
"epoch": 0.3384615384615385,
"grad_norm": 3.444439649581909,
"learning_rate": 6.76923076923077e-06,
"loss": 0.5026,
"step": 220
},
{
"epoch": 0.3384615384615385,
"eval_loss": 0.6209812760353088,
"eval_runtime": 3.2344,
"eval_samples_per_second": 35.246,
"eval_steps_per_second": 1.855,
"step": 220
},
{
"epoch": 0.35384615384615387,
"grad_norm": 2.5516700744628906,
"learning_rate": 7.076923076923078e-06,
"loss": 0.4938,
"step": 230
},
{
"epoch": 0.35384615384615387,
"eval_loss": 0.6547431349754333,
"eval_runtime": 3.2496,
"eval_samples_per_second": 35.081,
"eval_steps_per_second": 1.846,
"step": 230
},
{
"epoch": 0.36923076923076925,
"grad_norm": 25.815881729125977,
"learning_rate": 7.384615384615386e-06,
"loss": 0.5766,
"step": 240
},
{
"epoch": 0.36923076923076925,
"eval_loss": 0.6063669919967651,
"eval_runtime": 3.2779,
"eval_samples_per_second": 34.778,
"eval_steps_per_second": 1.83,
"step": 240
},
{
"epoch": 0.38461538461538464,
"grad_norm": 2.2970821857452393,
"learning_rate": 7.692307692307694e-06,
"loss": 0.5924,
"step": 250
},
{
"epoch": 0.38461538461538464,
"eval_loss": 0.6134644746780396,
"eval_runtime": 3.2301,
"eval_samples_per_second": 35.293,
"eval_steps_per_second": 1.858,
"step": 250
},
{
"epoch": 0.4,
"grad_norm": 2.2378745079040527,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5715,
"step": 260
},
{
"epoch": 0.4,
"eval_loss": 0.6102377772331238,
"eval_runtime": 3.2286,
"eval_samples_per_second": 35.309,
"eval_steps_per_second": 1.858,
"step": 260
},
{
"epoch": 0.4153846153846154,
"grad_norm": 2.2380335330963135,
"learning_rate": 8.307692307692309e-06,
"loss": 0.5491,
"step": 270
},
{
"epoch": 0.4153846153846154,
"eval_loss": 0.6195328831672668,
"eval_runtime": 3.2393,
"eval_samples_per_second": 35.193,
"eval_steps_per_second": 1.852,
"step": 270
},
{
"epoch": 0.4307692307692308,
"grad_norm": 3.0623953342437744,
"learning_rate": 8.615384615384617e-06,
"loss": 0.6217,
"step": 280
},
{
"epoch": 0.4307692307692308,
"eval_loss": 0.6168036460876465,
"eval_runtime": 3.2516,
"eval_samples_per_second": 35.06,
"eval_steps_per_second": 1.845,
"step": 280
},
{
"epoch": 0.4461538461538462,
"grad_norm": 2.4642109870910645,
"learning_rate": 8.923076923076925e-06,
"loss": 0.4625,
"step": 290
},
{
"epoch": 0.4461538461538462,
"eval_loss": 0.6237349510192871,
"eval_runtime": 3.243,
"eval_samples_per_second": 35.153,
"eval_steps_per_second": 1.85,
"step": 290
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.919198751449585,
"learning_rate": 9.230769230769232e-06,
"loss": 0.554,
"step": 300
},
{
"epoch": 0.46153846153846156,
"eval_loss": 0.6107349991798401,
"eval_runtime": 3.2321,
"eval_samples_per_second": 35.271,
"eval_steps_per_second": 1.856,
"step": 300
},
{
"epoch": 0.47692307692307695,
"grad_norm": 2.186372756958008,
"learning_rate": 9.53846153846154e-06,
"loss": 0.4941,
"step": 310
},
{
"epoch": 0.47692307692307695,
"eval_loss": 0.614315390586853,
"eval_runtime": 3.2373,
"eval_samples_per_second": 35.214,
"eval_steps_per_second": 1.853,
"step": 310
},
{
"epoch": 0.49230769230769234,
"grad_norm": 2.5454864501953125,
"learning_rate": 9.846153846153848e-06,
"loss": 0.5021,
"step": 320
},
{
"epoch": 0.49230769230769234,
"eval_loss": 0.6134229302406311,
"eval_runtime": 3.2312,
"eval_samples_per_second": 35.281,
"eval_steps_per_second": 1.857,
"step": 320
},
{
"epoch": 0.5076923076923077,
"grad_norm": 2.518843412399292,
"learning_rate": 1.0153846153846154e-05,
"loss": 0.587,
"step": 330
},
{
"epoch": 0.5076923076923077,
"eval_loss": 0.6188045144081116,
"eval_runtime": 3.2447,
"eval_samples_per_second": 35.135,
"eval_steps_per_second": 1.849,
"step": 330
},
{
"epoch": 0.5230769230769231,
"grad_norm": 2.2720816135406494,
"learning_rate": 1.0461538461538463e-05,
"loss": 0.6655,
"step": 340
},
{
"epoch": 0.5230769230769231,
"eval_loss": 0.6142882108688354,
"eval_runtime": 3.2478,
"eval_samples_per_second": 35.101,
"eval_steps_per_second": 1.847,
"step": 340
},
{
"epoch": 0.5384615384615384,
"grad_norm": 2.510495662689209,
"learning_rate": 1.076923076923077e-05,
"loss": 0.6116,
"step": 350
},
{
"epoch": 0.5384615384615384,
"eval_loss": 0.615568995475769,
"eval_runtime": 3.2332,
"eval_samples_per_second": 35.259,
"eval_steps_per_second": 1.856,
"step": 350
},
{
"epoch": 0.5538461538461539,
"grad_norm": 3.122288942337036,
"learning_rate": 1.1076923076923079e-05,
"loss": 0.5906,
"step": 360
},
{
"epoch": 0.5538461538461539,
"eval_loss": 0.6134847402572632,
"eval_runtime": 3.23,
"eval_samples_per_second": 35.294,
"eval_steps_per_second": 1.858,
"step": 360
},
{
"epoch": 0.5692307692307692,
"grad_norm": 2.096451759338379,
"learning_rate": 1.1384615384615385e-05,
"loss": 0.4887,
"step": 370
},
{
"epoch": 0.5692307692307692,
"eval_loss": 0.6209902763366699,
"eval_runtime": 3.2877,
"eval_samples_per_second": 34.674,
"eval_steps_per_second": 1.825,
"step": 370
},
{
"epoch": 0.5846153846153846,
"grad_norm": 1.9950298070907593,
"learning_rate": 1.1692307692307694e-05,
"loss": 0.5759,
"step": 380
},
{
"epoch": 0.5846153846153846,
"eval_loss": 0.623406171798706,
"eval_runtime": 3.2604,
"eval_samples_per_second": 34.965,
"eval_steps_per_second": 1.84,
"step": 380
},
{
"epoch": 0.6,
"grad_norm": 1.6859853267669678,
"learning_rate": 1.2e-05,
"loss": 0.5436,
"step": 390
},
{
"epoch": 0.6,
"eval_loss": 0.622590184211731,
"eval_runtime": 3.2828,
"eval_samples_per_second": 34.726,
"eval_steps_per_second": 1.828,
"step": 390
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.1276960372924805,
"learning_rate": 1.230769230769231e-05,
"loss": 0.5281,
"step": 400
},
{
"epoch": 0.6153846153846154,
"eval_loss": 0.6532315611839294,
"eval_runtime": 3.2611,
"eval_samples_per_second": 34.958,
"eval_steps_per_second": 1.84,
"step": 400
},
{
"epoch": 0.6307692307692307,
"grad_norm": 2.362884998321533,
"learning_rate": 1.2615384615384616e-05,
"loss": 0.5469,
"step": 410
},
{
"epoch": 0.6307692307692307,
"eval_loss": 0.6342897415161133,
"eval_runtime": 3.2382,
"eval_samples_per_second": 35.205,
"eval_steps_per_second": 1.853,
"step": 410
},
{
"epoch": 0.6461538461538462,
"grad_norm": 2.6885156631469727,
"learning_rate": 1.2923076923076925e-05,
"loss": 0.5795,
"step": 420
},
{
"epoch": 0.6461538461538462,
"eval_loss": 0.6264632940292358,
"eval_runtime": 3.238,
"eval_samples_per_second": 35.207,
"eval_steps_per_second": 1.853,
"step": 420
},
{
"epoch": 0.6615384615384615,
"grad_norm": 2.579245090484619,
"learning_rate": 1.3230769230769231e-05,
"loss": 0.5347,
"step": 430
},
{
"epoch": 0.6615384615384615,
"eval_loss": 0.6273682117462158,
"eval_runtime": 3.2364,
"eval_samples_per_second": 35.224,
"eval_steps_per_second": 1.854,
"step": 430
},
{
"epoch": 0.676923076923077,
"grad_norm": 1.1545597314834595,
"learning_rate": 1.353846153846154e-05,
"loss": 0.4647,
"step": 440
},
{
"epoch": 0.676923076923077,
"eval_loss": 0.6299084424972534,
"eval_runtime": 3.2446,
"eval_samples_per_second": 35.136,
"eval_steps_per_second": 1.849,
"step": 440
},
{
"epoch": 0.6923076923076923,
"grad_norm": 2.961758613586426,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.461,
"step": 450
},
{
"epoch": 0.6923076923076923,
"eval_loss": 0.6325281858444214,
"eval_runtime": 3.2395,
"eval_samples_per_second": 35.191,
"eval_steps_per_second": 1.852,
"step": 450
},
{
"epoch": 0.7076923076923077,
"grad_norm": 1.9980833530426025,
"learning_rate": 1.4153846153846156e-05,
"loss": 0.5936,
"step": 460
},
{
"epoch": 0.7076923076923077,
"eval_loss": 0.6283787488937378,
"eval_runtime": 3.2222,
"eval_samples_per_second": 35.379,
"eval_steps_per_second": 1.862,
"step": 460
},
{
"epoch": 0.7230769230769231,
"grad_norm": 1.7074766159057617,
"learning_rate": 1.4461538461538462e-05,
"loss": 0.5754,
"step": 470
},
{
"epoch": 0.7230769230769231,
"eval_loss": 0.6299780011177063,
"eval_runtime": 3.4089,
"eval_samples_per_second": 33.442,
"eval_steps_per_second": 1.76,
"step": 470
},
{
"epoch": 0.7384615384615385,
"grad_norm": 3.0761687755584717,
"learning_rate": 1.4769230769230772e-05,
"loss": 0.5832,
"step": 480
},
{
"epoch": 0.7384615384615385,
"eval_loss": 0.6351837515830994,
"eval_runtime": 3.2325,
"eval_samples_per_second": 35.266,
"eval_steps_per_second": 1.856,
"step": 480
},
{
"epoch": 0.7538461538461538,
"grad_norm": 2.6451804637908936,
"learning_rate": 1.5076923076923078e-05,
"loss": 0.5678,
"step": 490
},
{
"epoch": 0.7538461538461538,
"eval_loss": 0.6302112340927124,
"eval_runtime": 3.2461,
"eval_samples_per_second": 35.119,
"eval_steps_per_second": 1.848,
"step": 490
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.2752690315246582,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.5067,
"step": 500
},
{
"epoch": 0.7692307692307693,
"eval_loss": 0.6335285902023315,
"eval_runtime": 3.233,
"eval_samples_per_second": 35.261,
"eval_steps_per_second": 1.856,
"step": 500
},
{
"epoch": 0.7846153846153846,
"grad_norm": 1.623413324356079,
"learning_rate": 1.5692307692307693e-05,
"loss": 0.565,
"step": 510
},
{
"epoch": 0.7846153846153846,
"eval_loss": 0.6364890933036804,
"eval_runtime": 3.2571,
"eval_samples_per_second": 35.001,
"eval_steps_per_second": 1.842,
"step": 510
},
{
"epoch": 0.8,
"grad_norm": 1.4268816709518433,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6637,
"step": 520
},
{
"epoch": 0.8,
"eval_loss": 0.6888372898101807,
"eval_runtime": 3.2161,
"eval_samples_per_second": 35.446,
"eval_steps_per_second": 1.866,
"step": 520
},
{
"epoch": 0.8153846153846154,
"grad_norm": 2.0327444076538086,
"learning_rate": 1.630769230769231e-05,
"loss": 0.5415,
"step": 530
},
{
"epoch": 0.8153846153846154,
"eval_loss": 0.6402605175971985,
"eval_runtime": 3.249,
"eval_samples_per_second": 35.088,
"eval_steps_per_second": 1.847,
"step": 530
},
{
"epoch": 0.8307692307692308,
"grad_norm": 2.4058265686035156,
"learning_rate": 1.6615384615384618e-05,
"loss": 0.6001,
"step": 540
},
{
"epoch": 0.8307692307692308,
"eval_loss": 0.6381077170372009,
"eval_runtime": 3.2304,
"eval_samples_per_second": 35.29,
"eval_steps_per_second": 1.857,
"step": 540
},
{
"epoch": 0.8461538461538461,
"grad_norm": 3.1266651153564453,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.6938,
"step": 550
},
{
"epoch": 0.8461538461538461,
"eval_loss": 0.6406418681144714,
"eval_runtime": 3.2645,
"eval_samples_per_second": 34.921,
"eval_steps_per_second": 1.838,
"step": 550
},
{
"epoch": 0.8615384615384616,
"grad_norm": 2.09586501121521,
"learning_rate": 1.7230769230769234e-05,
"loss": 0.5671,
"step": 560
},
{
"epoch": 0.8615384615384616,
"eval_loss": 0.6451361775398254,
"eval_runtime": 3.3107,
"eval_samples_per_second": 34.434,
"eval_steps_per_second": 1.812,
"step": 560
},
{
"epoch": 0.8769230769230769,
"grad_norm": 3.3246548175811768,
"learning_rate": 1.753846153846154e-05,
"loss": 0.5231,
"step": 570
},
{
"epoch": 0.8769230769230769,
"eval_loss": 0.6457281708717346,
"eval_runtime": 3.2516,
"eval_samples_per_second": 35.059,
"eval_steps_per_second": 1.845,
"step": 570
},
{
"epoch": 0.8923076923076924,
"grad_norm": 1.8420376777648926,
"learning_rate": 1.784615384615385e-05,
"loss": 0.6566,
"step": 580
},
{
"epoch": 0.8923076923076924,
"eval_loss": 0.6426037549972534,
"eval_runtime": 3.2569,
"eval_samples_per_second": 35.003,
"eval_steps_per_second": 1.842,
"step": 580
},
{
"epoch": 0.9076923076923077,
"grad_norm": 1.7587623596191406,
"learning_rate": 1.8153846153846155e-05,
"loss": 0.5607,
"step": 590
},
{
"epoch": 0.9076923076923077,
"eval_loss": 0.6446419358253479,
"eval_runtime": 3.2735,
"eval_samples_per_second": 34.825,
"eval_steps_per_second": 1.833,
"step": 590
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.1630938053131104,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.7058,
"step": 600
},
{
"epoch": 0.9230769230769231,
"eval_loss": 0.6553606986999512,
"eval_runtime": 3.2562,
"eval_samples_per_second": 35.01,
"eval_steps_per_second": 1.843,
"step": 600
},
{
"epoch": 0.9384615384615385,
"grad_norm": 1.9333707094192505,
"learning_rate": 1.876923076923077e-05,
"loss": 0.6126,
"step": 610
},
{
"epoch": 0.9384615384615385,
"eval_loss": 0.6482510566711426,
"eval_runtime": 3.2568,
"eval_samples_per_second": 35.004,
"eval_steps_per_second": 1.842,
"step": 610
},
{
"epoch": 0.9538461538461539,
"grad_norm": 1.994057297706604,
"learning_rate": 1.907692307692308e-05,
"loss": 0.6155,
"step": 620
},
{
"epoch": 0.9538461538461539,
"eval_loss": 0.6493787169456482,
"eval_runtime": 3.2411,
"eval_samples_per_second": 35.174,
"eval_steps_per_second": 1.851,
"step": 620
},
{
"epoch": 0.9692307692307692,
"grad_norm": 2.5365986824035645,
"learning_rate": 1.9384615384615386e-05,
"loss": 0.5934,
"step": 630
},
{
"epoch": 0.9692307692307692,
"eval_loss": 0.6541261076927185,
"eval_runtime": 3.2414,
"eval_samples_per_second": 35.17,
"eval_steps_per_second": 1.851,
"step": 630
},
{
"epoch": 0.9846153846153847,
"grad_norm": 2.831939697265625,
"learning_rate": 1.9692307692307696e-05,
"loss": 0.4716,
"step": 640
},
{
"epoch": 0.9846153846153847,
"eval_loss": 0.6611928343772888,
"eval_runtime": 3.2346,
"eval_samples_per_second": 35.244,
"eval_steps_per_second": 1.855,
"step": 640
},
{
"epoch": 1.0,
"grad_norm": 2.3626530170440674,
"learning_rate": 2e-05,
"loss": 0.5363,
"step": 650
},
{
"epoch": 1.0,
"eval_loss": 0.6603513956069946,
"eval_runtime": 3.238,
"eval_samples_per_second": 35.206,
"eval_steps_per_second": 1.853,
"step": 650
},
{
"epoch": 1.0153846153846153,
"grad_norm": 1.5879381895065308,
"learning_rate": 1.9999855802751384e-05,
"loss": 0.3579,
"step": 660
},
{
"epoch": 1.0153846153846153,
"eval_loss": 0.6978694200515747,
"eval_runtime": 3.2447,
"eval_samples_per_second": 35.135,
"eval_steps_per_second": 1.849,
"step": 660
},
{
"epoch": 1.0307692307692307,
"grad_norm": 1.9470375776290894,
"learning_rate": 1.9999423215164105e-05,
"loss": 0.3559,
"step": 670
},
{
"epoch": 1.0307692307692307,
"eval_loss": 0.6810071468353271,
"eval_runtime": 3.2514,
"eval_samples_per_second": 35.062,
"eval_steps_per_second": 1.845,
"step": 670
},
{
"epoch": 1.0461538461538462,
"grad_norm": 1.9423273801803589,
"learning_rate": 1.9998702249713747e-05,
"loss": 0.3977,
"step": 680
},
{
"epoch": 1.0461538461538462,
"eval_loss": 0.6764042377471924,
"eval_runtime": 3.2336,
"eval_samples_per_second": 35.255,
"eval_steps_per_second": 1.856,
"step": 680
},
{
"epoch": 1.0615384615384615,
"grad_norm": 2.47997784614563,
"learning_rate": 1.9997692927192562e-05,
"loss": 0.3449,
"step": 690
},
{
"epoch": 1.0615384615384615,
"eval_loss": 0.6773045659065247,
"eval_runtime": 3.2495,
"eval_samples_per_second": 35.082,
"eval_steps_per_second": 1.846,
"step": 690
},
{
"epoch": 1.0769230769230769,
"grad_norm": 1.2528847455978394,
"learning_rate": 1.9996395276708856e-05,
"loss": 0.3426,
"step": 700
},
{
"epoch": 1.0769230769230769,
"eval_loss": 0.6868980526924133,
"eval_runtime": 3.2469,
"eval_samples_per_second": 35.111,
"eval_steps_per_second": 1.848,
"step": 700
},
{
"epoch": 1.0923076923076924,
"grad_norm": 1.9821579456329346,
"learning_rate": 1.9994809335686152e-05,
"loss": 0.4387,
"step": 710
},
{
"epoch": 1.0923076923076924,
"eval_loss": 0.6710843443870544,
"eval_runtime": 3.2766,
"eval_samples_per_second": 34.792,
"eval_steps_per_second": 1.831,
"step": 710
},
{
"epoch": 1.1076923076923078,
"grad_norm": 1.4338393211364746,
"learning_rate": 1.9992935149862116e-05,
"loss": 0.3443,
"step": 720
},
{
"epoch": 1.1076923076923078,
"eval_loss": 0.6952248215675354,
"eval_runtime": 3.2614,
"eval_samples_per_second": 34.954,
"eval_steps_per_second": 1.84,
"step": 720
},
{
"epoch": 1.123076923076923,
"grad_norm": 1.1517648696899414,
"learning_rate": 1.999077277328724e-05,
"loss": 0.3484,
"step": 730
},
{
"epoch": 1.123076923076923,
"eval_loss": 0.6964479684829712,
"eval_runtime": 3.2521,
"eval_samples_per_second": 35.054,
"eval_steps_per_second": 1.845,
"step": 730
},
{
"epoch": 1.1384615384615384,
"grad_norm": 1.650405764579773,
"learning_rate": 1.998832226832327e-05,
"loss": 0.4018,
"step": 740
},
{
"epoch": 1.1384615384615384,
"eval_loss": 0.6902267932891846,
"eval_runtime": 3.2586,
"eval_samples_per_second": 34.984,
"eval_steps_per_second": 1.841,
"step": 740
},
{
"epoch": 1.1538461538461537,
"grad_norm": 2.2939112186431885,
"learning_rate": 1.9985583705641418e-05,
"loss": 0.3984,
"step": 750
},
{
"epoch": 1.1538461538461537,
"eval_loss": 0.6953668594360352,
"eval_runtime": 3.2666,
"eval_samples_per_second": 34.899,
"eval_steps_per_second": 1.837,
"step": 750
},
{
"epoch": 1.1692307692307693,
"grad_norm": 1.588689923286438,
"learning_rate": 1.9982557164220335e-05,
"loss": 0.3423,
"step": 760
},
{
"epoch": 1.1692307692307693,
"eval_loss": 0.6961036324501038,
"eval_runtime": 3.2728,
"eval_samples_per_second": 34.832,
"eval_steps_per_second": 1.833,
"step": 760
},
{
"epoch": 1.1846153846153846,
"grad_norm": 2.06250262260437,
"learning_rate": 1.9979242731343803e-05,
"loss": 0.2843,
"step": 770
},
{
"epoch": 1.1846153846153846,
"eval_loss": 0.7108862400054932,
"eval_runtime": 3.2657,
"eval_samples_per_second": 34.908,
"eval_steps_per_second": 1.837,
"step": 770
},
{
"epoch": 1.2,
"grad_norm": 2.609130382537842,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.3172,
"step": 780
},
{
"epoch": 1.2,
"eval_loss": 0.6978670358657837,
"eval_runtime": 3.2647,
"eval_samples_per_second": 34.919,
"eval_steps_per_second": 1.838,
"step": 780
},
{
"epoch": 1.2153846153846155,
"grad_norm": 2.1612465381622314,
"learning_rate": 1.9971750581869955e-05,
"loss": 0.4031,
"step": 790
},
{
"epoch": 1.2153846153846155,
"eval_loss": 0.7043502330780029,
"eval_runtime": 3.2943,
"eval_samples_per_second": 34.605,
"eval_steps_per_second": 1.821,
"step": 790
},
{
"epoch": 1.2307692307692308,
"grad_norm": 2.465644121170044,
"learning_rate": 1.9967573081342103e-05,
"loss": 0.3869,
"step": 800
},
{
"epoch": 1.2307692307692308,
"eval_loss": 0.694877564907074,
"eval_runtime": 3.2465,
"eval_samples_per_second": 35.115,
"eval_steps_per_second": 1.848,
"step": 800
},
{
"epoch": 1.2461538461538462,
"grad_norm": 2.611905097961426,
"learning_rate": 1.9963108121491508e-05,
"loss": 0.3364,
"step": 810
},
{
"epoch": 1.2461538461538462,
"eval_loss": 0.6959603428840637,
"eval_runtime": 3.2287,
"eval_samples_per_second": 35.309,
"eval_steps_per_second": 1.858,
"step": 810
},
{
"epoch": 1.2615384615384615,
"grad_norm": 2.0148117542266846,
"learning_rate": 1.9958355831085155e-05,
"loss": 0.3699,
"step": 820
},
{
"epoch": 1.2615384615384615,
"eval_loss": 0.695041835308075,
"eval_runtime": 3.2511,
"eval_samples_per_second": 35.065,
"eval_steps_per_second": 1.846,
"step": 820
},
{
"epoch": 1.2769230769230768,
"grad_norm": 2.213994264602661,
"learning_rate": 1.995331634717649e-05,
"loss": 0.4101,
"step": 830
},
{
"epoch": 1.2769230769230768,
"eval_loss": 0.6806493997573853,
"eval_runtime": 3.2367,
"eval_samples_per_second": 35.221,
"eval_steps_per_second": 1.854,
"step": 830
},
{
"epoch": 1.2923076923076924,
"grad_norm": 1.7559289932250977,
"learning_rate": 1.9947989815101444e-05,
"loss": 0.4012,
"step": 840
},
{
"epoch": 1.2923076923076924,
"eval_loss": 0.6939857006072998,
"eval_runtime": 3.2502,
"eval_samples_per_second": 35.075,
"eval_steps_per_second": 1.846,
"step": 840
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.8075680136680603,
"learning_rate": 1.9942376388474282e-05,
"loss": 0.3202,
"step": 850
},
{
"epoch": 1.3076923076923077,
"eval_loss": 0.7051680684089661,
"eval_runtime": 3.2612,
"eval_samples_per_second": 34.956,
"eval_steps_per_second": 1.84,
"step": 850
},
{
"epoch": 1.323076923076923,
"grad_norm": 1.8492660522460938,
"learning_rate": 1.9936476229183133e-05,
"loss": 0.3985,
"step": 860
},
{
"epoch": 1.323076923076923,
"eval_loss": 0.6843434572219849,
"eval_runtime": 3.2739,
"eval_samples_per_second": 34.821,
"eval_steps_per_second": 1.833,
"step": 860
},
{
"epoch": 1.3384615384615386,
"grad_norm": 1.6737396717071533,
"learning_rate": 1.9930289507385344e-05,
"loss": 0.4244,
"step": 870
},
{
"epoch": 1.3384615384615386,
"eval_loss": 0.6972203254699707,
"eval_runtime": 3.2498,
"eval_samples_per_second": 35.079,
"eval_steps_per_second": 1.846,
"step": 870
},
{
"epoch": 1.353846153846154,
"grad_norm": 2.5821003913879395,
"learning_rate": 1.992381640150257e-05,
"loss": 0.3924,
"step": 880
},
{
"epoch": 1.353846153846154,
"eval_loss": 0.6854589581489563,
"eval_runtime": 3.2524,
"eval_samples_per_second": 35.051,
"eval_steps_per_second": 1.845,
"step": 880
},
{
"epoch": 1.3692307692307693,
"grad_norm": 1.248367428779602,
"learning_rate": 1.9917057098215624e-05,
"loss": 0.3659,
"step": 890
},
{
"epoch": 1.3692307692307693,
"eval_loss": 0.6994220614433289,
"eval_runtime": 3.2745,
"eval_samples_per_second": 34.814,
"eval_steps_per_second": 1.832,
"step": 890
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.9339770674705505,
"learning_rate": 1.9910011792459086e-05,
"loss": 0.309,
"step": 900
},
{
"epoch": 1.3846153846153846,
"eval_loss": 0.7046116590499878,
"eval_runtime": 3.2642,
"eval_samples_per_second": 34.924,
"eval_steps_per_second": 1.838,
"step": 900
},
{
"epoch": 1.4,
"grad_norm": 27.820077896118164,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.3788,
"step": 910
},
{
"epoch": 1.4,
"eval_loss": 0.6996982097625732,
"eval_runtime": 3.2777,
"eval_samples_per_second": 34.781,
"eval_steps_per_second": 1.831,
"step": 910
},
{
"epoch": 1.4153846153846155,
"grad_norm": 2.5190956592559814,
"learning_rate": 1.9895063994510512e-05,
"loss": 0.3372,
"step": 920
},
{
"epoch": 1.4153846153846155,
"eval_loss": 0.7020460963249207,
"eval_runtime": 3.242,
"eval_samples_per_second": 35.164,
"eval_steps_per_second": 1.851,
"step": 920
},
{
"epoch": 1.4307692307692308,
"grad_norm": 1.8096344470977783,
"learning_rate": 1.9887161933404743e-05,
"loss": 0.3812,
"step": 930
},
{
"epoch": 1.4307692307692308,
"eval_loss": 0.6968725323677063,
"eval_runtime": 3.3598,
"eval_samples_per_second": 33.93,
"eval_steps_per_second": 1.786,
"step": 930
},
{
"epoch": 1.4461538461538461,
"grad_norm": 1.8657336235046387,
"learning_rate": 1.9878974731989487e-05,
"loss": 0.414,
"step": 940
},
{
"epoch": 1.4461538461538461,
"eval_loss": 0.6909111142158508,
"eval_runtime": 3.2552,
"eval_samples_per_second": 35.021,
"eval_steps_per_second": 1.843,
"step": 940
},
{
"epoch": 1.4615384615384617,
"grad_norm": 2.4143567085266113,
"learning_rate": 1.9870502626379127e-05,
"loss": 0.3813,
"step": 950
},
{
"epoch": 1.4615384615384617,
"eval_loss": 0.6953186392784119,
"eval_runtime": 3.2929,
"eval_samples_per_second": 34.619,
"eval_steps_per_second": 1.822,
"step": 950
},
{
"epoch": 1.476923076923077,
"grad_norm": 2.3205642700195312,
"learning_rate": 1.9861745860904538e-05,
"loss": 0.3701,
"step": 960
},
{
"epoch": 1.476923076923077,
"eval_loss": 0.695374608039856,
"eval_runtime": 3.2455,
"eval_samples_per_second": 35.126,
"eval_steps_per_second": 1.849,
"step": 960
},
{
"epoch": 1.4923076923076923,
"grad_norm": 1.55659818649292,
"learning_rate": 1.9852704688106003e-05,
"loss": 0.3437,
"step": 970
},
{
"epoch": 1.4923076923076923,
"eval_loss": 0.7013147473335266,
"eval_runtime": 3.2638,
"eval_samples_per_second": 34.929,
"eval_steps_per_second": 1.838,
"step": 970
},
{
"epoch": 1.5076923076923077,
"grad_norm": 2.180811882019043,
"learning_rate": 1.9843379368725978e-05,
"loss": 0.3343,
"step": 980
},
{
"epoch": 1.5076923076923077,
"eval_loss": 0.7043299078941345,
"eval_runtime": 3.2392,
"eval_samples_per_second": 35.194,
"eval_steps_per_second": 1.852,
"step": 980
},
{
"epoch": 1.523076923076923,
"grad_norm": 1.474899411201477,
"learning_rate": 1.983377017170154e-05,
"loss": 0.3601,
"step": 990
},
{
"epoch": 1.523076923076923,
"eval_loss": 0.6996614336967468,
"eval_runtime": 3.2475,
"eval_samples_per_second": 35.104,
"eval_steps_per_second": 1.848,
"step": 990
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.9230364561080933,
"learning_rate": 1.9823877374156647e-05,
"loss": 0.3752,
"step": 1000
},
{
"epoch": 1.5384615384615383,
"eval_loss": 0.6967916488647461,
"eval_runtime": 3.2278,
"eval_samples_per_second": 35.318,
"eval_steps_per_second": 1.859,
"step": 1000
},
{
"epoch": 1.5538461538461539,
"grad_norm": 1.0019750595092773,
"learning_rate": 1.9813701261394136e-05,
"loss": 0.3406,
"step": 1010
},
{
"epoch": 1.5538461538461539,
"eval_loss": 0.6963152885437012,
"eval_runtime": 3.2442,
"eval_samples_per_second": 35.14,
"eval_steps_per_second": 1.849,
"step": 1010
},
{
"epoch": 1.5692307692307692,
"grad_norm": 1.7724684476852417,
"learning_rate": 1.9803242126887496e-05,
"loss": 0.4573,
"step": 1020
},
{
"epoch": 1.5692307692307692,
"eval_loss": 0.7392306327819824,
"eval_runtime": 3.2664,
"eval_samples_per_second": 34.901,
"eval_steps_per_second": 1.837,
"step": 1020
},
{
"epoch": 1.5846153846153848,
"grad_norm": 1.7095409631729126,
"learning_rate": 1.979250027227241e-05,
"loss": 0.3882,
"step": 1030
},
{
"epoch": 1.5846153846153848,
"eval_loss": 0.708345890045166,
"eval_runtime": 3.276,
"eval_samples_per_second": 34.799,
"eval_steps_per_second": 1.832,
"step": 1030
},
{
"epoch": 1.6,
"grad_norm": 1.9319413900375366,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.3933,
"step": 1040
},
{
"epoch": 1.6,
"eval_loss": 0.697462797164917,
"eval_runtime": 3.243,
"eval_samples_per_second": 35.153,
"eval_steps_per_second": 1.85,
"step": 1040
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.8260385990142822,
"learning_rate": 1.977016965001817e-05,
"loss": 0.427,
"step": 1050
},
{
"epoch": 1.6153846153846154,
"eval_loss": 0.6899636387825012,
"eval_runtime": 3.2481,
"eval_samples_per_second": 35.098,
"eval_steps_per_second": 1.847,
"step": 1050
},
{
"epoch": 1.6307692307692307,
"grad_norm": 1.6716266870498657,
"learning_rate": 1.9758581526381878e-05,
"loss": 0.3436,
"step": 1060
},
{
"epoch": 1.6307692307692307,
"eval_loss": 0.6924691200256348,
"eval_runtime": 3.2562,
"eval_samples_per_second": 35.01,
"eval_steps_per_second": 1.843,
"step": 1060
},
{
"epoch": 1.646153846153846,
"grad_norm": 0.8898760080337524,
"learning_rate": 1.9746711970624282e-05,
"loss": 0.3802,
"step": 1070
},
{
"epoch": 1.646153846153846,
"eval_loss": 0.7017173171043396,
"eval_runtime": 3.2449,
"eval_samples_per_second": 35.132,
"eval_steps_per_second": 1.849,
"step": 1070
},
{
"epoch": 1.6615384615384614,
"grad_norm": 1.6189157962799072,
"learning_rate": 1.973456132505684e-05,
"loss": 0.3668,
"step": 1080
},
{
"epoch": 1.6615384615384614,
"eval_loss": 0.6917209029197693,
"eval_runtime": 3.2318,
"eval_samples_per_second": 35.275,
"eval_steps_per_second": 1.857,
"step": 1080
},
{
"epoch": 1.676923076923077,
"grad_norm": 1.77718985080719,
"learning_rate": 1.972212994009749e-05,
"loss": 0.3996,
"step": 1090
},
{
"epoch": 1.676923076923077,
"eval_loss": 0.6930002570152283,
"eval_runtime": 3.2419,
"eval_samples_per_second": 35.165,
"eval_steps_per_second": 1.851,
"step": 1090
},
{
"epoch": 1.6923076923076923,
"grad_norm": 1.6316412687301636,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.4447,
"step": 1100
},
{
"epoch": 1.6923076923076923,
"eval_loss": 0.6898515820503235,
"eval_runtime": 3.2294,
"eval_samples_per_second": 35.3,
"eval_steps_per_second": 1.858,
"step": 1100
},
{
"epoch": 1.7076923076923078,
"grad_norm": 1.2730180025100708,
"learning_rate": 1.9696426394146278e-05,
"loss": 0.4221,
"step": 1110
},
{
"epoch": 1.7076923076923078,
"eval_loss": 0.693137526512146,
"eval_runtime": 3.23,
"eval_samples_per_second": 35.294,
"eval_steps_per_second": 1.858,
"step": 1110
},
{
"epoch": 1.7230769230769232,
"grad_norm": 2.3808021545410156,
"learning_rate": 1.9683154974430544e-05,
"loss": 0.3271,
"step": 1120
},
{
"epoch": 1.7230769230769232,
"eval_loss": 0.6987683773040771,
"eval_runtime": 3.2399,
"eval_samples_per_second": 35.186,
"eval_steps_per_second": 1.852,
"step": 1120
},
{
"epoch": 1.7384615384615385,
"grad_norm": 2.563812017440796,
"learning_rate": 1.9669604297853766e-05,
"loss": 0.3751,
"step": 1130
},
{
"epoch": 1.7384615384615385,
"eval_loss": 0.6946467161178589,
"eval_runtime": 3.2296,
"eval_samples_per_second": 35.299,
"eval_steps_per_second": 1.858,
"step": 1130
},
{
"epoch": 1.7538461538461538,
"grad_norm": 1.6859829425811768,
"learning_rate": 1.965577475520999e-05,
"loss": 0.3433,
"step": 1140
},
{
"epoch": 1.7538461538461538,
"eval_loss": 0.6889815926551819,
"eval_runtime": 3.2391,
"eval_samples_per_second": 35.195,
"eval_steps_per_second": 1.852,
"step": 1140
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.8283382654190063,
"learning_rate": 1.9641666745335626e-05,
"loss": 0.419,
"step": 1150
},
{
"epoch": 1.7692307692307692,
"eval_loss": 0.6913794875144958,
"eval_runtime": 3.2283,
"eval_samples_per_second": 35.313,
"eval_steps_per_second": 1.859,
"step": 1150
},
{
"epoch": 1.7846153846153845,
"grad_norm": 1.826407551765442,
"learning_rate": 1.962728067509791e-05,
"loss": 0.3423,
"step": 1160
},
{
"epoch": 1.7846153846153845,
"eval_loss": 0.692046046257019,
"eval_runtime": 3.2427,
"eval_samples_per_second": 35.156,
"eval_steps_per_second": 1.85,
"step": 1160
},
{
"epoch": 1.8,
"grad_norm": 1.4921714067459106,
"learning_rate": 1.961261695938319e-05,
"loss": 0.3351,
"step": 1170
},
{
"epoch": 1.8,
"eval_loss": 0.7031008005142212,
"eval_runtime": 3.249,
"eval_samples_per_second": 35.088,
"eval_steps_per_second": 1.847,
"step": 1170
},
{
"epoch": 1.8153846153846154,
"grad_norm": 2.1426408290863037,
"learning_rate": 1.9597676021084962e-05,
"loss": 0.3521,
"step": 1180
},
{
"epoch": 1.8153846153846154,
"eval_loss": 0.6923142671585083,
"eval_runtime": 3.2308,
"eval_samples_per_second": 35.286,
"eval_steps_per_second": 1.857,
"step": 1180
},
{
"epoch": 1.830769230769231,
"grad_norm": 1.8514198064804077,
"learning_rate": 1.9582458291091664e-05,
"loss": 0.454,
"step": 1190
},
{
"epoch": 1.830769230769231,
"eval_loss": 0.6877439022064209,
"eval_runtime": 3.2342,
"eval_samples_per_second": 35.248,
"eval_steps_per_second": 1.855,
"step": 1190
},
{
"epoch": 1.8461538461538463,
"grad_norm": 2.015425205230713,
"learning_rate": 1.9566964208274254e-05,
"loss": 0.3908,
"step": 1200
},
{
"epoch": 1.8461538461538463,
"eval_loss": 0.6943904161453247,
"eval_runtime": 3.2272,
"eval_samples_per_second": 35.324,
"eval_steps_per_second": 1.859,
"step": 1200
},
{
"epoch": 1.8615384615384616,
"grad_norm": 1.4284974336624146,
"learning_rate": 1.9551194219473552e-05,
"loss": 0.3538,
"step": 1210
},
{
"epoch": 1.8615384615384616,
"eval_loss": 0.6958539485931396,
"eval_runtime": 3.2638,
"eval_samples_per_second": 34.929,
"eval_steps_per_second": 1.838,
"step": 1210
},
{
"epoch": 1.876923076923077,
"grad_norm": 2.42622447013855,
"learning_rate": 1.9535148779487365e-05,
"loss": 0.28,
"step": 1220
},
{
"epoch": 1.876923076923077,
"eval_loss": 0.7015026211738586,
"eval_runtime": 3.2491,
"eval_samples_per_second": 35.086,
"eval_steps_per_second": 1.847,
"step": 1220
},
{
"epoch": 1.8923076923076922,
"grad_norm": 1.8617641925811768,
"learning_rate": 1.9518828351057345e-05,
"loss": 0.4324,
"step": 1230
},
{
"epoch": 1.8923076923076922,
"eval_loss": 0.6908020377159119,
"eval_runtime": 3.2376,
"eval_samples_per_second": 35.211,
"eval_steps_per_second": 1.853,
"step": 1230
},
{
"epoch": 1.9076923076923076,
"grad_norm": 1.7121613025665283,
"learning_rate": 1.9502233404855672e-05,
"loss": 0.3713,
"step": 1240
},
{
"epoch": 1.9076923076923076,
"eval_loss": 0.703855574131012,
"eval_runtime": 3.2355,
"eval_samples_per_second": 35.234,
"eval_steps_per_second": 1.854,
"step": 1240
},
{
"epoch": 1.9230769230769231,
"grad_norm": 2.0595622062683105,
"learning_rate": 1.9485364419471454e-05,
"loss": 0.4456,
"step": 1250
},
{
"epoch": 1.9230769230769231,
"eval_loss": 0.686195433139801,
"eval_runtime": 3.2266,
"eval_samples_per_second": 35.331,
"eval_steps_per_second": 1.86,
"step": 1250
},
{
"epoch": 1.9384615384615385,
"grad_norm": 1.9807814359664917,
"learning_rate": 1.946822188139696e-05,
"loss": 0.2958,
"step": 1260
},
{
"epoch": 1.9384615384615385,
"eval_loss": 0.6912775039672852,
"eval_runtime": 3.2509,
"eval_samples_per_second": 35.067,
"eval_steps_per_second": 1.846,
"step": 1260
},
{
"epoch": 1.953846153846154,
"grad_norm": 1.8224427700042725,
"learning_rate": 1.945080628501355e-05,
"loss": 0.3876,
"step": 1270
},
{
"epoch": 1.953846153846154,
"eval_loss": 0.6861458420753479,
"eval_runtime": 3.2739,
"eval_samples_per_second": 34.82,
"eval_steps_per_second": 1.833,
"step": 1270
},
{
"epoch": 1.9692307692307693,
"grad_norm": 2.107452630996704,
"learning_rate": 1.9433118132577432e-05,
"loss": 0.3748,
"step": 1280
},
{
"epoch": 1.9692307692307693,
"eval_loss": 0.6867597699165344,
"eval_runtime": 3.2581,
"eval_samples_per_second": 34.99,
"eval_steps_per_second": 1.842,
"step": 1280
},
{
"epoch": 1.9846153846153847,
"grad_norm": 5.972170352935791,
"learning_rate": 1.94151579342052e-05,
"loss": 0.4297,
"step": 1290
},
{
"epoch": 1.9846153846153847,
"eval_loss": 1.595029354095459,
"eval_runtime": 3.2512,
"eval_samples_per_second": 35.064,
"eval_steps_per_second": 1.845,
"step": 1290
},
{
"epoch": 2.0,
"grad_norm": 1.460162878036499,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.6569,
"step": 1300
},
{
"epoch": 2.0,
"eval_loss": 0.7044022083282471,
"eval_runtime": 3.2342,
"eval_samples_per_second": 35.248,
"eval_steps_per_second": 1.855,
"step": 1300
},
{
"epoch": 2.0153846153846153,
"grad_norm": 1.5906578302383423,
"learning_rate": 1.9378423479332045e-05,
"loss": 0.2524,
"step": 1310
},
{
"epoch": 2.0153846153846153,
"eval_loss": 0.7527978420257568,
"eval_runtime": 3.2369,
"eval_samples_per_second": 35.218,
"eval_steps_per_second": 1.854,
"step": 1310
},
{
"epoch": 2.0307692307692307,
"grad_norm": 1.2187044620513916,
"learning_rate": 1.935965028223259e-05,
"loss": 0.1678,
"step": 1320
},
{
"epoch": 2.0307692307692307,
"eval_loss": 0.7567086219787598,
"eval_runtime": 3.2521,
"eval_samples_per_second": 35.054,
"eval_steps_per_second": 1.845,
"step": 1320
},
{
"epoch": 2.046153846153846,
"grad_norm": 0.984000563621521,
"learning_rate": 1.9340607157969393e-05,
"loss": 0.219,
"step": 1330
},
{
"epoch": 2.046153846153846,
"eval_loss": 0.7520028948783875,
"eval_runtime": 3.2487,
"eval_samples_per_second": 35.091,
"eval_steps_per_second": 1.847,
"step": 1330
},
{
"epoch": 2.0615384615384613,
"grad_norm": 1.2751810550689697,
"learning_rate": 1.932129465573568e-05,
"loss": 0.2095,
"step": 1340
},
{
"epoch": 2.0615384615384613,
"eval_loss": 0.739636242389679,
"eval_runtime": 3.2269,
"eval_samples_per_second": 35.328,
"eval_steps_per_second": 1.859,
"step": 1340
},
{
"epoch": 2.076923076923077,
"grad_norm": 1.4136130809783936,
"learning_rate": 1.9301713332493386e-05,
"loss": 0.1668,
"step": 1350
},
{
"epoch": 2.076923076923077,
"eval_loss": 0.7536832690238953,
"eval_runtime": 3.2417,
"eval_samples_per_second": 35.167,
"eval_steps_per_second": 1.851,
"step": 1350
},
{
"epoch": 2.0923076923076924,
"grad_norm": 1.7586925029754639,
"learning_rate": 1.9281863752957095e-05,
"loss": 0.2118,
"step": 1360
},
{
"epoch": 2.0923076923076924,
"eval_loss": 0.7858787775039673,
"eval_runtime": 3.2321,
"eval_samples_per_second": 35.271,
"eval_steps_per_second": 1.856,
"step": 1360
},
{
"epoch": 2.1076923076923078,
"grad_norm": 1.9954192638397217,
"learning_rate": 1.9261746489577767e-05,
"loss": 0.1911,
"step": 1370
},
{
"epoch": 2.1076923076923078,
"eval_loss": 0.7683790922164917,
"eval_runtime": 3.2187,
"eval_samples_per_second": 35.418,
"eval_steps_per_second": 1.864,
"step": 1370
},
{
"epoch": 2.123076923076923,
"grad_norm": 1.311628818511963,
"learning_rate": 1.92413621225262e-05,
"loss": 0.1894,
"step": 1380
},
{
"epoch": 2.123076923076923,
"eval_loss": 0.8233888745307922,
"eval_runtime": 3.2593,
"eval_samples_per_second": 34.977,
"eval_steps_per_second": 1.841,
"step": 1380
},
{
"epoch": 2.1384615384615384,
"grad_norm": 1.3510133028030396,
"learning_rate": 1.9220711239676325e-05,
"loss": 0.2001,
"step": 1390
},
{
"epoch": 2.1384615384615384,
"eval_loss": 0.7833328247070312,
"eval_runtime": 3.2388,
"eval_samples_per_second": 35.198,
"eval_steps_per_second": 1.853,
"step": 1390
},
{
"epoch": 2.1538461538461537,
"grad_norm": 1.1844192743301392,
"learning_rate": 1.9199794436588244e-05,
"loss": 0.2078,
"step": 1400
},
{
"epoch": 2.1538461538461537,
"eval_loss": 0.7819744348526001,
"eval_runtime": 3.2322,
"eval_samples_per_second": 35.27,
"eval_steps_per_second": 1.856,
"step": 1400
},
{
"epoch": 2.169230769230769,
"grad_norm": 1.4540330171585083,
"learning_rate": 1.917861231649104e-05,
"loss": 0.2046,
"step": 1410
},
{
"epoch": 2.169230769230769,
"eval_loss": 0.7777317762374878,
"eval_runtime": 3.3497,
"eval_samples_per_second": 34.033,
"eval_steps_per_second": 1.791,
"step": 1410
},
{
"epoch": 2.184615384615385,
"grad_norm": 2.100379467010498,
"learning_rate": 1.915716549026541e-05,
"loss": 0.2192,
"step": 1420
},
{
"epoch": 2.184615384615385,
"eval_loss": 0.7560202479362488,
"eval_runtime": 3.2536,
"eval_samples_per_second": 35.038,
"eval_steps_per_second": 1.844,
"step": 1420
},
{
"epoch": 2.2,
"grad_norm": 1.1869295835494995,
"learning_rate": 1.913545457642601e-05,
"loss": 0.2055,
"step": 1430
},
{
"epoch": 2.2,
"eval_loss": 0.7658796906471252,
"eval_runtime": 3.2561,
"eval_samples_per_second": 35.011,
"eval_steps_per_second": 1.843,
"step": 1430
},
{
"epoch": 2.2153846153846155,
"grad_norm": 1.143850564956665,
"learning_rate": 1.9113480201103658e-05,
"loss": 0.2003,
"step": 1440
},
{
"epoch": 2.2153846153846155,
"eval_loss": 0.7692248821258545,
"eval_runtime": 3.2573,
"eval_samples_per_second": 34.998,
"eval_steps_per_second": 1.842,
"step": 1440
},
{
"epoch": 2.230769230769231,
"grad_norm": 1.4219717979431152,
"learning_rate": 1.909124299802724e-05,
"loss": 0.1882,
"step": 1450
},
{
"epoch": 2.230769230769231,
"eval_loss": 0.7854686975479126,
"eval_runtime": 3.2376,
"eval_samples_per_second": 35.211,
"eval_steps_per_second": 1.853,
"step": 1450
},
{
"epoch": 2.246153846153846,
"grad_norm": 1.727668046951294,
"learning_rate": 1.9068743608505454e-05,
"loss": 0.2081,
"step": 1460
},
{
"epoch": 2.246153846153846,
"eval_loss": 0.7837368249893188,
"eval_runtime": 3.2688,
"eval_samples_per_second": 34.875,
"eval_steps_per_second": 1.836,
"step": 1460
},
{
"epoch": 2.2615384615384615,
"grad_norm": 1.9428791999816895,
"learning_rate": 1.9045982681408324e-05,
"loss": 0.2031,
"step": 1470
},
{
"epoch": 2.2615384615384615,
"eval_loss": 0.7683539390563965,
"eval_runtime": 3.2316,
"eval_samples_per_second": 35.277,
"eval_steps_per_second": 1.857,
"step": 1470
},
{
"epoch": 2.276923076923077,
"grad_norm": 1.5166252851486206,
"learning_rate": 1.902296087314845e-05,
"loss": 0.1919,
"step": 1480
},
{
"epoch": 2.276923076923077,
"eval_loss": 0.7894486784934998,
"eval_runtime": 3.2275,
"eval_samples_per_second": 35.321,
"eval_steps_per_second": 1.859,
"step": 1480
},
{
"epoch": 2.292307692307692,
"grad_norm": 1.368630290031433,
"learning_rate": 1.8999678847662124e-05,
"loss": 0.1998,
"step": 1490
},
{
"epoch": 2.292307692307692,
"eval_loss": 0.7855644226074219,
"eval_runtime": 3.2357,
"eval_samples_per_second": 35.232,
"eval_steps_per_second": 1.854,
"step": 1490
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.9620829820632935,
"learning_rate": 1.8976137276390145e-05,
"loss": 0.2139,
"step": 1500
},
{
"epoch": 2.3076923076923075,
"eval_loss": 0.797519326210022,
"eval_runtime": 3.2212,
"eval_samples_per_second": 35.39,
"eval_steps_per_second": 1.863,
"step": 1500
},
{
"epoch": 2.3230769230769233,
"grad_norm": 1.0639945268630981,
"learning_rate": 1.895233683825847e-05,
"loss": 0.2164,
"step": 1510
},
{
"epoch": 2.3230769230769233,
"eval_loss": 0.7683231234550476,
"eval_runtime": 3.2416,
"eval_samples_per_second": 35.168,
"eval_steps_per_second": 1.851,
"step": 1510
},
{
"epoch": 2.3384615384615386,
"grad_norm": 2.229300022125244,
"learning_rate": 1.892827821965864e-05,
"loss": 0.188,
"step": 1520
},
{
"epoch": 2.3384615384615386,
"eval_loss": 0.7734756469726562,
"eval_runtime": 3.239,
"eval_samples_per_second": 35.196,
"eval_steps_per_second": 1.852,
"step": 1520
},
{
"epoch": 2.353846153846154,
"grad_norm": 1.2442930936813354,
"learning_rate": 1.8903962114427985e-05,
"loss": 0.1762,
"step": 1530
},
{
"epoch": 2.353846153846154,
"eval_loss": 0.7807677984237671,
"eval_runtime": 3.2321,
"eval_samples_per_second": 35.272,
"eval_steps_per_second": 1.856,
"step": 1530
},
{
"epoch": 2.3692307692307693,
"grad_norm": 0.7546485066413879,
"learning_rate": 1.8879389223829592e-05,
"loss": 0.1933,
"step": 1540
},
{
"epoch": 2.3692307692307693,
"eval_loss": 0.7788336277008057,
"eval_runtime": 3.234,
"eval_samples_per_second": 35.25,
"eval_steps_per_second": 1.855,
"step": 1540
},
{
"epoch": 2.3846153846153846,
"grad_norm": 1.6050472259521484,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.2,
"step": 1550
},
{
"epoch": 2.3846153846153846,
"eval_loss": 0.7777507305145264,
"eval_runtime": 3.2303,
"eval_samples_per_second": 35.291,
"eval_steps_per_second": 1.857,
"step": 1550
},
{
"epoch": 2.4,
"grad_norm": 1.6613671779632568,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.1959,
"step": 1560
},
{
"epoch": 2.4,
"eval_loss": 0.7840877175331116,
"eval_runtime": 3.2313,
"eval_samples_per_second": 35.28,
"eval_steps_per_second": 1.857,
"step": 1560
},
{
"epoch": 2.4153846153846152,
"grad_norm": 1.127969741821289,
"learning_rate": 1.8804136963419316e-05,
"loss": 0.1791,
"step": 1570
},
{
"epoch": 2.4153846153846152,
"eval_loss": 0.787642776966095,
"eval_runtime": 3.2339,
"eval_samples_per_second": 35.251,
"eval_steps_per_second": 1.855,
"step": 1570
},
{
"epoch": 2.430769230769231,
"grad_norm": 1.0740890502929688,
"learning_rate": 1.8778544091784047e-05,
"loss": 0.1952,
"step": 1580
},
{
"epoch": 2.430769230769231,
"eval_loss": 0.7895064949989319,
"eval_runtime": 3.2507,
"eval_samples_per_second": 35.069,
"eval_steps_per_second": 1.846,
"step": 1580
},
{
"epoch": 2.4461538461538463,
"grad_norm": 1.3111459016799927,
"learning_rate": 1.87526980517678e-05,
"loss": 0.2019,
"step": 1590
},
{
"epoch": 2.4461538461538463,
"eval_loss": 0.7794804573059082,
"eval_runtime": 3.2322,
"eval_samples_per_second": 35.27,
"eval_steps_per_second": 1.856,
"step": 1590
},
{
"epoch": 2.4615384615384617,
"grad_norm": 1.7549346685409546,
"learning_rate": 1.8726599588756144e-05,
"loss": 0.1857,
"step": 1600
},
{
"epoch": 2.4615384615384617,
"eval_loss": 0.7962229251861572,
"eval_runtime": 3.2309,
"eval_samples_per_second": 35.285,
"eval_steps_per_second": 1.857,
"step": 1600
},
{
"epoch": 2.476923076923077,
"grad_norm": 1.6596492528915405,
"learning_rate": 1.8700249455414394e-05,
"loss": 0.2058,
"step": 1610
},
{
"epoch": 2.476923076923077,
"eval_loss": 0.785554051399231,
"eval_runtime": 3.2375,
"eval_samples_per_second": 35.212,
"eval_steps_per_second": 1.853,
"step": 1610
},
{
"epoch": 2.4923076923076923,
"grad_norm": 1.5621322393417358,
"learning_rate": 1.8673648411665895e-05,
"loss": 0.1946,
"step": 1620
},
{
"epoch": 2.4923076923076923,
"eval_loss": 0.7949020266532898,
"eval_runtime": 3.2269,
"eval_samples_per_second": 35.328,
"eval_steps_per_second": 1.859,
"step": 1620
},
{
"epoch": 2.5076923076923077,
"grad_norm": 2.000927686691284,
"learning_rate": 1.864679722467011e-05,
"loss": 0.1984,
"step": 1630
},
{
"epoch": 2.5076923076923077,
"eval_loss": 0.791332483291626,
"eval_runtime": 3.2389,
"eval_samples_per_second": 35.197,
"eval_steps_per_second": 1.852,
"step": 1630
},
{
"epoch": 2.523076923076923,
"grad_norm": 1.7056845426559448,
"learning_rate": 1.8619696668800494e-05,
"loss": 0.2212,
"step": 1640
},
{
"epoch": 2.523076923076923,
"eval_loss": 0.7772064805030823,
"eval_runtime": 3.2441,
"eval_samples_per_second": 35.14,
"eval_steps_per_second": 1.849,
"step": 1640
},
{
"epoch": 2.5384615384615383,
"grad_norm": 1.5976656675338745,
"learning_rate": 1.859234752562217e-05,
"loss": 0.1901,
"step": 1650
},
{
"epoch": 2.5384615384615383,
"eval_loss": 0.7850207686424255,
"eval_runtime": 3.2221,
"eval_samples_per_second": 35.38,
"eval_steps_per_second": 1.862,
"step": 1650
},
{
"epoch": 2.5538461538461537,
"grad_norm": 1.0322597026824951,
"learning_rate": 1.8564750583869374e-05,
"loss": 0.2185,
"step": 1660
},
{
"epoch": 2.5538461538461537,
"eval_loss": 0.7930358648300171,
"eval_runtime": 3.2427,
"eval_samples_per_second": 35.156,
"eval_steps_per_second": 1.85,
"step": 1660
},
{
"epoch": 2.569230769230769,
"grad_norm": 1.1539405584335327,
"learning_rate": 1.8536906639422724e-05,
"loss": 0.2056,
"step": 1670
},
{
"epoch": 2.569230769230769,
"eval_loss": 0.7705276012420654,
"eval_runtime": 3.2511,
"eval_samples_per_second": 35.065,
"eval_steps_per_second": 1.846,
"step": 1670
},
{
"epoch": 2.5846153846153848,
"grad_norm": 1.2852847576141357,
"learning_rate": 1.850881649528625e-05,
"loss": 0.2031,
"step": 1680
},
{
"epoch": 2.5846153846153848,
"eval_loss": 0.7809199094772339,
"eval_runtime": 3.2419,
"eval_samples_per_second": 35.164,
"eval_steps_per_second": 1.851,
"step": 1680
},
{
"epoch": 2.6,
"grad_norm": 2.8470299243927,
"learning_rate": 1.848048096156426e-05,
"loss": 0.207,
"step": 1690
},
{
"epoch": 2.6,
"eval_loss": 0.7837203145027161,
"eval_runtime": 3.2437,
"eval_samples_per_second": 35.145,
"eval_steps_per_second": 1.85,
"step": 1690
},
{
"epoch": 2.6153846153846154,
"grad_norm": 1.169309139251709,
"learning_rate": 1.845190085543795e-05,
"loss": 0.1924,
"step": 1700
},
{
"epoch": 2.6153846153846154,
"eval_loss": 0.8024268746376038,
"eval_runtime": 3.2426,
"eval_samples_per_second": 35.157,
"eval_steps_per_second": 1.85,
"step": 1700
},
{
"epoch": 2.6307692307692307,
"grad_norm": 1.3079050779342651,
"learning_rate": 1.8423077001141848e-05,
"loss": 0.2111,
"step": 1710
},
{
"epoch": 2.6307692307692307,
"eval_loss": 0.7842855453491211,
"eval_runtime": 3.2512,
"eval_samples_per_second": 35.064,
"eval_steps_per_second": 1.845,
"step": 1710
},
{
"epoch": 2.646153846153846,
"grad_norm": 1.5863689184188843,
"learning_rate": 1.839401022994006e-05,
"loss": 0.2039,
"step": 1720
},
{
"epoch": 2.646153846153846,
"eval_loss": 0.7856019735336304,
"eval_runtime": 3.2563,
"eval_samples_per_second": 35.009,
"eval_steps_per_second": 1.843,
"step": 1720
},
{
"epoch": 2.6615384615384614,
"grad_norm": 1.1605026721954346,
"learning_rate": 1.8364701380102267e-05,
"loss": 0.2183,
"step": 1730
},
{
"epoch": 2.6615384615384614,
"eval_loss": 0.7758111953735352,
"eval_runtime": 3.231,
"eval_samples_per_second": 35.283,
"eval_steps_per_second": 1.857,
"step": 1730
},
{
"epoch": 2.676923076923077,
"grad_norm": 1.4007433652877808,
"learning_rate": 1.8335151296879576e-05,
"loss": 0.2054,
"step": 1740
},
{
"epoch": 2.676923076923077,
"eval_loss": 0.7833234071731567,
"eval_runtime": 3.2356,
"eval_samples_per_second": 35.233,
"eval_steps_per_second": 1.854,
"step": 1740
},
{
"epoch": 2.6923076923076925,
"grad_norm": 1.3966948986053467,
"learning_rate": 1.8305360832480118e-05,
"loss": 0.1974,
"step": 1750
},
{
"epoch": 2.6923076923076925,
"eval_loss": 0.7975159287452698,
"eval_runtime": 3.2293,
"eval_samples_per_second": 35.302,
"eval_steps_per_second": 1.858,
"step": 1750
},
{
"epoch": 2.707692307692308,
"grad_norm": 1.2282441854476929,
"learning_rate": 1.82753308460445e-05,
"loss": 0.2114,
"step": 1760
},
{
"epoch": 2.707692307692308,
"eval_loss": 0.8000977039337158,
"eval_runtime": 3.226,
"eval_samples_per_second": 35.337,
"eval_steps_per_second": 1.86,
"step": 1760
},
{
"epoch": 2.723076923076923,
"grad_norm": 1.7516143321990967,
"learning_rate": 1.8245062203621003e-05,
"loss": 0.2081,
"step": 1770
},
{
"epoch": 2.723076923076923,
"eval_loss": 0.7978941202163696,
"eval_runtime": 3.2435,
"eval_samples_per_second": 35.147,
"eval_steps_per_second": 1.85,
"step": 1770
},
{
"epoch": 2.7384615384615385,
"grad_norm": 1.6751377582550049,
"learning_rate": 1.821455577814062e-05,
"loss": 0.2013,
"step": 1780
},
{
"epoch": 2.7384615384615385,
"eval_loss": 0.7863066792488098,
"eval_runtime": 3.2507,
"eval_samples_per_second": 35.07,
"eval_steps_per_second": 1.846,
"step": 1780
},
{
"epoch": 2.753846153846154,
"grad_norm": 1.3899345397949219,
"learning_rate": 1.818381244939187e-05,
"loss": 0.206,
"step": 1790
},
{
"epoch": 2.753846153846154,
"eval_loss": 0.7733153104782104,
"eval_runtime": 3.2514,
"eval_samples_per_second": 35.062,
"eval_steps_per_second": 1.845,
"step": 1790
},
{
"epoch": 2.769230769230769,
"grad_norm": 1.6483854055404663,
"learning_rate": 1.8152833103995443e-05,
"loss": 0.1979,
"step": 1800
},
{
"epoch": 2.769230769230769,
"eval_loss": 0.7838578224182129,
"eval_runtime": 3.2273,
"eval_samples_per_second": 35.323,
"eval_steps_per_second": 1.859,
"step": 1800
},
{
"epoch": 2.7846153846153845,
"grad_norm": 1.4585682153701782,
"learning_rate": 1.8121618635378616e-05,
"loss": 0.2093,
"step": 1810
},
{
"epoch": 2.7846153846153845,
"eval_loss": 0.7677554488182068,
"eval_runtime": 3.231,
"eval_samples_per_second": 35.284,
"eval_steps_per_second": 1.857,
"step": 1810
},
{
"epoch": 2.8,
"grad_norm": 1.3923680782318115,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.2036,
"step": 1820
},
{
"epoch": 2.8,
"eval_loss": 0.7729052901268005,
"eval_runtime": 3.2487,
"eval_samples_per_second": 35.091,
"eval_steps_per_second": 1.847,
"step": 1820
},
{
"epoch": 2.815384615384615,
"grad_norm": 1.233302354812622,
"learning_rate": 1.8058487936070992e-05,
"loss": 0.1931,
"step": 1830
},
{
"epoch": 2.815384615384615,
"eval_loss": 0.7708905935287476,
"eval_runtime": 3.2127,
"eval_samples_per_second": 35.484,
"eval_steps_per_second": 1.868,
"step": 1830
},
{
"epoch": 2.830769230769231,
"grad_norm": 1.4429056644439697,
"learning_rate": 1.802657352603483e-05,
"loss": 0.1929,
"step": 1840
},
{
"epoch": 2.830769230769231,
"eval_loss": 0.7802720069885254,
"eval_runtime": 3.2503,
"eval_samples_per_second": 35.074,
"eval_steps_per_second": 1.846,
"step": 1840
},
{
"epoch": 2.8461538461538463,
"grad_norm": 2.0769877433776855,
"learning_rate": 1.7994427634035016e-05,
"loss": 0.226,
"step": 1850
},
{
"epoch": 2.8461538461538463,
"eval_loss": 0.766547679901123,
"eval_runtime": 3.2413,
"eval_samples_per_second": 35.171,
"eval_steps_per_second": 1.851,
"step": 1850
},
{
"epoch": 2.8615384615384616,
"grad_norm": 1.845153570175171,
"learning_rate": 1.7962051187141377e-05,
"loss": 0.2257,
"step": 1860
},
{
"epoch": 2.8615384615384616,
"eval_loss": 0.7760981321334839,
"eval_runtime": 3.2416,
"eval_samples_per_second": 35.168,
"eval_steps_per_second": 1.851,
"step": 1860
},
{
"epoch": 2.876923076923077,
"grad_norm": 1.8381919860839844,
"learning_rate": 1.7929445119072837e-05,
"loss": 0.2193,
"step": 1870
},
{
"epoch": 2.876923076923077,
"eval_loss": 0.7926127314567566,
"eval_runtime": 3.2889,
"eval_samples_per_second": 34.662,
"eval_steps_per_second": 1.824,
"step": 1870
},
{
"epoch": 2.8923076923076922,
"grad_norm": 1.7760707139968872,
"learning_rate": 1.7896610370170452e-05,
"loss": 0.2085,
"step": 1880
},
{
"epoch": 2.8923076923076922,
"eval_loss": 0.7870352268218994,
"eval_runtime": 3.2379,
"eval_samples_per_second": 35.208,
"eval_steps_per_second": 1.853,
"step": 1880
},
{
"epoch": 2.9076923076923076,
"grad_norm": 1.2421387434005737,
"learning_rate": 1.786354788737031e-05,
"loss": 0.2374,
"step": 1890
},
{
"epoch": 2.9076923076923076,
"eval_loss": 0.7905800342559814,
"eval_runtime": 3.2268,
"eval_samples_per_second": 35.33,
"eval_steps_per_second": 1.859,
"step": 1890
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.8296164274215698,
"learning_rate": 1.7830258624176224e-05,
"loss": 0.1788,
"step": 1900
},
{
"epoch": 2.9230769230769234,
"eval_loss": 0.7861989736557007,
"eval_runtime": 3.2405,
"eval_samples_per_second": 35.18,
"eval_steps_per_second": 1.852,
"step": 1900
},
{
"epoch": 2.9384615384615387,
"grad_norm": 1.577077865600586,
"learning_rate": 1.7796743540632226e-05,
"loss": 0.2296,
"step": 1910
},
{
"epoch": 2.9384615384615387,
"eval_loss": 0.775193452835083,
"eval_runtime": 3.2468,
"eval_samples_per_second": 35.111,
"eval_steps_per_second": 1.848,
"step": 1910
},
{
"epoch": 2.953846153846154,
"grad_norm": 1.630001187324524,
"learning_rate": 1.776300360329488e-05,
"loss": 0.2115,
"step": 1920
},
{
"epoch": 2.953846153846154,
"eval_loss": 0.7809256911277771,
"eval_runtime": 3.2326,
"eval_samples_per_second": 35.266,
"eval_steps_per_second": 1.856,
"step": 1920
},
{
"epoch": 2.9692307692307693,
"grad_norm": 1.5851411819458008,
"learning_rate": 1.772903978520542e-05,
"loss": 0.1967,
"step": 1930
},
{
"epoch": 2.9692307692307693,
"eval_loss": 0.7896639108657837,
"eval_runtime": 3.2497,
"eval_samples_per_second": 35.08,
"eval_steps_per_second": 1.846,
"step": 1930
},
{
"epoch": 2.9846153846153847,
"grad_norm": 1.3122369050979614,
"learning_rate": 1.769485306586166e-05,
"loss": 0.2159,
"step": 1940
},
{
"epoch": 2.9846153846153847,
"eval_loss": 0.777010977268219,
"eval_runtime": 3.2294,
"eval_samples_per_second": 35.3,
"eval_steps_per_second": 1.858,
"step": 1940
},
{
"epoch": 3.0,
"grad_norm": 1.2341620922088623,
"learning_rate": 1.766044443118978e-05,
"loss": 0.1962,
"step": 1950
},
{
"epoch": 3.0,
"eval_loss": 0.7913311123847961,
"eval_runtime": 3.226,
"eval_samples_per_second": 35.338,
"eval_steps_per_second": 1.86,
"step": 1950
},
{
"epoch": 3.0153846153846153,
"grad_norm": 1.1695995330810547,
"learning_rate": 1.762581487351587e-05,
"loss": 0.1231,
"step": 1960
},
{
"epoch": 3.0153846153846153,
"eval_loss": 0.8296219706535339,
"eval_runtime": 3.2283,
"eval_samples_per_second": 35.313,
"eval_steps_per_second": 1.859,
"step": 1960
},
{
"epoch": 3.0307692307692307,
"grad_norm": 1.0905882120132446,
"learning_rate": 1.7590965391537316e-05,
"loss": 0.1028,
"step": 1970
},
{
"epoch": 3.0307692307692307,
"eval_loss": 0.8419015407562256,
"eval_runtime": 3.2463,
"eval_samples_per_second": 35.117,
"eval_steps_per_second": 1.848,
"step": 1970
},
{
"epoch": 3.046153846153846,
"grad_norm": 0.9968711137771606,
"learning_rate": 1.7555896990294003e-05,
"loss": 0.116,
"step": 1980
},
{
"epoch": 3.046153846153846,
"eval_loss": 0.8519408106803894,
"eval_runtime": 3.2373,
"eval_samples_per_second": 35.214,
"eval_steps_per_second": 1.853,
"step": 1980
},
{
"epoch": 3.0615384615384613,
"grad_norm": 1.8941599130630493,
"learning_rate": 1.7520610681139322e-05,
"loss": 0.1195,
"step": 1990
},
{
"epoch": 3.0615384615384613,
"eval_loss": 0.8432408571243286,
"eval_runtime": 3.2477,
"eval_samples_per_second": 35.102,
"eval_steps_per_second": 1.847,
"step": 1990
},
{
"epoch": 3.076923076923077,
"grad_norm": 1.5090588331222534,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.1141,
"step": 2000
},
{
"epoch": 3.076923076923077,
"eval_loss": 0.8407796025276184,
"eval_runtime": 3.2492,
"eval_samples_per_second": 35.085,
"eval_steps_per_second": 1.847,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 6500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"total_flos": 1.3637570942048666e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}