llava-v1.6-vicuna-13b_anyres / trainer_state.json
Bleking's picture
llava-v1.6-vicuna-13b_anyres
55ad74c
{
"best_metric": 0.6575854420661926,
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256",
"epoch": 10.0,
"eval_steps": 1.0,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03125,
"grad_norm": 0.5230235555406132,
"learning_rate": 0.0,
"loss": 1.5809,
"step": 1
},
{
"epoch": 0.03125,
"eval_loss": 1.6275018453598022,
"eval_runtime": 82.059,
"eval_samples_per_second": 2.437,
"eval_steps_per_second": 0.305,
"step": 1
},
{
"epoch": 0.0625,
"grad_norm": 0.5095402010892089,
"learning_rate": 2e-05,
"loss": 1.4958,
"step": 2
},
{
"epoch": 0.0625,
"eval_loss": 1.6275018453598022,
"eval_runtime": 76.5747,
"eval_samples_per_second": 2.612,
"eval_steps_per_second": 0.326,
"step": 2
},
{
"epoch": 0.09375,
"grad_norm": 0.4998514282504938,
"learning_rate": 2e-05,
"loss": 1.5552,
"step": 3
},
{
"epoch": 0.09375,
"eval_loss": 1.5956931114196777,
"eval_runtime": 76.1563,
"eval_samples_per_second": 2.626,
"eval_steps_per_second": 0.328,
"step": 3
},
{
"epoch": 0.125,
"grad_norm": 0.4280580315108126,
"learning_rate": 2e-05,
"loss": 1.4846,
"step": 4
},
{
"epoch": 0.125,
"eval_loss": 1.5584176778793335,
"eval_runtime": 76.1235,
"eval_samples_per_second": 2.627,
"eval_steps_per_second": 0.328,
"step": 4
},
{
"epoch": 0.15625,
"grad_norm": 0.5678499435986384,
"learning_rate": 2e-05,
"loss": 1.5036,
"step": 5
},
{
"epoch": 0.15625,
"eval_loss": 1.5207562446594238,
"eval_runtime": 76.1514,
"eval_samples_per_second": 2.626,
"eval_steps_per_second": 0.328,
"step": 5
},
{
"epoch": 0.1875,
"grad_norm": 0.5368461657542534,
"learning_rate": 2e-05,
"loss": 1.476,
"step": 6
},
{
"epoch": 0.1875,
"eval_loss": 1.4807783365249634,
"eval_runtime": 77.3444,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 0.323,
"step": 6
},
{
"epoch": 0.21875,
"grad_norm": 0.5549950083087136,
"learning_rate": 2e-05,
"loss": 1.4358,
"step": 7
},
{
"epoch": 0.21875,
"eval_loss": 1.4411544799804688,
"eval_runtime": 77.066,
"eval_samples_per_second": 2.595,
"eval_steps_per_second": 0.324,
"step": 7
},
{
"epoch": 0.25,
"grad_norm": 0.5549950083087136,
"learning_rate": 2e-05,
"loss": 1.4369,
"step": 8
},
{
"epoch": 0.25,
"eval_loss": 1.4411544799804688,
"eval_runtime": 77.2807,
"eval_samples_per_second": 2.588,
"eval_steps_per_second": 0.323,
"step": 8
},
{
"epoch": 0.28125,
"grad_norm": 0.5292240951443854,
"learning_rate": 2e-05,
"loss": 1.4471,
"step": 9
},
{
"epoch": 0.28125,
"eval_loss": 1.4036556482315063,
"eval_runtime": 78.1562,
"eval_samples_per_second": 2.559,
"eval_steps_per_second": 0.32,
"step": 9
},
{
"epoch": 0.3125,
"grad_norm": 0.5292240951443854,
"learning_rate": 2e-05,
"loss": 1.3666,
"step": 10
},
{
"epoch": 0.3125,
"eval_loss": 1.4036556482315063,
"eval_runtime": 77.1645,
"eval_samples_per_second": 2.592,
"eval_steps_per_second": 0.324,
"step": 10
},
{
"epoch": 0.34375,
"grad_norm": 0.5292240951443854,
"learning_rate": 2e-05,
"loss": 1.4149,
"step": 11
},
{
"epoch": 0.34375,
"eval_loss": 1.4036556482315063,
"eval_runtime": 78.7627,
"eval_samples_per_second": 2.539,
"eval_steps_per_second": 0.317,
"step": 11
},
{
"epoch": 0.375,
"grad_norm": 0.684588966714067,
"learning_rate": 2e-05,
"loss": 1.3883,
"step": 12
},
{
"epoch": 0.375,
"eval_loss": 1.3679308891296387,
"eval_runtime": 78.4315,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 12
},
{
"epoch": 0.40625,
"grad_norm": 0.6261826769491422,
"learning_rate": 2e-05,
"loss": 1.4271,
"step": 13
},
{
"epoch": 0.40625,
"eval_loss": 1.3369851112365723,
"eval_runtime": 78.685,
"eval_samples_per_second": 2.542,
"eval_steps_per_second": 0.318,
"step": 13
},
{
"epoch": 0.4375,
"grad_norm": 0.6261826769491422,
"learning_rate": 2e-05,
"loss": 1.2495,
"step": 14
},
{
"epoch": 0.4375,
"eval_loss": 1.3369851112365723,
"eval_runtime": 78.0511,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 14
},
{
"epoch": 0.46875,
"grad_norm": 0.6028103951693778,
"learning_rate": 2e-05,
"loss": 1.3513,
"step": 15
},
{
"epoch": 0.46875,
"eval_loss": 1.3032653331756592,
"eval_runtime": 78.0271,
"eval_samples_per_second": 2.563,
"eval_steps_per_second": 0.32,
"step": 15
},
{
"epoch": 0.5,
"grad_norm": 0.769290402283396,
"learning_rate": 2e-05,
"loss": 1.3117,
"step": 16
},
{
"epoch": 0.5,
"eval_loss": 1.2661188840866089,
"eval_runtime": 78.1857,
"eval_samples_per_second": 2.558,
"eval_steps_per_second": 0.32,
"step": 16
},
{
"epoch": 0.53125,
"grad_norm": 1.3279338025863765,
"learning_rate": 2e-05,
"loss": 1.2768,
"step": 17
},
{
"epoch": 0.53125,
"eval_loss": 1.2299447059631348,
"eval_runtime": 78.2064,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 17
},
{
"epoch": 0.5625,
"grad_norm": 0.7410327159336384,
"learning_rate": 2e-05,
"loss": 1.256,
"step": 18
},
{
"epoch": 0.5625,
"eval_loss": 1.2044258117675781,
"eval_runtime": 78.072,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 18
},
{
"epoch": 0.59375,
"grad_norm": 0.44078820770408506,
"learning_rate": 2e-05,
"loss": 1.1252,
"step": 19
},
{
"epoch": 0.59375,
"eval_loss": 1.1826122999191284,
"eval_runtime": 78.7312,
"eval_samples_per_second": 2.54,
"eval_steps_per_second": 0.318,
"step": 19
},
{
"epoch": 0.625,
"grad_norm": 0.49020841613371097,
"learning_rate": 2e-05,
"loss": 1.2249,
"step": 20
},
{
"epoch": 0.625,
"eval_loss": 1.1616511344909668,
"eval_runtime": 78.2736,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 0.319,
"step": 20
},
{
"epoch": 0.65625,
"grad_norm": 0.43031322695269714,
"learning_rate": 2e-05,
"loss": 1.1466,
"step": 21
},
{
"epoch": 0.65625,
"eval_loss": 1.1410629749298096,
"eval_runtime": 79.6432,
"eval_samples_per_second": 2.511,
"eval_steps_per_second": 0.314,
"step": 21
},
{
"epoch": 0.6875,
"grad_norm": 0.45632085445955545,
"learning_rate": 2e-05,
"loss": 1.1951,
"step": 22
},
{
"epoch": 0.6875,
"eval_loss": 1.1204684972763062,
"eval_runtime": 79.0609,
"eval_samples_per_second": 2.53,
"eval_steps_per_second": 0.316,
"step": 22
},
{
"epoch": 0.71875,
"grad_norm": 0.40048586945364495,
"learning_rate": 2e-05,
"loss": 1.1826,
"step": 23
},
{
"epoch": 0.71875,
"eval_loss": 1.1002545356750488,
"eval_runtime": 82.8578,
"eval_samples_per_second": 2.414,
"eval_steps_per_second": 0.302,
"step": 23
},
{
"epoch": 0.75,
"grad_norm": 0.3703033261027938,
"learning_rate": 2e-05,
"loss": 1.1543,
"step": 24
},
{
"epoch": 0.75,
"eval_loss": 1.0805977582931519,
"eval_runtime": 76.1407,
"eval_samples_per_second": 2.627,
"eval_steps_per_second": 0.328,
"step": 24
},
{
"epoch": 0.78125,
"grad_norm": 0.3986313105418924,
"learning_rate": 2e-05,
"loss": 1.1046,
"step": 25
},
{
"epoch": 0.78125,
"eval_loss": 1.0610157251358032,
"eval_runtime": 76.3083,
"eval_samples_per_second": 2.621,
"eval_steps_per_second": 0.328,
"step": 25
},
{
"epoch": 0.8125,
"grad_norm": 0.36265027203577943,
"learning_rate": 2e-05,
"loss": 1.1048,
"step": 26
},
{
"epoch": 0.8125,
"eval_loss": 1.0421289205551147,
"eval_runtime": 77.2186,
"eval_samples_per_second": 2.59,
"eval_steps_per_second": 0.324,
"step": 26
},
{
"epoch": 0.84375,
"grad_norm": 0.3881748990218768,
"learning_rate": 2e-05,
"loss": 1.0425,
"step": 27
},
{
"epoch": 0.84375,
"eval_loss": 1.0240073204040527,
"eval_runtime": 77.8662,
"eval_samples_per_second": 2.569,
"eval_steps_per_second": 0.321,
"step": 27
},
{
"epoch": 0.875,
"grad_norm": 0.3734031294324286,
"learning_rate": 2e-05,
"loss": 1.0484,
"step": 28
},
{
"epoch": 0.875,
"eval_loss": 1.0066957473754883,
"eval_runtime": 77.269,
"eval_samples_per_second": 2.588,
"eval_steps_per_second": 0.324,
"step": 28
},
{
"epoch": 0.90625,
"grad_norm": 0.29695383079342563,
"learning_rate": 2e-05,
"loss": 1.0387,
"step": 29
},
{
"epoch": 0.90625,
"eval_loss": 0.9906074404716492,
"eval_runtime": 77.2245,
"eval_samples_per_second": 2.59,
"eval_steps_per_second": 0.324,
"step": 29
},
{
"epoch": 0.9375,
"grad_norm": 0.29273146875026623,
"learning_rate": 2e-05,
"loss": 1.0568,
"step": 30
},
{
"epoch": 0.9375,
"eval_loss": 0.975755512714386,
"eval_runtime": 78.0056,
"eval_samples_per_second": 2.564,
"eval_steps_per_second": 0.32,
"step": 30
},
{
"epoch": 0.96875,
"grad_norm": 0.35070440686850546,
"learning_rate": 2e-05,
"loss": 0.9114,
"step": 31
},
{
"epoch": 0.96875,
"eval_loss": 0.9615123271942139,
"eval_runtime": 77.9051,
"eval_samples_per_second": 2.567,
"eval_steps_per_second": 0.321,
"step": 31
},
{
"epoch": 1.0,
"grad_norm": 0.30846157140439384,
"learning_rate": 2e-05,
"loss": 0.9941,
"step": 32
},
{
"epoch": 1.0,
"eval_loss": 0.9480571150779724,
"eval_runtime": 77.2322,
"eval_samples_per_second": 2.59,
"eval_steps_per_second": 0.324,
"step": 32
},
{
"epoch": 1.03125,
"grad_norm": 0.2950381371932973,
"learning_rate": 2e-05,
"loss": 1.0297,
"step": 33
},
{
"epoch": 1.03125,
"eval_loss": 0.9356330037117004,
"eval_runtime": 81.8443,
"eval_samples_per_second": 2.444,
"eval_steps_per_second": 0.305,
"step": 33
},
{
"epoch": 1.0625,
"grad_norm": 0.27080038065834283,
"learning_rate": 2e-05,
"loss": 1.021,
"step": 34
},
{
"epoch": 1.0625,
"eval_loss": 0.9245791435241699,
"eval_runtime": 76.2071,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 34
},
{
"epoch": 1.09375,
"grad_norm": 0.23165081252649894,
"learning_rate": 2e-05,
"loss": 1.0366,
"step": 35
},
{
"epoch": 1.09375,
"eval_loss": 0.9151126146316528,
"eval_runtime": 77.0412,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.325,
"step": 35
},
{
"epoch": 1.125,
"grad_norm": 0.4033780922500775,
"learning_rate": 2e-05,
"loss": 1.0127,
"step": 36
},
{
"epoch": 1.125,
"eval_loss": 0.9063960313796997,
"eval_runtime": 76.9327,
"eval_samples_per_second": 2.6,
"eval_steps_per_second": 0.325,
"step": 36
},
{
"epoch": 1.15625,
"grad_norm": 0.2398039831439168,
"learning_rate": 2e-05,
"loss": 0.9418,
"step": 37
},
{
"epoch": 1.15625,
"eval_loss": 0.8982363939285278,
"eval_runtime": 76.1234,
"eval_samples_per_second": 2.627,
"eval_steps_per_second": 0.328,
"step": 37
},
{
"epoch": 1.1875,
"grad_norm": 0.28793451241246804,
"learning_rate": 2e-05,
"loss": 0.9643,
"step": 38
},
{
"epoch": 1.1875,
"eval_loss": 0.8908895254135132,
"eval_runtime": 76.2877,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 38
},
{
"epoch": 1.21875,
"grad_norm": 0.2927691606307197,
"learning_rate": 2e-05,
"loss": 1.0087,
"step": 39
},
{
"epoch": 1.21875,
"eval_loss": 0.8845618367195129,
"eval_runtime": 76.2282,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 39
},
{
"epoch": 1.25,
"grad_norm": 0.26410982001408806,
"learning_rate": 2e-05,
"loss": 0.986,
"step": 40
},
{
"epoch": 1.25,
"eval_loss": 0.8784474730491638,
"eval_runtime": 76.2512,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 40
},
{
"epoch": 1.28125,
"grad_norm": 0.29182630949665306,
"learning_rate": 2e-05,
"loss": 0.9711,
"step": 41
},
{
"epoch": 1.28125,
"eval_loss": 0.8725223541259766,
"eval_runtime": 77.1229,
"eval_samples_per_second": 2.593,
"eval_steps_per_second": 0.324,
"step": 41
},
{
"epoch": 1.3125,
"grad_norm": 0.36402838796832665,
"learning_rate": 2e-05,
"loss": 0.9263,
"step": 42
},
{
"epoch": 1.3125,
"eval_loss": 0.8662790060043335,
"eval_runtime": 77.2362,
"eval_samples_per_second": 2.589,
"eval_steps_per_second": 0.324,
"step": 42
},
{
"epoch": 1.34375,
"grad_norm": 0.29338184478895163,
"learning_rate": 2e-05,
"loss": 0.8947,
"step": 43
},
{
"epoch": 1.34375,
"eval_loss": 0.8600431680679321,
"eval_runtime": 77.1213,
"eval_samples_per_second": 2.593,
"eval_steps_per_second": 0.324,
"step": 43
},
{
"epoch": 1.375,
"grad_norm": 0.2201714229702277,
"learning_rate": 2e-05,
"loss": 0.9059,
"step": 44
},
{
"epoch": 1.375,
"eval_loss": 0.8545799255371094,
"eval_runtime": 77.991,
"eval_samples_per_second": 2.564,
"eval_steps_per_second": 0.321,
"step": 44
},
{
"epoch": 1.40625,
"grad_norm": 0.2254966625243654,
"learning_rate": 2e-05,
"loss": 0.8942,
"step": 45
},
{
"epoch": 1.40625,
"eval_loss": 0.8497399687767029,
"eval_runtime": 77.2698,
"eval_samples_per_second": 2.588,
"eval_steps_per_second": 0.324,
"step": 45
},
{
"epoch": 1.4375,
"grad_norm": 0.21753318432075458,
"learning_rate": 2e-05,
"loss": 0.9376,
"step": 46
},
{
"epoch": 1.4375,
"eval_loss": 0.8452473282814026,
"eval_runtime": 77.0568,
"eval_samples_per_second": 2.595,
"eval_steps_per_second": 0.324,
"step": 46
},
{
"epoch": 1.46875,
"grad_norm": 0.21449718265972945,
"learning_rate": 2e-05,
"loss": 0.9369,
"step": 47
},
{
"epoch": 1.46875,
"eval_loss": 0.841134786605835,
"eval_runtime": 77.225,
"eval_samples_per_second": 2.59,
"eval_steps_per_second": 0.324,
"step": 47
},
{
"epoch": 1.5,
"grad_norm": 0.2109063266748924,
"learning_rate": 2e-05,
"loss": 0.8511,
"step": 48
},
{
"epoch": 1.5,
"eval_loss": 0.8373770117759705,
"eval_runtime": 76.2309,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 48
},
{
"epoch": 1.53125,
"grad_norm": 0.232838633689838,
"learning_rate": 2e-05,
"loss": 0.8694,
"step": 49
},
{
"epoch": 1.53125,
"eval_loss": 0.8338289856910706,
"eval_runtime": 76.277,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 49
},
{
"epoch": 1.5625,
"grad_norm": 0.4189704940803984,
"learning_rate": 2e-05,
"loss": 0.8464,
"step": 50
},
{
"epoch": 1.5625,
"eval_loss": 0.8297132849693298,
"eval_runtime": 76.2872,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 50
},
{
"epoch": 1.59375,
"grad_norm": 0.2171618165123276,
"learning_rate": 2e-05,
"loss": 0.8785,
"step": 51
},
{
"epoch": 1.59375,
"eval_loss": 0.8257431983947754,
"eval_runtime": 76.2639,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 51
},
{
"epoch": 1.625,
"grad_norm": 0.21934651037670305,
"learning_rate": 2e-05,
"loss": 0.7645,
"step": 52
},
{
"epoch": 1.625,
"eval_loss": 0.8223557472229004,
"eval_runtime": 76.2383,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 52
},
{
"epoch": 1.65625,
"grad_norm": 0.24183530733164746,
"learning_rate": 2e-05,
"loss": 0.9218,
"step": 53
},
{
"epoch": 1.65625,
"eval_loss": 0.8189653158187866,
"eval_runtime": 76.9819,
"eval_samples_per_second": 2.598,
"eval_steps_per_second": 0.325,
"step": 53
},
{
"epoch": 1.6875,
"grad_norm": 0.23450930244279267,
"learning_rate": 2e-05,
"loss": 0.8896,
"step": 54
},
{
"epoch": 1.6875,
"eval_loss": 0.8152530193328857,
"eval_runtime": 76.2378,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 54
},
{
"epoch": 1.71875,
"grad_norm": 0.22081665899796085,
"learning_rate": 2e-05,
"loss": 0.8798,
"step": 55
},
{
"epoch": 1.71875,
"eval_loss": 0.8122122287750244,
"eval_runtime": 76.289,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 55
},
{
"epoch": 1.75,
"grad_norm": 0.21311746114111046,
"learning_rate": 2e-05,
"loss": 0.9482,
"step": 56
},
{
"epoch": 1.75,
"eval_loss": 0.8092318773269653,
"eval_runtime": 77.8321,
"eval_samples_per_second": 2.57,
"eval_steps_per_second": 0.321,
"step": 56
},
{
"epoch": 1.78125,
"grad_norm": 0.2496565307107556,
"learning_rate": 2e-05,
"loss": 0.8917,
"step": 57
},
{
"epoch": 1.78125,
"eval_loss": 0.8070546984672546,
"eval_runtime": 77.2651,
"eval_samples_per_second": 2.588,
"eval_steps_per_second": 0.324,
"step": 57
},
{
"epoch": 1.8125,
"grad_norm": 0.2137866456424736,
"learning_rate": 2e-05,
"loss": 0.909,
"step": 58
},
{
"epoch": 1.8125,
"eval_loss": 0.8049566745758057,
"eval_runtime": 78.0925,
"eval_samples_per_second": 2.561,
"eval_steps_per_second": 0.32,
"step": 58
},
{
"epoch": 1.84375,
"grad_norm": 0.22567502859345095,
"learning_rate": 2e-05,
"loss": 0.8611,
"step": 59
},
{
"epoch": 1.84375,
"eval_loss": 0.8028810024261475,
"eval_runtime": 78.0553,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 59
},
{
"epoch": 1.875,
"grad_norm": 0.23303796552302508,
"learning_rate": 2e-05,
"loss": 0.9209,
"step": 60
},
{
"epoch": 1.875,
"eval_loss": 0.800568699836731,
"eval_runtime": 78.052,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 60
},
{
"epoch": 1.90625,
"grad_norm": 0.24566727726974544,
"learning_rate": 2e-05,
"loss": 0.8239,
"step": 61
},
{
"epoch": 1.90625,
"eval_loss": 0.7976545691490173,
"eval_runtime": 77.3056,
"eval_samples_per_second": 2.587,
"eval_steps_per_second": 0.323,
"step": 61
},
{
"epoch": 1.9375,
"grad_norm": 0.23014192522354907,
"learning_rate": 2e-05,
"loss": 0.8814,
"step": 62
},
{
"epoch": 1.9375,
"eval_loss": 0.7945474982261658,
"eval_runtime": 77.3398,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 0.323,
"step": 62
},
{
"epoch": 1.96875,
"grad_norm": 0.23042819102671622,
"learning_rate": 2e-05,
"loss": 0.9064,
"step": 63
},
{
"epoch": 1.96875,
"eval_loss": 0.7918359637260437,
"eval_runtime": 77.4272,
"eval_samples_per_second": 2.583,
"eval_steps_per_second": 0.323,
"step": 63
},
{
"epoch": 2.0,
"grad_norm": 0.23940667173206315,
"learning_rate": 2e-05,
"loss": 0.8658,
"step": 64
},
{
"epoch": 2.0,
"eval_loss": 0.7891160845756531,
"eval_runtime": 77.3236,
"eval_samples_per_second": 2.587,
"eval_steps_per_second": 0.323,
"step": 64
},
{
"epoch": 2.03125,
"grad_norm": 0.22630342930143643,
"learning_rate": 2e-05,
"loss": 0.8403,
"step": 65
},
{
"epoch": 2.03125,
"eval_loss": 0.7859742641448975,
"eval_runtime": 77.2001,
"eval_samples_per_second": 2.591,
"eval_steps_per_second": 0.324,
"step": 65
},
{
"epoch": 2.0625,
"grad_norm": 0.20949240460260976,
"learning_rate": 2e-05,
"loss": 0.8472,
"step": 66
},
{
"epoch": 2.0625,
"eval_loss": 0.7834083437919617,
"eval_runtime": 78.9646,
"eval_samples_per_second": 2.533,
"eval_steps_per_second": 0.317,
"step": 66
},
{
"epoch": 2.09375,
"grad_norm": 0.22714400479820654,
"learning_rate": 2e-05,
"loss": 0.841,
"step": 67
},
{
"epoch": 2.09375,
"eval_loss": 0.7805308699607849,
"eval_runtime": 78.7552,
"eval_samples_per_second": 2.54,
"eval_steps_per_second": 0.317,
"step": 67
},
{
"epoch": 2.125,
"grad_norm": 0.23345123077006047,
"learning_rate": 2e-05,
"loss": 0.9028,
"step": 68
},
{
"epoch": 2.125,
"eval_loss": 0.7779514789581299,
"eval_runtime": 78.3387,
"eval_samples_per_second": 2.553,
"eval_steps_per_second": 0.319,
"step": 68
},
{
"epoch": 2.15625,
"grad_norm": 0.251841542575211,
"learning_rate": 2e-05,
"loss": 0.8381,
"step": 69
},
{
"epoch": 2.15625,
"eval_loss": 0.7756664752960205,
"eval_runtime": 78.3109,
"eval_samples_per_second": 2.554,
"eval_steps_per_second": 0.319,
"step": 69
},
{
"epoch": 2.1875,
"grad_norm": 0.23548386839773608,
"learning_rate": 2e-05,
"loss": 0.7914,
"step": 70
},
{
"epoch": 2.1875,
"eval_loss": 0.7733604907989502,
"eval_runtime": 78.9712,
"eval_samples_per_second": 2.533,
"eval_steps_per_second": 0.317,
"step": 70
},
{
"epoch": 2.21875,
"grad_norm": 0.23262740912668387,
"learning_rate": 2e-05,
"loss": 0.8778,
"step": 71
},
{
"epoch": 2.21875,
"eval_loss": 0.771755576133728,
"eval_runtime": 78.2633,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 0.319,
"step": 71
},
{
"epoch": 2.25,
"grad_norm": 0.22075289612357513,
"learning_rate": 2e-05,
"loss": 0.7945,
"step": 72
},
{
"epoch": 2.25,
"eval_loss": 0.7705450654029846,
"eval_runtime": 78.3151,
"eval_samples_per_second": 2.554,
"eval_steps_per_second": 0.319,
"step": 72
},
{
"epoch": 2.28125,
"grad_norm": 0.25520381955936466,
"learning_rate": 2e-05,
"loss": 0.8387,
"step": 73
},
{
"epoch": 2.28125,
"eval_loss": 0.7695029973983765,
"eval_runtime": 78.2901,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 0.319,
"step": 73
},
{
"epoch": 2.3125,
"grad_norm": 0.2047305385827267,
"learning_rate": 2e-05,
"loss": 0.8404,
"step": 74
},
{
"epoch": 2.3125,
"eval_loss": 0.7684457302093506,
"eval_runtime": 78.3875,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 0.319,
"step": 74
},
{
"epoch": 2.34375,
"grad_norm": 0.2262323045133288,
"learning_rate": 2e-05,
"loss": 0.8811,
"step": 75
},
{
"epoch": 2.34375,
"eval_loss": 0.7671162486076355,
"eval_runtime": 78.202,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 75
},
{
"epoch": 2.375,
"grad_norm": 0.21885464923925876,
"learning_rate": 2e-05,
"loss": 0.7942,
"step": 76
},
{
"epoch": 2.375,
"eval_loss": 0.7658494710922241,
"eval_runtime": 78.1746,
"eval_samples_per_second": 2.558,
"eval_steps_per_second": 0.32,
"step": 76
},
{
"epoch": 2.40625,
"grad_norm": 0.21717306953626966,
"learning_rate": 2e-05,
"loss": 0.8497,
"step": 77
},
{
"epoch": 2.40625,
"eval_loss": 0.7642120122909546,
"eval_runtime": 78.2026,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 77
},
{
"epoch": 2.4375,
"grad_norm": 0.2530725583748258,
"learning_rate": 2e-05,
"loss": 0.8584,
"step": 78
},
{
"epoch": 2.4375,
"eval_loss": 0.7625510692596436,
"eval_runtime": 78.1991,
"eval_samples_per_second": 2.558,
"eval_steps_per_second": 0.32,
"step": 78
},
{
"epoch": 2.46875,
"grad_norm": 0.25354787036627263,
"learning_rate": 2e-05,
"loss": 0.8569,
"step": 79
},
{
"epoch": 2.46875,
"eval_loss": 0.7616268396377563,
"eval_runtime": 78.2915,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 0.319,
"step": 79
},
{
"epoch": 2.5,
"grad_norm": 0.2800865746664007,
"learning_rate": 2e-05,
"loss": 0.9116,
"step": 80
},
{
"epoch": 2.5,
"eval_loss": 0.7603214979171753,
"eval_runtime": 78.2749,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 0.319,
"step": 80
},
{
"epoch": 2.53125,
"grad_norm": 0.268139688449618,
"learning_rate": 2e-05,
"loss": 0.8397,
"step": 81
},
{
"epoch": 2.53125,
"eval_loss": 0.7584869265556335,
"eval_runtime": 79.1445,
"eval_samples_per_second": 2.527,
"eval_steps_per_second": 0.316,
"step": 81
},
{
"epoch": 2.5625,
"grad_norm": 0.3128648654463789,
"learning_rate": 2e-05,
"loss": 0.8888,
"step": 82
},
{
"epoch": 2.5625,
"eval_loss": 0.7566561102867126,
"eval_runtime": 79.2089,
"eval_samples_per_second": 2.525,
"eval_steps_per_second": 0.316,
"step": 82
},
{
"epoch": 2.59375,
"grad_norm": 0.2502355211215609,
"learning_rate": 2e-05,
"loss": 0.8346,
"step": 83
},
{
"epoch": 2.59375,
"eval_loss": 0.7547345161437988,
"eval_runtime": 79.2691,
"eval_samples_per_second": 2.523,
"eval_steps_per_second": 0.315,
"step": 83
},
{
"epoch": 2.625,
"grad_norm": 0.25281184629018644,
"learning_rate": 2e-05,
"loss": 0.795,
"step": 84
},
{
"epoch": 2.625,
"eval_loss": 0.7527951598167419,
"eval_runtime": 79.4068,
"eval_samples_per_second": 2.519,
"eval_steps_per_second": 0.315,
"step": 84
},
{
"epoch": 2.65625,
"grad_norm": 0.24246729562645003,
"learning_rate": 2e-05,
"loss": 0.7649,
"step": 85
},
{
"epoch": 2.65625,
"eval_loss": 0.7509815096855164,
"eval_runtime": 79.1612,
"eval_samples_per_second": 2.526,
"eval_steps_per_second": 0.316,
"step": 85
},
{
"epoch": 2.6875,
"grad_norm": 0.27005475109453947,
"learning_rate": 2e-05,
"loss": 0.7964,
"step": 86
},
{
"epoch": 2.6875,
"eval_loss": 0.7485950589179993,
"eval_runtime": 80.0714,
"eval_samples_per_second": 2.498,
"eval_steps_per_second": 0.312,
"step": 86
},
{
"epoch": 2.71875,
"grad_norm": 0.2723492355800971,
"learning_rate": 2e-05,
"loss": 0.8117,
"step": 87
},
{
"epoch": 2.71875,
"eval_loss": 0.7459420561790466,
"eval_runtime": 79.4075,
"eval_samples_per_second": 2.519,
"eval_steps_per_second": 0.315,
"step": 87
},
{
"epoch": 2.75,
"grad_norm": 0.2946493898427159,
"learning_rate": 2e-05,
"loss": 0.8986,
"step": 88
},
{
"epoch": 2.75,
"eval_loss": 0.7436455488204956,
"eval_runtime": 79.3721,
"eval_samples_per_second": 2.52,
"eval_steps_per_second": 0.315,
"step": 88
},
{
"epoch": 2.78125,
"grad_norm": 0.26411214734213284,
"learning_rate": 2e-05,
"loss": 0.8145,
"step": 89
},
{
"epoch": 2.78125,
"eval_loss": 0.7424752712249756,
"eval_runtime": 79.2988,
"eval_samples_per_second": 2.522,
"eval_steps_per_second": 0.315,
"step": 89
},
{
"epoch": 2.8125,
"grad_norm": 0.27115747269014817,
"learning_rate": 2e-05,
"loss": 0.8457,
"step": 90
},
{
"epoch": 2.8125,
"eval_loss": 0.7416408658027649,
"eval_runtime": 79.4004,
"eval_samples_per_second": 2.519,
"eval_steps_per_second": 0.315,
"step": 90
},
{
"epoch": 2.84375,
"grad_norm": 0.25831877964821937,
"learning_rate": 2e-05,
"loss": 0.7568,
"step": 91
},
{
"epoch": 2.84375,
"eval_loss": 0.7404463291168213,
"eval_runtime": 81.7767,
"eval_samples_per_second": 2.446,
"eval_steps_per_second": 0.306,
"step": 91
},
{
"epoch": 2.875,
"grad_norm": 0.31273388454942935,
"learning_rate": 2e-05,
"loss": 0.8562,
"step": 92
},
{
"epoch": 2.875,
"eval_loss": 0.7384185791015625,
"eval_runtime": 82.3443,
"eval_samples_per_second": 2.429,
"eval_steps_per_second": 0.304,
"step": 92
},
{
"epoch": 2.90625,
"grad_norm": 0.2838267071008901,
"learning_rate": 2e-05,
"loss": 0.7869,
"step": 93
},
{
"epoch": 2.90625,
"eval_loss": 0.7366807460784912,
"eval_runtime": 82.2622,
"eval_samples_per_second": 2.431,
"eval_steps_per_second": 0.304,
"step": 93
},
{
"epoch": 2.9375,
"grad_norm": 0.28625827941831467,
"learning_rate": 2e-05,
"loss": 0.8618,
"step": 94
},
{
"epoch": 2.9375,
"eval_loss": 0.7357398867607117,
"eval_runtime": 81.9471,
"eval_samples_per_second": 2.441,
"eval_steps_per_second": 0.305,
"step": 94
},
{
"epoch": 2.96875,
"grad_norm": 0.25548002643954326,
"learning_rate": 2e-05,
"loss": 0.8085,
"step": 95
},
{
"epoch": 2.96875,
"eval_loss": 0.7356534004211426,
"eval_runtime": 82.1186,
"eval_samples_per_second": 2.436,
"eval_steps_per_second": 0.304,
"step": 95
},
{
"epoch": 3.0,
"grad_norm": 0.27081450830961107,
"learning_rate": 2e-05,
"loss": 0.7684,
"step": 96
},
{
"epoch": 3.0,
"eval_loss": 0.7346957921981812,
"eval_runtime": 81.5463,
"eval_samples_per_second": 2.453,
"eval_steps_per_second": 0.307,
"step": 96
},
{
"epoch": 3.03125,
"grad_norm": 0.2985486737236676,
"learning_rate": 2e-05,
"loss": 0.7274,
"step": 97
},
{
"epoch": 3.03125,
"eval_loss": 0.7325752377510071,
"eval_runtime": 81.7804,
"eval_samples_per_second": 2.446,
"eval_steps_per_second": 0.306,
"step": 97
},
{
"epoch": 3.0625,
"grad_norm": 0.29149719690624026,
"learning_rate": 2e-05,
"loss": 0.8119,
"step": 98
},
{
"epoch": 3.0625,
"eval_loss": 0.7298976182937622,
"eval_runtime": 76.2764,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 98
},
{
"epoch": 3.09375,
"grad_norm": 0.25227859825215865,
"learning_rate": 2e-05,
"loss": 0.7888,
"step": 99
},
{
"epoch": 3.09375,
"eval_loss": 0.727373480796814,
"eval_runtime": 76.2418,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 99
},
{
"epoch": 3.125,
"grad_norm": 0.27316954971752555,
"learning_rate": 2e-05,
"loss": 0.8224,
"step": 100
},
{
"epoch": 3.125,
"eval_loss": 0.7254325747489929,
"eval_runtime": 76.1474,
"eval_samples_per_second": 2.626,
"eval_steps_per_second": 0.328,
"step": 100
},
{
"epoch": 3.15625,
"grad_norm": 0.24239788607957785,
"learning_rate": 2e-05,
"loss": 0.7535,
"step": 101
},
{
"epoch": 3.15625,
"eval_loss": 0.724058985710144,
"eval_runtime": 76.2391,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 101
},
{
"epoch": 3.1875,
"grad_norm": 0.25648385925427025,
"learning_rate": 2e-05,
"loss": 0.8195,
"step": 102
},
{
"epoch": 3.1875,
"eval_loss": 0.7235870957374573,
"eval_runtime": 76.9134,
"eval_samples_per_second": 2.6,
"eval_steps_per_second": 0.325,
"step": 102
},
{
"epoch": 3.21875,
"grad_norm": 0.29620170789161204,
"learning_rate": 2e-05,
"loss": 0.8224,
"step": 103
},
{
"epoch": 3.21875,
"eval_loss": 0.7228152751922607,
"eval_runtime": 76.095,
"eval_samples_per_second": 2.628,
"eval_steps_per_second": 0.329,
"step": 103
},
{
"epoch": 3.25,
"grad_norm": 0.3484116181139593,
"learning_rate": 2e-05,
"loss": 0.7478,
"step": 104
},
{
"epoch": 3.25,
"eval_loss": 0.7209363579750061,
"eval_runtime": 76.9377,
"eval_samples_per_second": 2.6,
"eval_steps_per_second": 0.325,
"step": 104
},
{
"epoch": 3.28125,
"grad_norm": 0.25212350156184643,
"learning_rate": 2e-05,
"loss": 0.7885,
"step": 105
},
{
"epoch": 3.28125,
"eval_loss": 0.7197096347808838,
"eval_runtime": 76.2008,
"eval_samples_per_second": 2.625,
"eval_steps_per_second": 0.328,
"step": 105
},
{
"epoch": 3.3125,
"grad_norm": 0.264200147608962,
"learning_rate": 2e-05,
"loss": 0.8371,
"step": 106
},
{
"epoch": 3.3125,
"eval_loss": 0.7197055220603943,
"eval_runtime": 78.1542,
"eval_samples_per_second": 2.559,
"eval_steps_per_second": 0.32,
"step": 106
},
{
"epoch": 3.34375,
"grad_norm": 0.3309431084940201,
"learning_rate": 2e-05,
"loss": 0.6999,
"step": 107
},
{
"epoch": 3.34375,
"eval_loss": 0.7187016010284424,
"eval_runtime": 78.4259,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 107
},
{
"epoch": 3.375,
"grad_norm": 0.3131644456919823,
"learning_rate": 2e-05,
"loss": 0.7587,
"step": 108
},
{
"epoch": 3.375,
"eval_loss": 0.717018187046051,
"eval_runtime": 78.4558,
"eval_samples_per_second": 2.549,
"eval_steps_per_second": 0.319,
"step": 108
},
{
"epoch": 3.40625,
"grad_norm": 0.33527684120780293,
"learning_rate": 2e-05,
"loss": 0.7468,
"step": 109
},
{
"epoch": 3.40625,
"eval_loss": 0.7147062420845032,
"eval_runtime": 78.2334,
"eval_samples_per_second": 2.556,
"eval_steps_per_second": 0.32,
"step": 109
},
{
"epoch": 3.4375,
"grad_norm": 0.29542683956231724,
"learning_rate": 2e-05,
"loss": 0.7477,
"step": 110
},
{
"epoch": 3.4375,
"eval_loss": 0.7130224704742432,
"eval_runtime": 79.1179,
"eval_samples_per_second": 2.528,
"eval_steps_per_second": 0.316,
"step": 110
},
{
"epoch": 3.46875,
"grad_norm": 0.31128698002926114,
"learning_rate": 2e-05,
"loss": 0.8153,
"step": 111
},
{
"epoch": 3.46875,
"eval_loss": 0.7120551466941833,
"eval_runtime": 80.292,
"eval_samples_per_second": 2.491,
"eval_steps_per_second": 0.311,
"step": 111
},
{
"epoch": 3.5,
"grad_norm": 0.32502558864214215,
"learning_rate": 2e-05,
"loss": 0.8043,
"step": 112
},
{
"epoch": 3.5,
"eval_loss": 0.7117202877998352,
"eval_runtime": 79.7539,
"eval_samples_per_second": 2.508,
"eval_steps_per_second": 0.313,
"step": 112
},
{
"epoch": 3.53125,
"grad_norm": 0.34335720855758517,
"learning_rate": 2e-05,
"loss": 0.871,
"step": 113
},
{
"epoch": 3.53125,
"eval_loss": 0.7117029428482056,
"eval_runtime": 80.0281,
"eval_samples_per_second": 2.499,
"eval_steps_per_second": 0.312,
"step": 113
},
{
"epoch": 3.5625,
"grad_norm": 0.31951931695644,
"learning_rate": 2e-05,
"loss": 0.7453,
"step": 114
},
{
"epoch": 3.5625,
"eval_loss": 0.7116554379463196,
"eval_runtime": 79.7209,
"eval_samples_per_second": 2.509,
"eval_steps_per_second": 0.314,
"step": 114
},
{
"epoch": 3.59375,
"grad_norm": 0.28067192963874266,
"learning_rate": 2e-05,
"loss": 0.8045,
"step": 115
},
{
"epoch": 3.59375,
"eval_loss": 0.7118353843688965,
"eval_runtime": 80.0195,
"eval_samples_per_second": 2.499,
"eval_steps_per_second": 0.312,
"step": 115
},
{
"epoch": 3.625,
"grad_norm": 0.2739718257400276,
"learning_rate": 2e-05,
"loss": 0.775,
"step": 116
},
{
"epoch": 3.625,
"eval_loss": 0.7122579216957092,
"eval_runtime": 76.2052,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 116
},
{
"epoch": 3.65625,
"grad_norm": 0.31401723658881836,
"learning_rate": 2e-05,
"loss": 0.7826,
"step": 117
},
{
"epoch": 3.65625,
"eval_loss": 0.7118574380874634,
"eval_runtime": 76.1509,
"eval_samples_per_second": 2.626,
"eval_steps_per_second": 0.328,
"step": 117
},
{
"epoch": 3.6875,
"grad_norm": 0.36925964858634625,
"learning_rate": 2e-05,
"loss": 0.7884,
"step": 118
},
{
"epoch": 3.6875,
"eval_loss": 0.710691511631012,
"eval_runtime": 76.2305,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 118
},
{
"epoch": 3.71875,
"grad_norm": 0.3050583880654791,
"learning_rate": 2e-05,
"loss": 0.8402,
"step": 119
},
{
"epoch": 3.71875,
"eval_loss": 0.7096763849258423,
"eval_runtime": 77.0581,
"eval_samples_per_second": 2.595,
"eval_steps_per_second": 0.324,
"step": 119
},
{
"epoch": 3.75,
"grad_norm": 0.2648625651290031,
"learning_rate": 2e-05,
"loss": 0.7889,
"step": 120
},
{
"epoch": 3.75,
"eval_loss": 0.7094223499298096,
"eval_runtime": 76.1379,
"eval_samples_per_second": 2.627,
"eval_steps_per_second": 0.328,
"step": 120
},
{
"epoch": 3.78125,
"grad_norm": 0.3107221696449271,
"learning_rate": 2e-05,
"loss": 0.7615,
"step": 121
},
{
"epoch": 3.78125,
"eval_loss": 0.7081363201141357,
"eval_runtime": 76.626,
"eval_samples_per_second": 2.61,
"eval_steps_per_second": 0.326,
"step": 121
},
{
"epoch": 3.8125,
"grad_norm": 0.3455151299995048,
"learning_rate": 2e-05,
"loss": 0.8342,
"step": 122
},
{
"epoch": 3.8125,
"eval_loss": 0.7063001990318298,
"eval_runtime": 77.0293,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.325,
"step": 122
},
{
"epoch": 3.84375,
"grad_norm": 0.28847071926472523,
"learning_rate": 2e-05,
"loss": 0.7477,
"step": 123
},
{
"epoch": 3.84375,
"eval_loss": 0.7044610381126404,
"eval_runtime": 76.2385,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 123
},
{
"epoch": 3.875,
"grad_norm": 0.26753816515069856,
"learning_rate": 2e-05,
"loss": 0.7653,
"step": 124
},
{
"epoch": 3.875,
"eval_loss": 0.7033799886703491,
"eval_runtime": 76.1985,
"eval_samples_per_second": 2.625,
"eval_steps_per_second": 0.328,
"step": 124
},
{
"epoch": 3.90625,
"grad_norm": 0.3465046292893005,
"learning_rate": 2e-05,
"loss": 0.8144,
"step": 125
},
{
"epoch": 3.90625,
"eval_loss": 0.7021930813789368,
"eval_runtime": 76.2234,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 125
},
{
"epoch": 3.9375,
"grad_norm": 0.3451690427620698,
"learning_rate": 2e-05,
"loss": 0.7871,
"step": 126
},
{
"epoch": 3.9375,
"eval_loss": 0.7013542652130127,
"eval_runtime": 78.0752,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 126
},
{
"epoch": 3.96875,
"grad_norm": 0.31571858642673567,
"learning_rate": 2e-05,
"loss": 0.7568,
"step": 127
},
{
"epoch": 3.96875,
"eval_loss": 0.7007560729980469,
"eval_runtime": 78.3558,
"eval_samples_per_second": 2.552,
"eval_steps_per_second": 0.319,
"step": 127
},
{
"epoch": 4.0,
"grad_norm": 0.3247003540270338,
"learning_rate": 2e-05,
"loss": 0.6714,
"step": 128
},
{
"epoch": 4.0,
"eval_loss": 0.6999780535697937,
"eval_runtime": 78.9788,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.317,
"step": 128
},
{
"epoch": 4.03125,
"grad_norm": 0.2814983490019739,
"learning_rate": 2e-05,
"loss": 0.7797,
"step": 129
},
{
"epoch": 4.03125,
"eval_loss": 0.6998200416564941,
"eval_runtime": 78.3093,
"eval_samples_per_second": 2.554,
"eval_steps_per_second": 0.319,
"step": 129
},
{
"epoch": 4.0625,
"grad_norm": 0.31961631715145106,
"learning_rate": 2e-05,
"loss": 0.7993,
"step": 130
},
{
"epoch": 4.0625,
"eval_loss": 0.6995271444320679,
"eval_runtime": 78.2172,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 130
},
{
"epoch": 4.09375,
"grad_norm": 0.32333364662215863,
"learning_rate": 2e-05,
"loss": 0.7896,
"step": 131
},
{
"epoch": 4.09375,
"eval_loss": 0.6992727518081665,
"eval_runtime": 79.0125,
"eval_samples_per_second": 2.531,
"eval_steps_per_second": 0.316,
"step": 131
},
{
"epoch": 4.125,
"grad_norm": 0.3255859640449829,
"learning_rate": 2e-05,
"loss": 0.7542,
"step": 132
},
{
"epoch": 4.125,
"eval_loss": 0.6988572478294373,
"eval_runtime": 79.0,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.316,
"step": 132
},
{
"epoch": 4.15625,
"grad_norm": 0.3307068947429175,
"learning_rate": 2e-05,
"loss": 0.8416,
"step": 133
},
{
"epoch": 4.15625,
"eval_loss": 0.6981343030929565,
"eval_runtime": 78.3309,
"eval_samples_per_second": 2.553,
"eval_steps_per_second": 0.319,
"step": 133
},
{
"epoch": 4.1875,
"grad_norm": 0.3842303818116732,
"learning_rate": 2e-05,
"loss": 0.7605,
"step": 134
},
{
"epoch": 4.1875,
"eval_loss": 0.6968980431556702,
"eval_runtime": 78.5608,
"eval_samples_per_second": 2.546,
"eval_steps_per_second": 0.318,
"step": 134
},
{
"epoch": 4.21875,
"grad_norm": 0.331839472419003,
"learning_rate": 2e-05,
"loss": 0.7643,
"step": 135
},
{
"epoch": 4.21875,
"eval_loss": 0.6955949664115906,
"eval_runtime": 78.3566,
"eval_samples_per_second": 2.552,
"eval_steps_per_second": 0.319,
"step": 135
},
{
"epoch": 4.25,
"grad_norm": 0.31864813130499836,
"learning_rate": 2e-05,
"loss": 0.7369,
"step": 136
},
{
"epoch": 4.25,
"eval_loss": 0.6951528787612915,
"eval_runtime": 79.7802,
"eval_samples_per_second": 2.507,
"eval_steps_per_second": 0.313,
"step": 136
},
{
"epoch": 4.28125,
"grad_norm": 0.352549164434451,
"learning_rate": 2e-05,
"loss": 0.7332,
"step": 137
},
{
"epoch": 4.28125,
"eval_loss": 0.6947290897369385,
"eval_runtime": 79.8171,
"eval_samples_per_second": 2.506,
"eval_steps_per_second": 0.313,
"step": 137
},
{
"epoch": 4.3125,
"grad_norm": 0.37128812818896284,
"learning_rate": 2e-05,
"loss": 0.7542,
"step": 138
},
{
"epoch": 4.3125,
"eval_loss": 0.6937370300292969,
"eval_runtime": 79.7782,
"eval_samples_per_second": 2.507,
"eval_steps_per_second": 0.313,
"step": 138
},
{
"epoch": 4.34375,
"grad_norm": 0.3348014941412048,
"learning_rate": 2e-05,
"loss": 0.7079,
"step": 139
},
{
"epoch": 4.34375,
"eval_loss": 0.692456066608429,
"eval_runtime": 79.9308,
"eval_samples_per_second": 2.502,
"eval_steps_per_second": 0.313,
"step": 139
},
{
"epoch": 4.375,
"grad_norm": 0.34411051658527964,
"learning_rate": 2e-05,
"loss": 0.7465,
"step": 140
},
{
"epoch": 4.375,
"eval_loss": 0.6915809512138367,
"eval_runtime": 79.943,
"eval_samples_per_second": 2.502,
"eval_steps_per_second": 0.313,
"step": 140
},
{
"epoch": 4.40625,
"grad_norm": 0.3373909601921749,
"learning_rate": 2e-05,
"loss": 0.7648,
"step": 141
},
{
"epoch": 4.40625,
"eval_loss": 0.6912103295326233,
"eval_runtime": 79.8515,
"eval_samples_per_second": 2.505,
"eval_steps_per_second": 0.313,
"step": 141
},
{
"epoch": 4.4375,
"grad_norm": 0.33253827371305456,
"learning_rate": 2e-05,
"loss": 0.7224,
"step": 142
},
{
"epoch": 4.4375,
"eval_loss": 0.6912806630134583,
"eval_runtime": 80.6475,
"eval_samples_per_second": 2.48,
"eval_steps_per_second": 0.31,
"step": 142
},
{
"epoch": 4.46875,
"grad_norm": 0.38458075172588313,
"learning_rate": 2e-05,
"loss": 0.7261,
"step": 143
},
{
"epoch": 4.46875,
"eval_loss": 0.6905419230461121,
"eval_runtime": 80.2606,
"eval_samples_per_second": 2.492,
"eval_steps_per_second": 0.311,
"step": 143
},
{
"epoch": 4.5,
"grad_norm": 0.31351962640463144,
"learning_rate": 2e-05,
"loss": 0.6909,
"step": 144
},
{
"epoch": 4.5,
"eval_loss": 0.6898491382598877,
"eval_runtime": 79.9965,
"eval_samples_per_second": 2.5,
"eval_steps_per_second": 0.313,
"step": 144
},
{
"epoch": 4.53125,
"grad_norm": 0.35474372115704583,
"learning_rate": 2e-05,
"loss": 0.7605,
"step": 145
},
{
"epoch": 4.53125,
"eval_loss": 0.6893147230148315,
"eval_runtime": 1475.5758,
"eval_samples_per_second": 0.136,
"eval_steps_per_second": 0.017,
"step": 145
},
{
"epoch": 4.5625,
"grad_norm": 0.3479568917421202,
"learning_rate": 2e-05,
"loss": 0.6638,
"step": 146
},
{
"epoch": 4.5625,
"eval_loss": 0.6884538531303406,
"eval_runtime": 84.6835,
"eval_samples_per_second": 2.362,
"eval_steps_per_second": 0.295,
"step": 146
},
{
"epoch": 4.59375,
"grad_norm": 0.3421823344428645,
"learning_rate": 2e-05,
"loss": 0.7339,
"step": 147
},
{
"epoch": 4.59375,
"eval_loss": 0.6873475909233093,
"eval_runtime": 83.3138,
"eval_samples_per_second": 2.401,
"eval_steps_per_second": 0.3,
"step": 147
},
{
"epoch": 4.625,
"grad_norm": 0.3642187020830788,
"learning_rate": 2e-05,
"loss": 0.6825,
"step": 148
},
{
"epoch": 4.625,
"eval_loss": 0.6858401298522949,
"eval_runtime": 82.1066,
"eval_samples_per_second": 2.436,
"eval_steps_per_second": 0.304,
"step": 148
},
{
"epoch": 4.65625,
"grad_norm": 0.35097547901391785,
"learning_rate": 2e-05,
"loss": 0.7986,
"step": 149
},
{
"epoch": 4.65625,
"eval_loss": 0.6848779320716858,
"eval_runtime": 84.4076,
"eval_samples_per_second": 2.369,
"eval_steps_per_second": 0.296,
"step": 149
},
{
"epoch": 4.6875,
"grad_norm": 0.3568694843794629,
"learning_rate": 2e-05,
"loss": 0.7176,
"step": 150
},
{
"epoch": 4.6875,
"eval_loss": 0.6842290759086609,
"eval_runtime": 82.5945,
"eval_samples_per_second": 2.421,
"eval_steps_per_second": 0.303,
"step": 150
},
{
"epoch": 4.71875,
"grad_norm": 0.34258633585260334,
"learning_rate": 2e-05,
"loss": 0.7363,
"step": 151
},
{
"epoch": 4.71875,
"eval_loss": 0.6838659048080444,
"eval_runtime": 85.9626,
"eval_samples_per_second": 2.327,
"eval_steps_per_second": 0.291,
"step": 151
},
{
"epoch": 4.75,
"grad_norm": 0.42319523894659655,
"learning_rate": 2e-05,
"loss": 0.7675,
"step": 152
},
{
"epoch": 4.75,
"eval_loss": 0.6830299496650696,
"eval_runtime": 85.7189,
"eval_samples_per_second": 2.333,
"eval_steps_per_second": 0.292,
"step": 152
},
{
"epoch": 4.78125,
"grad_norm": 0.3632195533127194,
"learning_rate": 2e-05,
"loss": 0.715,
"step": 153
},
{
"epoch": 4.78125,
"eval_loss": 0.6826379895210266,
"eval_runtime": 87.8244,
"eval_samples_per_second": 2.277,
"eval_steps_per_second": 0.285,
"step": 153
},
{
"epoch": 4.8125,
"grad_norm": 0.3738308004604413,
"learning_rate": 2e-05,
"loss": 0.7344,
"step": 154
},
{
"epoch": 4.8125,
"eval_loss": 0.6826817393302917,
"eval_runtime": 86.5822,
"eval_samples_per_second": 2.31,
"eval_steps_per_second": 0.289,
"step": 154
},
{
"epoch": 4.84375,
"grad_norm": 0.3618696330632776,
"learning_rate": 2e-05,
"loss": 0.6632,
"step": 155
},
{
"epoch": 4.84375,
"eval_loss": 0.6827967166900635,
"eval_runtime": 82.1829,
"eval_samples_per_second": 2.434,
"eval_steps_per_second": 0.304,
"step": 155
},
{
"epoch": 4.875,
"grad_norm": 0.38901912569992203,
"learning_rate": 2e-05,
"loss": 0.7788,
"step": 156
},
{
"epoch": 4.875,
"eval_loss": 0.6821711659431458,
"eval_runtime": 84.4511,
"eval_samples_per_second": 2.368,
"eval_steps_per_second": 0.296,
"step": 156
},
{
"epoch": 4.90625,
"grad_norm": 0.3516096507348829,
"learning_rate": 2e-05,
"loss": 0.7794,
"step": 157
},
{
"epoch": 4.90625,
"eval_loss": 0.6819837689399719,
"eval_runtime": 84.1594,
"eval_samples_per_second": 2.376,
"eval_steps_per_second": 0.297,
"step": 157
},
{
"epoch": 4.9375,
"grad_norm": 0.36066902463794986,
"learning_rate": 2e-05,
"loss": 0.7674,
"step": 158
},
{
"epoch": 4.9375,
"eval_loss": 0.6817716956138611,
"eval_runtime": 83.8929,
"eval_samples_per_second": 2.384,
"eval_steps_per_second": 0.298,
"step": 158
},
{
"epoch": 4.96875,
"grad_norm": 0.36641784926154175,
"learning_rate": 2e-05,
"loss": 0.7116,
"step": 159
},
{
"epoch": 4.96875,
"eval_loss": 0.6816902160644531,
"eval_runtime": 84.4431,
"eval_samples_per_second": 2.368,
"eval_steps_per_second": 0.296,
"step": 159
},
{
"epoch": 5.0,
"grad_norm": 0.4020716293225933,
"learning_rate": 2e-05,
"loss": 0.7142,
"step": 160
},
{
"epoch": 5.0,
"eval_loss": 0.6811469793319702,
"eval_runtime": 86.0681,
"eval_samples_per_second": 2.324,
"eval_steps_per_second": 0.29,
"step": 160
},
{
"epoch": 5.03125,
"grad_norm": 0.38360882669254054,
"learning_rate": 2e-05,
"loss": 0.6756,
"step": 161
},
{
"epoch": 5.03125,
"eval_loss": 0.6798409223556519,
"eval_runtime": 81.9903,
"eval_samples_per_second": 2.439,
"eval_steps_per_second": 0.305,
"step": 161
},
{
"epoch": 5.0625,
"grad_norm": 0.34966156213066135,
"learning_rate": 2e-05,
"loss": 0.827,
"step": 162
},
{
"epoch": 5.0625,
"eval_loss": 0.6788859367370605,
"eval_runtime": 76.1753,
"eval_samples_per_second": 2.626,
"eval_steps_per_second": 0.328,
"step": 162
},
{
"epoch": 5.09375,
"grad_norm": 0.41140842939901384,
"learning_rate": 2e-05,
"loss": 0.6409,
"step": 163
},
{
"epoch": 5.09375,
"eval_loss": 0.6787077188491821,
"eval_runtime": 76.2239,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 163
},
{
"epoch": 5.125,
"grad_norm": 0.4222084070163774,
"learning_rate": 2e-05,
"loss": 0.7774,
"step": 164
},
{
"epoch": 5.125,
"eval_loss": 0.6796822547912598,
"eval_runtime": 76.2141,
"eval_samples_per_second": 2.624,
"eval_steps_per_second": 0.328,
"step": 164
},
{
"epoch": 5.15625,
"grad_norm": 0.4644454724424921,
"learning_rate": 2e-05,
"loss": 0.6057,
"step": 165
},
{
"epoch": 5.15625,
"eval_loss": 0.6794346570968628,
"eval_runtime": 76.3216,
"eval_samples_per_second": 2.62,
"eval_steps_per_second": 0.328,
"step": 165
},
{
"epoch": 5.1875,
"grad_norm": 0.46128725263272996,
"learning_rate": 2e-05,
"loss": 0.7158,
"step": 166
},
{
"epoch": 5.1875,
"eval_loss": 0.6791612505912781,
"eval_runtime": 78.4909,
"eval_samples_per_second": 2.548,
"eval_steps_per_second": 0.319,
"step": 166
},
{
"epoch": 5.21875,
"grad_norm": 0.37300666872025545,
"learning_rate": 2e-05,
"loss": 0.7363,
"step": 167
},
{
"epoch": 5.21875,
"eval_loss": 0.6788016557693481,
"eval_runtime": 78.5697,
"eval_samples_per_second": 2.546,
"eval_steps_per_second": 0.318,
"step": 167
},
{
"epoch": 5.25,
"grad_norm": 0.41454648576180214,
"learning_rate": 2e-05,
"loss": 0.7759,
"step": 168
},
{
"epoch": 5.25,
"eval_loss": 0.6787048578262329,
"eval_runtime": 78.5317,
"eval_samples_per_second": 2.547,
"eval_steps_per_second": 0.318,
"step": 168
},
{
"epoch": 5.28125,
"grad_norm": 0.40724665091386236,
"learning_rate": 2e-05,
"loss": 0.6944,
"step": 169
},
{
"epoch": 5.28125,
"eval_loss": 0.679679811000824,
"eval_runtime": 78.6899,
"eval_samples_per_second": 2.542,
"eval_steps_per_second": 0.318,
"step": 169
},
{
"epoch": 5.3125,
"grad_norm": 0.3875110486208986,
"learning_rate": 2e-05,
"loss": 0.6634,
"step": 170
},
{
"epoch": 5.3125,
"eval_loss": 0.6819935441017151,
"eval_runtime": 78.3617,
"eval_samples_per_second": 2.552,
"eval_steps_per_second": 0.319,
"step": 170
},
{
"epoch": 5.34375,
"grad_norm": 0.47956532155617193,
"learning_rate": 2e-05,
"loss": 0.687,
"step": 171
},
{
"epoch": 5.34375,
"eval_loss": 0.6825206875801086,
"eval_runtime": 78.4435,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 171
},
{
"epoch": 5.375,
"grad_norm": 0.4599359590587781,
"learning_rate": 2e-05,
"loss": 0.7718,
"step": 172
},
{
"epoch": 5.375,
"eval_loss": 0.6816768050193787,
"eval_runtime": 78.3005,
"eval_samples_per_second": 2.554,
"eval_steps_per_second": 0.319,
"step": 172
},
{
"epoch": 5.40625,
"grad_norm": 0.4057490487995386,
"learning_rate": 2e-05,
"loss": 0.7292,
"step": 173
},
{
"epoch": 5.40625,
"eval_loss": 0.6806090474128723,
"eval_runtime": 78.3313,
"eval_samples_per_second": 2.553,
"eval_steps_per_second": 0.319,
"step": 173
},
{
"epoch": 5.4375,
"grad_norm": 0.4143979315360467,
"learning_rate": 2e-05,
"loss": 0.7697,
"step": 174
},
{
"epoch": 5.4375,
"eval_loss": 0.6795693039894104,
"eval_runtime": 78.4526,
"eval_samples_per_second": 2.549,
"eval_steps_per_second": 0.319,
"step": 174
},
{
"epoch": 5.46875,
"grad_norm": 0.4219663662343445,
"learning_rate": 2e-05,
"loss": 0.7534,
"step": 175
},
{
"epoch": 5.46875,
"eval_loss": 0.6793847680091858,
"eval_runtime": 78.8009,
"eval_samples_per_second": 2.538,
"eval_steps_per_second": 0.317,
"step": 175
},
{
"epoch": 5.5,
"grad_norm": 0.4491811321927657,
"learning_rate": 2e-05,
"loss": 0.7004,
"step": 176
},
{
"epoch": 5.5,
"eval_loss": 0.6775352358818054,
"eval_runtime": 80.0685,
"eval_samples_per_second": 2.498,
"eval_steps_per_second": 0.312,
"step": 176
},
{
"epoch": 5.53125,
"grad_norm": 0.46366516532638885,
"learning_rate": 2e-05,
"loss": 0.7357,
"step": 177
},
{
"epoch": 5.53125,
"eval_loss": 0.6748698949813843,
"eval_runtime": 80.0487,
"eval_samples_per_second": 2.498,
"eval_steps_per_second": 0.312,
"step": 177
},
{
"epoch": 5.5625,
"grad_norm": 0.3815188640227797,
"learning_rate": 2e-05,
"loss": 0.7592,
"step": 178
},
{
"epoch": 5.5625,
"eval_loss": 0.6728273034095764,
"eval_runtime": 80.0318,
"eval_samples_per_second": 2.499,
"eval_steps_per_second": 0.312,
"step": 178
},
{
"epoch": 5.59375,
"grad_norm": 0.41025429416666304,
"learning_rate": 2e-05,
"loss": 0.6585,
"step": 179
},
{
"epoch": 5.59375,
"eval_loss": 0.6718859672546387,
"eval_runtime": 79.8801,
"eval_samples_per_second": 2.504,
"eval_steps_per_second": 0.313,
"step": 179
},
{
"epoch": 5.625,
"grad_norm": 0.40652817592240054,
"learning_rate": 2e-05,
"loss": 0.6611,
"step": 180
},
{
"epoch": 5.625,
"eval_loss": 0.6715708374977112,
"eval_runtime": 76.7261,
"eval_samples_per_second": 2.607,
"eval_steps_per_second": 0.326,
"step": 180
},
{
"epoch": 5.65625,
"grad_norm": 0.40753961326688415,
"learning_rate": 2e-05,
"loss": 0.6779,
"step": 181
},
{
"epoch": 5.65625,
"eval_loss": 0.6719761490821838,
"eval_runtime": 77.0136,
"eval_samples_per_second": 2.597,
"eval_steps_per_second": 0.325,
"step": 181
},
{
"epoch": 5.6875,
"grad_norm": 0.4232811980671673,
"learning_rate": 2e-05,
"loss": 0.6475,
"step": 182
},
{
"epoch": 5.6875,
"eval_loss": 0.6724664568901062,
"eval_runtime": 76.9731,
"eval_samples_per_second": 2.598,
"eval_steps_per_second": 0.325,
"step": 182
},
{
"epoch": 5.71875,
"grad_norm": 0.5132756318549849,
"learning_rate": 2e-05,
"loss": 0.6801,
"step": 183
},
{
"epoch": 5.71875,
"eval_loss": 0.6723365783691406,
"eval_runtime": 76.4132,
"eval_samples_per_second": 2.617,
"eval_steps_per_second": 0.327,
"step": 183
},
{
"epoch": 5.75,
"grad_norm": 0.43526879230161264,
"learning_rate": 2e-05,
"loss": 0.6673,
"step": 184
},
{
"epoch": 5.75,
"eval_loss": 0.672926664352417,
"eval_runtime": 76.1936,
"eval_samples_per_second": 2.625,
"eval_steps_per_second": 0.328,
"step": 184
},
{
"epoch": 5.78125,
"grad_norm": 0.46965560853038507,
"learning_rate": 2e-05,
"loss": 0.7074,
"step": 185
},
{
"epoch": 5.78125,
"eval_loss": 0.6731134057044983,
"eval_runtime": 76.2345,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 185
},
{
"epoch": 5.8125,
"grad_norm": 0.4733296318676217,
"learning_rate": 2e-05,
"loss": 0.6791,
"step": 186
},
{
"epoch": 5.8125,
"eval_loss": 0.6726363301277161,
"eval_runtime": 78.3939,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 0.319,
"step": 186
},
{
"epoch": 5.84375,
"grad_norm": 0.4662943253655961,
"learning_rate": 2e-05,
"loss": 0.7371,
"step": 187
},
{
"epoch": 5.84375,
"eval_loss": 0.6726526021957397,
"eval_runtime": 79.1834,
"eval_samples_per_second": 2.526,
"eval_steps_per_second": 0.316,
"step": 187
},
{
"epoch": 5.875,
"grad_norm": 0.4420962889993382,
"learning_rate": 2e-05,
"loss": 0.675,
"step": 188
},
{
"epoch": 5.875,
"eval_loss": 0.6727125644683838,
"eval_runtime": 78.252,
"eval_samples_per_second": 2.556,
"eval_steps_per_second": 0.319,
"step": 188
},
{
"epoch": 5.90625,
"grad_norm": 0.4345166976944551,
"learning_rate": 2e-05,
"loss": 0.6748,
"step": 189
},
{
"epoch": 5.90625,
"eval_loss": 0.6725904941558838,
"eval_runtime": 78.3914,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 0.319,
"step": 189
},
{
"epoch": 5.9375,
"grad_norm": 0.45109463315374526,
"learning_rate": 2e-05,
"loss": 0.7024,
"step": 190
},
{
"epoch": 5.9375,
"eval_loss": 0.6718384027481079,
"eval_runtime": 78.4361,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 190
},
{
"epoch": 5.96875,
"grad_norm": 0.42953871838795626,
"learning_rate": 2e-05,
"loss": 0.6904,
"step": 191
},
{
"epoch": 5.96875,
"eval_loss": 0.6703083515167236,
"eval_runtime": 78.3863,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 0.319,
"step": 191
},
{
"epoch": 6.0,
"grad_norm": 0.4248607379284984,
"learning_rate": 2e-05,
"loss": 0.6659,
"step": 192
},
{
"epoch": 6.0,
"eval_loss": 0.6693080067634583,
"eval_runtime": 78.4373,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 192
},
{
"epoch": 6.03125,
"grad_norm": 0.42839417453459494,
"learning_rate": 2e-05,
"loss": 0.7457,
"step": 193
},
{
"epoch": 6.03125,
"eval_loss": 0.6689594984054565,
"eval_runtime": 78.4169,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 193
},
{
"epoch": 6.0625,
"grad_norm": 0.4216922788166874,
"learning_rate": 2e-05,
"loss": 0.7189,
"step": 194
},
{
"epoch": 6.0625,
"eval_loss": 0.6689300537109375,
"eval_runtime": 78.9793,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.317,
"step": 194
},
{
"epoch": 6.09375,
"grad_norm": 0.45199575791858004,
"learning_rate": 2e-05,
"loss": 0.6438,
"step": 195
},
{
"epoch": 6.09375,
"eval_loss": 0.6690151691436768,
"eval_runtime": 78.5002,
"eval_samples_per_second": 2.548,
"eval_steps_per_second": 0.318,
"step": 195
},
{
"epoch": 6.125,
"grad_norm": 0.4166923177293841,
"learning_rate": 2e-05,
"loss": 0.6885,
"step": 196
},
{
"epoch": 6.125,
"eval_loss": 0.6688613891601562,
"eval_runtime": 80.5497,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 0.31,
"step": 196
},
{
"epoch": 6.15625,
"grad_norm": 0.45164281863366285,
"learning_rate": 2e-05,
"loss": 0.7197,
"step": 197
},
{
"epoch": 6.15625,
"eval_loss": 0.6687932014465332,
"eval_runtime": 80.1482,
"eval_samples_per_second": 2.495,
"eval_steps_per_second": 0.312,
"step": 197
},
{
"epoch": 6.1875,
"grad_norm": 0.45653924787504446,
"learning_rate": 2e-05,
"loss": 0.776,
"step": 198
},
{
"epoch": 6.1875,
"eval_loss": 0.6690963506698608,
"eval_runtime": 80.4464,
"eval_samples_per_second": 2.486,
"eval_steps_per_second": 0.311,
"step": 198
},
{
"epoch": 6.21875,
"grad_norm": 0.4966562341334706,
"learning_rate": 2e-05,
"loss": 0.6532,
"step": 199
},
{
"epoch": 6.21875,
"eval_loss": 0.669116735458374,
"eval_runtime": 79.8294,
"eval_samples_per_second": 2.505,
"eval_steps_per_second": 0.313,
"step": 199
},
{
"epoch": 6.25,
"grad_norm": 0.4838469303220975,
"learning_rate": 2e-05,
"loss": 0.6883,
"step": 200
},
{
"epoch": 6.25,
"eval_loss": 0.6693156957626343,
"eval_runtime": 80.25,
"eval_samples_per_second": 2.492,
"eval_steps_per_second": 0.312,
"step": 200
},
{
"epoch": 6.28125,
"grad_norm": 0.4836820906895964,
"learning_rate": 2e-05,
"loss": 0.7106,
"step": 201
},
{
"epoch": 6.28125,
"eval_loss": 0.6704170107841492,
"eval_runtime": 79.9636,
"eval_samples_per_second": 2.501,
"eval_steps_per_second": 0.313,
"step": 201
},
{
"epoch": 6.3125,
"grad_norm": 0.4945855983140219,
"learning_rate": 2e-05,
"loss": 0.6336,
"step": 202
},
{
"epoch": 6.3125,
"eval_loss": 0.6708824038505554,
"eval_runtime": 80.8044,
"eval_samples_per_second": 2.475,
"eval_steps_per_second": 0.309,
"step": 202
},
{
"epoch": 6.34375,
"grad_norm": 0.44587847230103017,
"learning_rate": 2e-05,
"loss": 0.7811,
"step": 203
},
{
"epoch": 6.34375,
"eval_loss": 0.6723968982696533,
"eval_runtime": 80.1715,
"eval_samples_per_second": 2.495,
"eval_steps_per_second": 0.312,
"step": 203
},
{
"epoch": 6.375,
"grad_norm": 0.5351063503195825,
"learning_rate": 2e-05,
"loss": 0.6222,
"step": 204
},
{
"epoch": 6.375,
"eval_loss": 0.672196626663208,
"eval_runtime": 79.927,
"eval_samples_per_second": 2.502,
"eval_steps_per_second": 0.313,
"step": 204
},
{
"epoch": 6.40625,
"grad_norm": 0.4742985088010474,
"learning_rate": 2e-05,
"loss": 0.6157,
"step": 205
},
{
"epoch": 6.40625,
"eval_loss": 0.671062171459198,
"eval_runtime": 80.1997,
"eval_samples_per_second": 2.494,
"eval_steps_per_second": 0.312,
"step": 205
},
{
"epoch": 6.4375,
"grad_norm": 0.5188882333349506,
"learning_rate": 2e-05,
"loss": 0.6462,
"step": 206
},
{
"epoch": 6.4375,
"eval_loss": 0.6701972484588623,
"eval_runtime": 81.6643,
"eval_samples_per_second": 2.449,
"eval_steps_per_second": 0.306,
"step": 206
},
{
"epoch": 6.46875,
"grad_norm": 0.45328063593983603,
"learning_rate": 2e-05,
"loss": 0.7058,
"step": 207
},
{
"epoch": 6.46875,
"eval_loss": 0.6699164509773254,
"eval_runtime": 81.2228,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 0.308,
"step": 207
},
{
"epoch": 6.5,
"grad_norm": 0.5197645538332801,
"learning_rate": 2e-05,
"loss": 0.6462,
"step": 208
},
{
"epoch": 6.5,
"eval_loss": 0.6702597141265869,
"eval_runtime": 81.1451,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 0.308,
"step": 208
},
{
"epoch": 6.53125,
"grad_norm": 0.5762528184834232,
"learning_rate": 2e-05,
"loss": 0.6259,
"step": 209
},
{
"epoch": 6.53125,
"eval_loss": 0.6696366667747498,
"eval_runtime": 81.1643,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 0.308,
"step": 209
},
{
"epoch": 6.5625,
"grad_norm": 0.5249503180293145,
"learning_rate": 2e-05,
"loss": 0.6045,
"step": 210
},
{
"epoch": 6.5625,
"eval_loss": 0.6688054800033569,
"eval_runtime": 80.9492,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 0.309,
"step": 210
},
{
"epoch": 6.59375,
"grad_norm": 0.543503888655844,
"learning_rate": 2e-05,
"loss": 0.6496,
"step": 211
},
{
"epoch": 6.59375,
"eval_loss": 0.6689916849136353,
"eval_runtime": 81.6473,
"eval_samples_per_second": 2.45,
"eval_steps_per_second": 0.306,
"step": 211
},
{
"epoch": 6.625,
"grad_norm": 0.48119553592193554,
"learning_rate": 2e-05,
"loss": 0.6211,
"step": 212
},
{
"epoch": 6.625,
"eval_loss": 0.6703050136566162,
"eval_runtime": 81.9207,
"eval_samples_per_second": 2.441,
"eval_steps_per_second": 0.305,
"step": 212
},
{
"epoch": 6.65625,
"grad_norm": 0.5153356086819314,
"learning_rate": 2e-05,
"loss": 0.7135,
"step": 213
},
{
"epoch": 6.65625,
"eval_loss": 0.6702842116355896,
"eval_runtime": 81.1503,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 0.308,
"step": 213
},
{
"epoch": 6.6875,
"grad_norm": 0.5249915042825578,
"learning_rate": 2e-05,
"loss": 0.6635,
"step": 214
},
{
"epoch": 6.6875,
"eval_loss": 0.6687333583831787,
"eval_runtime": 81.6743,
"eval_samples_per_second": 2.449,
"eval_steps_per_second": 0.306,
"step": 214
},
{
"epoch": 6.71875,
"grad_norm": 0.5204840219868723,
"learning_rate": 2e-05,
"loss": 0.6701,
"step": 215
},
{
"epoch": 6.71875,
"eval_loss": 0.6657728552818298,
"eval_runtime": 81.106,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 0.308,
"step": 215
},
{
"epoch": 6.75,
"grad_norm": 0.5266935225120133,
"learning_rate": 2e-05,
"loss": 0.6637,
"step": 216
},
{
"epoch": 6.75,
"eval_loss": 0.6641908884048462,
"eval_runtime": 82.2613,
"eval_samples_per_second": 2.431,
"eval_steps_per_second": 0.304,
"step": 216
},
{
"epoch": 6.78125,
"grad_norm": 0.5438859451742696,
"learning_rate": 2e-05,
"loss": 0.6168,
"step": 217
},
{
"epoch": 6.78125,
"eval_loss": 0.6652233600616455,
"eval_runtime": 82.042,
"eval_samples_per_second": 2.438,
"eval_steps_per_second": 0.305,
"step": 217
},
{
"epoch": 6.8125,
"grad_norm": 0.5716385253433929,
"learning_rate": 2e-05,
"loss": 0.6062,
"step": 218
},
{
"epoch": 6.8125,
"eval_loss": 0.6656240820884705,
"eval_runtime": 81.233,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 0.308,
"step": 218
},
{
"epoch": 6.84375,
"grad_norm": 1.0572787630142522,
"learning_rate": 2e-05,
"loss": 0.7037,
"step": 219
},
{
"epoch": 6.84375,
"eval_loss": 0.6645559072494507,
"eval_runtime": 81.2099,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 0.308,
"step": 219
},
{
"epoch": 6.875,
"grad_norm": 0.5924889323251107,
"learning_rate": 2e-05,
"loss": 0.712,
"step": 220
},
{
"epoch": 6.875,
"eval_loss": 0.6619111895561218,
"eval_runtime": 81.7826,
"eval_samples_per_second": 2.446,
"eval_steps_per_second": 0.306,
"step": 220
},
{
"epoch": 6.90625,
"grad_norm": 0.5290576915218269,
"learning_rate": 2e-05,
"loss": 0.6659,
"step": 221
},
{
"epoch": 6.90625,
"eval_loss": 0.6609540581703186,
"eval_runtime": 82.9922,
"eval_samples_per_second": 2.41,
"eval_steps_per_second": 0.301,
"step": 221
},
{
"epoch": 6.9375,
"grad_norm": 0.5831209517049147,
"learning_rate": 2e-05,
"loss": 0.6547,
"step": 222
},
{
"epoch": 6.9375,
"eval_loss": 0.660676896572113,
"eval_runtime": 83.6541,
"eval_samples_per_second": 2.391,
"eval_steps_per_second": 0.299,
"step": 222
},
{
"epoch": 6.96875,
"grad_norm": 0.5320966369511158,
"learning_rate": 2e-05,
"loss": 0.6968,
"step": 223
},
{
"epoch": 6.96875,
"eval_loss": 0.6618594527244568,
"eval_runtime": 83.1148,
"eval_samples_per_second": 2.406,
"eval_steps_per_second": 0.301,
"step": 223
},
{
"epoch": 7.0,
"grad_norm": 0.5829636446837394,
"learning_rate": 2e-05,
"loss": 0.7407,
"step": 224
},
{
"epoch": 7.0,
"eval_loss": 0.6635661125183105,
"eval_runtime": 82.8183,
"eval_samples_per_second": 2.415,
"eval_steps_per_second": 0.302,
"step": 224
},
{
"epoch": 7.03125,
"grad_norm": 0.4975095056459566,
"learning_rate": 2e-05,
"loss": 0.6535,
"step": 225
},
{
"epoch": 7.03125,
"eval_loss": 0.6641671657562256,
"eval_runtime": 83.0267,
"eval_samples_per_second": 2.409,
"eval_steps_per_second": 0.301,
"step": 225
},
{
"epoch": 7.0625,
"grad_norm": 0.5625698523064815,
"learning_rate": 2e-05,
"loss": 0.6012,
"step": 226
},
{
"epoch": 7.0625,
"eval_loss": 0.6639044880867004,
"eval_runtime": 83.3881,
"eval_samples_per_second": 2.398,
"eval_steps_per_second": 0.3,
"step": 226
},
{
"epoch": 7.09375,
"grad_norm": 0.5436196850683295,
"learning_rate": 2e-05,
"loss": 0.6485,
"step": 227
},
{
"epoch": 7.09375,
"eval_loss": 0.6651788353919983,
"eval_runtime": 82.7096,
"eval_samples_per_second": 2.418,
"eval_steps_per_second": 0.302,
"step": 227
},
{
"epoch": 7.125,
"grad_norm": 0.5598906287609361,
"learning_rate": 2e-05,
"loss": 0.6142,
"step": 228
},
{
"epoch": 7.125,
"eval_loss": 0.6688636541366577,
"eval_runtime": 82.601,
"eval_samples_per_second": 2.421,
"eval_steps_per_second": 0.303,
"step": 228
},
{
"epoch": 7.15625,
"grad_norm": 0.7572979310697923,
"learning_rate": 2e-05,
"loss": 0.6221,
"step": 229
},
{
"epoch": 7.15625,
"eval_loss": 0.6699694991111755,
"eval_runtime": 82.6032,
"eval_samples_per_second": 2.421,
"eval_steps_per_second": 0.303,
"step": 229
},
{
"epoch": 7.1875,
"grad_norm": 0.6173309690580897,
"learning_rate": 2e-05,
"loss": 0.5919,
"step": 230
},
{
"epoch": 7.1875,
"eval_loss": 0.6706527471542358,
"eval_runtime": 82.9732,
"eval_samples_per_second": 2.41,
"eval_steps_per_second": 0.301,
"step": 230
},
{
"epoch": 7.21875,
"grad_norm": 0.643241771517866,
"learning_rate": 2e-05,
"loss": 0.7081,
"step": 231
},
{
"epoch": 7.21875,
"eval_loss": 0.6700320243835449,
"eval_runtime": 84.5621,
"eval_samples_per_second": 2.365,
"eval_steps_per_second": 0.296,
"step": 231
},
{
"epoch": 7.25,
"grad_norm": 0.577638137570571,
"learning_rate": 2e-05,
"loss": 0.6873,
"step": 232
},
{
"epoch": 7.25,
"eval_loss": 0.669111430644989,
"eval_runtime": 84.5124,
"eval_samples_per_second": 2.367,
"eval_steps_per_second": 0.296,
"step": 232
},
{
"epoch": 7.28125,
"grad_norm": 0.7229488296023369,
"learning_rate": 2e-05,
"loss": 0.6301,
"step": 233
},
{
"epoch": 7.28125,
"eval_loss": 0.6664154529571533,
"eval_runtime": 84.6437,
"eval_samples_per_second": 2.363,
"eval_steps_per_second": 0.295,
"step": 233
},
{
"epoch": 7.3125,
"grad_norm": 0.5827815449039045,
"learning_rate": 2e-05,
"loss": 0.669,
"step": 234
},
{
"epoch": 7.3125,
"eval_loss": 0.6641202569007874,
"eval_runtime": 84.489,
"eval_samples_per_second": 2.367,
"eval_steps_per_second": 0.296,
"step": 234
},
{
"epoch": 7.34375,
"grad_norm": 0.57507354017269,
"learning_rate": 2e-05,
"loss": 0.6474,
"step": 235
},
{
"epoch": 7.34375,
"eval_loss": 0.6623325347900391,
"eval_runtime": 84.5536,
"eval_samples_per_second": 2.365,
"eval_steps_per_second": 0.296,
"step": 235
},
{
"epoch": 7.375,
"grad_norm": 0.5810844862533651,
"learning_rate": 2e-05,
"loss": 0.6048,
"step": 236
},
{
"epoch": 7.375,
"eval_loss": 0.6619194746017456,
"eval_runtime": 84.2296,
"eval_samples_per_second": 2.374,
"eval_steps_per_second": 0.297,
"step": 236
},
{
"epoch": 7.40625,
"grad_norm": 0.6075032415813726,
"learning_rate": 2e-05,
"loss": 0.6529,
"step": 237
},
{
"epoch": 7.40625,
"eval_loss": 0.6626202464103699,
"eval_runtime": 84.9703,
"eval_samples_per_second": 2.354,
"eval_steps_per_second": 0.294,
"step": 237
},
{
"epoch": 7.4375,
"grad_norm": 0.6402642234375245,
"learning_rate": 2e-05,
"loss": 0.6433,
"step": 238
},
{
"epoch": 7.4375,
"eval_loss": 0.663289487361908,
"eval_runtime": 84.8924,
"eval_samples_per_second": 2.356,
"eval_steps_per_second": 0.294,
"step": 238
},
{
"epoch": 7.46875,
"grad_norm": 0.6335996982657431,
"learning_rate": 2e-05,
"loss": 0.6815,
"step": 239
},
{
"epoch": 7.46875,
"eval_loss": 0.6636109948158264,
"eval_runtime": 85.0551,
"eval_samples_per_second": 2.351,
"eval_steps_per_second": 0.294,
"step": 239
},
{
"epoch": 7.5,
"grad_norm": 0.5796846795848909,
"learning_rate": 2e-05,
"loss": 0.6236,
"step": 240
},
{
"epoch": 7.5,
"eval_loss": 0.6652829051017761,
"eval_runtime": 84.7574,
"eval_samples_per_second": 2.36,
"eval_steps_per_second": 0.295,
"step": 240
},
{
"epoch": 7.53125,
"grad_norm": 0.5380402145760035,
"learning_rate": 2e-05,
"loss": 0.6564,
"step": 241
},
{
"epoch": 7.53125,
"eval_loss": 0.6676375865936279,
"eval_runtime": 86.2058,
"eval_samples_per_second": 2.32,
"eval_steps_per_second": 0.29,
"step": 241
},
{
"epoch": 7.5625,
"grad_norm": 0.5964298255824012,
"learning_rate": 2e-05,
"loss": 0.6475,
"step": 242
},
{
"epoch": 7.5625,
"eval_loss": 0.6698520183563232,
"eval_runtime": 85.8955,
"eval_samples_per_second": 2.328,
"eval_steps_per_second": 0.291,
"step": 242
},
{
"epoch": 7.59375,
"grad_norm": 0.561279296875,
"learning_rate": 2e-05,
"loss": 0.6395,
"step": 243
},
{
"epoch": 7.59375,
"eval_loss": 0.6705803871154785,
"eval_runtime": 86.0036,
"eval_samples_per_second": 2.325,
"eval_steps_per_second": 0.291,
"step": 243
},
{
"epoch": 7.625,
"grad_norm": 0.6757292755073548,
"learning_rate": 2e-05,
"loss": 0.7074,
"step": 244
},
{
"epoch": 7.625,
"eval_loss": 0.6679538488388062,
"eval_runtime": 85.5379,
"eval_samples_per_second": 2.338,
"eval_steps_per_second": 0.292,
"step": 244
},
{
"epoch": 7.65625,
"grad_norm": 0.659077163070129,
"learning_rate": 2e-05,
"loss": 0.6078,
"step": 245
},
{
"epoch": 7.65625,
"eval_loss": 0.6667564511299133,
"eval_runtime": 85.752,
"eval_samples_per_second": 2.332,
"eval_steps_per_second": 0.292,
"step": 245
},
{
"epoch": 7.6875,
"grad_norm": 0.6215405566454576,
"learning_rate": 2e-05,
"loss": 0.6603,
"step": 246
},
{
"epoch": 7.6875,
"eval_loss": 0.665945291519165,
"eval_runtime": 92.3086,
"eval_samples_per_second": 2.167,
"eval_steps_per_second": 0.271,
"step": 246
},
{
"epoch": 7.71875,
"grad_norm": 0.6130534921490498,
"learning_rate": 2e-05,
"loss": 0.6435,
"step": 247
},
{
"epoch": 7.71875,
"eval_loss": 0.6661685109138489,
"eval_runtime": 87.1917,
"eval_samples_per_second": 2.294,
"eval_steps_per_second": 0.287,
"step": 247
},
{
"epoch": 7.75,
"grad_norm": 0.6025415602868736,
"learning_rate": 2e-05,
"loss": 0.6308,
"step": 248
},
{
"epoch": 7.75,
"eval_loss": 0.6658704280853271,
"eval_runtime": 86.8233,
"eval_samples_per_second": 2.304,
"eval_steps_per_second": 0.288,
"step": 248
},
{
"epoch": 7.78125,
"grad_norm": 0.6901593792019413,
"learning_rate": 2e-05,
"loss": 0.6777,
"step": 249
},
{
"epoch": 7.78125,
"eval_loss": 0.6652414202690125,
"eval_runtime": 86.7625,
"eval_samples_per_second": 2.305,
"eval_steps_per_second": 0.288,
"step": 249
},
{
"epoch": 7.8125,
"grad_norm": 0.6436454697341579,
"learning_rate": 2e-05,
"loss": 0.6912,
"step": 250
},
{
"epoch": 7.8125,
"eval_loss": 0.6654212474822998,
"eval_runtime": 86.871,
"eval_samples_per_second": 2.302,
"eval_steps_per_second": 0.288,
"step": 250
},
{
"epoch": 7.84375,
"grad_norm": 0.649040103024529,
"learning_rate": 2e-05,
"loss": 0.6025,
"step": 251
},
{
"epoch": 7.84375,
"eval_loss": 0.6654068231582642,
"eval_runtime": 86.7458,
"eval_samples_per_second": 2.306,
"eval_steps_per_second": 0.288,
"step": 251
},
{
"epoch": 7.875,
"grad_norm": 0.6595522131680224,
"learning_rate": 2e-05,
"loss": 0.5973,
"step": 252
},
{
"epoch": 7.875,
"eval_loss": 0.6644830107688904,
"eval_runtime": 86.8739,
"eval_samples_per_second": 2.302,
"eval_steps_per_second": 0.288,
"step": 252
},
{
"epoch": 7.90625,
"grad_norm": 0.6689891717273936,
"learning_rate": 2e-05,
"loss": 0.687,
"step": 253
},
{
"epoch": 7.90625,
"eval_loss": 0.6616199612617493,
"eval_runtime": 86.8222,
"eval_samples_per_second": 2.304,
"eval_steps_per_second": 0.288,
"step": 253
},
{
"epoch": 7.9375,
"grad_norm": 0.6306846778314292,
"learning_rate": 2e-05,
"loss": 0.6599,
"step": 254
},
{
"epoch": 7.9375,
"eval_loss": 0.6592965126037598,
"eval_runtime": 86.8577,
"eval_samples_per_second": 2.303,
"eval_steps_per_second": 0.288,
"step": 254
},
{
"epoch": 7.96875,
"grad_norm": 0.6021327993890785,
"learning_rate": 2e-05,
"loss": 0.575,
"step": 255
},
{
"epoch": 7.96875,
"eval_loss": 0.6580593585968018,
"eval_runtime": 86.7582,
"eval_samples_per_second": 2.305,
"eval_steps_per_second": 0.288,
"step": 255
},
{
"epoch": 8.0,
"grad_norm": 0.6174712675568311,
"learning_rate": 2e-05,
"loss": 0.6341,
"step": 256
},
{
"epoch": 8.0,
"eval_loss": 0.6575854420661926,
"eval_runtime": 76.7634,
"eval_samples_per_second": 2.605,
"eval_steps_per_second": 0.326,
"step": 256
},
{
"epoch": 8.03125,
"grad_norm": 0.6551281786490154,
"learning_rate": 2e-05,
"loss": 0.6032,
"step": 257
},
{
"epoch": 8.03125,
"eval_loss": 0.6583926677703857,
"eval_runtime": 83.4222,
"eval_samples_per_second": 2.397,
"eval_steps_per_second": 0.3,
"step": 257
},
{
"epoch": 8.0625,
"grad_norm": 0.6033798361300539,
"learning_rate": 2e-05,
"loss": 0.6352,
"step": 258
},
{
"epoch": 8.0625,
"eval_loss": 0.6615632772445679,
"eval_runtime": 76.7227,
"eval_samples_per_second": 2.607,
"eval_steps_per_second": 0.326,
"step": 258
},
{
"epoch": 8.09375,
"grad_norm": 0.557538857110867,
"learning_rate": 2e-05,
"loss": 0.6472,
"step": 259
},
{
"epoch": 8.09375,
"eval_loss": 0.6674608588218689,
"eval_runtime": 76.6215,
"eval_samples_per_second": 2.61,
"eval_steps_per_second": 0.326,
"step": 259
},
{
"epoch": 8.125,
"grad_norm": 0.7828450894757938,
"learning_rate": 2e-05,
"loss": 0.6576,
"step": 260
},
{
"epoch": 8.125,
"eval_loss": 0.670245349407196,
"eval_runtime": 76.685,
"eval_samples_per_second": 2.608,
"eval_steps_per_second": 0.326,
"step": 260
},
{
"epoch": 8.15625,
"grad_norm": 0.7969830757603331,
"learning_rate": 2e-05,
"loss": 0.5809,
"step": 261
},
{
"epoch": 8.15625,
"eval_loss": 0.6711975336074829,
"eval_runtime": 78.0022,
"eval_samples_per_second": 2.564,
"eval_steps_per_second": 0.321,
"step": 261
},
{
"epoch": 8.1875,
"grad_norm": 0.6431174985709492,
"learning_rate": 2e-05,
"loss": 0.6971,
"step": 262
},
{
"epoch": 8.1875,
"eval_loss": 0.6719404458999634,
"eval_runtime": 78.7599,
"eval_samples_per_second": 2.539,
"eval_steps_per_second": 0.317,
"step": 262
},
{
"epoch": 8.21875,
"grad_norm": 0.7025583314944188,
"learning_rate": 2e-05,
"loss": 0.5751,
"step": 263
},
{
"epoch": 8.21875,
"eval_loss": 0.6719526648521423,
"eval_runtime": 78.0188,
"eval_samples_per_second": 2.563,
"eval_steps_per_second": 0.32,
"step": 263
},
{
"epoch": 8.25,
"grad_norm": 0.7114355417811269,
"learning_rate": 2e-05,
"loss": 0.623,
"step": 264
},
{
"epoch": 8.25,
"eval_loss": 0.6717848181724548,
"eval_runtime": 78.6366,
"eval_samples_per_second": 2.543,
"eval_steps_per_second": 0.318,
"step": 264
},
{
"epoch": 8.28125,
"grad_norm": 0.8272269435769467,
"learning_rate": 2e-05,
"loss": 0.6509,
"step": 265
},
{
"epoch": 8.28125,
"eval_loss": 0.6701865196228027,
"eval_runtime": 78.7279,
"eval_samples_per_second": 2.54,
"eval_steps_per_second": 0.318,
"step": 265
},
{
"epoch": 8.3125,
"grad_norm": 0.7215994453471393,
"learning_rate": 2e-05,
"loss": 0.6263,
"step": 266
},
{
"epoch": 8.3125,
"eval_loss": 0.6682087182998657,
"eval_runtime": 78.1433,
"eval_samples_per_second": 2.559,
"eval_steps_per_second": 0.32,
"step": 266
},
{
"epoch": 8.34375,
"grad_norm": 0.6425448006102333,
"learning_rate": 2e-05,
"loss": 0.5613,
"step": 267
},
{
"epoch": 8.34375,
"eval_loss": 0.6686681509017944,
"eval_runtime": 78.0964,
"eval_samples_per_second": 2.561,
"eval_steps_per_second": 0.32,
"step": 267
},
{
"epoch": 8.375,
"grad_norm": 0.7207053166384572,
"learning_rate": 2e-05,
"loss": 0.6239,
"step": 268
},
{
"epoch": 8.375,
"eval_loss": 0.6676305532455444,
"eval_runtime": 77.9986,
"eval_samples_per_second": 2.564,
"eval_steps_per_second": 0.321,
"step": 268
},
{
"epoch": 8.40625,
"grad_norm": 0.7459344743811905,
"learning_rate": 2e-05,
"loss": 0.6159,
"step": 269
},
{
"epoch": 8.40625,
"eval_loss": 0.6660167574882507,
"eval_runtime": 78.4159,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 0.319,
"step": 269
},
{
"epoch": 8.4375,
"grad_norm": 0.7179805119560739,
"learning_rate": 2e-05,
"loss": 0.6192,
"step": 270
},
{
"epoch": 8.4375,
"eval_loss": 0.6636325716972351,
"eval_runtime": 78.2224,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 270
},
{
"epoch": 8.46875,
"grad_norm": 0.724792498458059,
"learning_rate": 2e-05,
"loss": 0.5234,
"step": 271
},
{
"epoch": 8.46875,
"eval_loss": 0.6647288799285889,
"eval_runtime": 79.0573,
"eval_samples_per_second": 2.53,
"eval_steps_per_second": 0.316,
"step": 271
},
{
"epoch": 8.5,
"grad_norm": 0.6544107138826364,
"learning_rate": 2e-05,
"loss": 0.6067,
"step": 272
},
{
"epoch": 8.5,
"eval_loss": 0.6689667701721191,
"eval_runtime": 79.2898,
"eval_samples_per_second": 2.522,
"eval_steps_per_second": 0.315,
"step": 272
},
{
"epoch": 8.53125,
"grad_norm": 0.71580236810568,
"learning_rate": 2e-05,
"loss": 0.6215,
"step": 273
},
{
"epoch": 8.53125,
"eval_loss": 0.6723271012306213,
"eval_runtime": 79.0759,
"eval_samples_per_second": 2.529,
"eval_steps_per_second": 0.316,
"step": 273
},
{
"epoch": 8.5625,
"grad_norm": 0.7741383931390255,
"learning_rate": 2e-05,
"loss": 0.6012,
"step": 274
},
{
"epoch": 8.5625,
"eval_loss": 0.6743794083595276,
"eval_runtime": 79.0509,
"eval_samples_per_second": 2.53,
"eval_steps_per_second": 0.316,
"step": 274
},
{
"epoch": 8.59375,
"grad_norm": 0.7927343087738151,
"learning_rate": 2e-05,
"loss": 0.6241,
"step": 275
},
{
"epoch": 8.59375,
"eval_loss": 0.6728585958480835,
"eval_runtime": 79.2296,
"eval_samples_per_second": 2.524,
"eval_steps_per_second": 0.316,
"step": 275
},
{
"epoch": 8.625,
"grad_norm": 0.759468785526614,
"learning_rate": 2e-05,
"loss": 0.6209,
"step": 276
},
{
"epoch": 8.625,
"eval_loss": 0.6686221957206726,
"eval_runtime": 76.7494,
"eval_samples_per_second": 2.606,
"eval_steps_per_second": 0.326,
"step": 276
},
{
"epoch": 8.65625,
"grad_norm": 0.7345386079388437,
"learning_rate": 2e-05,
"loss": 0.5618,
"step": 277
},
{
"epoch": 8.65625,
"eval_loss": 0.6659188270568848,
"eval_runtime": 77.4511,
"eval_samples_per_second": 2.582,
"eval_steps_per_second": 0.323,
"step": 277
},
{
"epoch": 8.6875,
"grad_norm": 0.6822491965046279,
"learning_rate": 2e-05,
"loss": 0.6064,
"step": 278
},
{
"epoch": 8.6875,
"eval_loss": 0.664726734161377,
"eval_runtime": 76.7108,
"eval_samples_per_second": 2.607,
"eval_steps_per_second": 0.326,
"step": 278
},
{
"epoch": 8.71875,
"grad_norm": 0.7329120674082968,
"learning_rate": 2e-05,
"loss": 0.5843,
"step": 279
},
{
"epoch": 8.71875,
"eval_loss": 0.6635715961456299,
"eval_runtime": 76.7921,
"eval_samples_per_second": 2.604,
"eval_steps_per_second": 0.326,
"step": 279
},
{
"epoch": 8.75,
"grad_norm": 0.7950781591249908,
"learning_rate": 2e-05,
"loss": 0.6383,
"step": 280
},
{
"epoch": 8.75,
"eval_loss": 0.664521336555481,
"eval_runtime": 76.6952,
"eval_samples_per_second": 2.608,
"eval_steps_per_second": 0.326,
"step": 280
},
{
"epoch": 8.78125,
"grad_norm": 0.6791182798182671,
"learning_rate": 2e-05,
"loss": 0.5932,
"step": 281
},
{
"epoch": 8.78125,
"eval_loss": 0.6673008799552917,
"eval_runtime": 76.794,
"eval_samples_per_second": 2.604,
"eval_steps_per_second": 0.326,
"step": 281
},
{
"epoch": 8.8125,
"grad_norm": 0.7633434086832942,
"learning_rate": 2e-05,
"loss": 0.5754,
"step": 282
},
{
"epoch": 8.8125,
"eval_loss": 0.6692779064178467,
"eval_runtime": 76.7749,
"eval_samples_per_second": 2.605,
"eval_steps_per_second": 0.326,
"step": 282
},
{
"epoch": 8.84375,
"grad_norm": 0.6857090076317197,
"learning_rate": 2e-05,
"loss": 0.5585,
"step": 283
},
{
"epoch": 8.84375,
"eval_loss": 0.6702080368995667,
"eval_runtime": 76.6913,
"eval_samples_per_second": 2.608,
"eval_steps_per_second": 0.326,
"step": 283
},
{
"epoch": 8.875,
"grad_norm": 0.6961298007385132,
"learning_rate": 2e-05,
"loss": 0.5093,
"step": 284
},
{
"epoch": 8.875,
"eval_loss": 0.6708166599273682,
"eval_runtime": 76.7725,
"eval_samples_per_second": 2.605,
"eval_steps_per_second": 0.326,
"step": 284
},
{
"epoch": 8.90625,
"grad_norm": 0.7783752192295856,
"learning_rate": 2e-05,
"loss": 0.5656,
"step": 285
},
{
"epoch": 8.90625,
"eval_loss": 0.6697121262550354,
"eval_runtime": 76.7888,
"eval_samples_per_second": 2.605,
"eval_steps_per_second": 0.326,
"step": 285
},
{
"epoch": 8.9375,
"grad_norm": 0.7327581828795048,
"learning_rate": 2e-05,
"loss": 0.6984,
"step": 286
},
{
"epoch": 8.9375,
"eval_loss": 0.6684187054634094,
"eval_runtime": 78.6657,
"eval_samples_per_second": 2.542,
"eval_steps_per_second": 0.318,
"step": 286
},
{
"epoch": 8.96875,
"grad_norm": 0.689919829790507,
"learning_rate": 2e-05,
"loss": 0.6173,
"step": 287
},
{
"epoch": 8.96875,
"eval_loss": 0.6675245761871338,
"eval_runtime": 78.1275,
"eval_samples_per_second": 2.56,
"eval_steps_per_second": 0.32,
"step": 287
},
{
"epoch": 9.0,
"grad_norm": 0.6812947879732435,
"learning_rate": 2e-05,
"loss": 0.5499,
"step": 288
},
{
"epoch": 9.0,
"eval_loss": 0.6678825616836548,
"eval_runtime": 78.8588,
"eval_samples_per_second": 2.536,
"eval_steps_per_second": 0.317,
"step": 288
},
{
"epoch": 9.03125,
"grad_norm": 0.715716761740314,
"learning_rate": 2e-05,
"loss": 0.5699,
"step": 289
},
{
"epoch": 9.03125,
"eval_loss": 0.6692755222320557,
"eval_runtime": 83.098,
"eval_samples_per_second": 2.407,
"eval_steps_per_second": 0.301,
"step": 289
},
{
"epoch": 9.0625,
"grad_norm": 0.7438930389955494,
"learning_rate": 2e-05,
"loss": 0.5974,
"step": 290
},
{
"epoch": 9.0625,
"eval_loss": 0.6735746264457703,
"eval_runtime": 77.384,
"eval_samples_per_second": 2.585,
"eval_steps_per_second": 0.323,
"step": 290
},
{
"epoch": 9.09375,
"grad_norm": 0.7271043131369198,
"learning_rate": 2e-05,
"loss": 0.601,
"step": 291
},
{
"epoch": 9.09375,
"eval_loss": 0.6790977716445923,
"eval_runtime": 78.0312,
"eval_samples_per_second": 2.563,
"eval_steps_per_second": 0.32,
"step": 291
},
{
"epoch": 9.125,
"grad_norm": 0.851687675865168,
"learning_rate": 2e-05,
"loss": 0.5681,
"step": 292
},
{
"epoch": 9.125,
"eval_loss": 0.6834170818328857,
"eval_runtime": 77.8688,
"eval_samples_per_second": 2.568,
"eval_steps_per_second": 0.321,
"step": 292
},
{
"epoch": 9.15625,
"grad_norm": 0.7905287763218567,
"learning_rate": 2e-05,
"loss": 0.6222,
"step": 293
},
{
"epoch": 9.15625,
"eval_loss": 0.6843841671943665,
"eval_runtime": 77.985,
"eval_samples_per_second": 2.565,
"eval_steps_per_second": 0.321,
"step": 293
},
{
"epoch": 9.1875,
"grad_norm": 0.7301520002532459,
"learning_rate": 2e-05,
"loss": 0.5549,
"step": 294
},
{
"epoch": 9.1875,
"eval_loss": 0.6860540509223938,
"eval_runtime": 78.0163,
"eval_samples_per_second": 2.564,
"eval_steps_per_second": 0.32,
"step": 294
},
{
"epoch": 9.21875,
"grad_norm": 0.899999206595601,
"learning_rate": 2e-05,
"loss": 0.5128,
"step": 295
},
{
"epoch": 9.21875,
"eval_loss": 0.685759425163269,
"eval_runtime": 78.4339,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 0.319,
"step": 295
},
{
"epoch": 9.25,
"grad_norm": 0.8064287475451557,
"learning_rate": 2e-05,
"loss": 0.5261,
"step": 296
},
{
"epoch": 9.25,
"eval_loss": 0.6864770650863647,
"eval_runtime": 79.6129,
"eval_samples_per_second": 2.512,
"eval_steps_per_second": 0.314,
"step": 296
},
{
"epoch": 9.28125,
"grad_norm": 0.8837240795882767,
"learning_rate": 2e-05,
"loss": 0.621,
"step": 297
},
{
"epoch": 9.28125,
"eval_loss": 0.6871599555015564,
"eval_runtime": 78.9778,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.317,
"step": 297
},
{
"epoch": 9.3125,
"grad_norm": 0.9676184044078363,
"learning_rate": 2e-05,
"loss": 0.5655,
"step": 298
},
{
"epoch": 9.3125,
"eval_loss": 0.6881282329559326,
"eval_runtime": 78.9944,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.316,
"step": 298
},
{
"epoch": 9.34375,
"grad_norm": 0.8723474213941232,
"learning_rate": 2e-05,
"loss": 0.5449,
"step": 299
},
{
"epoch": 9.34375,
"eval_loss": 0.6879245638847351,
"eval_runtime": 79.0056,
"eval_samples_per_second": 2.531,
"eval_steps_per_second": 0.316,
"step": 299
},
{
"epoch": 9.375,
"grad_norm": 0.848833488380702,
"learning_rate": 2e-05,
"loss": 0.5683,
"step": 300
},
{
"epoch": 9.375,
"eval_loss": 0.6846978664398193,
"eval_runtime": 78.9003,
"eval_samples_per_second": 2.535,
"eval_steps_per_second": 0.317,
"step": 300
},
{
"epoch": 9.40625,
"grad_norm": 0.8586391766708288,
"learning_rate": 2e-05,
"loss": 0.5358,
"step": 301
},
{
"epoch": 9.40625,
"eval_loss": 0.6798649430274963,
"eval_runtime": 80.0404,
"eval_samples_per_second": 2.499,
"eval_steps_per_second": 0.312,
"step": 301
},
{
"epoch": 9.4375,
"grad_norm": 0.8007832596916474,
"learning_rate": 2e-05,
"loss": 0.5792,
"step": 302
},
{
"epoch": 9.4375,
"eval_loss": 0.6757382750511169,
"eval_runtime": 79.962,
"eval_samples_per_second": 2.501,
"eval_steps_per_second": 0.313,
"step": 302
},
{
"epoch": 9.46875,
"grad_norm": 0.7839805948862919,
"learning_rate": 2e-05,
"loss": 0.5917,
"step": 303
},
{
"epoch": 9.46875,
"eval_loss": 0.6754000782966614,
"eval_runtime": 80.738,
"eval_samples_per_second": 2.477,
"eval_steps_per_second": 0.31,
"step": 303
},
{
"epoch": 9.5,
"grad_norm": 0.7397772754102683,
"learning_rate": 2e-05,
"loss": 0.6249,
"step": 304
},
{
"epoch": 9.5,
"eval_loss": 0.6777495741844177,
"eval_runtime": 80.5144,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 0.311,
"step": 304
},
{
"epoch": 9.53125,
"grad_norm": 0.857390001265035,
"learning_rate": 2e-05,
"loss": 0.5932,
"step": 305
},
{
"epoch": 9.53125,
"eval_loss": 0.6778848171234131,
"eval_runtime": 80.1508,
"eval_samples_per_second": 2.495,
"eval_steps_per_second": 0.312,
"step": 305
},
{
"epoch": 9.5625,
"grad_norm": 0.9430180281536945,
"learning_rate": 2e-05,
"loss": 0.5793,
"step": 306
},
{
"epoch": 9.5625,
"eval_loss": 0.6771917939186096,
"eval_runtime": 76.7109,
"eval_samples_per_second": 2.607,
"eval_steps_per_second": 0.326,
"step": 306
},
{
"epoch": 9.59375,
"grad_norm": 0.8705050270903875,
"learning_rate": 2e-05,
"loss": 0.5601,
"step": 307
},
{
"epoch": 9.59375,
"eval_loss": 0.6808632016181946,
"eval_runtime": 76.6965,
"eval_samples_per_second": 2.608,
"eval_steps_per_second": 0.326,
"step": 307
},
{
"epoch": 9.625,
"grad_norm": 0.8611871513168323,
"learning_rate": 2e-05,
"loss": 0.5953,
"step": 308
},
{
"epoch": 9.625,
"eval_loss": 0.6875945329666138,
"eval_runtime": 76.6592,
"eval_samples_per_second": 2.609,
"eval_steps_per_second": 0.326,
"step": 308
},
{
"epoch": 9.65625,
"grad_norm": 0.9066952565245906,
"learning_rate": 2e-05,
"loss": 0.5815,
"step": 309
},
{
"epoch": 9.65625,
"eval_loss": 0.6910049319267273,
"eval_runtime": 76.7021,
"eval_samples_per_second": 2.607,
"eval_steps_per_second": 0.326,
"step": 309
},
{
"epoch": 9.6875,
"grad_norm": 1.0666864048105145,
"learning_rate": 2e-05,
"loss": 0.5663,
"step": 310
},
{
"epoch": 9.6875,
"eval_loss": 0.6869986057281494,
"eval_runtime": 76.6344,
"eval_samples_per_second": 2.61,
"eval_steps_per_second": 0.326,
"step": 310
},
{
"epoch": 9.71875,
"grad_norm": 0.9413311560347162,
"learning_rate": 2e-05,
"loss": 0.5106,
"step": 311
},
{
"epoch": 9.71875,
"eval_loss": 0.6825075745582581,
"eval_runtime": 78.7857,
"eval_samples_per_second": 2.539,
"eval_steps_per_second": 0.317,
"step": 311
},
{
"epoch": 9.75,
"grad_norm": 0.9175579044457436,
"learning_rate": 2e-05,
"loss": 0.5821,
"step": 312
},
{
"epoch": 9.75,
"eval_loss": 0.6794223189353943,
"eval_runtime": 78.0368,
"eval_samples_per_second": 2.563,
"eval_steps_per_second": 0.32,
"step": 312
},
{
"epoch": 9.78125,
"grad_norm": 0.7982785075945665,
"learning_rate": 2e-05,
"loss": 0.5781,
"step": 313
},
{
"epoch": 9.78125,
"eval_loss": 0.679649829864502,
"eval_runtime": 78.0513,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 313
},
{
"epoch": 9.8125,
"grad_norm": 0.9284642289974022,
"learning_rate": 2e-05,
"loss": 0.5394,
"step": 314
},
{
"epoch": 9.8125,
"eval_loss": 0.6805163025856018,
"eval_runtime": 78.2229,
"eval_samples_per_second": 2.557,
"eval_steps_per_second": 0.32,
"step": 314
},
{
"epoch": 9.84375,
"grad_norm": 0.8816568355396782,
"learning_rate": 2e-05,
"loss": 0.5722,
"step": 315
},
{
"epoch": 9.84375,
"eval_loss": 0.6801097393035889,
"eval_runtime": 78.9282,
"eval_samples_per_second": 2.534,
"eval_steps_per_second": 0.317,
"step": 315
},
{
"epoch": 9.875,
"grad_norm": 0.8137119863863306,
"learning_rate": 2e-05,
"loss": 0.5831,
"step": 316
},
{
"epoch": 9.875,
"eval_loss": 0.6792600750923157,
"eval_runtime": 78.8166,
"eval_samples_per_second": 2.538,
"eval_steps_per_second": 0.317,
"step": 316
},
{
"epoch": 9.90625,
"grad_norm": 0.9595174764400289,
"learning_rate": 2e-05,
"loss": 0.5489,
"step": 317
},
{
"epoch": 9.90625,
"eval_loss": 0.6755692958831787,
"eval_runtime": 78.1426,
"eval_samples_per_second": 2.559,
"eval_steps_per_second": 0.32,
"step": 317
},
{
"epoch": 9.9375,
"grad_norm": 0.8612490247878711,
"learning_rate": 2e-05,
"loss": 0.5508,
"step": 318
},
{
"epoch": 9.9375,
"eval_loss": 0.673053503036499,
"eval_runtime": 78.0565,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 318
},
{
"epoch": 9.96875,
"grad_norm": 0.9474068762478358,
"learning_rate": 2e-05,
"loss": 0.5859,
"step": 319
},
{
"epoch": 9.96875,
"eval_loss": 0.6695602536201477,
"eval_runtime": 78.051,
"eval_samples_per_second": 2.562,
"eval_steps_per_second": 0.32,
"step": 319
},
{
"epoch": 10.0,
"grad_norm": 0.8401643717683449,
"learning_rate": 2e-05,
"loss": 0.5277,
"step": 320
},
{
"epoch": 10.0,
"eval_loss": 0.6707890033721924,
"eval_runtime": 78.9959,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.316,
"step": 320
},
{
"epoch": 10.0,
"step": 320,
"total_flos": 613933061373952.0,
"train_loss": 0.056994458101689814,
"train_runtime": 3241.7031,
"train_samples_per_second": 3.085,
"train_steps_per_second": 0.099
}
],
"logging_steps": 1.0,
"max_steps": 320,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 613933061373952.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}