|
{
|
|
"best_metric": 0.08053209632635117,
|
|
"best_model_checkpoint": "saves/Breeze-7B-FC-v1_0-15-12-2024\\checkpoint-1700",
|
|
"epoch": 0.3381150088257962,
|
|
"eval_steps": 100,
|
|
"global_step": 1700,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0019889118166223304,
|
|
"grad_norm": 0.6233828067779541,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7108,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.003977823633244661,
|
|
"grad_norm": 0.4662277102470398,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.729,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.005966735449866992,
|
|
"grad_norm": 0.5307815670967102,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6936,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.007955647266489322,
|
|
"grad_norm": 0.5890870690345764,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7229,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.009944559083111653,
|
|
"grad_norm": 0.5515501499176025,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6669,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.011933470899733983,
|
|
"grad_norm": 0.5326377153396606,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6564,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.013922382716356313,
|
|
"grad_norm": 0.5636907815933228,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6773,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.015911294532978643,
|
|
"grad_norm": 0.49359801411628723,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6516,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.017900206349600975,
|
|
"grad_norm": 0.4739631116390228,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6594,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.019889118166223307,
|
|
"grad_norm": 0.5182624459266663,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.69,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.019889118166223307,
|
|
"eval_loss": 0.08401616662740707,
|
|
"eval_runtime": 30.1147,
|
|
"eval_samples_per_second": 2.69,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.021878029982845635,
|
|
"grad_norm": 0.5577639937400818,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6813,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.023866941799467967,
|
|
"grad_norm": 0.5308650732040405,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.689,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.025855853616090295,
|
|
"grad_norm": 0.705682635307312,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6846,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.027844765432712627,
|
|
"grad_norm": 0.5250112414360046,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6699,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.02983367724933496,
|
|
"grad_norm": 0.5885920524597168,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6733,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.03182258906595729,
|
|
"grad_norm": 0.5392662286758423,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.673,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.03381150088257962,
|
|
"grad_norm": 0.576032280921936,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6934,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.03580041269920195,
|
|
"grad_norm": 0.5046477317810059,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6535,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.03778932451582428,
|
|
"grad_norm": 0.628962516784668,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6758,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.03977823633244661,
|
|
"grad_norm": 0.5312318801879883,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6839,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.03977823633244661,
|
|
"eval_loss": 0.08386518061161041,
|
|
"eval_runtime": 30.1222,
|
|
"eval_samples_per_second": 2.689,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.04176714814906894,
|
|
"grad_norm": 0.5689460635185242,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6616,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.04375605996569127,
|
|
"grad_norm": 0.5760230422019958,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.67,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.045744971782313605,
|
|
"grad_norm": 0.6037033200263977,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6608,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.04773388359893593,
|
|
"grad_norm": 0.5196573138237,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.672,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.04972279541555826,
|
|
"grad_norm": 0.5766464471817017,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6651,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.05171170723218059,
|
|
"grad_norm": 0.5686795711517334,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6351,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.053700619048802925,
|
|
"grad_norm": 0.5607637763023376,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.659,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.05568953086542525,
|
|
"grad_norm": 0.545982837677002,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6297,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.05767844268204758,
|
|
"grad_norm": 0.6047331690788269,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6541,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.05966735449866992,
|
|
"grad_norm": 0.5864997506141663,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6831,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.05966735449866992,
|
|
"eval_loss": 0.0832422524690628,
|
|
"eval_runtime": 30.1361,
|
|
"eval_samples_per_second": 2.688,
|
|
"eval_steps_per_second": 1.36,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.061656266315292245,
|
|
"grad_norm": 0.6031671166419983,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6441,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.06364517813191457,
|
|
"grad_norm": 0.5433733463287354,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6836,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.06563408994853691,
|
|
"grad_norm": 0.5863742232322693,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6804,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.06762300176515924,
|
|
"grad_norm": 0.7768782377243042,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7014,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.06961191358178156,
|
|
"grad_norm": 0.548475444316864,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6545,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.0716008253984039,
|
|
"grad_norm": 0.7511247992515564,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6478,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.07358973721502624,
|
|
"grad_norm": 0.6464333534240723,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6762,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.07557864903164856,
|
|
"grad_norm": 0.6280458569526672,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6723,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.07756756084827089,
|
|
"grad_norm": 0.6138644218444824,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6834,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.07955647266489323,
|
|
"grad_norm": 0.6612856984138489,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.662,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.07955647266489323,
|
|
"eval_loss": 0.08379939943552017,
|
|
"eval_runtime": 30.1324,
|
|
"eval_samples_per_second": 2.688,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.08154538448151555,
|
|
"grad_norm": 0.5658541917800903,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6507,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.08353429629813788,
|
|
"grad_norm": 0.5861065983772278,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6565,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.08552320811476022,
|
|
"grad_norm": 0.6580057144165039,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6961,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.08751211993138254,
|
|
"grad_norm": 0.6456801295280457,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6667,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.08950103174800488,
|
|
"grad_norm": 0.6603415608406067,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6589,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.09148994356462721,
|
|
"grad_norm": 0.6744834184646606,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6791,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.09347885538124953,
|
|
"grad_norm": 0.6219160556793213,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6748,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.09546776719787187,
|
|
"grad_norm": 0.6373462677001953,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.654,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.09745667901449419,
|
|
"grad_norm": 0.7271533608436584,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.651,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.09944559083111652,
|
|
"grad_norm": 0.6483666300773621,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6728,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.09944559083111652,
|
|
"eval_loss": 0.08319716155529022,
|
|
"eval_runtime": 30.1252,
|
|
"eval_samples_per_second": 2.689,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.10143450264773886,
|
|
"grad_norm": 0.5817425847053528,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6571,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.10342341446436118,
|
|
"grad_norm": 0.6830428838729858,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6618,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.10541232628098351,
|
|
"grad_norm": 0.5775642395019531,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6181,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.10740123809760585,
|
|
"grad_norm": 0.6007582545280457,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6839,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.10939014991422817,
|
|
"grad_norm": 0.648262083530426,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6643,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.1113790617308505,
|
|
"grad_norm": 0.6632483601570129,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.652,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.11336797354747284,
|
|
"grad_norm": 0.5972626805305481,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6938,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.11535688536409516,
|
|
"grad_norm": 0.6052406430244446,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6301,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.1173457971807175,
|
|
"grad_norm": 0.5875466465950012,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6614,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.11933470899733983,
|
|
"grad_norm": 0.7067976593971252,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.647,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.11933470899733983,
|
|
"eval_loss": 0.08319241553544998,
|
|
"eval_runtime": 30.1031,
|
|
"eval_samples_per_second": 2.691,
|
|
"eval_steps_per_second": 1.362,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.12132362081396215,
|
|
"grad_norm": 0.66518634557724,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.693,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.12331253263058449,
|
|
"grad_norm": 0.6959813833236694,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6747,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.1253014444472068,
|
|
"grad_norm": 0.935105562210083,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6686,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.12729035626382915,
|
|
"grad_norm": 0.713768720626831,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6707,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.12927926808045148,
|
|
"grad_norm": 0.7059699296951294,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7255,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.13126817989707382,
|
|
"grad_norm": 0.588306725025177,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6689,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.13325709171369615,
|
|
"grad_norm": 0.6097111105918884,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6612,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.1352460035303185,
|
|
"grad_norm": 0.642393946647644,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6743,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.1372349153469408,
|
|
"grad_norm": 0.7600162625312805,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6768,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.13922382716356313,
|
|
"grad_norm": 0.7193499207496643,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6559,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.13922382716356313,
|
|
"eval_loss": 0.08392166346311569,
|
|
"eval_runtime": 30.1253,
|
|
"eval_samples_per_second": 2.689,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.14121273898018546,
|
|
"grad_norm": 0.6542356014251709,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6775,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.1432016507968078,
|
|
"grad_norm": 0.629941999912262,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6223,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.14519056261343014,
|
|
"grad_norm": 0.6493385434150696,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6894,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.14717947443005247,
|
|
"grad_norm": 0.7201417684555054,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6858,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.14916838624667478,
|
|
"grad_norm": 0.6775253415107727,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6628,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.1511572980632971,
|
|
"grad_norm": 0.6149548292160034,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6993,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.15314620987991945,
|
|
"grad_norm": 0.6627587080001831,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.646,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.15513512169654178,
|
|
"grad_norm": 0.6701797842979431,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6927,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.15712403351316412,
|
|
"grad_norm": 0.678193211555481,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6454,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.15911294532978645,
|
|
"grad_norm": 0.6337444186210632,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6723,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.15911294532978645,
|
|
"eval_loss": 0.08426591008901596,
|
|
"eval_runtime": 30.0445,
|
|
"eval_samples_per_second": 2.696,
|
|
"eval_steps_per_second": 1.365,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.16110185714640876,
|
|
"grad_norm": 0.654451310634613,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6799,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.1630907689630311,
|
|
"grad_norm": 0.6989086866378784,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6694,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.16507968077965343,
|
|
"grad_norm": 0.6176579594612122,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6225,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.16706859259627577,
|
|
"grad_norm": 0.6462605595588684,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6584,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.1690575044128981,
|
|
"grad_norm": 0.7809733748435974,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6601,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.17104641622952044,
|
|
"grad_norm": 0.7143774032592773,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6655,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.17303532804614274,
|
|
"grad_norm": 0.7137865424156189,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6849,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.17502423986276508,
|
|
"grad_norm": 0.715568482875824,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6408,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.17701315167938741,
|
|
"grad_norm": 0.59111088514328,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6772,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.17900206349600975,
|
|
"grad_norm": 0.7616696357727051,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6671,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.17900206349600975,
|
|
"eval_loss": 0.084267757833004,
|
|
"eval_runtime": 30.1275,
|
|
"eval_samples_per_second": 2.689,
|
|
"eval_steps_per_second": 1.361,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.18099097531263209,
|
|
"grad_norm": 0.6685693860054016,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6792,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.18297988712925442,
|
|
"grad_norm": 0.7320526838302612,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6435,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.18496879894587673,
|
|
"grad_norm": 0.6541480422019958,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6649,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.18695771076249906,
|
|
"grad_norm": 0.6433006525039673,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6677,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.1889466225791214,
|
|
"grad_norm": 0.6296941041946411,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6334,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.19093553439574373,
|
|
"grad_norm": 0.7856689691543579,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7039,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.19292444621236607,
|
|
"grad_norm": 0.6200475096702576,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6602,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.19491335802898838,
|
|
"grad_norm": 0.6970551609992981,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6704,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.1969022698456107,
|
|
"grad_norm": 0.6525449752807617,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6721,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.19889118166223305,
|
|
"grad_norm": 0.7507511377334595,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6829,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.19889118166223305,
|
|
"eval_loss": 0.0835462287068367,
|
|
"eval_runtime": 30.0507,
|
|
"eval_samples_per_second": 2.695,
|
|
"eval_steps_per_second": 1.364,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.20088009347885538,
|
|
"grad_norm": 0.7378696203231812,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6567,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.20286900529547772,
|
|
"grad_norm": 0.6451396346092224,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6502,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.20485791711210005,
|
|
"grad_norm": 0.6342566013336182,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6477,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.20684682892872236,
|
|
"grad_norm": 0.7209526896476746,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6661,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.2088357407453447,
|
|
"grad_norm": 0.6808329820632935,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6515,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.21082465256196703,
|
|
"grad_norm": 0.6738231182098389,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.626,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.21281356437858936,
|
|
"grad_norm": 0.6646963357925415,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6714,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.2148024761952117,
|
|
"grad_norm": 0.6372888088226318,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6768,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.21679138801183404,
|
|
"grad_norm": 0.7138890624046326,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6949,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.21878029982845634,
|
|
"grad_norm": 0.7249679565429688,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6928,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.21878029982845634,
|
|
"eval_loss": 0.08422956615686417,
|
|
"eval_runtime": 30.1032,
|
|
"eval_samples_per_second": 2.691,
|
|
"eval_steps_per_second": 1.362,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.22076921164507868,
|
|
"grad_norm": 0.6382346153259277,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6619,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.222758123461701,
|
|
"grad_norm": 0.6400596499443054,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7103,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.22474703527832335,
|
|
"grad_norm": 0.6994810700416565,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6647,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.22673594709494568,
|
|
"grad_norm": 0.76835036277771,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6923,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.22872485891156802,
|
|
"grad_norm": 0.6603644490242004,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.673,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.23071377072819033,
|
|
"grad_norm": 0.7264408469200134,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6828,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.23270268254481266,
|
|
"grad_norm": 0.7072731852531433,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6831,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.234691594361435,
|
|
"grad_norm": 0.6494096517562866,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6659,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.23668050617805733,
|
|
"grad_norm": 0.6463006734848022,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7155,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.23866941799467967,
|
|
"grad_norm": 0.6508920192718506,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6563,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.23866941799467967,
|
|
"eval_loss": 0.08387701213359833,
|
|
"eval_runtime": 30.0299,
|
|
"eval_samples_per_second": 2.697,
|
|
"eval_steps_per_second": 1.365,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.240658329811302,
|
|
"grad_norm": 0.6701735258102417,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6765,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.2426472416279243,
|
|
"grad_norm": 0.5798119902610779,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6501,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.24463615344454664,
|
|
"grad_norm": 0.7210298776626587,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6576,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.24662506526116898,
|
|
"grad_norm": 0.7448759078979492,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6918,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.24861397707779131,
|
|
"grad_norm": 0.6556337475776672,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6526,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.2506028888944136,
|
|
"grad_norm": 0.6584301590919495,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6736,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.252591800711036,
|
|
"grad_norm": 0.6725241541862488,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6772,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.2545807125276583,
|
|
"grad_norm": 0.7188987731933594,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6629,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.25656962434428066,
|
|
"grad_norm": 0.8247680068016052,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6844,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.25855853616090296,
|
|
"grad_norm": 0.6960418224334717,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6411,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.25855853616090296,
|
|
"eval_loss": 0.0839666873216629,
|
|
"eval_runtime": 30.0597,
|
|
"eval_samples_per_second": 2.695,
|
|
"eval_steps_per_second": 1.364,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.26054744797752527,
|
|
"grad_norm": 0.6796591877937317,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6707,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.26253635979414763,
|
|
"grad_norm": 0.6542907357215881,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6691,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.26452527161076994,
|
|
"grad_norm": 0.6826708912849426,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6696,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.2665141834273923,
|
|
"grad_norm": 0.7088764905929565,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6817,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.2685030952440146,
|
|
"grad_norm": 0.7242617607116699,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6759,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.270492007060637,
|
|
"grad_norm": 0.7008711099624634,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6592,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.2724809188772593,
|
|
"grad_norm": 0.6489241719245911,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6637,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.2744698306938816,
|
|
"grad_norm": 0.7217922806739807,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6539,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.27645874251050395,
|
|
"grad_norm": 0.8037365078926086,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6994,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.27844765432712626,
|
|
"grad_norm": 0.659654974937439,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6673,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.27844765432712626,
|
|
"eval_loss": 0.0840681791305542,
|
|
"eval_runtime": 30.0353,
|
|
"eval_samples_per_second": 2.697,
|
|
"eval_steps_per_second": 1.365,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.2804365661437486,
|
|
"grad_norm": 0.7238272428512573,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6974,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.28242547796037093,
|
|
"grad_norm": 0.6564947366714478,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6687,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.28441438977699324,
|
|
"grad_norm": 0.7392669916152954,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6546,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.2864033015936156,
|
|
"grad_norm": 0.7504440546035767,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6737,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.2883922134102379,
|
|
"grad_norm": 0.7336270213127136,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6803,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.29038112522686027,
|
|
"grad_norm": 1.862186312675476,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.658,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.2923700370434826,
|
|
"grad_norm": 0.7425276637077332,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6383,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.29435894886010494,
|
|
"grad_norm": 0.6604830622673035,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7077,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.29634786067672725,
|
|
"grad_norm": 0.7673712968826294,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6722,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.29833677249334956,
|
|
"grad_norm": 0.7889634370803833,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6909,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.29833677249334956,
|
|
"eval_loss": 0.08427305519580841,
|
|
"eval_runtime": 30.0826,
|
|
"eval_samples_per_second": 2.693,
|
|
"eval_steps_per_second": 1.363,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.3003256843099719,
|
|
"grad_norm": 0.8077505826950073,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6976,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.3023145961265942,
|
|
"grad_norm": 0.6837480068206787,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.675,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.3043035079432166,
|
|
"grad_norm": 0.6629063487052917,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6396,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.3062924197598389,
|
|
"grad_norm": 0.7187213897705078,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6898,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.3082813315764612,
|
|
"grad_norm": 0.7269571423530579,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7106,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.31027024339308357,
|
|
"grad_norm": 0.6767787337303162,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6836,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.3122591552097059,
|
|
"grad_norm": 0.7046016454696655,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7129,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.31424806702632824,
|
|
"grad_norm": 0.6218843460083008,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.7129,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.31623697884295054,
|
|
"grad_norm": 0.7410914897918701,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6616,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.3182258906595729,
|
|
"grad_norm": 0.5945529937744141,
|
|
"learning_rate": 0.0004,
|
|
"loss": 0.6878,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.3182258906595729,
|
|
"eval_loss": 0.08433911204338074,
|
|
"eval_runtime": 30.0269,
|
|
"eval_samples_per_second": 2.698,
|
|
"eval_steps_per_second": 1.365,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.3202148024761952,
|
|
"grad_norm": 0.6479379534721375,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6708,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.3222037142928175,
|
|
"grad_norm": 0.6011672616004944,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6511,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.3241926261094399,
|
|
"grad_norm": 0.6457736492156982,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6601,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.3261815379260622,
|
|
"grad_norm": 0.6549608707427979,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6643,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.32817044974268456,
|
|
"grad_norm": 0.6869723200798035,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6766,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.33015936155930686,
|
|
"grad_norm": 0.6526493430137634,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6625,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.33214827337592917,
|
|
"grad_norm": 0.6106629967689514,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6412,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.33413718519255153,
|
|
"grad_norm": 0.6620826125144958,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6314,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.33612609700917384,
|
|
"grad_norm": 0.6487278938293457,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.643,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.3381150088257962,
|
|
"grad_norm": 0.7703284621238708,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.6298,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.3381150088257962,
|
|
"eval_loss": 0.08053209632635117,
|
|
"eval_runtime": 30.0784,
|
|
"eval_samples_per_second": 2.693,
|
|
"eval_steps_per_second": 1.363,
|
|
"step": 1700
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 10000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.460398769660756e+18,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|