win10's picture
Upload 13 files
6197248 verified
{
"best_metric": 0.08053209632635117,
"best_model_checkpoint": "saves/Breeze-7B-FC-v1_0-15-12-2024\\checkpoint-1700",
"epoch": 0.3381150088257962,
"eval_steps": 100,
"global_step": 1700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019889118166223304,
"grad_norm": 0.6233828067779541,
"learning_rate": 0.0004,
"loss": 0.7108,
"step": 10
},
{
"epoch": 0.003977823633244661,
"grad_norm": 0.4662277102470398,
"learning_rate": 0.0004,
"loss": 0.729,
"step": 20
},
{
"epoch": 0.005966735449866992,
"grad_norm": 0.5307815670967102,
"learning_rate": 0.0004,
"loss": 0.6936,
"step": 30
},
{
"epoch": 0.007955647266489322,
"grad_norm": 0.5890870690345764,
"learning_rate": 0.0004,
"loss": 0.7229,
"step": 40
},
{
"epoch": 0.009944559083111653,
"grad_norm": 0.5515501499176025,
"learning_rate": 0.0004,
"loss": 0.6669,
"step": 50
},
{
"epoch": 0.011933470899733983,
"grad_norm": 0.5326377153396606,
"learning_rate": 0.0004,
"loss": 0.6564,
"step": 60
},
{
"epoch": 0.013922382716356313,
"grad_norm": 0.5636907815933228,
"learning_rate": 0.0004,
"loss": 0.6773,
"step": 70
},
{
"epoch": 0.015911294532978643,
"grad_norm": 0.49359801411628723,
"learning_rate": 0.0004,
"loss": 0.6516,
"step": 80
},
{
"epoch": 0.017900206349600975,
"grad_norm": 0.4739631116390228,
"learning_rate": 0.0004,
"loss": 0.6594,
"step": 90
},
{
"epoch": 0.019889118166223307,
"grad_norm": 0.5182624459266663,
"learning_rate": 0.0004,
"loss": 0.69,
"step": 100
},
{
"epoch": 0.019889118166223307,
"eval_loss": 0.08401616662740707,
"eval_runtime": 30.1147,
"eval_samples_per_second": 2.69,
"eval_steps_per_second": 1.361,
"step": 100
},
{
"epoch": 0.021878029982845635,
"grad_norm": 0.5577639937400818,
"learning_rate": 0.0004,
"loss": 0.6813,
"step": 110
},
{
"epoch": 0.023866941799467967,
"grad_norm": 0.5308650732040405,
"learning_rate": 0.0004,
"loss": 0.689,
"step": 120
},
{
"epoch": 0.025855853616090295,
"grad_norm": 0.705682635307312,
"learning_rate": 0.0004,
"loss": 0.6846,
"step": 130
},
{
"epoch": 0.027844765432712627,
"grad_norm": 0.5250112414360046,
"learning_rate": 0.0004,
"loss": 0.6699,
"step": 140
},
{
"epoch": 0.02983367724933496,
"grad_norm": 0.5885920524597168,
"learning_rate": 0.0004,
"loss": 0.6733,
"step": 150
},
{
"epoch": 0.03182258906595729,
"grad_norm": 0.5392662286758423,
"learning_rate": 0.0004,
"loss": 0.673,
"step": 160
},
{
"epoch": 0.03381150088257962,
"grad_norm": 0.576032280921936,
"learning_rate": 0.0004,
"loss": 0.6934,
"step": 170
},
{
"epoch": 0.03580041269920195,
"grad_norm": 0.5046477317810059,
"learning_rate": 0.0004,
"loss": 0.6535,
"step": 180
},
{
"epoch": 0.03778932451582428,
"grad_norm": 0.628962516784668,
"learning_rate": 0.0004,
"loss": 0.6758,
"step": 190
},
{
"epoch": 0.03977823633244661,
"grad_norm": 0.5312318801879883,
"learning_rate": 0.0004,
"loss": 0.6839,
"step": 200
},
{
"epoch": 0.03977823633244661,
"eval_loss": 0.08386518061161041,
"eval_runtime": 30.1222,
"eval_samples_per_second": 2.689,
"eval_steps_per_second": 1.361,
"step": 200
},
{
"epoch": 0.04176714814906894,
"grad_norm": 0.5689460635185242,
"learning_rate": 0.0004,
"loss": 0.6616,
"step": 210
},
{
"epoch": 0.04375605996569127,
"grad_norm": 0.5760230422019958,
"learning_rate": 0.0004,
"loss": 0.67,
"step": 220
},
{
"epoch": 0.045744971782313605,
"grad_norm": 0.6037033200263977,
"learning_rate": 0.0004,
"loss": 0.6608,
"step": 230
},
{
"epoch": 0.04773388359893593,
"grad_norm": 0.5196573138237,
"learning_rate": 0.0004,
"loss": 0.672,
"step": 240
},
{
"epoch": 0.04972279541555826,
"grad_norm": 0.5766464471817017,
"learning_rate": 0.0004,
"loss": 0.6651,
"step": 250
},
{
"epoch": 0.05171170723218059,
"grad_norm": 0.5686795711517334,
"learning_rate": 0.0004,
"loss": 0.6351,
"step": 260
},
{
"epoch": 0.053700619048802925,
"grad_norm": 0.5607637763023376,
"learning_rate": 0.0004,
"loss": 0.659,
"step": 270
},
{
"epoch": 0.05568953086542525,
"grad_norm": 0.545982837677002,
"learning_rate": 0.0004,
"loss": 0.6297,
"step": 280
},
{
"epoch": 0.05767844268204758,
"grad_norm": 0.6047331690788269,
"learning_rate": 0.0004,
"loss": 0.6541,
"step": 290
},
{
"epoch": 0.05966735449866992,
"grad_norm": 0.5864997506141663,
"learning_rate": 0.0004,
"loss": 0.6831,
"step": 300
},
{
"epoch": 0.05966735449866992,
"eval_loss": 0.0832422524690628,
"eval_runtime": 30.1361,
"eval_samples_per_second": 2.688,
"eval_steps_per_second": 1.36,
"step": 300
},
{
"epoch": 0.061656266315292245,
"grad_norm": 0.6031671166419983,
"learning_rate": 0.0004,
"loss": 0.6441,
"step": 310
},
{
"epoch": 0.06364517813191457,
"grad_norm": 0.5433733463287354,
"learning_rate": 0.0004,
"loss": 0.6836,
"step": 320
},
{
"epoch": 0.06563408994853691,
"grad_norm": 0.5863742232322693,
"learning_rate": 0.0004,
"loss": 0.6804,
"step": 330
},
{
"epoch": 0.06762300176515924,
"grad_norm": 0.7768782377243042,
"learning_rate": 0.0004,
"loss": 0.7014,
"step": 340
},
{
"epoch": 0.06961191358178156,
"grad_norm": 0.548475444316864,
"learning_rate": 0.0004,
"loss": 0.6545,
"step": 350
},
{
"epoch": 0.0716008253984039,
"grad_norm": 0.7511247992515564,
"learning_rate": 0.0004,
"loss": 0.6478,
"step": 360
},
{
"epoch": 0.07358973721502624,
"grad_norm": 0.6464333534240723,
"learning_rate": 0.0004,
"loss": 0.6762,
"step": 370
},
{
"epoch": 0.07557864903164856,
"grad_norm": 0.6280458569526672,
"learning_rate": 0.0004,
"loss": 0.6723,
"step": 380
},
{
"epoch": 0.07756756084827089,
"grad_norm": 0.6138644218444824,
"learning_rate": 0.0004,
"loss": 0.6834,
"step": 390
},
{
"epoch": 0.07955647266489323,
"grad_norm": 0.6612856984138489,
"learning_rate": 0.0004,
"loss": 0.662,
"step": 400
},
{
"epoch": 0.07955647266489323,
"eval_loss": 0.08379939943552017,
"eval_runtime": 30.1324,
"eval_samples_per_second": 2.688,
"eval_steps_per_second": 1.361,
"step": 400
},
{
"epoch": 0.08154538448151555,
"grad_norm": 0.5658541917800903,
"learning_rate": 0.0004,
"loss": 0.6507,
"step": 410
},
{
"epoch": 0.08353429629813788,
"grad_norm": 0.5861065983772278,
"learning_rate": 0.0004,
"loss": 0.6565,
"step": 420
},
{
"epoch": 0.08552320811476022,
"grad_norm": 0.6580057144165039,
"learning_rate": 0.0004,
"loss": 0.6961,
"step": 430
},
{
"epoch": 0.08751211993138254,
"grad_norm": 0.6456801295280457,
"learning_rate": 0.0004,
"loss": 0.6667,
"step": 440
},
{
"epoch": 0.08950103174800488,
"grad_norm": 0.6603415608406067,
"learning_rate": 0.0004,
"loss": 0.6589,
"step": 450
},
{
"epoch": 0.09148994356462721,
"grad_norm": 0.6744834184646606,
"learning_rate": 0.0004,
"loss": 0.6791,
"step": 460
},
{
"epoch": 0.09347885538124953,
"grad_norm": 0.6219160556793213,
"learning_rate": 0.0004,
"loss": 0.6748,
"step": 470
},
{
"epoch": 0.09546776719787187,
"grad_norm": 0.6373462677001953,
"learning_rate": 0.0004,
"loss": 0.654,
"step": 480
},
{
"epoch": 0.09745667901449419,
"grad_norm": 0.7271533608436584,
"learning_rate": 0.0004,
"loss": 0.651,
"step": 490
},
{
"epoch": 0.09944559083111652,
"grad_norm": 0.6483666300773621,
"learning_rate": 0.0004,
"loss": 0.6728,
"step": 500
},
{
"epoch": 0.09944559083111652,
"eval_loss": 0.08319716155529022,
"eval_runtime": 30.1252,
"eval_samples_per_second": 2.689,
"eval_steps_per_second": 1.361,
"step": 500
},
{
"epoch": 0.10143450264773886,
"grad_norm": 0.5817425847053528,
"learning_rate": 0.0004,
"loss": 0.6571,
"step": 510
},
{
"epoch": 0.10342341446436118,
"grad_norm": 0.6830428838729858,
"learning_rate": 0.0004,
"loss": 0.6618,
"step": 520
},
{
"epoch": 0.10541232628098351,
"grad_norm": 0.5775642395019531,
"learning_rate": 0.0004,
"loss": 0.6181,
"step": 530
},
{
"epoch": 0.10740123809760585,
"grad_norm": 0.6007582545280457,
"learning_rate": 0.0004,
"loss": 0.6839,
"step": 540
},
{
"epoch": 0.10939014991422817,
"grad_norm": 0.648262083530426,
"learning_rate": 0.0004,
"loss": 0.6643,
"step": 550
},
{
"epoch": 0.1113790617308505,
"grad_norm": 0.6632483601570129,
"learning_rate": 0.0004,
"loss": 0.652,
"step": 560
},
{
"epoch": 0.11336797354747284,
"grad_norm": 0.5972626805305481,
"learning_rate": 0.0004,
"loss": 0.6938,
"step": 570
},
{
"epoch": 0.11535688536409516,
"grad_norm": 0.6052406430244446,
"learning_rate": 0.0004,
"loss": 0.6301,
"step": 580
},
{
"epoch": 0.1173457971807175,
"grad_norm": 0.5875466465950012,
"learning_rate": 0.0004,
"loss": 0.6614,
"step": 590
},
{
"epoch": 0.11933470899733983,
"grad_norm": 0.7067976593971252,
"learning_rate": 0.0004,
"loss": 0.647,
"step": 600
},
{
"epoch": 0.11933470899733983,
"eval_loss": 0.08319241553544998,
"eval_runtime": 30.1031,
"eval_samples_per_second": 2.691,
"eval_steps_per_second": 1.362,
"step": 600
},
{
"epoch": 0.12132362081396215,
"grad_norm": 0.66518634557724,
"learning_rate": 0.0004,
"loss": 0.693,
"step": 610
},
{
"epoch": 0.12331253263058449,
"grad_norm": 0.6959813833236694,
"learning_rate": 0.0004,
"loss": 0.6747,
"step": 620
},
{
"epoch": 0.1253014444472068,
"grad_norm": 0.935105562210083,
"learning_rate": 0.0004,
"loss": 0.6686,
"step": 630
},
{
"epoch": 0.12729035626382915,
"grad_norm": 0.713768720626831,
"learning_rate": 0.0004,
"loss": 0.6707,
"step": 640
},
{
"epoch": 0.12927926808045148,
"grad_norm": 0.7059699296951294,
"learning_rate": 0.0004,
"loss": 0.7255,
"step": 650
},
{
"epoch": 0.13126817989707382,
"grad_norm": 0.588306725025177,
"learning_rate": 0.0004,
"loss": 0.6689,
"step": 660
},
{
"epoch": 0.13325709171369615,
"grad_norm": 0.6097111105918884,
"learning_rate": 0.0004,
"loss": 0.6612,
"step": 670
},
{
"epoch": 0.1352460035303185,
"grad_norm": 0.642393946647644,
"learning_rate": 0.0004,
"loss": 0.6743,
"step": 680
},
{
"epoch": 0.1372349153469408,
"grad_norm": 0.7600162625312805,
"learning_rate": 0.0004,
"loss": 0.6768,
"step": 690
},
{
"epoch": 0.13922382716356313,
"grad_norm": 0.7193499207496643,
"learning_rate": 0.0004,
"loss": 0.6559,
"step": 700
},
{
"epoch": 0.13922382716356313,
"eval_loss": 0.08392166346311569,
"eval_runtime": 30.1253,
"eval_samples_per_second": 2.689,
"eval_steps_per_second": 1.361,
"step": 700
},
{
"epoch": 0.14121273898018546,
"grad_norm": 0.6542356014251709,
"learning_rate": 0.0004,
"loss": 0.6775,
"step": 710
},
{
"epoch": 0.1432016507968078,
"grad_norm": 0.629941999912262,
"learning_rate": 0.0004,
"loss": 0.6223,
"step": 720
},
{
"epoch": 0.14519056261343014,
"grad_norm": 0.6493385434150696,
"learning_rate": 0.0004,
"loss": 0.6894,
"step": 730
},
{
"epoch": 0.14717947443005247,
"grad_norm": 0.7201417684555054,
"learning_rate": 0.0004,
"loss": 0.6858,
"step": 740
},
{
"epoch": 0.14916838624667478,
"grad_norm": 0.6775253415107727,
"learning_rate": 0.0004,
"loss": 0.6628,
"step": 750
},
{
"epoch": 0.1511572980632971,
"grad_norm": 0.6149548292160034,
"learning_rate": 0.0004,
"loss": 0.6993,
"step": 760
},
{
"epoch": 0.15314620987991945,
"grad_norm": 0.6627587080001831,
"learning_rate": 0.0004,
"loss": 0.646,
"step": 770
},
{
"epoch": 0.15513512169654178,
"grad_norm": 0.6701797842979431,
"learning_rate": 0.0004,
"loss": 0.6927,
"step": 780
},
{
"epoch": 0.15712403351316412,
"grad_norm": 0.678193211555481,
"learning_rate": 0.0004,
"loss": 0.6454,
"step": 790
},
{
"epoch": 0.15911294532978645,
"grad_norm": 0.6337444186210632,
"learning_rate": 0.0004,
"loss": 0.6723,
"step": 800
},
{
"epoch": 0.15911294532978645,
"eval_loss": 0.08426591008901596,
"eval_runtime": 30.0445,
"eval_samples_per_second": 2.696,
"eval_steps_per_second": 1.365,
"step": 800
},
{
"epoch": 0.16110185714640876,
"grad_norm": 0.654451310634613,
"learning_rate": 0.0004,
"loss": 0.6799,
"step": 810
},
{
"epoch": 0.1630907689630311,
"grad_norm": 0.6989086866378784,
"learning_rate": 0.0004,
"loss": 0.6694,
"step": 820
},
{
"epoch": 0.16507968077965343,
"grad_norm": 0.6176579594612122,
"learning_rate": 0.0004,
"loss": 0.6225,
"step": 830
},
{
"epoch": 0.16706859259627577,
"grad_norm": 0.6462605595588684,
"learning_rate": 0.0004,
"loss": 0.6584,
"step": 840
},
{
"epoch": 0.1690575044128981,
"grad_norm": 0.7809733748435974,
"learning_rate": 0.0004,
"loss": 0.6601,
"step": 850
},
{
"epoch": 0.17104641622952044,
"grad_norm": 0.7143774032592773,
"learning_rate": 0.0004,
"loss": 0.6655,
"step": 860
},
{
"epoch": 0.17303532804614274,
"grad_norm": 0.7137865424156189,
"learning_rate": 0.0004,
"loss": 0.6849,
"step": 870
},
{
"epoch": 0.17502423986276508,
"grad_norm": 0.715568482875824,
"learning_rate": 0.0004,
"loss": 0.6408,
"step": 880
},
{
"epoch": 0.17701315167938741,
"grad_norm": 0.59111088514328,
"learning_rate": 0.0004,
"loss": 0.6772,
"step": 890
},
{
"epoch": 0.17900206349600975,
"grad_norm": 0.7616696357727051,
"learning_rate": 0.0004,
"loss": 0.6671,
"step": 900
},
{
"epoch": 0.17900206349600975,
"eval_loss": 0.084267757833004,
"eval_runtime": 30.1275,
"eval_samples_per_second": 2.689,
"eval_steps_per_second": 1.361,
"step": 900
},
{
"epoch": 0.18099097531263209,
"grad_norm": 0.6685693860054016,
"learning_rate": 0.0004,
"loss": 0.6792,
"step": 910
},
{
"epoch": 0.18297988712925442,
"grad_norm": 0.7320526838302612,
"learning_rate": 0.0004,
"loss": 0.6435,
"step": 920
},
{
"epoch": 0.18496879894587673,
"grad_norm": 0.6541480422019958,
"learning_rate": 0.0004,
"loss": 0.6649,
"step": 930
},
{
"epoch": 0.18695771076249906,
"grad_norm": 0.6433006525039673,
"learning_rate": 0.0004,
"loss": 0.6677,
"step": 940
},
{
"epoch": 0.1889466225791214,
"grad_norm": 0.6296941041946411,
"learning_rate": 0.0004,
"loss": 0.6334,
"step": 950
},
{
"epoch": 0.19093553439574373,
"grad_norm": 0.7856689691543579,
"learning_rate": 0.0004,
"loss": 0.7039,
"step": 960
},
{
"epoch": 0.19292444621236607,
"grad_norm": 0.6200475096702576,
"learning_rate": 0.0004,
"loss": 0.6602,
"step": 970
},
{
"epoch": 0.19491335802898838,
"grad_norm": 0.6970551609992981,
"learning_rate": 0.0004,
"loss": 0.6704,
"step": 980
},
{
"epoch": 0.1969022698456107,
"grad_norm": 0.6525449752807617,
"learning_rate": 0.0004,
"loss": 0.6721,
"step": 990
},
{
"epoch": 0.19889118166223305,
"grad_norm": 0.7507511377334595,
"learning_rate": 0.0004,
"loss": 0.6829,
"step": 1000
},
{
"epoch": 0.19889118166223305,
"eval_loss": 0.0835462287068367,
"eval_runtime": 30.0507,
"eval_samples_per_second": 2.695,
"eval_steps_per_second": 1.364,
"step": 1000
},
{
"epoch": 0.20088009347885538,
"grad_norm": 0.7378696203231812,
"learning_rate": 0.0004,
"loss": 0.6567,
"step": 1010
},
{
"epoch": 0.20286900529547772,
"grad_norm": 0.6451396346092224,
"learning_rate": 0.0004,
"loss": 0.6502,
"step": 1020
},
{
"epoch": 0.20485791711210005,
"grad_norm": 0.6342566013336182,
"learning_rate": 0.0004,
"loss": 0.6477,
"step": 1030
},
{
"epoch": 0.20684682892872236,
"grad_norm": 0.7209526896476746,
"learning_rate": 0.0004,
"loss": 0.6661,
"step": 1040
},
{
"epoch": 0.2088357407453447,
"grad_norm": 0.6808329820632935,
"learning_rate": 0.0004,
"loss": 0.6515,
"step": 1050
},
{
"epoch": 0.21082465256196703,
"grad_norm": 0.6738231182098389,
"learning_rate": 0.0004,
"loss": 0.626,
"step": 1060
},
{
"epoch": 0.21281356437858936,
"grad_norm": 0.6646963357925415,
"learning_rate": 0.0004,
"loss": 0.6714,
"step": 1070
},
{
"epoch": 0.2148024761952117,
"grad_norm": 0.6372888088226318,
"learning_rate": 0.0004,
"loss": 0.6768,
"step": 1080
},
{
"epoch": 0.21679138801183404,
"grad_norm": 0.7138890624046326,
"learning_rate": 0.0004,
"loss": 0.6949,
"step": 1090
},
{
"epoch": 0.21878029982845634,
"grad_norm": 0.7249679565429688,
"learning_rate": 0.0004,
"loss": 0.6928,
"step": 1100
},
{
"epoch": 0.21878029982845634,
"eval_loss": 0.08422956615686417,
"eval_runtime": 30.1032,
"eval_samples_per_second": 2.691,
"eval_steps_per_second": 1.362,
"step": 1100
},
{
"epoch": 0.22076921164507868,
"grad_norm": 0.6382346153259277,
"learning_rate": 0.0004,
"loss": 0.6619,
"step": 1110
},
{
"epoch": 0.222758123461701,
"grad_norm": 0.6400596499443054,
"learning_rate": 0.0004,
"loss": 0.7103,
"step": 1120
},
{
"epoch": 0.22474703527832335,
"grad_norm": 0.6994810700416565,
"learning_rate": 0.0004,
"loss": 0.6647,
"step": 1130
},
{
"epoch": 0.22673594709494568,
"grad_norm": 0.76835036277771,
"learning_rate": 0.0004,
"loss": 0.6923,
"step": 1140
},
{
"epoch": 0.22872485891156802,
"grad_norm": 0.6603644490242004,
"learning_rate": 0.0004,
"loss": 0.673,
"step": 1150
},
{
"epoch": 0.23071377072819033,
"grad_norm": 0.7264408469200134,
"learning_rate": 0.0004,
"loss": 0.6828,
"step": 1160
},
{
"epoch": 0.23270268254481266,
"grad_norm": 0.7072731852531433,
"learning_rate": 0.0004,
"loss": 0.6831,
"step": 1170
},
{
"epoch": 0.234691594361435,
"grad_norm": 0.6494096517562866,
"learning_rate": 0.0004,
"loss": 0.6659,
"step": 1180
},
{
"epoch": 0.23668050617805733,
"grad_norm": 0.6463006734848022,
"learning_rate": 0.0004,
"loss": 0.7155,
"step": 1190
},
{
"epoch": 0.23866941799467967,
"grad_norm": 0.6508920192718506,
"learning_rate": 0.0004,
"loss": 0.6563,
"step": 1200
},
{
"epoch": 0.23866941799467967,
"eval_loss": 0.08387701213359833,
"eval_runtime": 30.0299,
"eval_samples_per_second": 2.697,
"eval_steps_per_second": 1.365,
"step": 1200
},
{
"epoch": 0.240658329811302,
"grad_norm": 0.6701735258102417,
"learning_rate": 0.0004,
"loss": 0.6765,
"step": 1210
},
{
"epoch": 0.2426472416279243,
"grad_norm": 0.5798119902610779,
"learning_rate": 0.0004,
"loss": 0.6501,
"step": 1220
},
{
"epoch": 0.24463615344454664,
"grad_norm": 0.7210298776626587,
"learning_rate": 0.0004,
"loss": 0.6576,
"step": 1230
},
{
"epoch": 0.24662506526116898,
"grad_norm": 0.7448759078979492,
"learning_rate": 0.0004,
"loss": 0.6918,
"step": 1240
},
{
"epoch": 0.24861397707779131,
"grad_norm": 0.6556337475776672,
"learning_rate": 0.0004,
"loss": 0.6526,
"step": 1250
},
{
"epoch": 0.2506028888944136,
"grad_norm": 0.6584301590919495,
"learning_rate": 0.0004,
"loss": 0.6736,
"step": 1260
},
{
"epoch": 0.252591800711036,
"grad_norm": 0.6725241541862488,
"learning_rate": 0.0004,
"loss": 0.6772,
"step": 1270
},
{
"epoch": 0.2545807125276583,
"grad_norm": 0.7188987731933594,
"learning_rate": 0.0004,
"loss": 0.6629,
"step": 1280
},
{
"epoch": 0.25656962434428066,
"grad_norm": 0.8247680068016052,
"learning_rate": 0.0004,
"loss": 0.6844,
"step": 1290
},
{
"epoch": 0.25855853616090296,
"grad_norm": 0.6960418224334717,
"learning_rate": 0.0004,
"loss": 0.6411,
"step": 1300
},
{
"epoch": 0.25855853616090296,
"eval_loss": 0.0839666873216629,
"eval_runtime": 30.0597,
"eval_samples_per_second": 2.695,
"eval_steps_per_second": 1.364,
"step": 1300
},
{
"epoch": 0.26054744797752527,
"grad_norm": 0.6796591877937317,
"learning_rate": 0.0004,
"loss": 0.6707,
"step": 1310
},
{
"epoch": 0.26253635979414763,
"grad_norm": 0.6542907357215881,
"learning_rate": 0.0004,
"loss": 0.6691,
"step": 1320
},
{
"epoch": 0.26452527161076994,
"grad_norm": 0.6826708912849426,
"learning_rate": 0.0004,
"loss": 0.6696,
"step": 1330
},
{
"epoch": 0.2665141834273923,
"grad_norm": 0.7088764905929565,
"learning_rate": 0.0004,
"loss": 0.6817,
"step": 1340
},
{
"epoch": 0.2685030952440146,
"grad_norm": 0.7242617607116699,
"learning_rate": 0.0004,
"loss": 0.6759,
"step": 1350
},
{
"epoch": 0.270492007060637,
"grad_norm": 0.7008711099624634,
"learning_rate": 0.0004,
"loss": 0.6592,
"step": 1360
},
{
"epoch": 0.2724809188772593,
"grad_norm": 0.6489241719245911,
"learning_rate": 0.0004,
"loss": 0.6637,
"step": 1370
},
{
"epoch": 0.2744698306938816,
"grad_norm": 0.7217922806739807,
"learning_rate": 0.0004,
"loss": 0.6539,
"step": 1380
},
{
"epoch": 0.27645874251050395,
"grad_norm": 0.8037365078926086,
"learning_rate": 0.0004,
"loss": 0.6994,
"step": 1390
},
{
"epoch": 0.27844765432712626,
"grad_norm": 0.659654974937439,
"learning_rate": 0.0004,
"loss": 0.6673,
"step": 1400
},
{
"epoch": 0.27844765432712626,
"eval_loss": 0.0840681791305542,
"eval_runtime": 30.0353,
"eval_samples_per_second": 2.697,
"eval_steps_per_second": 1.365,
"step": 1400
},
{
"epoch": 0.2804365661437486,
"grad_norm": 0.7238272428512573,
"learning_rate": 0.0004,
"loss": 0.6974,
"step": 1410
},
{
"epoch": 0.28242547796037093,
"grad_norm": 0.6564947366714478,
"learning_rate": 0.0004,
"loss": 0.6687,
"step": 1420
},
{
"epoch": 0.28441438977699324,
"grad_norm": 0.7392669916152954,
"learning_rate": 0.0004,
"loss": 0.6546,
"step": 1430
},
{
"epoch": 0.2864033015936156,
"grad_norm": 0.7504440546035767,
"learning_rate": 0.0004,
"loss": 0.6737,
"step": 1440
},
{
"epoch": 0.2883922134102379,
"grad_norm": 0.7336270213127136,
"learning_rate": 0.0004,
"loss": 0.6803,
"step": 1450
},
{
"epoch": 0.29038112522686027,
"grad_norm": 1.862186312675476,
"learning_rate": 0.0004,
"loss": 0.658,
"step": 1460
},
{
"epoch": 0.2923700370434826,
"grad_norm": 0.7425276637077332,
"learning_rate": 0.0004,
"loss": 0.6383,
"step": 1470
},
{
"epoch": 0.29435894886010494,
"grad_norm": 0.6604830622673035,
"learning_rate": 0.0004,
"loss": 0.7077,
"step": 1480
},
{
"epoch": 0.29634786067672725,
"grad_norm": 0.7673712968826294,
"learning_rate": 0.0004,
"loss": 0.6722,
"step": 1490
},
{
"epoch": 0.29833677249334956,
"grad_norm": 0.7889634370803833,
"learning_rate": 0.0004,
"loss": 0.6909,
"step": 1500
},
{
"epoch": 0.29833677249334956,
"eval_loss": 0.08427305519580841,
"eval_runtime": 30.0826,
"eval_samples_per_second": 2.693,
"eval_steps_per_second": 1.363,
"step": 1500
},
{
"epoch": 0.3003256843099719,
"grad_norm": 0.8077505826950073,
"learning_rate": 0.0004,
"loss": 0.6976,
"step": 1510
},
{
"epoch": 0.3023145961265942,
"grad_norm": 0.6837480068206787,
"learning_rate": 0.0004,
"loss": 0.675,
"step": 1520
},
{
"epoch": 0.3043035079432166,
"grad_norm": 0.6629063487052917,
"learning_rate": 0.0004,
"loss": 0.6396,
"step": 1530
},
{
"epoch": 0.3062924197598389,
"grad_norm": 0.7187213897705078,
"learning_rate": 0.0004,
"loss": 0.6898,
"step": 1540
},
{
"epoch": 0.3082813315764612,
"grad_norm": 0.7269571423530579,
"learning_rate": 0.0004,
"loss": 0.7106,
"step": 1550
},
{
"epoch": 0.31027024339308357,
"grad_norm": 0.6767787337303162,
"learning_rate": 0.0004,
"loss": 0.6836,
"step": 1560
},
{
"epoch": 0.3122591552097059,
"grad_norm": 0.7046016454696655,
"learning_rate": 0.0004,
"loss": 0.7129,
"step": 1570
},
{
"epoch": 0.31424806702632824,
"grad_norm": 0.6218843460083008,
"learning_rate": 0.0004,
"loss": 0.7129,
"step": 1580
},
{
"epoch": 0.31623697884295054,
"grad_norm": 0.7410914897918701,
"learning_rate": 0.0004,
"loss": 0.6616,
"step": 1590
},
{
"epoch": 0.3182258906595729,
"grad_norm": 0.5945529937744141,
"learning_rate": 0.0004,
"loss": 0.6878,
"step": 1600
},
{
"epoch": 0.3182258906595729,
"eval_loss": 0.08433911204338074,
"eval_runtime": 30.0269,
"eval_samples_per_second": 2.698,
"eval_steps_per_second": 1.365,
"step": 1600
},
{
"epoch": 0.3202148024761952,
"grad_norm": 0.6479379534721375,
"learning_rate": 4e-05,
"loss": 0.6708,
"step": 1610
},
{
"epoch": 0.3222037142928175,
"grad_norm": 0.6011672616004944,
"learning_rate": 4e-05,
"loss": 0.6511,
"step": 1620
},
{
"epoch": 0.3241926261094399,
"grad_norm": 0.6457736492156982,
"learning_rate": 4e-05,
"loss": 0.6601,
"step": 1630
},
{
"epoch": 0.3261815379260622,
"grad_norm": 0.6549608707427979,
"learning_rate": 4e-05,
"loss": 0.6643,
"step": 1640
},
{
"epoch": 0.32817044974268456,
"grad_norm": 0.6869723200798035,
"learning_rate": 4e-05,
"loss": 0.6766,
"step": 1650
},
{
"epoch": 0.33015936155930686,
"grad_norm": 0.6526493430137634,
"learning_rate": 4e-05,
"loss": 0.6625,
"step": 1660
},
{
"epoch": 0.33214827337592917,
"grad_norm": 0.6106629967689514,
"learning_rate": 4e-05,
"loss": 0.6412,
"step": 1670
},
{
"epoch": 0.33413718519255153,
"grad_norm": 0.6620826125144958,
"learning_rate": 4e-05,
"loss": 0.6314,
"step": 1680
},
{
"epoch": 0.33612609700917384,
"grad_norm": 0.6487278938293457,
"learning_rate": 4e-05,
"loss": 0.643,
"step": 1690
},
{
"epoch": 0.3381150088257962,
"grad_norm": 0.7703284621238708,
"learning_rate": 4e-05,
"loss": 0.6298,
"step": 1700
},
{
"epoch": 0.3381150088257962,
"eval_loss": 0.08053209632635117,
"eval_runtime": 30.0784,
"eval_samples_per_second": 2.693,
"eval_steps_per_second": 1.363,
"step": 1700
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.460398769660756e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}