lesso's picture
Training in progress, step 200, checkpoint
c843011 verified
{
"best_metric": 11.866828918457031,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.006207613638127163,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.103806819063581e-05,
"grad_norm": 0.06145656853914261,
"learning_rate": 1.009e-05,
"loss": 11.9393,
"step": 1
},
{
"epoch": 3.103806819063581e-05,
"eval_loss": 11.937196731567383,
"eval_runtime": 414.1282,
"eval_samples_per_second": 32.758,
"eval_steps_per_second": 8.191,
"step": 1
},
{
"epoch": 6.207613638127163e-05,
"grad_norm": 0.10884089767932892,
"learning_rate": 2.018e-05,
"loss": 11.9275,
"step": 2
},
{
"epoch": 9.311420457190745e-05,
"grad_norm": 0.11498255282640457,
"learning_rate": 3.027e-05,
"loss": 11.9388,
"step": 3
},
{
"epoch": 0.00012415227276254325,
"grad_norm": 0.088991180062294,
"learning_rate": 4.036e-05,
"loss": 11.9282,
"step": 4
},
{
"epoch": 0.00015519034095317908,
"grad_norm": 0.07985679805278778,
"learning_rate": 5.045e-05,
"loss": 11.9416,
"step": 5
},
{
"epoch": 0.0001862284091438149,
"grad_norm": 0.07948625832796097,
"learning_rate": 6.054e-05,
"loss": 11.9386,
"step": 6
},
{
"epoch": 0.0002172664773344507,
"grad_norm": 0.08249641954898834,
"learning_rate": 7.062999999999999e-05,
"loss": 11.9342,
"step": 7
},
{
"epoch": 0.0002483045455250865,
"grad_norm": 0.06686617434024811,
"learning_rate": 8.072e-05,
"loss": 11.9395,
"step": 8
},
{
"epoch": 0.0002793426137157223,
"grad_norm": 0.07867742329835892,
"learning_rate": 9.081e-05,
"loss": 11.9405,
"step": 9
},
{
"epoch": 0.00031038068190635817,
"grad_norm": 0.09347719699144363,
"learning_rate": 0.0001009,
"loss": 11.9347,
"step": 10
},
{
"epoch": 0.000341418750096994,
"grad_norm": 0.09216347336769104,
"learning_rate": 0.00010036894736842106,
"loss": 11.9372,
"step": 11
},
{
"epoch": 0.0003724568182876298,
"grad_norm": 0.08478887379169464,
"learning_rate": 9.98378947368421e-05,
"loss": 11.9306,
"step": 12
},
{
"epoch": 0.0004034948864782656,
"grad_norm": 0.10052121430635452,
"learning_rate": 9.930684210526315e-05,
"loss": 11.9362,
"step": 13
},
{
"epoch": 0.0004345329546689014,
"grad_norm": 0.09549523890018463,
"learning_rate": 9.877578947368421e-05,
"loss": 11.933,
"step": 14
},
{
"epoch": 0.0004655710228595372,
"grad_norm": 0.10671835392713547,
"learning_rate": 9.824473684210527e-05,
"loss": 11.9354,
"step": 15
},
{
"epoch": 0.000496609091050173,
"grad_norm": 0.12740136682987213,
"learning_rate": 9.771368421052632e-05,
"loss": 11.9261,
"step": 16
},
{
"epoch": 0.0005276471592408088,
"grad_norm": 0.10865606367588043,
"learning_rate": 9.718263157894736e-05,
"loss": 11.9368,
"step": 17
},
{
"epoch": 0.0005586852274314446,
"grad_norm": 0.15083859860897064,
"learning_rate": 9.665157894736842e-05,
"loss": 11.9356,
"step": 18
},
{
"epoch": 0.0005897232956220805,
"grad_norm": 0.18175148963928223,
"learning_rate": 9.612052631578948e-05,
"loss": 11.9251,
"step": 19
},
{
"epoch": 0.0006207613638127163,
"grad_norm": 0.1715611070394516,
"learning_rate": 9.558947368421052e-05,
"loss": 11.927,
"step": 20
},
{
"epoch": 0.0006517994320033521,
"grad_norm": 0.17699895799160004,
"learning_rate": 9.505842105263159e-05,
"loss": 11.9289,
"step": 21
},
{
"epoch": 0.000682837500193988,
"grad_norm": 0.17426027357578278,
"learning_rate": 9.452736842105263e-05,
"loss": 11.9333,
"step": 22
},
{
"epoch": 0.0007138755683846238,
"grad_norm": 0.20959268510341644,
"learning_rate": 9.399631578947368e-05,
"loss": 11.9314,
"step": 23
},
{
"epoch": 0.0007449136365752596,
"grad_norm": 0.22871772944927216,
"learning_rate": 9.346526315789474e-05,
"loss": 11.9286,
"step": 24
},
{
"epoch": 0.0007759517047658954,
"grad_norm": 0.24363191425800323,
"learning_rate": 9.293421052631578e-05,
"loss": 11.9192,
"step": 25
},
{
"epoch": 0.0008069897729565312,
"grad_norm": 0.28882232308387756,
"learning_rate": 9.240315789473684e-05,
"loss": 11.9225,
"step": 26
},
{
"epoch": 0.000838027841147167,
"grad_norm": 0.3017212152481079,
"learning_rate": 9.18721052631579e-05,
"loss": 11.9349,
"step": 27
},
{
"epoch": 0.0008690659093378028,
"grad_norm": 0.36739078164100647,
"learning_rate": 9.134105263157895e-05,
"loss": 11.9307,
"step": 28
},
{
"epoch": 0.0009001039775284386,
"grad_norm": 0.35056230425834656,
"learning_rate": 9.081e-05,
"loss": 11.9151,
"step": 29
},
{
"epoch": 0.0009311420457190744,
"grad_norm": 0.3288112282752991,
"learning_rate": 9.027894736842105e-05,
"loss": 11.9204,
"step": 30
},
{
"epoch": 0.0009621801139097103,
"grad_norm": 0.43591663241386414,
"learning_rate": 8.97478947368421e-05,
"loss": 11.9073,
"step": 31
},
{
"epoch": 0.000993218182100346,
"grad_norm": 0.36357542872428894,
"learning_rate": 8.921684210526316e-05,
"loss": 11.9184,
"step": 32
},
{
"epoch": 0.0010242562502909818,
"grad_norm": 0.4034142792224884,
"learning_rate": 8.86857894736842e-05,
"loss": 11.9087,
"step": 33
},
{
"epoch": 0.0010552943184816176,
"grad_norm": 0.33256056904792786,
"learning_rate": 8.815473684210527e-05,
"loss": 11.9187,
"step": 34
},
{
"epoch": 0.0010863323866722534,
"grad_norm": 0.3923008143901825,
"learning_rate": 8.762368421052631e-05,
"loss": 11.9074,
"step": 35
},
{
"epoch": 0.0011173704548628892,
"grad_norm": 0.4441995322704315,
"learning_rate": 8.709263157894737e-05,
"loss": 11.9153,
"step": 36
},
{
"epoch": 0.0011484085230535253,
"grad_norm": 0.3605518639087677,
"learning_rate": 8.656157894736843e-05,
"loss": 11.912,
"step": 37
},
{
"epoch": 0.001179446591244161,
"grad_norm": 0.4256209135055542,
"learning_rate": 8.603052631578947e-05,
"loss": 11.9072,
"step": 38
},
{
"epoch": 0.0012104846594347969,
"grad_norm": 0.35672155022621155,
"learning_rate": 8.549947368421052e-05,
"loss": 11.9033,
"step": 39
},
{
"epoch": 0.0012415227276254327,
"grad_norm": 0.3789241909980774,
"learning_rate": 8.496842105263158e-05,
"loss": 11.8991,
"step": 40
},
{
"epoch": 0.0012725607958160685,
"grad_norm": 0.353971928358078,
"learning_rate": 8.443736842105264e-05,
"loss": 11.9023,
"step": 41
},
{
"epoch": 0.0013035988640067043,
"grad_norm": 0.36267516016960144,
"learning_rate": 8.390631578947369e-05,
"loss": 11.8962,
"step": 42
},
{
"epoch": 0.00133463693219734,
"grad_norm": 0.30361491441726685,
"learning_rate": 8.337526315789473e-05,
"loss": 11.8893,
"step": 43
},
{
"epoch": 0.001365675000387976,
"grad_norm": 0.3609680235385895,
"learning_rate": 8.284421052631579e-05,
"loss": 11.898,
"step": 44
},
{
"epoch": 0.0013967130685786117,
"grad_norm": 0.3087683618068695,
"learning_rate": 8.231315789473685e-05,
"loss": 11.89,
"step": 45
},
{
"epoch": 0.0014277511367692475,
"grad_norm": 0.3304439187049866,
"learning_rate": 8.178210526315789e-05,
"loss": 11.8939,
"step": 46
},
{
"epoch": 0.0014587892049598833,
"grad_norm": 0.29173874855041504,
"learning_rate": 8.125105263157894e-05,
"loss": 11.8919,
"step": 47
},
{
"epoch": 0.0014898272731505191,
"grad_norm": 0.23176749050617218,
"learning_rate": 8.072e-05,
"loss": 11.895,
"step": 48
},
{
"epoch": 0.001520865341341155,
"grad_norm": 0.29861122369766235,
"learning_rate": 8.018894736842106e-05,
"loss": 11.8894,
"step": 49
},
{
"epoch": 0.0015519034095317907,
"grad_norm": 0.22058968245983124,
"learning_rate": 7.965789473684211e-05,
"loss": 11.8917,
"step": 50
},
{
"epoch": 0.0015519034095317907,
"eval_loss": 11.89046573638916,
"eval_runtime": 412.729,
"eval_samples_per_second": 32.869,
"eval_steps_per_second": 8.218,
"step": 50
},
{
"epoch": 0.0015829414777224265,
"grad_norm": 0.28083565831184387,
"learning_rate": 7.912684210526315e-05,
"loss": 11.8962,
"step": 51
},
{
"epoch": 0.0016139795459130623,
"grad_norm": 0.40294113755226135,
"learning_rate": 7.859578947368421e-05,
"loss": 11.8997,
"step": 52
},
{
"epoch": 0.0016450176141036982,
"grad_norm": 0.3762059807777405,
"learning_rate": 7.806473684210527e-05,
"loss": 11.8922,
"step": 53
},
{
"epoch": 0.001676055682294334,
"grad_norm": 0.3565743863582611,
"learning_rate": 7.753368421052631e-05,
"loss": 11.8875,
"step": 54
},
{
"epoch": 0.0017070937504849698,
"grad_norm": 0.23867182433605194,
"learning_rate": 7.700263157894738e-05,
"loss": 11.8867,
"step": 55
},
{
"epoch": 0.0017381318186756056,
"grad_norm": 0.25642549991607666,
"learning_rate": 7.647157894736842e-05,
"loss": 11.8881,
"step": 56
},
{
"epoch": 0.0017691698868662414,
"grad_norm": 0.21004194021224976,
"learning_rate": 7.594052631578948e-05,
"loss": 11.8872,
"step": 57
},
{
"epoch": 0.0018002079550568772,
"grad_norm": 0.21973472833633423,
"learning_rate": 7.540947368421053e-05,
"loss": 11.8852,
"step": 58
},
{
"epoch": 0.001831246023247513,
"grad_norm": 0.17524276673793793,
"learning_rate": 7.487842105263157e-05,
"loss": 11.8839,
"step": 59
},
{
"epoch": 0.0018622840914381488,
"grad_norm": 0.18540893495082855,
"learning_rate": 7.434736842105263e-05,
"loss": 11.8891,
"step": 60
},
{
"epoch": 0.0018933221596287846,
"grad_norm": 0.19046209752559662,
"learning_rate": 7.381631578947368e-05,
"loss": 11.8908,
"step": 61
},
{
"epoch": 0.0019243602278194206,
"grad_norm": 0.1590886116027832,
"learning_rate": 7.328526315789474e-05,
"loss": 11.8905,
"step": 62
},
{
"epoch": 0.0019553982960100564,
"grad_norm": 0.188175231218338,
"learning_rate": 7.27542105263158e-05,
"loss": 11.8886,
"step": 63
},
{
"epoch": 0.001986436364200692,
"grad_norm": 0.1609676331281662,
"learning_rate": 7.222315789473684e-05,
"loss": 11.8729,
"step": 64
},
{
"epoch": 0.002017474432391328,
"grad_norm": 0.13807262480258942,
"learning_rate": 7.16921052631579e-05,
"loss": 11.8826,
"step": 65
},
{
"epoch": 0.0020485125005819636,
"grad_norm": 0.14472399652004242,
"learning_rate": 7.116105263157895e-05,
"loss": 11.8801,
"step": 66
},
{
"epoch": 0.0020795505687725997,
"grad_norm": 0.17055800557136536,
"learning_rate": 7.062999999999999e-05,
"loss": 11.8832,
"step": 67
},
{
"epoch": 0.0021105886369632352,
"grad_norm": 0.1315099149942398,
"learning_rate": 7.009894736842106e-05,
"loss": 11.8775,
"step": 68
},
{
"epoch": 0.0021416267051538713,
"grad_norm": 0.10270003229379654,
"learning_rate": 6.95678947368421e-05,
"loss": 11.8751,
"step": 69
},
{
"epoch": 0.002172664773344507,
"grad_norm": 0.13660825788974762,
"learning_rate": 6.903684210526316e-05,
"loss": 11.8779,
"step": 70
},
{
"epoch": 0.002203702841535143,
"grad_norm": 0.11236346513032913,
"learning_rate": 6.850578947368422e-05,
"loss": 11.8768,
"step": 71
},
{
"epoch": 0.0022347409097257785,
"grad_norm": 0.20397527515888214,
"learning_rate": 6.797473684210526e-05,
"loss": 11.8695,
"step": 72
},
{
"epoch": 0.0022657789779164145,
"grad_norm": 0.186505526304245,
"learning_rate": 6.744368421052631e-05,
"loss": 11.8669,
"step": 73
},
{
"epoch": 0.0022968170461070505,
"grad_norm": 0.12963062524795532,
"learning_rate": 6.691263157894736e-05,
"loss": 11.8862,
"step": 74
},
{
"epoch": 0.002327855114297686,
"grad_norm": 0.10962740331888199,
"learning_rate": 6.638157894736843e-05,
"loss": 11.8836,
"step": 75
},
{
"epoch": 0.002358893182488322,
"grad_norm": 0.13987334072589874,
"learning_rate": 6.585052631578948e-05,
"loss": 11.8744,
"step": 76
},
{
"epoch": 0.0023899312506789577,
"grad_norm": 0.19806340336799622,
"learning_rate": 6.531947368421052e-05,
"loss": 11.8869,
"step": 77
},
{
"epoch": 0.0024209693188695937,
"grad_norm": 0.11947502940893173,
"learning_rate": 6.478842105263158e-05,
"loss": 11.8842,
"step": 78
},
{
"epoch": 0.0024520073870602293,
"grad_norm": 0.17792446911334991,
"learning_rate": 6.425736842105264e-05,
"loss": 11.8844,
"step": 79
},
{
"epoch": 0.0024830454552508653,
"grad_norm": 0.13691994547843933,
"learning_rate": 6.372631578947368e-05,
"loss": 11.8785,
"step": 80
},
{
"epoch": 0.002514083523441501,
"grad_norm": 0.19070185720920563,
"learning_rate": 6.319526315789473e-05,
"loss": 11.8824,
"step": 81
},
{
"epoch": 0.002545121591632137,
"grad_norm": 0.13721631467342377,
"learning_rate": 6.266421052631579e-05,
"loss": 11.8779,
"step": 82
},
{
"epoch": 0.0025761596598227725,
"grad_norm": 0.11055205017328262,
"learning_rate": 6.213315789473685e-05,
"loss": 11.8808,
"step": 83
},
{
"epoch": 0.0026071977280134086,
"grad_norm": 0.1432357281446457,
"learning_rate": 6.16021052631579e-05,
"loss": 11.8835,
"step": 84
},
{
"epoch": 0.002638235796204044,
"grad_norm": 0.15524394810199738,
"learning_rate": 6.107105263157894e-05,
"loss": 11.8805,
"step": 85
},
{
"epoch": 0.00266927386439468,
"grad_norm": 0.12444309145212173,
"learning_rate": 6.054e-05,
"loss": 11.8804,
"step": 86
},
{
"epoch": 0.0027003119325853158,
"grad_norm": 0.19381719827651978,
"learning_rate": 6.000894736842105e-05,
"loss": 11.8824,
"step": 87
},
{
"epoch": 0.002731350000775952,
"grad_norm": 0.1044355109333992,
"learning_rate": 5.94778947368421e-05,
"loss": 11.8786,
"step": 88
},
{
"epoch": 0.0027623880689665874,
"grad_norm": 0.1657666563987732,
"learning_rate": 5.894684210526316e-05,
"loss": 11.8843,
"step": 89
},
{
"epoch": 0.0027934261371572234,
"grad_norm": 0.15279187262058258,
"learning_rate": 5.841578947368421e-05,
"loss": 11.8778,
"step": 90
},
{
"epoch": 0.002824464205347859,
"grad_norm": 0.13506744801998138,
"learning_rate": 5.7884736842105265e-05,
"loss": 11.8795,
"step": 91
},
{
"epoch": 0.002855502273538495,
"grad_norm": 0.1315164864063263,
"learning_rate": 5.7353684210526314e-05,
"loss": 11.8806,
"step": 92
},
{
"epoch": 0.0028865403417291306,
"grad_norm": 0.1755332350730896,
"learning_rate": 5.6822631578947364e-05,
"loss": 11.8803,
"step": 93
},
{
"epoch": 0.0029175784099197666,
"grad_norm": 0.11907346546649933,
"learning_rate": 5.629157894736842e-05,
"loss": 11.8779,
"step": 94
},
{
"epoch": 0.0029486164781104022,
"grad_norm": 0.1171933189034462,
"learning_rate": 5.576052631578948e-05,
"loss": 11.874,
"step": 95
},
{
"epoch": 0.0029796545463010382,
"grad_norm": 0.13140052556991577,
"learning_rate": 5.522947368421053e-05,
"loss": 11.8762,
"step": 96
},
{
"epoch": 0.003010692614491674,
"grad_norm": 0.12108182907104492,
"learning_rate": 5.469842105263158e-05,
"loss": 11.8796,
"step": 97
},
{
"epoch": 0.00304173068268231,
"grad_norm": 0.10808394849300385,
"learning_rate": 5.416736842105263e-05,
"loss": 11.8786,
"step": 98
},
{
"epoch": 0.003072768750872946,
"grad_norm": 0.08503346145153046,
"learning_rate": 5.3636315789473685e-05,
"loss": 11.8801,
"step": 99
},
{
"epoch": 0.0031038068190635815,
"grad_norm": 0.1425827592611313,
"learning_rate": 5.3105263157894734e-05,
"loss": 11.8832,
"step": 100
},
{
"epoch": 0.0031038068190635815,
"eval_loss": 11.877439498901367,
"eval_runtime": 413.0154,
"eval_samples_per_second": 32.846,
"eval_steps_per_second": 8.213,
"step": 100
},
{
"epoch": 0.0031348448872542175,
"grad_norm": 0.2032780945301056,
"learning_rate": 5.257421052631578e-05,
"loss": 11.8815,
"step": 101
},
{
"epoch": 0.003165882955444853,
"grad_norm": 0.234865203499794,
"learning_rate": 5.2043157894736846e-05,
"loss": 11.8732,
"step": 102
},
{
"epoch": 0.003196921023635489,
"grad_norm": 0.2010481357574463,
"learning_rate": 5.1512105263157895e-05,
"loss": 11.8762,
"step": 103
},
{
"epoch": 0.0032279590918261247,
"grad_norm": 0.30129870772361755,
"learning_rate": 5.098105263157895e-05,
"loss": 11.8719,
"step": 104
},
{
"epoch": 0.0032589971600167607,
"grad_norm": 0.1704847365617752,
"learning_rate": 5.045e-05,
"loss": 11.8783,
"step": 105
},
{
"epoch": 0.0032900352282073963,
"grad_norm": 0.125693678855896,
"learning_rate": 4.991894736842105e-05,
"loss": 11.8782,
"step": 106
},
{
"epoch": 0.0033210732963980323,
"grad_norm": 0.1328500360250473,
"learning_rate": 4.9387894736842105e-05,
"loss": 11.8734,
"step": 107
},
{
"epoch": 0.003352111364588668,
"grad_norm": 0.10017025470733643,
"learning_rate": 4.885684210526316e-05,
"loss": 11.8779,
"step": 108
},
{
"epoch": 0.003383149432779304,
"grad_norm": 0.1521725356578827,
"learning_rate": 4.832578947368421e-05,
"loss": 11.8802,
"step": 109
},
{
"epoch": 0.0034141875009699395,
"grad_norm": 0.12134622782468796,
"learning_rate": 4.779473684210526e-05,
"loss": 11.8718,
"step": 110
},
{
"epoch": 0.0034452255691605756,
"grad_norm": 0.13203729689121246,
"learning_rate": 4.7263684210526315e-05,
"loss": 11.8839,
"step": 111
},
{
"epoch": 0.003476263637351211,
"grad_norm": 0.14758096635341644,
"learning_rate": 4.673263157894737e-05,
"loss": 11.8757,
"step": 112
},
{
"epoch": 0.003507301705541847,
"grad_norm": 0.14375559985637665,
"learning_rate": 4.620157894736842e-05,
"loss": 11.8748,
"step": 113
},
{
"epoch": 0.0035383397737324828,
"grad_norm": 0.10907881706953049,
"learning_rate": 4.5670526315789475e-05,
"loss": 11.8673,
"step": 114
},
{
"epoch": 0.0035693778419231188,
"grad_norm": 0.11960776150226593,
"learning_rate": 4.5139473684210524e-05,
"loss": 11.8824,
"step": 115
},
{
"epoch": 0.0036004159101137544,
"grad_norm": 0.12177892029285431,
"learning_rate": 4.460842105263158e-05,
"loss": 11.8694,
"step": 116
},
{
"epoch": 0.0036314539783043904,
"grad_norm": 0.11586112529039383,
"learning_rate": 4.4077368421052636e-05,
"loss": 11.8777,
"step": 117
},
{
"epoch": 0.003662492046495026,
"grad_norm": 0.10951674729585648,
"learning_rate": 4.3546315789473685e-05,
"loss": 11.8792,
"step": 118
},
{
"epoch": 0.003693530114685662,
"grad_norm": 0.13929174840450287,
"learning_rate": 4.3015263157894734e-05,
"loss": 11.8713,
"step": 119
},
{
"epoch": 0.0037245681828762976,
"grad_norm": 0.14048022031784058,
"learning_rate": 4.248421052631579e-05,
"loss": 11.8762,
"step": 120
},
{
"epoch": 0.0037556062510669336,
"grad_norm": 0.1154230386018753,
"learning_rate": 4.1953157894736846e-05,
"loss": 11.8679,
"step": 121
},
{
"epoch": 0.003786644319257569,
"grad_norm": 0.11130271852016449,
"learning_rate": 4.1422105263157895e-05,
"loss": 11.8732,
"step": 122
},
{
"epoch": 0.0038176823874482052,
"grad_norm": 0.19456033408641815,
"learning_rate": 4.0891052631578944e-05,
"loss": 11.8726,
"step": 123
},
{
"epoch": 0.0038487204556388412,
"grad_norm": 0.17270055413246155,
"learning_rate": 4.036e-05,
"loss": 11.8599,
"step": 124
},
{
"epoch": 0.003879758523829477,
"grad_norm": 0.1157611832022667,
"learning_rate": 3.9828947368421056e-05,
"loss": 11.8741,
"step": 125
},
{
"epoch": 0.003910796592020113,
"grad_norm": 0.13942551612854004,
"learning_rate": 3.9297894736842105e-05,
"loss": 11.8738,
"step": 126
},
{
"epoch": 0.003941834660210749,
"grad_norm": 0.1275920271873474,
"learning_rate": 3.8766842105263154e-05,
"loss": 11.8615,
"step": 127
},
{
"epoch": 0.003972872728401384,
"grad_norm": 0.09613733738660812,
"learning_rate": 3.823578947368421e-05,
"loss": 11.8738,
"step": 128
},
{
"epoch": 0.00400391079659202,
"grad_norm": 0.16122451424598694,
"learning_rate": 3.7704736842105265e-05,
"loss": 11.8692,
"step": 129
},
{
"epoch": 0.004034948864782656,
"grad_norm": 0.13367600739002228,
"learning_rate": 3.7173684210526315e-05,
"loss": 11.8725,
"step": 130
},
{
"epoch": 0.004065986932973292,
"grad_norm": 0.11731020361185074,
"learning_rate": 3.664263157894737e-05,
"loss": 11.8773,
"step": 131
},
{
"epoch": 0.004097025001163927,
"grad_norm": 0.13289429247379303,
"learning_rate": 3.611157894736842e-05,
"loss": 11.8661,
"step": 132
},
{
"epoch": 0.004128063069354563,
"grad_norm": 0.13751353323459625,
"learning_rate": 3.5580526315789475e-05,
"loss": 11.8654,
"step": 133
},
{
"epoch": 0.004159101137545199,
"grad_norm": 0.13276422023773193,
"learning_rate": 3.504947368421053e-05,
"loss": 11.8733,
"step": 134
},
{
"epoch": 0.004190139205735835,
"grad_norm": 0.12328977137804031,
"learning_rate": 3.451842105263158e-05,
"loss": 11.8791,
"step": 135
},
{
"epoch": 0.0042211772739264705,
"grad_norm": 0.14081747829914093,
"learning_rate": 3.398736842105263e-05,
"loss": 11.8727,
"step": 136
},
{
"epoch": 0.0042522153421171065,
"grad_norm": 0.13730375468730927,
"learning_rate": 3.345631578947368e-05,
"loss": 11.8715,
"step": 137
},
{
"epoch": 0.0042832534103077425,
"grad_norm": 0.1453617960214615,
"learning_rate": 3.292526315789474e-05,
"loss": 11.8744,
"step": 138
},
{
"epoch": 0.0043142914784983786,
"grad_norm": 0.18549758195877075,
"learning_rate": 3.239421052631579e-05,
"loss": 11.8698,
"step": 139
},
{
"epoch": 0.004345329546689014,
"grad_norm": 0.13958542048931122,
"learning_rate": 3.186315789473684e-05,
"loss": 11.8703,
"step": 140
},
{
"epoch": 0.00437636761487965,
"grad_norm": 0.1116572916507721,
"learning_rate": 3.1332105263157895e-05,
"loss": 11.8733,
"step": 141
},
{
"epoch": 0.004407405683070286,
"grad_norm": 0.15264497697353363,
"learning_rate": 3.080105263157895e-05,
"loss": 11.8744,
"step": 142
},
{
"epoch": 0.004438443751260922,
"grad_norm": 0.16946198046207428,
"learning_rate": 3.027e-05,
"loss": 11.8748,
"step": 143
},
{
"epoch": 0.004469481819451557,
"grad_norm": 0.17827944457530975,
"learning_rate": 2.973894736842105e-05,
"loss": 11.8683,
"step": 144
},
{
"epoch": 0.004500519887642193,
"grad_norm": 0.15590260922908783,
"learning_rate": 2.9207894736842105e-05,
"loss": 11.8667,
"step": 145
},
{
"epoch": 0.004531557955832829,
"grad_norm": 0.17276710271835327,
"learning_rate": 2.8676842105263157e-05,
"loss": 11.8716,
"step": 146
},
{
"epoch": 0.004562596024023465,
"grad_norm": 0.15102654695510864,
"learning_rate": 2.814578947368421e-05,
"loss": 11.8671,
"step": 147
},
{
"epoch": 0.004593634092214101,
"grad_norm": 0.18881015479564667,
"learning_rate": 2.7614736842105266e-05,
"loss": 11.8761,
"step": 148
},
{
"epoch": 0.004624672160404736,
"grad_norm": 0.20375801622867584,
"learning_rate": 2.7083684210526315e-05,
"loss": 11.8732,
"step": 149
},
{
"epoch": 0.004655710228595372,
"grad_norm": 0.221470445394516,
"learning_rate": 2.6552631578947367e-05,
"loss": 11.8677,
"step": 150
},
{
"epoch": 0.004655710228595372,
"eval_loss": 11.868966102600098,
"eval_runtime": 414.3737,
"eval_samples_per_second": 32.739,
"eval_steps_per_second": 8.186,
"step": 150
},
{
"epoch": 0.004686748296786008,
"grad_norm": 0.1980566531419754,
"learning_rate": 2.6021578947368423e-05,
"loss": 11.8806,
"step": 151
},
{
"epoch": 0.004717786364976644,
"grad_norm": 0.19020256400108337,
"learning_rate": 2.5490526315789475e-05,
"loss": 11.8685,
"step": 152
},
{
"epoch": 0.004748824433167279,
"grad_norm": 0.24149803817272186,
"learning_rate": 2.4959473684210524e-05,
"loss": 11.857,
"step": 153
},
{
"epoch": 0.004779862501357915,
"grad_norm": 0.2381637543439865,
"learning_rate": 2.442842105263158e-05,
"loss": 11.8716,
"step": 154
},
{
"epoch": 0.0048109005695485514,
"grad_norm": 0.18365250527858734,
"learning_rate": 2.389736842105263e-05,
"loss": 11.8579,
"step": 155
},
{
"epoch": 0.0048419386377391875,
"grad_norm": 0.13402819633483887,
"learning_rate": 2.3366315789473685e-05,
"loss": 11.8672,
"step": 156
},
{
"epoch": 0.004872976705929823,
"grad_norm": 0.13781289756298065,
"learning_rate": 2.2835263157894738e-05,
"loss": 11.869,
"step": 157
},
{
"epoch": 0.004904014774120459,
"grad_norm": 0.1478467583656311,
"learning_rate": 2.230421052631579e-05,
"loss": 11.8676,
"step": 158
},
{
"epoch": 0.004935052842311095,
"grad_norm": 0.12892381846904755,
"learning_rate": 2.1773157894736843e-05,
"loss": 11.8707,
"step": 159
},
{
"epoch": 0.004966090910501731,
"grad_norm": 0.1606762856245041,
"learning_rate": 2.1242105263157895e-05,
"loss": 11.8716,
"step": 160
},
{
"epoch": 0.004997128978692366,
"grad_norm": 0.07333854585886002,
"learning_rate": 2.0711052631578947e-05,
"loss": 11.8637,
"step": 161
},
{
"epoch": 0.005028167046883002,
"grad_norm": 0.15114228427410126,
"learning_rate": 2.018e-05,
"loss": 11.8736,
"step": 162
},
{
"epoch": 0.005059205115073638,
"grad_norm": 0.0825541689991951,
"learning_rate": 1.9648947368421052e-05,
"loss": 11.859,
"step": 163
},
{
"epoch": 0.005090243183264274,
"grad_norm": 0.14535300433635712,
"learning_rate": 1.9117894736842105e-05,
"loss": 11.8684,
"step": 164
},
{
"epoch": 0.005121281251454909,
"grad_norm": 0.10561253130435944,
"learning_rate": 1.8586842105263157e-05,
"loss": 11.8702,
"step": 165
},
{
"epoch": 0.005152319319645545,
"grad_norm": 0.1263810098171234,
"learning_rate": 1.805578947368421e-05,
"loss": 11.8643,
"step": 166
},
{
"epoch": 0.005183357387836181,
"grad_norm": 0.07599103450775146,
"learning_rate": 1.7524736842105266e-05,
"loss": 11.8712,
"step": 167
},
{
"epoch": 0.005214395456026817,
"grad_norm": 0.10940805822610855,
"learning_rate": 1.6993684210526315e-05,
"loss": 11.8615,
"step": 168
},
{
"epoch": 0.005245433524217452,
"grad_norm": 0.07466328144073486,
"learning_rate": 1.646263157894737e-05,
"loss": 11.8694,
"step": 169
},
{
"epoch": 0.005276471592408088,
"grad_norm": 0.16310299932956696,
"learning_rate": 1.593157894736842e-05,
"loss": 11.8719,
"step": 170
},
{
"epoch": 0.005307509660598724,
"grad_norm": 0.09811323136091232,
"learning_rate": 1.5400526315789475e-05,
"loss": 11.8716,
"step": 171
},
{
"epoch": 0.00533854772878936,
"grad_norm": 0.07701898366212845,
"learning_rate": 1.4869473684210524e-05,
"loss": 11.8667,
"step": 172
},
{
"epoch": 0.005369585796979996,
"grad_norm": 0.09958688914775848,
"learning_rate": 1.4338421052631579e-05,
"loss": 11.869,
"step": 173
},
{
"epoch": 0.0054006238651706315,
"grad_norm": 0.10621592402458191,
"learning_rate": 1.3807368421052633e-05,
"loss": 11.8738,
"step": 174
},
{
"epoch": 0.005431661933361268,
"grad_norm": 0.09998718649148941,
"learning_rate": 1.3276315789473684e-05,
"loss": 11.8742,
"step": 175
},
{
"epoch": 0.005462700001551904,
"grad_norm": 0.08477786928415298,
"learning_rate": 1.2745263157894738e-05,
"loss": 11.8695,
"step": 176
},
{
"epoch": 0.00549373806974254,
"grad_norm": 0.0795200765132904,
"learning_rate": 1.221421052631579e-05,
"loss": 11.8666,
"step": 177
},
{
"epoch": 0.005524776137933175,
"grad_norm": 0.13940560817718506,
"learning_rate": 1.1683157894736843e-05,
"loss": 11.8669,
"step": 178
},
{
"epoch": 0.005555814206123811,
"grad_norm": 0.09820152819156647,
"learning_rate": 1.1152105263157895e-05,
"loss": 11.8619,
"step": 179
},
{
"epoch": 0.005586852274314447,
"grad_norm": 0.10361889004707336,
"learning_rate": 1.0621052631578948e-05,
"loss": 11.8694,
"step": 180
},
{
"epoch": 0.005617890342505083,
"grad_norm": 0.1097809374332428,
"learning_rate": 1.009e-05,
"loss": 11.8732,
"step": 181
},
{
"epoch": 0.005648928410695718,
"grad_norm": 0.18127013742923737,
"learning_rate": 9.558947368421052e-06,
"loss": 11.8678,
"step": 182
},
{
"epoch": 0.005679966478886354,
"grad_norm": 0.12430532276630402,
"learning_rate": 9.027894736842105e-06,
"loss": 11.8681,
"step": 183
},
{
"epoch": 0.00571100454707699,
"grad_norm": 0.17203256487846375,
"learning_rate": 8.496842105263157e-06,
"loss": 11.8631,
"step": 184
},
{
"epoch": 0.005742042615267626,
"grad_norm": 0.09748750180006027,
"learning_rate": 7.96578947368421e-06,
"loss": 11.8646,
"step": 185
},
{
"epoch": 0.005773080683458261,
"grad_norm": 0.14566555619239807,
"learning_rate": 7.434736842105262e-06,
"loss": 11.8662,
"step": 186
},
{
"epoch": 0.005804118751648897,
"grad_norm": 0.13249988853931427,
"learning_rate": 6.903684210526316e-06,
"loss": 11.8669,
"step": 187
},
{
"epoch": 0.005835156819839533,
"grad_norm": 0.14271654188632965,
"learning_rate": 6.372631578947369e-06,
"loss": 11.8662,
"step": 188
},
{
"epoch": 0.005866194888030169,
"grad_norm": 0.10937763005495071,
"learning_rate": 5.841578947368421e-06,
"loss": 11.8672,
"step": 189
},
{
"epoch": 0.0058972329562208044,
"grad_norm": 0.1206103041768074,
"learning_rate": 5.310526315789474e-06,
"loss": 11.862,
"step": 190
},
{
"epoch": 0.0059282710244114405,
"grad_norm": 0.12608934938907623,
"learning_rate": 4.779473684210526e-06,
"loss": 11.8674,
"step": 191
},
{
"epoch": 0.0059593090926020765,
"grad_norm": 0.1504194289445877,
"learning_rate": 4.248421052631579e-06,
"loss": 11.8704,
"step": 192
},
{
"epoch": 0.0059903471607927125,
"grad_norm": 0.19314518570899963,
"learning_rate": 3.717368421052631e-06,
"loss": 11.8675,
"step": 193
},
{
"epoch": 0.006021385228983348,
"grad_norm": 0.1470961570739746,
"learning_rate": 3.1863157894736844e-06,
"loss": 11.8726,
"step": 194
},
{
"epoch": 0.006052423297173984,
"grad_norm": 0.17263321578502655,
"learning_rate": 2.655263157894737e-06,
"loss": 11.8699,
"step": 195
},
{
"epoch": 0.00608346136536462,
"grad_norm": 0.18947991728782654,
"learning_rate": 2.1242105263157893e-06,
"loss": 11.8713,
"step": 196
},
{
"epoch": 0.006114499433555256,
"grad_norm": 0.19662337005138397,
"learning_rate": 1.5931578947368422e-06,
"loss": 11.873,
"step": 197
},
{
"epoch": 0.006145537501745892,
"grad_norm": 0.1542241871356964,
"learning_rate": 1.0621052631578947e-06,
"loss": 11.8692,
"step": 198
},
{
"epoch": 0.006176575569936527,
"grad_norm": 0.20754170417785645,
"learning_rate": 5.310526315789473e-07,
"loss": 11.8685,
"step": 199
},
{
"epoch": 0.006207613638127163,
"grad_norm": 0.20448267459869385,
"learning_rate": 0.0,
"loss": 11.8738,
"step": 200
},
{
"epoch": 0.006207613638127163,
"eval_loss": 11.866828918457031,
"eval_runtime": 412.6675,
"eval_samples_per_second": 32.874,
"eval_steps_per_second": 8.22,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 261529927680.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}