lesso's picture
Training in progress, step 200, checkpoint
c949d36 verified
{
"best_metric": 10.55037784576416,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.020064205457463884,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010032102728731942,
"grad_norm": 1.5991774797439575,
"learning_rate": 1.009e-05,
"loss": 22.2622,
"step": 1
},
{
"epoch": 0.00010032102728731942,
"eval_loss": 11.123811721801758,
"eval_runtime": 70.5623,
"eval_samples_per_second": 59.493,
"eval_steps_per_second": 14.88,
"step": 1
},
{
"epoch": 0.00020064205457463884,
"grad_norm": 1.434697151184082,
"learning_rate": 2.018e-05,
"loss": 22.2601,
"step": 2
},
{
"epoch": 0.00030096308186195825,
"grad_norm": 1.767243504524231,
"learning_rate": 3.027e-05,
"loss": 22.2774,
"step": 3
},
{
"epoch": 0.0004012841091492777,
"grad_norm": 1.4239779710769653,
"learning_rate": 4.036e-05,
"loss": 22.2314,
"step": 4
},
{
"epoch": 0.0005016051364365971,
"grad_norm": 1.3848161697387695,
"learning_rate": 5.045e-05,
"loss": 22.2259,
"step": 5
},
{
"epoch": 0.0006019261637239165,
"grad_norm": 1.6321865320205688,
"learning_rate": 6.054e-05,
"loss": 22.2408,
"step": 6
},
{
"epoch": 0.0007022471910112359,
"grad_norm": 1.7302271127700806,
"learning_rate": 7.062999999999999e-05,
"loss": 22.2005,
"step": 7
},
{
"epoch": 0.0008025682182985554,
"grad_norm": 1.7554914951324463,
"learning_rate": 8.072e-05,
"loss": 22.1973,
"step": 8
},
{
"epoch": 0.0009028892455858748,
"grad_norm": 1.476845145225525,
"learning_rate": 9.081e-05,
"loss": 22.1782,
"step": 9
},
{
"epoch": 0.0010032102728731941,
"grad_norm": 1.6162638664245605,
"learning_rate": 0.0001009,
"loss": 22.1558,
"step": 10
},
{
"epoch": 0.0011035313001605137,
"grad_norm": 1.559161901473999,
"learning_rate": 0.00010036894736842106,
"loss": 22.1244,
"step": 11
},
{
"epoch": 0.001203852327447833,
"grad_norm": 1.6010931730270386,
"learning_rate": 9.98378947368421e-05,
"loss": 22.0848,
"step": 12
},
{
"epoch": 0.0013041733547351525,
"grad_norm": 1.7077960968017578,
"learning_rate": 9.930684210526315e-05,
"loss": 22.0494,
"step": 13
},
{
"epoch": 0.0014044943820224719,
"grad_norm": 1.5489342212677002,
"learning_rate": 9.877578947368421e-05,
"loss": 22.0367,
"step": 14
},
{
"epoch": 0.0015048154093097914,
"grad_norm": 1.1338545083999634,
"learning_rate": 9.824473684210527e-05,
"loss": 22.0596,
"step": 15
},
{
"epoch": 0.0016051364365971107,
"grad_norm": 1.444381833076477,
"learning_rate": 9.771368421052632e-05,
"loss": 21.9966,
"step": 16
},
{
"epoch": 0.0017054574638844303,
"grad_norm": 1.4157575368881226,
"learning_rate": 9.718263157894736e-05,
"loss": 21.9681,
"step": 17
},
{
"epoch": 0.0018057784911717496,
"grad_norm": 1.5229800939559937,
"learning_rate": 9.665157894736842e-05,
"loss": 21.9163,
"step": 18
},
{
"epoch": 0.001906099518459069,
"grad_norm": 1.5339312553405762,
"learning_rate": 9.612052631578948e-05,
"loss": 21.8712,
"step": 19
},
{
"epoch": 0.0020064205457463883,
"grad_norm": 1.4681302309036255,
"learning_rate": 9.558947368421052e-05,
"loss": 21.9278,
"step": 20
},
{
"epoch": 0.002106741573033708,
"grad_norm": 1.7636168003082275,
"learning_rate": 9.505842105263159e-05,
"loss": 21.7252,
"step": 21
},
{
"epoch": 0.0022070626003210273,
"grad_norm": 1.4351588487625122,
"learning_rate": 9.452736842105263e-05,
"loss": 21.7812,
"step": 22
},
{
"epoch": 0.002307383627608347,
"grad_norm": 1.4266163110733032,
"learning_rate": 9.399631578947368e-05,
"loss": 21.767,
"step": 23
},
{
"epoch": 0.002407704654895666,
"grad_norm": 1.3205598592758179,
"learning_rate": 9.346526315789474e-05,
"loss": 21.742,
"step": 24
},
{
"epoch": 0.0025080256821829855,
"grad_norm": 1.32651948928833,
"learning_rate": 9.293421052631578e-05,
"loss": 21.7368,
"step": 25
},
{
"epoch": 0.002608346709470305,
"grad_norm": 1.4364937543869019,
"learning_rate": 9.240315789473684e-05,
"loss": 21.647,
"step": 26
},
{
"epoch": 0.002708667736757624,
"grad_norm": 1.3655853271484375,
"learning_rate": 9.18721052631579e-05,
"loss": 21.644,
"step": 27
},
{
"epoch": 0.0028089887640449437,
"grad_norm": 1.3614681959152222,
"learning_rate": 9.134105263157895e-05,
"loss": 21.5887,
"step": 28
},
{
"epoch": 0.0029093097913322633,
"grad_norm": 1.360560655593872,
"learning_rate": 9.081e-05,
"loss": 21.555,
"step": 29
},
{
"epoch": 0.003009630818619583,
"grad_norm": 1.2450507879257202,
"learning_rate": 9.027894736842105e-05,
"loss": 21.5753,
"step": 30
},
{
"epoch": 0.003109951845906902,
"grad_norm": 1.2793240547180176,
"learning_rate": 8.97478947368421e-05,
"loss": 21.5722,
"step": 31
},
{
"epoch": 0.0032102728731942215,
"grad_norm": 1.0861470699310303,
"learning_rate": 8.921684210526316e-05,
"loss": 21.5705,
"step": 32
},
{
"epoch": 0.003310593900481541,
"grad_norm": 1.2511563301086426,
"learning_rate": 8.86857894736842e-05,
"loss": 21.4441,
"step": 33
},
{
"epoch": 0.0034109149277688606,
"grad_norm": 0.9954004287719727,
"learning_rate": 8.815473684210527e-05,
"loss": 21.6331,
"step": 34
},
{
"epoch": 0.0035112359550561797,
"grad_norm": 1.0669211149215698,
"learning_rate": 8.762368421052631e-05,
"loss": 21.4872,
"step": 35
},
{
"epoch": 0.0036115569823434992,
"grad_norm": 1.0056594610214233,
"learning_rate": 8.709263157894737e-05,
"loss": 21.4733,
"step": 36
},
{
"epoch": 0.0037118780096308188,
"grad_norm": 1.016147255897522,
"learning_rate": 8.656157894736843e-05,
"loss": 21.4347,
"step": 37
},
{
"epoch": 0.003812199036918138,
"grad_norm": 1.1539530754089355,
"learning_rate": 8.603052631578947e-05,
"loss": 21.3633,
"step": 38
},
{
"epoch": 0.003912520064205458,
"grad_norm": 0.9206962585449219,
"learning_rate": 8.549947368421052e-05,
"loss": 21.4637,
"step": 39
},
{
"epoch": 0.0040128410914927765,
"grad_norm": 0.833474338054657,
"learning_rate": 8.496842105263158e-05,
"loss": 21.4302,
"step": 40
},
{
"epoch": 0.004113162118780096,
"grad_norm": 0.8578913807868958,
"learning_rate": 8.443736842105264e-05,
"loss": 21.4463,
"step": 41
},
{
"epoch": 0.004213483146067416,
"grad_norm": 1.1002817153930664,
"learning_rate": 8.390631578947369e-05,
"loss": 21.2379,
"step": 42
},
{
"epoch": 0.004313804173354735,
"grad_norm": 0.831058144569397,
"learning_rate": 8.337526315789473e-05,
"loss": 21.4246,
"step": 43
},
{
"epoch": 0.004414125200642055,
"grad_norm": 0.7859891057014465,
"learning_rate": 8.284421052631579e-05,
"loss": 21.318,
"step": 44
},
{
"epoch": 0.004514446227929374,
"grad_norm": 0.7401735782623291,
"learning_rate": 8.231315789473685e-05,
"loss": 21.3669,
"step": 45
},
{
"epoch": 0.004614767255216694,
"grad_norm": 0.7849454879760742,
"learning_rate": 8.178210526315789e-05,
"loss": 21.3329,
"step": 46
},
{
"epoch": 0.0047150882825040125,
"grad_norm": 0.6906037330627441,
"learning_rate": 8.125105263157894e-05,
"loss": 21.6264,
"step": 47
},
{
"epoch": 0.004815409309791332,
"grad_norm": 0.7339415550231934,
"learning_rate": 8.072e-05,
"loss": 21.5479,
"step": 48
},
{
"epoch": 0.0049157303370786515,
"grad_norm": 0.7654588222503662,
"learning_rate": 8.018894736842106e-05,
"loss": 21.4293,
"step": 49
},
{
"epoch": 0.005016051364365971,
"grad_norm": 0.6686012148857117,
"learning_rate": 7.965789473684211e-05,
"loss": 21.401,
"step": 50
},
{
"epoch": 0.005016051364365971,
"eval_loss": 10.654853820800781,
"eval_runtime": 70.5622,
"eval_samples_per_second": 59.494,
"eval_steps_per_second": 14.88,
"step": 50
},
{
"epoch": 0.005116372391653291,
"grad_norm": 0.6304632425308228,
"learning_rate": 7.912684210526315e-05,
"loss": 21.4006,
"step": 51
},
{
"epoch": 0.00521669341894061,
"grad_norm": 0.8711794018745422,
"learning_rate": 7.859578947368421e-05,
"loss": 21.125,
"step": 52
},
{
"epoch": 0.00531701444622793,
"grad_norm": 0.6818077564239502,
"learning_rate": 7.806473684210527e-05,
"loss": 21.2897,
"step": 53
},
{
"epoch": 0.005417335473515248,
"grad_norm": 0.7950479388237,
"learning_rate": 7.753368421052631e-05,
"loss": 21.1913,
"step": 54
},
{
"epoch": 0.005517656500802568,
"grad_norm": 0.6777336597442627,
"learning_rate": 7.700263157894738e-05,
"loss": 21.3268,
"step": 55
},
{
"epoch": 0.0056179775280898875,
"grad_norm": 0.9595808386802673,
"learning_rate": 7.647157894736842e-05,
"loss": 21.0679,
"step": 56
},
{
"epoch": 0.005718298555377207,
"grad_norm": 0.7060173153877258,
"learning_rate": 7.594052631578948e-05,
"loss": 21.2321,
"step": 57
},
{
"epoch": 0.005818619582664527,
"grad_norm": 0.6887240409851074,
"learning_rate": 7.540947368421053e-05,
"loss": 21.1762,
"step": 58
},
{
"epoch": 0.005918940609951846,
"grad_norm": 0.6797646880149841,
"learning_rate": 7.487842105263157e-05,
"loss": 21.3383,
"step": 59
},
{
"epoch": 0.006019261637239166,
"grad_norm": 0.7469924092292786,
"learning_rate": 7.434736842105263e-05,
"loss": 21.1953,
"step": 60
},
{
"epoch": 0.006119582664526484,
"grad_norm": 0.8638947606086731,
"learning_rate": 7.381631578947368e-05,
"loss": 21.1223,
"step": 61
},
{
"epoch": 0.006219903691813804,
"grad_norm": 0.7506782412528992,
"learning_rate": 7.328526315789474e-05,
"loss": 21.1332,
"step": 62
},
{
"epoch": 0.006320224719101123,
"grad_norm": 0.7623918652534485,
"learning_rate": 7.27542105263158e-05,
"loss": 21.2479,
"step": 63
},
{
"epoch": 0.006420545746388443,
"grad_norm": 0.6180063486099243,
"learning_rate": 7.222315789473684e-05,
"loss": 21.2955,
"step": 64
},
{
"epoch": 0.0065208667736757625,
"grad_norm": 0.7025743722915649,
"learning_rate": 7.16921052631579e-05,
"loss": 21.0339,
"step": 65
},
{
"epoch": 0.006621187800963082,
"grad_norm": 0.6287188529968262,
"learning_rate": 7.116105263157895e-05,
"loss": 21.2094,
"step": 66
},
{
"epoch": 0.006721508828250402,
"grad_norm": 0.6974558234214783,
"learning_rate": 7.062999999999999e-05,
"loss": 21.4861,
"step": 67
},
{
"epoch": 0.006821829855537721,
"grad_norm": 0.6953954696655273,
"learning_rate": 7.009894736842106e-05,
"loss": 21.3575,
"step": 68
},
{
"epoch": 0.00692215088282504,
"grad_norm": 0.7836357951164246,
"learning_rate": 6.95678947368421e-05,
"loss": 21.5824,
"step": 69
},
{
"epoch": 0.007022471910112359,
"grad_norm": 0.5490475296974182,
"learning_rate": 6.903684210526316e-05,
"loss": 21.3933,
"step": 70
},
{
"epoch": 0.007122792937399679,
"grad_norm": 0.5662202835083008,
"learning_rate": 6.850578947368422e-05,
"loss": 21.1143,
"step": 71
},
{
"epoch": 0.0072231139646869984,
"grad_norm": 0.6373079419136047,
"learning_rate": 6.797473684210526e-05,
"loss": 21.1235,
"step": 72
},
{
"epoch": 0.007323434991974318,
"grad_norm": 0.7429666519165039,
"learning_rate": 6.744368421052631e-05,
"loss": 21.1772,
"step": 73
},
{
"epoch": 0.0074237560192616375,
"grad_norm": 0.5452073812484741,
"learning_rate": 6.691263157894736e-05,
"loss": 21.2562,
"step": 74
},
{
"epoch": 0.007524077046548957,
"grad_norm": 0.5880535244941711,
"learning_rate": 6.638157894736843e-05,
"loss": 21.2677,
"step": 75
},
{
"epoch": 0.007624398073836276,
"grad_norm": 1.1386655569076538,
"learning_rate": 6.585052631578948e-05,
"loss": 21.0751,
"step": 76
},
{
"epoch": 0.007724719101123595,
"grad_norm": 0.6015380024909973,
"learning_rate": 6.531947368421052e-05,
"loss": 21.2662,
"step": 77
},
{
"epoch": 0.007825040128410916,
"grad_norm": 0.6043453216552734,
"learning_rate": 6.478842105263158e-05,
"loss": 21.0845,
"step": 78
},
{
"epoch": 0.007925361155698234,
"grad_norm": 0.715646505355835,
"learning_rate": 6.425736842105264e-05,
"loss": 21.035,
"step": 79
},
{
"epoch": 0.008025682182985553,
"grad_norm": 0.4522017538547516,
"learning_rate": 6.372631578947368e-05,
"loss": 21.2421,
"step": 80
},
{
"epoch": 0.008126003210272873,
"grad_norm": 0.591495156288147,
"learning_rate": 6.319526315789473e-05,
"loss": 21.2218,
"step": 81
},
{
"epoch": 0.008226324237560192,
"grad_norm": 0.8266862034797668,
"learning_rate": 6.266421052631579e-05,
"loss": 21.3115,
"step": 82
},
{
"epoch": 0.008326645264847513,
"grad_norm": 0.5965524911880493,
"learning_rate": 6.213315789473685e-05,
"loss": 21.1308,
"step": 83
},
{
"epoch": 0.008426966292134831,
"grad_norm": 0.5218135118484497,
"learning_rate": 6.16021052631579e-05,
"loss": 21.2285,
"step": 84
},
{
"epoch": 0.008527287319422152,
"grad_norm": 0.6091591715812683,
"learning_rate": 6.107105263157894e-05,
"loss": 21.2382,
"step": 85
},
{
"epoch": 0.00862760834670947,
"grad_norm": 0.5717049241065979,
"learning_rate": 6.054e-05,
"loss": 21.3019,
"step": 86
},
{
"epoch": 0.008727929373996789,
"grad_norm": 0.49166449904441833,
"learning_rate": 6.000894736842105e-05,
"loss": 21.1254,
"step": 87
},
{
"epoch": 0.00882825040128411,
"grad_norm": 0.6972517967224121,
"learning_rate": 5.94778947368421e-05,
"loss": 21.1412,
"step": 88
},
{
"epoch": 0.008928571428571428,
"grad_norm": 0.508397102355957,
"learning_rate": 5.894684210526316e-05,
"loss": 21.1448,
"step": 89
},
{
"epoch": 0.009028892455858748,
"grad_norm": 0.5189328193664551,
"learning_rate": 5.841578947368421e-05,
"loss": 21.3218,
"step": 90
},
{
"epoch": 0.009129213483146067,
"grad_norm": 0.5766506195068359,
"learning_rate": 5.7884736842105265e-05,
"loss": 21.1138,
"step": 91
},
{
"epoch": 0.009229534510433388,
"grad_norm": 0.4988974630832672,
"learning_rate": 5.7353684210526314e-05,
"loss": 21.0883,
"step": 92
},
{
"epoch": 0.009329855537720706,
"grad_norm": 0.6053217053413391,
"learning_rate": 5.6822631578947364e-05,
"loss": 21.1423,
"step": 93
},
{
"epoch": 0.009430176565008025,
"grad_norm": 0.6934704184532166,
"learning_rate": 5.629157894736842e-05,
"loss": 21.2583,
"step": 94
},
{
"epoch": 0.009530497592295345,
"grad_norm": 0.5622691512107849,
"learning_rate": 5.576052631578948e-05,
"loss": 21.2342,
"step": 95
},
{
"epoch": 0.009630818619582664,
"grad_norm": 0.5352026224136353,
"learning_rate": 5.522947368421053e-05,
"loss": 21.1231,
"step": 96
},
{
"epoch": 0.009731139646869984,
"grad_norm": 0.4841010868549347,
"learning_rate": 5.469842105263158e-05,
"loss": 21.1777,
"step": 97
},
{
"epoch": 0.009831460674157303,
"grad_norm": 0.5576900839805603,
"learning_rate": 5.416736842105263e-05,
"loss": 21.1561,
"step": 98
},
{
"epoch": 0.009931781701444624,
"grad_norm": 0.5335264205932617,
"learning_rate": 5.3636315789473685e-05,
"loss": 21.2953,
"step": 99
},
{
"epoch": 0.010032102728731942,
"grad_norm": 0.5405234098434448,
"learning_rate": 5.3105263157894734e-05,
"loss": 21.3645,
"step": 100
},
{
"epoch": 0.010032102728731942,
"eval_loss": 10.584715843200684,
"eval_runtime": 70.5882,
"eval_samples_per_second": 59.472,
"eval_steps_per_second": 14.875,
"step": 100
},
{
"epoch": 0.01013242375601926,
"grad_norm": 0.6677088141441345,
"learning_rate": 5.257421052631578e-05,
"loss": 21.1299,
"step": 101
},
{
"epoch": 0.010232744783306581,
"grad_norm": 0.5807069540023804,
"learning_rate": 5.2043157894736846e-05,
"loss": 21.0427,
"step": 102
},
{
"epoch": 0.0103330658105939,
"grad_norm": 0.501835823059082,
"learning_rate": 5.1512105263157895e-05,
"loss": 21.1984,
"step": 103
},
{
"epoch": 0.01043338683788122,
"grad_norm": 0.7399368286132812,
"learning_rate": 5.098105263157895e-05,
"loss": 21.1552,
"step": 104
},
{
"epoch": 0.010533707865168539,
"grad_norm": 0.8803501725196838,
"learning_rate": 5.045e-05,
"loss": 20.9362,
"step": 105
},
{
"epoch": 0.01063402889245586,
"grad_norm": 0.6539490818977356,
"learning_rate": 4.991894736842105e-05,
"loss": 21.3907,
"step": 106
},
{
"epoch": 0.010734349919743178,
"grad_norm": 0.8279363512992859,
"learning_rate": 4.9387894736842105e-05,
"loss": 21.1683,
"step": 107
},
{
"epoch": 0.010834670947030497,
"grad_norm": 0.7174587845802307,
"learning_rate": 4.885684210526316e-05,
"loss": 21.0967,
"step": 108
},
{
"epoch": 0.010934991974317817,
"grad_norm": 0.5338941216468811,
"learning_rate": 4.832578947368421e-05,
"loss": 21.2763,
"step": 109
},
{
"epoch": 0.011035313001605136,
"grad_norm": 0.5613208413124084,
"learning_rate": 4.779473684210526e-05,
"loss": 21.3178,
"step": 110
},
{
"epoch": 0.011135634028892456,
"grad_norm": 0.5921754240989685,
"learning_rate": 4.7263684210526315e-05,
"loss": 21.1902,
"step": 111
},
{
"epoch": 0.011235955056179775,
"grad_norm": 0.6494777798652649,
"learning_rate": 4.673263157894737e-05,
"loss": 21.1112,
"step": 112
},
{
"epoch": 0.011336276083467095,
"grad_norm": 0.6433371901512146,
"learning_rate": 4.620157894736842e-05,
"loss": 21.1761,
"step": 113
},
{
"epoch": 0.011436597110754414,
"grad_norm": 0.915080726146698,
"learning_rate": 4.5670526315789475e-05,
"loss": 21.0194,
"step": 114
},
{
"epoch": 0.011536918138041733,
"grad_norm": 1.0387072563171387,
"learning_rate": 4.5139473684210524e-05,
"loss": 20.9303,
"step": 115
},
{
"epoch": 0.011637239165329053,
"grad_norm": 0.6426976919174194,
"learning_rate": 4.460842105263158e-05,
"loss": 21.3885,
"step": 116
},
{
"epoch": 0.011737560192616372,
"grad_norm": 0.923250138759613,
"learning_rate": 4.4077368421052636e-05,
"loss": 20.8614,
"step": 117
},
{
"epoch": 0.011837881219903692,
"grad_norm": 0.5725429058074951,
"learning_rate": 4.3546315789473685e-05,
"loss": 21.1852,
"step": 118
},
{
"epoch": 0.011938202247191011,
"grad_norm": 0.656847357749939,
"learning_rate": 4.3015263157894734e-05,
"loss": 21.3078,
"step": 119
},
{
"epoch": 0.012038523274478331,
"grad_norm": 0.5193572640419006,
"learning_rate": 4.248421052631579e-05,
"loss": 21.1464,
"step": 120
},
{
"epoch": 0.01213884430176565,
"grad_norm": 0.7125611901283264,
"learning_rate": 4.1953157894736846e-05,
"loss": 21.1635,
"step": 121
},
{
"epoch": 0.012239165329052969,
"grad_norm": 0.523034930229187,
"learning_rate": 4.1422105263157895e-05,
"loss": 21.1683,
"step": 122
},
{
"epoch": 0.012339486356340289,
"grad_norm": 0.540019154548645,
"learning_rate": 4.0891052631578944e-05,
"loss": 21.0405,
"step": 123
},
{
"epoch": 0.012439807383627608,
"grad_norm": 0.6048979759216309,
"learning_rate": 4.036e-05,
"loss": 20.9969,
"step": 124
},
{
"epoch": 0.012540128410914928,
"grad_norm": 0.6617588996887207,
"learning_rate": 3.9828947368421056e-05,
"loss": 20.9615,
"step": 125
},
{
"epoch": 0.012640449438202247,
"grad_norm": 0.5792971253395081,
"learning_rate": 3.9297894736842105e-05,
"loss": 21.0814,
"step": 126
},
{
"epoch": 0.012740770465489567,
"grad_norm": 0.8640639185905457,
"learning_rate": 3.8766842105263154e-05,
"loss": 20.9608,
"step": 127
},
{
"epoch": 0.012841091492776886,
"grad_norm": 0.6493314504623413,
"learning_rate": 3.823578947368421e-05,
"loss": 21.0183,
"step": 128
},
{
"epoch": 0.012941412520064205,
"grad_norm": 0.5835341811180115,
"learning_rate": 3.7704736842105265e-05,
"loss": 21.247,
"step": 129
},
{
"epoch": 0.013041733547351525,
"grad_norm": 0.5780165791511536,
"learning_rate": 3.7173684210526315e-05,
"loss": 21.1309,
"step": 130
},
{
"epoch": 0.013142054574638844,
"grad_norm": 0.603012204170227,
"learning_rate": 3.664263157894737e-05,
"loss": 21.105,
"step": 131
},
{
"epoch": 0.013242375601926164,
"grad_norm": 0.5957277417182922,
"learning_rate": 3.611157894736842e-05,
"loss": 21.1341,
"step": 132
},
{
"epoch": 0.013342696629213483,
"grad_norm": 0.5657469034194946,
"learning_rate": 3.5580526315789475e-05,
"loss": 21.2018,
"step": 133
},
{
"epoch": 0.013443017656500803,
"grad_norm": 0.5668490529060364,
"learning_rate": 3.504947368421053e-05,
"loss": 21.2215,
"step": 134
},
{
"epoch": 0.013543338683788122,
"grad_norm": 0.6400595903396606,
"learning_rate": 3.451842105263158e-05,
"loss": 21.1004,
"step": 135
},
{
"epoch": 0.013643659711075442,
"grad_norm": 0.5542194247245789,
"learning_rate": 3.398736842105263e-05,
"loss": 21.1221,
"step": 136
},
{
"epoch": 0.013743980738362761,
"grad_norm": 0.5827724933624268,
"learning_rate": 3.345631578947368e-05,
"loss": 21.1213,
"step": 137
},
{
"epoch": 0.01384430176565008,
"grad_norm": 0.5147905945777893,
"learning_rate": 3.292526315789474e-05,
"loss": 21.1524,
"step": 138
},
{
"epoch": 0.0139446227929374,
"grad_norm": 0.6220738291740417,
"learning_rate": 3.239421052631579e-05,
"loss": 21.0253,
"step": 139
},
{
"epoch": 0.014044943820224719,
"grad_norm": 0.6777515411376953,
"learning_rate": 3.186315789473684e-05,
"loss": 21.3317,
"step": 140
},
{
"epoch": 0.014145264847512039,
"grad_norm": 0.6565226912498474,
"learning_rate": 3.1332105263157895e-05,
"loss": 20.9974,
"step": 141
},
{
"epoch": 0.014245585874799358,
"grad_norm": 0.6572129130363464,
"learning_rate": 3.080105263157895e-05,
"loss": 21.1355,
"step": 142
},
{
"epoch": 0.014345906902086678,
"grad_norm": 0.6094454526901245,
"learning_rate": 3.027e-05,
"loss": 21.1024,
"step": 143
},
{
"epoch": 0.014446227929373997,
"grad_norm": 0.5109902024269104,
"learning_rate": 2.973894736842105e-05,
"loss": 21.224,
"step": 144
},
{
"epoch": 0.014546548956661316,
"grad_norm": 0.7738908529281616,
"learning_rate": 2.9207894736842105e-05,
"loss": 21.4256,
"step": 145
},
{
"epoch": 0.014646869983948636,
"grad_norm": 0.5986607670783997,
"learning_rate": 2.8676842105263157e-05,
"loss": 21.2798,
"step": 146
},
{
"epoch": 0.014747191011235955,
"grad_norm": 0.6626487970352173,
"learning_rate": 2.814578947368421e-05,
"loss": 21.1379,
"step": 147
},
{
"epoch": 0.014847512038523275,
"grad_norm": 0.6779626607894897,
"learning_rate": 2.7614736842105266e-05,
"loss": 21.0229,
"step": 148
},
{
"epoch": 0.014947833065810594,
"grad_norm": 0.6698882579803467,
"learning_rate": 2.7083684210526315e-05,
"loss": 21.0818,
"step": 149
},
{
"epoch": 0.015048154093097914,
"grad_norm": 0.8434391021728516,
"learning_rate": 2.6552631578947367e-05,
"loss": 21.5482,
"step": 150
},
{
"epoch": 0.015048154093097914,
"eval_loss": 10.558324813842773,
"eval_runtime": 70.5565,
"eval_samples_per_second": 59.498,
"eval_steps_per_second": 14.882,
"step": 150
},
{
"epoch": 0.015148475120385233,
"grad_norm": 0.8424636721611023,
"learning_rate": 2.6021578947368423e-05,
"loss": 20.936,
"step": 151
},
{
"epoch": 0.015248796147672551,
"grad_norm": 0.5448256731033325,
"learning_rate": 2.5490526315789475e-05,
"loss": 21.1742,
"step": 152
},
{
"epoch": 0.015349117174959872,
"grad_norm": 0.6612237095832825,
"learning_rate": 2.4959473684210524e-05,
"loss": 21.0709,
"step": 153
},
{
"epoch": 0.01544943820224719,
"grad_norm": 0.550390899181366,
"learning_rate": 2.442842105263158e-05,
"loss": 20.8933,
"step": 154
},
{
"epoch": 0.015549759229534511,
"grad_norm": 0.6066597700119019,
"learning_rate": 2.389736842105263e-05,
"loss": 20.9679,
"step": 155
},
{
"epoch": 0.01565008025682183,
"grad_norm": 0.595045804977417,
"learning_rate": 2.3366315789473685e-05,
"loss": 21.0887,
"step": 156
},
{
"epoch": 0.01575040128410915,
"grad_norm": 0.6363713145256042,
"learning_rate": 2.2835263157894738e-05,
"loss": 21.3898,
"step": 157
},
{
"epoch": 0.01585072231139647,
"grad_norm": 0.5982836484909058,
"learning_rate": 2.230421052631579e-05,
"loss": 21.1671,
"step": 158
},
{
"epoch": 0.015951043338683787,
"grad_norm": 0.7963234782218933,
"learning_rate": 2.1773157894736843e-05,
"loss": 20.9716,
"step": 159
},
{
"epoch": 0.016051364365971106,
"grad_norm": 0.647216796875,
"learning_rate": 2.1242105263157895e-05,
"loss": 21.1608,
"step": 160
},
{
"epoch": 0.016151685393258428,
"grad_norm": 0.5022075772285461,
"learning_rate": 2.0711052631578947e-05,
"loss": 21.1819,
"step": 161
},
{
"epoch": 0.016252006420545747,
"grad_norm": 0.5094108581542969,
"learning_rate": 2.018e-05,
"loss": 21.1568,
"step": 162
},
{
"epoch": 0.016352327447833066,
"grad_norm": 0.5434950590133667,
"learning_rate": 1.9648947368421052e-05,
"loss": 20.9586,
"step": 163
},
{
"epoch": 0.016452648475120384,
"grad_norm": 0.6874385476112366,
"learning_rate": 1.9117894736842105e-05,
"loss": 21.0375,
"step": 164
},
{
"epoch": 0.016552969502407703,
"grad_norm": 0.49630945920944214,
"learning_rate": 1.8586842105263157e-05,
"loss": 21.0352,
"step": 165
},
{
"epoch": 0.016653290529695025,
"grad_norm": 0.6111531257629395,
"learning_rate": 1.805578947368421e-05,
"loss": 21.2822,
"step": 166
},
{
"epoch": 0.016753611556982344,
"grad_norm": 0.5392615795135498,
"learning_rate": 1.7524736842105266e-05,
"loss": 21.1524,
"step": 167
},
{
"epoch": 0.016853932584269662,
"grad_norm": 0.5594942569732666,
"learning_rate": 1.6993684210526315e-05,
"loss": 21.1931,
"step": 168
},
{
"epoch": 0.01695425361155698,
"grad_norm": 0.5756310224533081,
"learning_rate": 1.646263157894737e-05,
"loss": 21.2089,
"step": 169
},
{
"epoch": 0.017054574638844303,
"grad_norm": 0.783043622970581,
"learning_rate": 1.593157894736842e-05,
"loss": 20.8715,
"step": 170
},
{
"epoch": 0.017154895666131622,
"grad_norm": 0.4898316562175751,
"learning_rate": 1.5400526315789475e-05,
"loss": 21.1136,
"step": 171
},
{
"epoch": 0.01725521669341894,
"grad_norm": 0.6354514360427856,
"learning_rate": 1.4869473684210524e-05,
"loss": 21.089,
"step": 172
},
{
"epoch": 0.01735553772070626,
"grad_norm": 0.6032927632331848,
"learning_rate": 1.4338421052631579e-05,
"loss": 21.0936,
"step": 173
},
{
"epoch": 0.017455858747993578,
"grad_norm": 0.6571072936058044,
"learning_rate": 1.3807368421052633e-05,
"loss": 20.9723,
"step": 174
},
{
"epoch": 0.0175561797752809,
"grad_norm": 0.5904538035392761,
"learning_rate": 1.3276315789473684e-05,
"loss": 21.287,
"step": 175
},
{
"epoch": 0.01765650080256822,
"grad_norm": 0.5847862362861633,
"learning_rate": 1.2745263157894738e-05,
"loss": 21.1966,
"step": 176
},
{
"epoch": 0.017756821829855537,
"grad_norm": 0.5706862211227417,
"learning_rate": 1.221421052631579e-05,
"loss": 20.9706,
"step": 177
},
{
"epoch": 0.017857142857142856,
"grad_norm": 0.5410795211791992,
"learning_rate": 1.1683157894736843e-05,
"loss": 21.2563,
"step": 178
},
{
"epoch": 0.01795746388443018,
"grad_norm": 0.5394900441169739,
"learning_rate": 1.1152105263157895e-05,
"loss": 20.9992,
"step": 179
},
{
"epoch": 0.018057784911717497,
"grad_norm": 0.45912498235702515,
"learning_rate": 1.0621052631578948e-05,
"loss": 21.2137,
"step": 180
},
{
"epoch": 0.018158105939004816,
"grad_norm": 0.5860676765441895,
"learning_rate": 1.009e-05,
"loss": 21.1744,
"step": 181
},
{
"epoch": 0.018258426966292134,
"grad_norm": 0.7394751310348511,
"learning_rate": 9.558947368421052e-06,
"loss": 20.9416,
"step": 182
},
{
"epoch": 0.018358747993579453,
"grad_norm": 0.6703020334243774,
"learning_rate": 9.027894736842105e-06,
"loss": 20.9801,
"step": 183
},
{
"epoch": 0.018459069020866775,
"grad_norm": 0.5259845852851868,
"learning_rate": 8.496842105263157e-06,
"loss": 21.0672,
"step": 184
},
{
"epoch": 0.018559390048154094,
"grad_norm": 0.47938376665115356,
"learning_rate": 7.96578947368421e-06,
"loss": 21.0379,
"step": 185
},
{
"epoch": 0.018659711075441412,
"grad_norm": 0.6665632128715515,
"learning_rate": 7.434736842105262e-06,
"loss": 21.3934,
"step": 186
},
{
"epoch": 0.01876003210272873,
"grad_norm": 0.6356412172317505,
"learning_rate": 6.903684210526316e-06,
"loss": 21.3995,
"step": 187
},
{
"epoch": 0.01886035313001605,
"grad_norm": 0.5531170964241028,
"learning_rate": 6.372631578947369e-06,
"loss": 21.177,
"step": 188
},
{
"epoch": 0.018960674157303372,
"grad_norm": 0.5264145731925964,
"learning_rate": 5.841578947368421e-06,
"loss": 21.2467,
"step": 189
},
{
"epoch": 0.01906099518459069,
"grad_norm": 0.5184823870658875,
"learning_rate": 5.310526315789474e-06,
"loss": 21.0768,
"step": 190
},
{
"epoch": 0.01916131621187801,
"grad_norm": 0.601334810256958,
"learning_rate": 4.779473684210526e-06,
"loss": 21.0318,
"step": 191
},
{
"epoch": 0.019261637239165328,
"grad_norm": 0.6639525890350342,
"learning_rate": 4.248421052631579e-06,
"loss": 20.9996,
"step": 192
},
{
"epoch": 0.01936195826645265,
"grad_norm": 0.47377097606658936,
"learning_rate": 3.717368421052631e-06,
"loss": 21.0164,
"step": 193
},
{
"epoch": 0.01946227929373997,
"grad_norm": 0.6908702254295349,
"learning_rate": 3.1863157894736844e-06,
"loss": 20.9247,
"step": 194
},
{
"epoch": 0.019562600321027288,
"grad_norm": 0.5369330048561096,
"learning_rate": 2.655263157894737e-06,
"loss": 21.3861,
"step": 195
},
{
"epoch": 0.019662921348314606,
"grad_norm": 0.6818935871124268,
"learning_rate": 2.1242105263157893e-06,
"loss": 21.0936,
"step": 196
},
{
"epoch": 0.019763242375601925,
"grad_norm": 0.6107151508331299,
"learning_rate": 1.5931578947368422e-06,
"loss": 21.0526,
"step": 197
},
{
"epoch": 0.019863563402889247,
"grad_norm": 0.6597663760185242,
"learning_rate": 1.0621052631578947e-06,
"loss": 21.1064,
"step": 198
},
{
"epoch": 0.019963884430176566,
"grad_norm": 0.6686668992042542,
"learning_rate": 5.310526315789473e-07,
"loss": 21.0729,
"step": 199
},
{
"epoch": 0.020064205457463884,
"grad_norm": 0.7321626543998718,
"learning_rate": 0.0,
"loss": 21.4709,
"step": 200
},
{
"epoch": 0.020064205457463884,
"eval_loss": 10.55037784576416,
"eval_runtime": 70.588,
"eval_samples_per_second": 59.472,
"eval_steps_per_second": 14.875,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 11943700070400.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}