lombardata's picture
Evaluation on the test set completed on 2024_11_15.
6259946 verified
raw
history blame
41.5 kB
{
"best_metric": 0.46414923667907715,
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-probs-large-2024_11_15-batch-size32_freeze_probs/checkpoint-15987",
"epoch": 83.0,
"eval_steps": 500,
"global_step": 18177,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_explained_variance": 0.3432542085647583,
"eval_kl_divergence": 0.31011611223220825,
"eval_loss": 0.4855400025844574,
"eval_mae": 0.1364378184080124,
"eval_rmse": 0.17712123692035675,
"eval_runtime": 55.3387,
"eval_samples_per_second": 42.538,
"eval_steps_per_second": 1.337,
"learning_rate": 0.001,
"step": 219
},
{
"epoch": 2.0,
"eval_explained_variance": 0.38912513852119446,
"eval_kl_divergence": 0.5077245235443115,
"eval_loss": 0.47601452469825745,
"eval_mae": 0.12465938925743103,
"eval_rmse": 0.16875195503234863,
"eval_runtime": 54.843,
"eval_samples_per_second": 42.923,
"eval_steps_per_second": 1.349,
"learning_rate": 0.001,
"step": 438
},
{
"epoch": 2.2831050228310503,
"grad_norm": 0.35450002551078796,
"learning_rate": 0.001,
"loss": 0.5195,
"step": 500
},
{
"epoch": 3.0,
"eval_explained_variance": 0.3848476707935333,
"eval_kl_divergence": 0.7895973324775696,
"eval_loss": 0.4776814579963684,
"eval_mae": 0.12300346046686172,
"eval_rmse": 0.17065072059631348,
"eval_runtime": 56.2195,
"eval_samples_per_second": 41.872,
"eval_steps_per_second": 1.316,
"learning_rate": 0.001,
"step": 657
},
{
"epoch": 4.0,
"eval_explained_variance": 0.403704434633255,
"eval_kl_divergence": 0.49319207668304443,
"eval_loss": 0.47429159283638,
"eval_mae": 0.12376764416694641,
"eval_rmse": 0.1672389954328537,
"eval_runtime": 54.7793,
"eval_samples_per_second": 42.972,
"eval_steps_per_second": 1.351,
"learning_rate": 0.001,
"step": 876
},
{
"epoch": 4.566210045662101,
"grad_norm": 0.2313629388809204,
"learning_rate": 0.001,
"loss": 0.4742,
"step": 1000
},
{
"epoch": 5.0,
"eval_explained_variance": 0.41316938400268555,
"eval_kl_divergence": 0.2900688648223877,
"eval_loss": 0.47457176446914673,
"eval_mae": 0.12771284580230713,
"eval_rmse": 0.16687722504138947,
"eval_runtime": 55.1273,
"eval_samples_per_second": 42.701,
"eval_steps_per_second": 1.342,
"learning_rate": 0.001,
"step": 1095
},
{
"epoch": 6.0,
"eval_explained_variance": 0.40222811698913574,
"eval_kl_divergence": 0.43988940119743347,
"eval_loss": 0.4749792814254761,
"eval_mae": 0.1252531260251999,
"eval_rmse": 0.16735166311264038,
"eval_runtime": 53.136,
"eval_samples_per_second": 44.301,
"eval_steps_per_second": 1.393,
"learning_rate": 0.001,
"step": 1314
},
{
"epoch": 6.8493150684931505,
"grad_norm": 0.18959695100784302,
"learning_rate": 0.001,
"loss": 0.4706,
"step": 1500
},
{
"epoch": 7.0,
"eval_explained_variance": 0.4019981324672699,
"eval_kl_divergence": 0.48684099316596985,
"eval_loss": 0.4744807779788971,
"eval_mae": 0.12594138085842133,
"eval_rmse": 0.16705705225467682,
"eval_runtime": 53.367,
"eval_samples_per_second": 44.11,
"eval_steps_per_second": 1.387,
"learning_rate": 0.001,
"step": 1533
},
{
"epoch": 8.0,
"eval_explained_variance": 0.41111621260643005,
"eval_kl_divergence": 0.324148029088974,
"eval_loss": 0.47424906492233276,
"eval_mae": 0.12568950653076172,
"eval_rmse": 0.16722555458545685,
"eval_runtime": 55.5084,
"eval_samples_per_second": 42.408,
"eval_steps_per_second": 1.333,
"learning_rate": 0.001,
"step": 1752
},
{
"epoch": 9.0,
"eval_explained_variance": 0.4107116162776947,
"eval_kl_divergence": 0.4560392200946808,
"eval_loss": 0.4729686379432678,
"eval_mae": 0.12355945259332657,
"eval_rmse": 0.16584673523902893,
"eval_runtime": 55.1596,
"eval_samples_per_second": 42.676,
"eval_steps_per_second": 1.342,
"learning_rate": 0.001,
"step": 1971
},
{
"epoch": 9.132420091324201,
"grad_norm": 0.18577350676059723,
"learning_rate": 0.001,
"loss": 0.4678,
"step": 2000
},
{
"epoch": 10.0,
"eval_explained_variance": 0.4190339744091034,
"eval_kl_divergence": 0.2140849530696869,
"eval_loss": 0.4750550389289856,
"eval_mae": 0.12685616314411163,
"eval_rmse": 0.1679263859987259,
"eval_runtime": 56.0284,
"eval_samples_per_second": 42.014,
"eval_steps_per_second": 1.321,
"learning_rate": 0.001,
"step": 2190
},
{
"epoch": 11.0,
"eval_explained_variance": 0.41887199878692627,
"eval_kl_divergence": 0.2529982030391693,
"eval_loss": 0.4733181595802307,
"eval_mae": 0.12647458910942078,
"eval_rmse": 0.16627688705921173,
"eval_runtime": 55.5532,
"eval_samples_per_second": 42.374,
"eval_steps_per_second": 1.332,
"learning_rate": 0.001,
"step": 2409
},
{
"epoch": 11.415525114155251,
"grad_norm": 0.14618106186389923,
"learning_rate": 0.001,
"loss": 0.4674,
"step": 2500
},
{
"epoch": 12.0,
"eval_explained_variance": 0.4073503315448761,
"eval_kl_divergence": 0.3965540826320648,
"eval_loss": 0.4758349061012268,
"eval_mae": 0.1263781040906906,
"eval_rmse": 0.1683548092842102,
"eval_runtime": 53.8367,
"eval_samples_per_second": 43.725,
"eval_steps_per_second": 1.375,
"learning_rate": 0.001,
"step": 2628
},
{
"epoch": 13.0,
"eval_explained_variance": 0.41419240832328796,
"eval_kl_divergence": 0.6054547429084778,
"eval_loss": 0.4722050428390503,
"eval_mae": 0.12233959883451462,
"eval_rmse": 0.16495703160762787,
"eval_runtime": 54.7322,
"eval_samples_per_second": 43.009,
"eval_steps_per_second": 1.352,
"learning_rate": 0.001,
"step": 2847
},
{
"epoch": 13.698630136986301,
"grad_norm": 0.15461835265159607,
"learning_rate": 0.001,
"loss": 0.4676,
"step": 3000
},
{
"epoch": 14.0,
"eval_explained_variance": 0.40708938241004944,
"eval_kl_divergence": 0.4203389585018158,
"eval_loss": 0.4747372567653656,
"eval_mae": 0.12501581013202667,
"eval_rmse": 0.16655980050563812,
"eval_runtime": 55.2289,
"eval_samples_per_second": 42.623,
"eval_steps_per_second": 1.34,
"learning_rate": 0.001,
"step": 3066
},
{
"epoch": 15.0,
"eval_explained_variance": 0.41527059674263,
"eval_kl_divergence": 0.6553499102592468,
"eval_loss": 0.47325292229652405,
"eval_mae": 0.12266030162572861,
"eval_rmse": 0.16621644794940948,
"eval_runtime": 54.2502,
"eval_samples_per_second": 43.392,
"eval_steps_per_second": 1.364,
"learning_rate": 0.001,
"step": 3285
},
{
"epoch": 15.981735159817351,
"grad_norm": 0.10063416510820389,
"learning_rate": 0.001,
"loss": 0.4663,
"step": 3500
},
{
"epoch": 16.0,
"eval_explained_variance": 0.4175969660282135,
"eval_kl_divergence": 0.35757607221603394,
"eval_loss": 0.4734710156917572,
"eval_mae": 0.12411689758300781,
"eval_rmse": 0.16558559238910675,
"eval_runtime": 53.6921,
"eval_samples_per_second": 43.843,
"eval_steps_per_second": 1.378,
"learning_rate": 0.001,
"step": 3504
},
{
"epoch": 17.0,
"eval_explained_variance": 0.4231180250644684,
"eval_kl_divergence": 0.4545155465602875,
"eval_loss": 0.4721581041812897,
"eval_mae": 0.12205825001001358,
"eval_rmse": 0.16431300342082977,
"eval_runtime": 54.0719,
"eval_samples_per_second": 43.535,
"eval_steps_per_second": 1.369,
"learning_rate": 0.001,
"step": 3723
},
{
"epoch": 18.0,
"eval_explained_variance": 0.42092254757881165,
"eval_kl_divergence": 0.49019381403923035,
"eval_loss": 0.4723944365978241,
"eval_mae": 0.12245010584592819,
"eval_rmse": 0.16473934054374695,
"eval_runtime": 53.2446,
"eval_samples_per_second": 44.211,
"eval_steps_per_second": 1.39,
"learning_rate": 0.001,
"step": 3942
},
{
"epoch": 18.264840182648403,
"grad_norm": 0.11052733659744263,
"learning_rate": 0.001,
"loss": 0.4655,
"step": 4000
},
{
"epoch": 19.0,
"eval_explained_variance": 0.42237523198127747,
"eval_kl_divergence": 0.3157788813114166,
"eval_loss": 0.47289156913757324,
"eval_mae": 0.12610264122486115,
"eval_rmse": 0.164999321103096,
"eval_runtime": 54.353,
"eval_samples_per_second": 43.309,
"eval_steps_per_second": 1.361,
"learning_rate": 0.001,
"step": 4161
},
{
"epoch": 20.0,
"eval_explained_variance": 0.43422555923461914,
"eval_kl_divergence": 0.45738106966018677,
"eval_loss": 0.4697262644767761,
"eval_mae": 0.12028751522302628,
"eval_rmse": 0.16227416694164276,
"eval_runtime": 52.1033,
"eval_samples_per_second": 45.179,
"eval_steps_per_second": 1.42,
"learning_rate": 0.0001,
"step": 4380
},
{
"epoch": 20.54794520547945,
"grad_norm": 0.10903308540582657,
"learning_rate": 0.0001,
"loss": 0.4635,
"step": 4500
},
{
"epoch": 21.0,
"eval_explained_variance": 0.43825283646583557,
"eval_kl_divergence": 0.45688703656196594,
"eval_loss": 0.46890661120414734,
"eval_mae": 0.11968808621168137,
"eval_rmse": 0.16127373278141022,
"eval_runtime": 52.3325,
"eval_samples_per_second": 44.982,
"eval_steps_per_second": 1.414,
"learning_rate": 0.0001,
"step": 4599
},
{
"epoch": 22.0,
"eval_explained_variance": 0.4373685419559479,
"eval_kl_divergence": 0.45346954464912415,
"eval_loss": 0.46905258297920227,
"eval_mae": 0.12017489224672318,
"eval_rmse": 0.16165030002593994,
"eval_runtime": 51.1815,
"eval_samples_per_second": 45.993,
"eval_steps_per_second": 1.446,
"learning_rate": 0.0001,
"step": 4818
},
{
"epoch": 22.831050228310502,
"grad_norm": 0.09725002944469452,
"learning_rate": 0.0001,
"loss": 0.4615,
"step": 5000
},
{
"epoch": 23.0,
"eval_explained_variance": 0.4442131519317627,
"eval_kl_divergence": 0.2970678508281708,
"eval_loss": 0.4691086411476135,
"eval_mae": 0.1210075318813324,
"eval_rmse": 0.1613779515028,
"eval_runtime": 50.785,
"eval_samples_per_second": 46.352,
"eval_steps_per_second": 1.457,
"learning_rate": 0.0001,
"step": 5037
},
{
"epoch": 24.0,
"eval_explained_variance": 0.4405536353588104,
"eval_kl_divergence": 0.39161574840545654,
"eval_loss": 0.46915334463119507,
"eval_mae": 0.11959254741668701,
"eval_rmse": 0.16161170601844788,
"eval_runtime": 50.8712,
"eval_samples_per_second": 46.274,
"eval_steps_per_second": 1.455,
"learning_rate": 0.0001,
"step": 5256
},
{
"epoch": 25.0,
"eval_explained_variance": 0.4465361535549164,
"eval_kl_divergence": 0.4515945613384247,
"eval_loss": 0.4676876664161682,
"eval_mae": 0.11813607066869736,
"eval_rmse": 0.16005758941173553,
"eval_runtime": 50.537,
"eval_samples_per_second": 46.58,
"eval_steps_per_second": 1.464,
"learning_rate": 0.0001,
"step": 5475
},
{
"epoch": 25.114155251141554,
"grad_norm": 0.10921537131071091,
"learning_rate": 0.0001,
"loss": 0.4601,
"step": 5500
},
{
"epoch": 26.0,
"eval_explained_variance": 0.4434172809123993,
"eval_kl_divergence": 0.6089490652084351,
"eval_loss": 0.4679708480834961,
"eval_mae": 0.11711684614419937,
"eval_rmse": 0.1605486422777176,
"eval_runtime": 49.8832,
"eval_samples_per_second": 47.19,
"eval_steps_per_second": 1.483,
"learning_rate": 0.0001,
"step": 5694
},
{
"epoch": 27.0,
"eval_explained_variance": 0.4460805654525757,
"eval_kl_divergence": 0.4741028845310211,
"eval_loss": 0.4674595892429352,
"eval_mae": 0.11824781447649002,
"eval_rmse": 0.16004686057567596,
"eval_runtime": 49.7793,
"eval_samples_per_second": 47.289,
"eval_steps_per_second": 1.487,
"learning_rate": 0.0001,
"step": 5913
},
{
"epoch": 27.397260273972602,
"grad_norm": 0.11422494053840637,
"learning_rate": 0.0001,
"loss": 0.4585,
"step": 6000
},
{
"epoch": 28.0,
"eval_explained_variance": 0.4489245116710663,
"eval_kl_divergence": 0.3355759084224701,
"eval_loss": 0.46810340881347656,
"eval_mae": 0.11996418237686157,
"eval_rmse": 0.16060088574886322,
"eval_runtime": 52.9491,
"eval_samples_per_second": 44.458,
"eval_steps_per_second": 1.398,
"learning_rate": 0.0001,
"step": 6132
},
{
"epoch": 29.0,
"eval_explained_variance": 0.4459850490093231,
"eval_kl_divergence": 0.43302619457244873,
"eval_loss": 0.4678303897380829,
"eval_mae": 0.11808297038078308,
"eval_rmse": 0.16026519238948822,
"eval_runtime": 50.5506,
"eval_samples_per_second": 46.567,
"eval_steps_per_second": 1.464,
"learning_rate": 0.0001,
"step": 6351
},
{
"epoch": 29.680365296803654,
"grad_norm": 0.11833047866821289,
"learning_rate": 0.0001,
"loss": 0.4578,
"step": 6500
},
{
"epoch": 30.0,
"eval_explained_variance": 0.4503695070743561,
"eval_kl_divergence": 0.3159695267677307,
"eval_loss": 0.46800243854522705,
"eval_mae": 0.11937135457992554,
"eval_rmse": 0.160204216837883,
"eval_runtime": 50.0689,
"eval_samples_per_second": 47.015,
"eval_steps_per_second": 1.478,
"learning_rate": 0.0001,
"step": 6570
},
{
"epoch": 31.0,
"eval_explained_variance": 0.4467611014842987,
"eval_kl_divergence": 0.419010728597641,
"eval_loss": 0.4676785469055176,
"eval_mae": 0.11789224296808243,
"eval_rmse": 0.1599912792444229,
"eval_runtime": 50.2573,
"eval_samples_per_second": 46.839,
"eval_steps_per_second": 1.472,
"learning_rate": 0.0001,
"step": 6789
},
{
"epoch": 31.963470319634702,
"grad_norm": 0.1234586164355278,
"learning_rate": 0.0001,
"loss": 0.4579,
"step": 7000
},
{
"epoch": 32.0,
"eval_explained_variance": 0.4503757953643799,
"eval_kl_divergence": 0.3705631494522095,
"eval_loss": 0.46752873063087463,
"eval_mae": 0.11878199130296707,
"eval_rmse": 0.159804567694664,
"eval_runtime": 50.3085,
"eval_samples_per_second": 46.791,
"eval_steps_per_second": 1.471,
"learning_rate": 0.0001,
"step": 7008
},
{
"epoch": 33.0,
"eval_explained_variance": 0.4545632600784302,
"eval_kl_divergence": 0.35043853521347046,
"eval_loss": 0.46710190176963806,
"eval_mae": 0.1181415393948555,
"eval_rmse": 0.1593446284532547,
"eval_runtime": 50.4199,
"eval_samples_per_second": 46.688,
"eval_steps_per_second": 1.468,
"learning_rate": 0.0001,
"step": 7227
},
{
"epoch": 34.0,
"eval_explained_variance": 0.4532606303691864,
"eval_kl_divergence": 0.3881392180919647,
"eval_loss": 0.4670344293117523,
"eval_mae": 0.11804797500371933,
"eval_rmse": 0.15942266583442688,
"eval_runtime": 50.088,
"eval_samples_per_second": 46.997,
"eval_steps_per_second": 1.477,
"learning_rate": 0.0001,
"step": 7446
},
{
"epoch": 34.24657534246575,
"grad_norm": 0.14323526620864868,
"learning_rate": 0.0001,
"loss": 0.4569,
"step": 7500
},
{
"epoch": 35.0,
"eval_explained_variance": 0.4555685818195343,
"eval_kl_divergence": 0.43976902961730957,
"eval_loss": 0.4662601053714752,
"eval_mae": 0.11664538830518723,
"eval_rmse": 0.1586536318063736,
"eval_runtime": 49.8708,
"eval_samples_per_second": 47.202,
"eval_steps_per_second": 1.484,
"learning_rate": 0.0001,
"step": 7665
},
{
"epoch": 36.0,
"eval_explained_variance": 0.4544428884983063,
"eval_kl_divergence": 0.4382496476173401,
"eval_loss": 0.46657058596611023,
"eval_mae": 0.11700741201639175,
"eval_rmse": 0.15874631702899933,
"eval_runtime": 49.7975,
"eval_samples_per_second": 47.271,
"eval_steps_per_second": 1.486,
"learning_rate": 0.0001,
"step": 7884
},
{
"epoch": 36.529680365296805,
"grad_norm": 0.17629703879356384,
"learning_rate": 0.0001,
"loss": 0.4572,
"step": 8000
},
{
"epoch": 37.0,
"eval_explained_variance": 0.45941635966300964,
"eval_kl_divergence": 0.4330490827560425,
"eval_loss": 0.4657588005065918,
"eval_mae": 0.11633748561143875,
"eval_rmse": 0.15810036659240723,
"eval_runtime": 51.4251,
"eval_samples_per_second": 45.775,
"eval_steps_per_second": 1.439,
"learning_rate": 0.0001,
"step": 8103
},
{
"epoch": 38.0,
"eval_explained_variance": 0.4566784203052521,
"eval_kl_divergence": 0.4877949357032776,
"eval_loss": 0.4659184217453003,
"eval_mae": 0.11623784899711609,
"eval_rmse": 0.15832678973674774,
"eval_runtime": 49.7333,
"eval_samples_per_second": 47.332,
"eval_steps_per_second": 1.488,
"learning_rate": 0.0001,
"step": 8322
},
{
"epoch": 38.81278538812786,
"grad_norm": 0.1781003624200821,
"learning_rate": 0.0001,
"loss": 0.4572,
"step": 8500
},
{
"epoch": 39.0,
"eval_explained_variance": 0.45519956946372986,
"eval_kl_divergence": 0.3790707290172577,
"eval_loss": 0.46703553199768066,
"eval_mae": 0.11782807856798172,
"eval_rmse": 0.15946339070796967,
"eval_runtime": 52.4,
"eval_samples_per_second": 44.924,
"eval_steps_per_second": 1.412,
"learning_rate": 0.0001,
"step": 8541
},
{
"epoch": 40.0,
"eval_explained_variance": 0.45683178305625916,
"eval_kl_divergence": 0.38892972469329834,
"eval_loss": 0.4664987027645111,
"eval_mae": 0.11783644556999207,
"eval_rmse": 0.15876977145671844,
"eval_runtime": 50.7398,
"eval_samples_per_second": 46.394,
"eval_steps_per_second": 1.458,
"learning_rate": 0.0001,
"step": 8760
},
{
"epoch": 41.0,
"eval_explained_variance": 0.4591364860534668,
"eval_kl_divergence": 0.3222128450870514,
"eval_loss": 0.46659526228904724,
"eval_mae": 0.11838778108358383,
"eval_rmse": 0.15888933837413788,
"eval_runtime": 50.0159,
"eval_samples_per_second": 47.065,
"eval_steps_per_second": 1.48,
"learning_rate": 0.0001,
"step": 8979
},
{
"epoch": 41.0958904109589,
"grad_norm": 0.13085126876831055,
"learning_rate": 0.0001,
"loss": 0.4559,
"step": 9000
},
{
"epoch": 42.0,
"eval_explained_variance": 0.4606964886188507,
"eval_kl_divergence": 0.426244854927063,
"eval_loss": 0.4655005633831024,
"eval_mae": 0.11635158210992813,
"eval_rmse": 0.15787668526172638,
"eval_runtime": 49.9099,
"eval_samples_per_second": 47.165,
"eval_steps_per_second": 1.483,
"learning_rate": 0.0001,
"step": 9198
},
{
"epoch": 43.0,
"eval_explained_variance": 0.46034756302833557,
"eval_kl_divergence": 0.4611224830150604,
"eval_loss": 0.4656265676021576,
"eval_mae": 0.11616652458906174,
"eval_rmse": 0.1579464077949524,
"eval_runtime": 50.0123,
"eval_samples_per_second": 47.068,
"eval_steps_per_second": 1.48,
"learning_rate": 0.0001,
"step": 9417
},
{
"epoch": 43.37899543378995,
"grad_norm": 0.17523790895938873,
"learning_rate": 0.0001,
"loss": 0.4554,
"step": 9500
},
{
"epoch": 44.0,
"eval_explained_variance": 0.4616149961948395,
"eval_kl_divergence": 0.45858410000801086,
"eval_loss": 0.4655725955963135,
"eval_mae": 0.11644264310598373,
"eval_rmse": 0.15800905227661133,
"eval_runtime": 50.6284,
"eval_samples_per_second": 46.496,
"eval_steps_per_second": 1.462,
"learning_rate": 0.0001,
"step": 9636
},
{
"epoch": 45.0,
"eval_explained_variance": 0.45969870686531067,
"eval_kl_divergence": 0.4367772340774536,
"eval_loss": 0.46600833535194397,
"eval_mae": 0.11579249054193497,
"eval_rmse": 0.15833592414855957,
"eval_runtime": 50.629,
"eval_samples_per_second": 46.495,
"eval_steps_per_second": 1.462,
"learning_rate": 0.0001,
"step": 9855
},
{
"epoch": 45.662100456621005,
"grad_norm": 0.1231347844004631,
"learning_rate": 0.0001,
"loss": 0.4557,
"step": 10000
},
{
"epoch": 46.0,
"eval_explained_variance": 0.4603704512119293,
"eval_kl_divergence": 0.41175922751426697,
"eval_loss": 0.4660418927669525,
"eval_mae": 0.11639311909675598,
"eval_rmse": 0.1581837385892868,
"eval_runtime": 50.1537,
"eval_samples_per_second": 46.936,
"eval_steps_per_second": 1.475,
"learning_rate": 0.0001,
"step": 10074
},
{
"epoch": 47.0,
"eval_explained_variance": 0.4613979756832123,
"eval_kl_divergence": 0.5424114465713501,
"eval_loss": 0.46521857380867004,
"eval_mae": 0.11542114615440369,
"eval_rmse": 0.15771377086639404,
"eval_runtime": 49.6928,
"eval_samples_per_second": 47.371,
"eval_steps_per_second": 1.489,
"learning_rate": 0.0001,
"step": 10293
},
{
"epoch": 47.945205479452056,
"grad_norm": 0.46352267265319824,
"learning_rate": 0.0001,
"loss": 0.4551,
"step": 10500
},
{
"epoch": 48.0,
"eval_explained_variance": 0.45960724353790283,
"eval_kl_divergence": 0.525124728679657,
"eval_loss": 0.46598610281944275,
"eval_mae": 0.1159835234284401,
"eval_rmse": 0.15856431424617767,
"eval_runtime": 49.9974,
"eval_samples_per_second": 47.082,
"eval_steps_per_second": 1.48,
"learning_rate": 0.0001,
"step": 10512
},
{
"epoch": 49.0,
"eval_explained_variance": 0.4572352468967438,
"eval_kl_divergence": 0.5006867051124573,
"eval_loss": 0.46604350209236145,
"eval_mae": 0.11609696596860886,
"eval_rmse": 0.15853044390678406,
"eval_runtime": 50.2446,
"eval_samples_per_second": 46.851,
"eval_steps_per_second": 1.473,
"learning_rate": 0.0001,
"step": 10731
},
{
"epoch": 50.0,
"eval_explained_variance": 0.4658548831939697,
"eval_kl_divergence": 0.24239596724510193,
"eval_loss": 0.46660009026527405,
"eval_mae": 0.11854288727045059,
"eval_rmse": 0.15863054990768433,
"eval_runtime": 50.1897,
"eval_samples_per_second": 46.902,
"eval_steps_per_second": 1.474,
"learning_rate": 0.0001,
"step": 10950
},
{
"epoch": 50.22831050228311,
"grad_norm": 0.1688494235277176,
"learning_rate": 0.0001,
"loss": 0.4545,
"step": 11000
},
{
"epoch": 51.0,
"eval_explained_variance": 0.45888975262641907,
"eval_kl_divergence": 0.4170607030391693,
"eval_loss": 0.4660661220550537,
"eval_mae": 0.11618483066558838,
"eval_rmse": 0.15835459530353546,
"eval_runtime": 49.5535,
"eval_samples_per_second": 47.504,
"eval_steps_per_second": 1.493,
"learning_rate": 0.0001,
"step": 11169
},
{
"epoch": 52.0,
"eval_explained_variance": 0.46297597885131836,
"eval_kl_divergence": 0.49118655920028687,
"eval_loss": 0.4649689793586731,
"eval_mae": 0.11549883335828781,
"eval_rmse": 0.1575259119272232,
"eval_runtime": 50.3774,
"eval_samples_per_second": 46.727,
"eval_steps_per_second": 1.469,
"learning_rate": 0.0001,
"step": 11388
},
{
"epoch": 52.51141552511415,
"grad_norm": 0.2805333137512207,
"learning_rate": 0.0001,
"loss": 0.4548,
"step": 11500
},
{
"epoch": 53.0,
"eval_explained_variance": 0.46440085768699646,
"eval_kl_divergence": 0.4030352830886841,
"eval_loss": 0.4653578996658325,
"eval_mae": 0.11687562614679337,
"eval_rmse": 0.15780305862426758,
"eval_runtime": 51.1877,
"eval_samples_per_second": 45.988,
"eval_steps_per_second": 1.446,
"learning_rate": 0.0001,
"step": 11607
},
{
"epoch": 54.0,
"eval_explained_variance": 0.4594965875148773,
"eval_kl_divergence": 0.4810858964920044,
"eval_loss": 0.4660585820674896,
"eval_mae": 0.11529505252838135,
"eval_rmse": 0.15853293240070343,
"eval_runtime": 51.2952,
"eval_samples_per_second": 45.891,
"eval_steps_per_second": 1.443,
"learning_rate": 0.0001,
"step": 11826
},
{
"epoch": 54.794520547945204,
"grad_norm": 0.22778521478176117,
"learning_rate": 0.0001,
"loss": 0.455,
"step": 12000
},
{
"epoch": 55.0,
"eval_explained_variance": 0.46380600333213806,
"eval_kl_divergence": 0.3773800730705261,
"eval_loss": 0.46527624130249023,
"eval_mae": 0.11668615788221359,
"eval_rmse": 0.1576414853334427,
"eval_runtime": 50.6825,
"eval_samples_per_second": 46.446,
"eval_steps_per_second": 1.46,
"learning_rate": 0.0001,
"step": 12045
},
{
"epoch": 56.0,
"eval_explained_variance": 0.4669934809207916,
"eval_kl_divergence": 0.32541513442993164,
"eval_loss": 0.4654240906238556,
"eval_mae": 0.11757931858301163,
"eval_rmse": 0.1575363427400589,
"eval_runtime": 50.538,
"eval_samples_per_second": 46.579,
"eval_steps_per_second": 1.464,
"learning_rate": 0.0001,
"step": 12264
},
{
"epoch": 57.0,
"eval_explained_variance": 0.4661710560321808,
"eval_kl_divergence": 0.3648814857006073,
"eval_loss": 0.4654492139816284,
"eval_mae": 0.11615876108407974,
"eval_rmse": 0.15751774609088898,
"eval_runtime": 51.1673,
"eval_samples_per_second": 46.006,
"eval_steps_per_second": 1.446,
"learning_rate": 0.0001,
"step": 12483
},
{
"epoch": 57.077625570776256,
"grad_norm": 0.16715611517429352,
"learning_rate": 0.0001,
"loss": 0.4531,
"step": 12500
},
{
"epoch": 58.0,
"eval_explained_variance": 0.4606919586658478,
"eval_kl_divergence": 0.40749335289001465,
"eval_loss": 0.46654412150382996,
"eval_mae": 0.1166309341788292,
"eval_rmse": 0.15835203230381012,
"eval_runtime": 50.603,
"eval_samples_per_second": 46.519,
"eval_steps_per_second": 1.462,
"learning_rate": 0.0001,
"step": 12702
},
{
"epoch": 59.0,
"eval_explained_variance": 0.4653950035572052,
"eval_kl_divergence": 0.42019784450531006,
"eval_loss": 0.465238481760025,
"eval_mae": 0.11570876836776733,
"eval_rmse": 0.15746039152145386,
"eval_runtime": 50.3267,
"eval_samples_per_second": 46.774,
"eval_steps_per_second": 1.47,
"learning_rate": 1e-05,
"step": 12921
},
{
"epoch": 59.36073059360731,
"grad_norm": 0.19701753556728363,
"learning_rate": 1e-05,
"loss": 0.4538,
"step": 13000
},
{
"epoch": 60.0,
"eval_explained_variance": 0.4668855369091034,
"eval_kl_divergence": 0.4084234833717346,
"eval_loss": 0.46530231833457947,
"eval_mae": 0.11569295078516006,
"eval_rmse": 0.15709955990314484,
"eval_runtime": 51.1174,
"eval_samples_per_second": 46.051,
"eval_steps_per_second": 1.448,
"learning_rate": 1e-05,
"step": 13140
},
{
"epoch": 61.0,
"eval_explained_variance": 0.4661245346069336,
"eval_kl_divergence": 0.4496937096118927,
"eval_loss": 0.4653523564338684,
"eval_mae": 0.11528477817773819,
"eval_rmse": 0.15729330480098724,
"eval_runtime": 50.8416,
"eval_samples_per_second": 46.301,
"eval_steps_per_second": 1.456,
"learning_rate": 1e-05,
"step": 13359
},
{
"epoch": 61.64383561643836,
"grad_norm": 0.1874207705259323,
"learning_rate": 1e-05,
"loss": 0.4529,
"step": 13500
},
{
"epoch": 62.0,
"eval_explained_variance": 0.4681651294231415,
"eval_kl_divergence": 0.411173015832901,
"eval_loss": 0.46477487683296204,
"eval_mae": 0.11529665440320969,
"eval_rmse": 0.15684308111667633,
"eval_runtime": 52.6214,
"eval_samples_per_second": 44.735,
"eval_steps_per_second": 1.406,
"learning_rate": 1e-05,
"step": 13578
},
{
"epoch": 63.0,
"eval_explained_variance": 0.47016242146492004,
"eval_kl_divergence": 0.3748082220554352,
"eval_loss": 0.46481335163116455,
"eval_mae": 0.11518841236829758,
"eval_rmse": 0.15671293437480927,
"eval_runtime": 53.2469,
"eval_samples_per_second": 44.209,
"eval_steps_per_second": 1.39,
"learning_rate": 1e-05,
"step": 13797
},
{
"epoch": 63.926940639269404,
"grad_norm": 0.22562281787395477,
"learning_rate": 1e-05,
"loss": 0.4527,
"step": 14000
},
{
"epoch": 64.0,
"eval_explained_variance": 0.4721170663833618,
"eval_kl_divergence": 0.3044198155403137,
"eval_loss": 0.46523070335388184,
"eval_mae": 0.11618036776781082,
"eval_rmse": 0.15709933638572693,
"eval_runtime": 53.3051,
"eval_samples_per_second": 44.161,
"eval_steps_per_second": 1.388,
"learning_rate": 1e-05,
"step": 14016
},
{
"epoch": 65.0,
"eval_explained_variance": 0.46695852279663086,
"eval_kl_divergence": 0.46853822469711304,
"eval_loss": 0.46484872698783875,
"eval_mae": 0.11532068997621536,
"eval_rmse": 0.1568661779165268,
"eval_runtime": 52.7599,
"eval_samples_per_second": 44.617,
"eval_steps_per_second": 1.403,
"learning_rate": 1e-05,
"step": 14235
},
{
"epoch": 66.0,
"eval_explained_variance": 0.46712610125541687,
"eval_kl_divergence": 0.508738100528717,
"eval_loss": 0.46500927209854126,
"eval_mae": 0.11475471407175064,
"eval_rmse": 0.15729309618473053,
"eval_runtime": 54.0149,
"eval_samples_per_second": 43.581,
"eval_steps_per_second": 1.37,
"learning_rate": 1e-05,
"step": 14454
},
{
"epoch": 66.21004566210046,
"grad_norm": 0.18448679149150848,
"learning_rate": 1e-05,
"loss": 0.4531,
"step": 14500
},
{
"epoch": 67.0,
"eval_explained_variance": 0.4690088927745819,
"eval_kl_divergence": 0.42743220925331116,
"eval_loss": 0.4645930230617523,
"eval_mae": 0.1155417189002037,
"eval_rmse": 0.1567572057247162,
"eval_runtime": 52.5655,
"eval_samples_per_second": 44.782,
"eval_steps_per_second": 1.408,
"learning_rate": 1e-05,
"step": 14673
},
{
"epoch": 68.0,
"eval_explained_variance": 0.4680323302745819,
"eval_kl_divergence": 0.49686378240585327,
"eval_loss": 0.46456360816955566,
"eval_mae": 0.11437365412712097,
"eval_rmse": 0.1566230058670044,
"eval_runtime": 50.8799,
"eval_samples_per_second": 46.266,
"eval_steps_per_second": 1.454,
"learning_rate": 1e-05,
"step": 14892
},
{
"epoch": 68.4931506849315,
"grad_norm": 0.21752646565437317,
"learning_rate": 1e-05,
"loss": 0.452,
"step": 15000
},
{
"epoch": 69.0,
"eval_explained_variance": 0.4696376323699951,
"eval_kl_divergence": 0.44800856709480286,
"eval_loss": 0.464430034160614,
"eval_mae": 0.11452987045049667,
"eval_rmse": 0.15642575919628143,
"eval_runtime": 61.8405,
"eval_samples_per_second": 38.066,
"eval_steps_per_second": 1.197,
"learning_rate": 1e-05,
"step": 15111
},
{
"epoch": 70.0,
"eval_explained_variance": 0.4692017734050751,
"eval_kl_divergence": 0.42908576130867004,
"eval_loss": 0.4648461937904358,
"eval_mae": 0.11500384658575058,
"eval_rmse": 0.15674862265586853,
"eval_runtime": 60.5787,
"eval_samples_per_second": 38.859,
"eval_steps_per_second": 1.222,
"learning_rate": 1e-05,
"step": 15330
},
{
"epoch": 70.77625570776256,
"grad_norm": 0.23285503685474396,
"learning_rate": 1e-05,
"loss": 0.4524,
"step": 15500
},
{
"epoch": 71.0,
"eval_explained_variance": 0.4711233675479889,
"eval_kl_divergence": 0.37966692447662354,
"eval_loss": 0.4645022749900818,
"eval_mae": 0.11555531620979309,
"eval_rmse": 0.15646833181381226,
"eval_runtime": 61.2584,
"eval_samples_per_second": 38.427,
"eval_steps_per_second": 1.208,
"learning_rate": 1e-05,
"step": 15549
},
{
"epoch": 72.0,
"eval_explained_variance": 0.4690466821193695,
"eval_kl_divergence": 0.42796915769577026,
"eval_loss": 0.46473589539527893,
"eval_mae": 0.11497951298952103,
"eval_rmse": 0.15693025290966034,
"eval_runtime": 61.782,
"eval_samples_per_second": 38.102,
"eval_steps_per_second": 1.198,
"learning_rate": 1e-05,
"step": 15768
},
{
"epoch": 73.0,
"eval_explained_variance": 0.4707035720348358,
"eval_kl_divergence": 0.4591566324234009,
"eval_loss": 0.46414923667907715,
"eval_mae": 0.11423368006944656,
"eval_rmse": 0.15631103515625,
"eval_runtime": 62.9115,
"eval_samples_per_second": 37.418,
"eval_steps_per_second": 1.176,
"learning_rate": 1e-05,
"step": 15987
},
{
"epoch": 73.05936073059361,
"grad_norm": 0.1904192417860031,
"learning_rate": 1e-05,
"loss": 0.4515,
"step": 16000
},
{
"epoch": 74.0,
"eval_explained_variance": 0.4705829620361328,
"eval_kl_divergence": 0.43208685517311096,
"eval_loss": 0.4641610085964203,
"eval_mae": 0.11505597829818726,
"eval_rmse": 0.1563975065946579,
"eval_runtime": 61.932,
"eval_samples_per_second": 38.009,
"eval_steps_per_second": 1.195,
"learning_rate": 1e-05,
"step": 16206
},
{
"epoch": 75.0,
"eval_explained_variance": 0.47077181935310364,
"eval_kl_divergence": 0.3843104839324951,
"eval_loss": 0.4644509255886078,
"eval_mae": 0.11519055813550949,
"eval_rmse": 0.15653057396411896,
"eval_runtime": 62.3182,
"eval_samples_per_second": 37.774,
"eval_steps_per_second": 1.187,
"learning_rate": 1e-05,
"step": 16425
},
{
"epoch": 75.34246575342466,
"grad_norm": 0.2563965618610382,
"learning_rate": 1e-05,
"loss": 0.4521,
"step": 16500
},
{
"epoch": 76.0,
"eval_explained_variance": 0.4675123989582062,
"eval_kl_divergence": 0.5215911269187927,
"eval_loss": 0.4646488130092621,
"eval_mae": 0.1146780475974083,
"eval_rmse": 0.1569206565618515,
"eval_runtime": 66.0488,
"eval_samples_per_second": 35.64,
"eval_steps_per_second": 1.12,
"learning_rate": 1e-05,
"step": 16644
},
{
"epoch": 77.0,
"eval_explained_variance": 0.46909868717193604,
"eval_kl_divergence": 0.4094104468822479,
"eval_loss": 0.46475714445114136,
"eval_mae": 0.11523856967687607,
"eval_rmse": 0.15687990188598633,
"eval_runtime": 62.1685,
"eval_samples_per_second": 37.865,
"eval_steps_per_second": 1.19,
"learning_rate": 1e-05,
"step": 16863
},
{
"epoch": 77.62557077625571,
"grad_norm": 0.16491472721099854,
"learning_rate": 1e-05,
"loss": 0.4519,
"step": 17000
},
{
"epoch": 78.0,
"eval_explained_variance": 0.47086599469184875,
"eval_kl_divergence": 0.43988528847694397,
"eval_loss": 0.46428272128105164,
"eval_mae": 0.11493176966905594,
"eval_rmse": 0.15638257563114166,
"eval_runtime": 61.9923,
"eval_samples_per_second": 37.972,
"eval_steps_per_second": 1.194,
"learning_rate": 1e-05,
"step": 17082
},
{
"epoch": 79.0,
"eval_explained_variance": 0.4697439670562744,
"eval_kl_divergence": 0.4178011417388916,
"eval_loss": 0.4645934998989105,
"eval_mae": 0.11465150117874146,
"eval_rmse": 0.15666015446186066,
"eval_runtime": 63.0404,
"eval_samples_per_second": 37.341,
"eval_steps_per_second": 1.174,
"learning_rate": 1e-05,
"step": 17301
},
{
"epoch": 79.90867579908675,
"grad_norm": 0.1647184044122696,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4517,
"step": 17500
},
{
"epoch": 80.0,
"eval_explained_variance": 0.4699563980102539,
"eval_kl_divergence": 0.43727052211761475,
"eval_loss": 0.46436014771461487,
"eval_mae": 0.11501001566648483,
"eval_rmse": 0.15643416345119476,
"eval_runtime": 61.5606,
"eval_samples_per_second": 38.239,
"eval_steps_per_second": 1.202,
"learning_rate": 1.0000000000000002e-06,
"step": 17520
},
{
"epoch": 81.0,
"eval_explained_variance": 0.468768835067749,
"eval_kl_divergence": 0.47009941935539246,
"eval_loss": 0.46448636054992676,
"eval_mae": 0.11508657783269882,
"eval_rmse": 0.15673168003559113,
"eval_runtime": 62.9178,
"eval_samples_per_second": 37.414,
"eval_steps_per_second": 1.176,
"learning_rate": 1.0000000000000002e-06,
"step": 17739
},
{
"epoch": 82.0,
"eval_explained_variance": 0.470253586769104,
"eval_kl_divergence": 0.4601159989833832,
"eval_loss": 0.4644375145435333,
"eval_mae": 0.11455937474966049,
"eval_rmse": 0.15652652084827423,
"eval_runtime": 62.6023,
"eval_samples_per_second": 37.602,
"eval_steps_per_second": 1.182,
"learning_rate": 1.0000000000000002e-06,
"step": 17958
},
{
"epoch": 82.1917808219178,
"grad_norm": 0.2432813197374344,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4514,
"step": 18000
},
{
"epoch": 83.0,
"eval_explained_variance": 0.468420147895813,
"eval_kl_divergence": 0.4510715901851654,
"eval_loss": 0.46457409858703613,
"eval_mae": 0.11468392610549927,
"eval_rmse": 0.15669189393520355,
"eval_runtime": 62.7877,
"eval_samples_per_second": 37.491,
"eval_steps_per_second": 1.179,
"learning_rate": 1.0000000000000002e-06,
"step": 18177
},
{
"epoch": 83.0,
"learning_rate": 1.0000000000000002e-06,
"step": 18177,
"total_flos": 8.603009036605255e+19,
"train_loss": 0.45949580130708517,
"train_runtime": 19431.3015,
"train_samples_per_second": 54.06,
"train_steps_per_second": 1.691
}
],
"logging_steps": 500,
"max_steps": 32850,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.603009036605255e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}