{ "best_metric": 0.46414923667907715, "best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-probs-large-2024_11_15-batch-size32_freeze_probs/checkpoint-15987", "epoch": 83.0, "eval_steps": 500, "global_step": 18177, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_explained_variance": 0.3432542085647583, "eval_kl_divergence": 0.31011611223220825, "eval_loss": 0.4855400025844574, "eval_mae": 0.1364378184080124, "eval_rmse": 0.17712123692035675, "eval_runtime": 55.3387, "eval_samples_per_second": 42.538, "eval_steps_per_second": 1.337, "learning_rate": 0.001, "step": 219 }, { "epoch": 2.0, "eval_explained_variance": 0.38912513852119446, "eval_kl_divergence": 0.5077245235443115, "eval_loss": 0.47601452469825745, "eval_mae": 0.12465938925743103, "eval_rmse": 0.16875195503234863, "eval_runtime": 54.843, "eval_samples_per_second": 42.923, "eval_steps_per_second": 1.349, "learning_rate": 0.001, "step": 438 }, { "epoch": 2.2831050228310503, "grad_norm": 0.35450002551078796, "learning_rate": 0.001, "loss": 0.5195, "step": 500 }, { "epoch": 3.0, "eval_explained_variance": 0.3848476707935333, "eval_kl_divergence": 0.7895973324775696, "eval_loss": 0.4776814579963684, "eval_mae": 0.12300346046686172, "eval_rmse": 0.17065072059631348, "eval_runtime": 56.2195, "eval_samples_per_second": 41.872, "eval_steps_per_second": 1.316, "learning_rate": 0.001, "step": 657 }, { "epoch": 4.0, "eval_explained_variance": 0.403704434633255, "eval_kl_divergence": 0.49319207668304443, "eval_loss": 0.47429159283638, "eval_mae": 0.12376764416694641, "eval_rmse": 0.1672389954328537, "eval_runtime": 54.7793, "eval_samples_per_second": 42.972, "eval_steps_per_second": 1.351, "learning_rate": 0.001, "step": 876 }, { "epoch": 4.566210045662101, "grad_norm": 0.2313629388809204, "learning_rate": 0.001, "loss": 0.4742, "step": 1000 }, { "epoch": 5.0, "eval_explained_variance": 0.41316938400268555, "eval_kl_divergence": 0.2900688648223877, "eval_loss": 0.47457176446914673, "eval_mae": 0.12771284580230713, "eval_rmse": 0.16687722504138947, "eval_runtime": 55.1273, "eval_samples_per_second": 42.701, "eval_steps_per_second": 1.342, "learning_rate": 0.001, "step": 1095 }, { "epoch": 6.0, "eval_explained_variance": 0.40222811698913574, "eval_kl_divergence": 0.43988940119743347, "eval_loss": 0.4749792814254761, "eval_mae": 0.1252531260251999, "eval_rmse": 0.16735166311264038, "eval_runtime": 53.136, "eval_samples_per_second": 44.301, "eval_steps_per_second": 1.393, "learning_rate": 0.001, "step": 1314 }, { "epoch": 6.8493150684931505, "grad_norm": 0.18959695100784302, "learning_rate": 0.001, "loss": 0.4706, "step": 1500 }, { "epoch": 7.0, "eval_explained_variance": 0.4019981324672699, "eval_kl_divergence": 0.48684099316596985, "eval_loss": 0.4744807779788971, "eval_mae": 0.12594138085842133, "eval_rmse": 0.16705705225467682, "eval_runtime": 53.367, "eval_samples_per_second": 44.11, "eval_steps_per_second": 1.387, "learning_rate": 0.001, "step": 1533 }, { "epoch": 8.0, "eval_explained_variance": 0.41111621260643005, "eval_kl_divergence": 0.324148029088974, "eval_loss": 0.47424906492233276, "eval_mae": 0.12568950653076172, "eval_rmse": 0.16722555458545685, "eval_runtime": 55.5084, "eval_samples_per_second": 42.408, "eval_steps_per_second": 1.333, "learning_rate": 0.001, "step": 1752 }, { "epoch": 9.0, "eval_explained_variance": 0.4107116162776947, "eval_kl_divergence": 0.4560392200946808, "eval_loss": 0.4729686379432678, "eval_mae": 0.12355945259332657, "eval_rmse": 0.16584673523902893, "eval_runtime": 55.1596, "eval_samples_per_second": 42.676, "eval_steps_per_second": 1.342, "learning_rate": 0.001, "step": 1971 }, { "epoch": 9.132420091324201, "grad_norm": 0.18577350676059723, "learning_rate": 0.001, "loss": 0.4678, "step": 2000 }, { "epoch": 10.0, "eval_explained_variance": 0.4190339744091034, "eval_kl_divergence": 0.2140849530696869, "eval_loss": 0.4750550389289856, "eval_mae": 0.12685616314411163, "eval_rmse": 0.1679263859987259, "eval_runtime": 56.0284, "eval_samples_per_second": 42.014, "eval_steps_per_second": 1.321, "learning_rate": 0.001, "step": 2190 }, { "epoch": 11.0, "eval_explained_variance": 0.41887199878692627, "eval_kl_divergence": 0.2529982030391693, "eval_loss": 0.4733181595802307, "eval_mae": 0.12647458910942078, "eval_rmse": 0.16627688705921173, "eval_runtime": 55.5532, "eval_samples_per_second": 42.374, "eval_steps_per_second": 1.332, "learning_rate": 0.001, "step": 2409 }, { "epoch": 11.415525114155251, "grad_norm": 0.14618106186389923, "learning_rate": 0.001, "loss": 0.4674, "step": 2500 }, { "epoch": 12.0, "eval_explained_variance": 0.4073503315448761, "eval_kl_divergence": 0.3965540826320648, "eval_loss": 0.4758349061012268, "eval_mae": 0.1263781040906906, "eval_rmse": 0.1683548092842102, "eval_runtime": 53.8367, "eval_samples_per_second": 43.725, "eval_steps_per_second": 1.375, "learning_rate": 0.001, "step": 2628 }, { "epoch": 13.0, "eval_explained_variance": 0.41419240832328796, "eval_kl_divergence": 0.6054547429084778, "eval_loss": 0.4722050428390503, "eval_mae": 0.12233959883451462, "eval_rmse": 0.16495703160762787, "eval_runtime": 54.7322, "eval_samples_per_second": 43.009, "eval_steps_per_second": 1.352, "learning_rate": 0.001, "step": 2847 }, { "epoch": 13.698630136986301, "grad_norm": 0.15461835265159607, "learning_rate": 0.001, "loss": 0.4676, "step": 3000 }, { "epoch": 14.0, "eval_explained_variance": 0.40708938241004944, "eval_kl_divergence": 0.4203389585018158, "eval_loss": 0.4747372567653656, "eval_mae": 0.12501581013202667, "eval_rmse": 0.16655980050563812, "eval_runtime": 55.2289, "eval_samples_per_second": 42.623, "eval_steps_per_second": 1.34, "learning_rate": 0.001, "step": 3066 }, { "epoch": 15.0, "eval_explained_variance": 0.41527059674263, "eval_kl_divergence": 0.6553499102592468, "eval_loss": 0.47325292229652405, "eval_mae": 0.12266030162572861, "eval_rmse": 0.16621644794940948, "eval_runtime": 54.2502, "eval_samples_per_second": 43.392, "eval_steps_per_second": 1.364, "learning_rate": 0.001, "step": 3285 }, { "epoch": 15.981735159817351, "grad_norm": 0.10063416510820389, "learning_rate": 0.001, "loss": 0.4663, "step": 3500 }, { "epoch": 16.0, "eval_explained_variance": 0.4175969660282135, "eval_kl_divergence": 0.35757607221603394, "eval_loss": 0.4734710156917572, "eval_mae": 0.12411689758300781, "eval_rmse": 0.16558559238910675, "eval_runtime": 53.6921, "eval_samples_per_second": 43.843, "eval_steps_per_second": 1.378, "learning_rate": 0.001, "step": 3504 }, { "epoch": 17.0, "eval_explained_variance": 0.4231180250644684, "eval_kl_divergence": 0.4545155465602875, "eval_loss": 0.4721581041812897, "eval_mae": 0.12205825001001358, "eval_rmse": 0.16431300342082977, "eval_runtime": 54.0719, "eval_samples_per_second": 43.535, "eval_steps_per_second": 1.369, "learning_rate": 0.001, "step": 3723 }, { "epoch": 18.0, "eval_explained_variance": 0.42092254757881165, "eval_kl_divergence": 0.49019381403923035, "eval_loss": 0.4723944365978241, "eval_mae": 0.12245010584592819, "eval_rmse": 0.16473934054374695, "eval_runtime": 53.2446, "eval_samples_per_second": 44.211, "eval_steps_per_second": 1.39, "learning_rate": 0.001, "step": 3942 }, { "epoch": 18.264840182648403, "grad_norm": 0.11052733659744263, "learning_rate": 0.001, "loss": 0.4655, "step": 4000 }, { "epoch": 19.0, "eval_explained_variance": 0.42237523198127747, "eval_kl_divergence": 0.3157788813114166, "eval_loss": 0.47289156913757324, "eval_mae": 0.12610264122486115, "eval_rmse": 0.164999321103096, "eval_runtime": 54.353, "eval_samples_per_second": 43.309, "eval_steps_per_second": 1.361, "learning_rate": 0.001, "step": 4161 }, { "epoch": 20.0, "eval_explained_variance": 0.43422555923461914, "eval_kl_divergence": 0.45738106966018677, "eval_loss": 0.4697262644767761, "eval_mae": 0.12028751522302628, "eval_rmse": 0.16227416694164276, "eval_runtime": 52.1033, "eval_samples_per_second": 45.179, "eval_steps_per_second": 1.42, "learning_rate": 0.0001, "step": 4380 }, { "epoch": 20.54794520547945, "grad_norm": 0.10903308540582657, "learning_rate": 0.0001, "loss": 0.4635, "step": 4500 }, { "epoch": 21.0, "eval_explained_variance": 0.43825283646583557, "eval_kl_divergence": 0.45688703656196594, "eval_loss": 0.46890661120414734, "eval_mae": 0.11968808621168137, "eval_rmse": 0.16127373278141022, "eval_runtime": 52.3325, "eval_samples_per_second": 44.982, "eval_steps_per_second": 1.414, "learning_rate": 0.0001, "step": 4599 }, { "epoch": 22.0, "eval_explained_variance": 0.4373685419559479, "eval_kl_divergence": 0.45346954464912415, "eval_loss": 0.46905258297920227, "eval_mae": 0.12017489224672318, "eval_rmse": 0.16165030002593994, "eval_runtime": 51.1815, "eval_samples_per_second": 45.993, "eval_steps_per_second": 1.446, "learning_rate": 0.0001, "step": 4818 }, { "epoch": 22.831050228310502, "grad_norm": 0.09725002944469452, "learning_rate": 0.0001, "loss": 0.4615, "step": 5000 }, { "epoch": 23.0, "eval_explained_variance": 0.4442131519317627, "eval_kl_divergence": 0.2970678508281708, "eval_loss": 0.4691086411476135, "eval_mae": 0.1210075318813324, "eval_rmse": 0.1613779515028, "eval_runtime": 50.785, "eval_samples_per_second": 46.352, "eval_steps_per_second": 1.457, "learning_rate": 0.0001, "step": 5037 }, { "epoch": 24.0, "eval_explained_variance": 0.4405536353588104, "eval_kl_divergence": 0.39161574840545654, "eval_loss": 0.46915334463119507, "eval_mae": 0.11959254741668701, "eval_rmse": 0.16161170601844788, "eval_runtime": 50.8712, "eval_samples_per_second": 46.274, "eval_steps_per_second": 1.455, "learning_rate": 0.0001, "step": 5256 }, { "epoch": 25.0, "eval_explained_variance": 0.4465361535549164, "eval_kl_divergence": 0.4515945613384247, "eval_loss": 0.4676876664161682, "eval_mae": 0.11813607066869736, "eval_rmse": 0.16005758941173553, "eval_runtime": 50.537, "eval_samples_per_second": 46.58, "eval_steps_per_second": 1.464, "learning_rate": 0.0001, "step": 5475 }, { "epoch": 25.114155251141554, "grad_norm": 0.10921537131071091, "learning_rate": 0.0001, "loss": 0.4601, "step": 5500 }, { "epoch": 26.0, "eval_explained_variance": 0.4434172809123993, "eval_kl_divergence": 0.6089490652084351, "eval_loss": 0.4679708480834961, "eval_mae": 0.11711684614419937, "eval_rmse": 0.1605486422777176, "eval_runtime": 49.8832, "eval_samples_per_second": 47.19, "eval_steps_per_second": 1.483, "learning_rate": 0.0001, "step": 5694 }, { "epoch": 27.0, "eval_explained_variance": 0.4460805654525757, "eval_kl_divergence": 0.4741028845310211, "eval_loss": 0.4674595892429352, "eval_mae": 0.11824781447649002, "eval_rmse": 0.16004686057567596, "eval_runtime": 49.7793, "eval_samples_per_second": 47.289, "eval_steps_per_second": 1.487, "learning_rate": 0.0001, "step": 5913 }, { "epoch": 27.397260273972602, "grad_norm": 0.11422494053840637, "learning_rate": 0.0001, "loss": 0.4585, "step": 6000 }, { "epoch": 28.0, "eval_explained_variance": 0.4489245116710663, "eval_kl_divergence": 0.3355759084224701, "eval_loss": 0.46810340881347656, "eval_mae": 0.11996418237686157, "eval_rmse": 0.16060088574886322, "eval_runtime": 52.9491, "eval_samples_per_second": 44.458, "eval_steps_per_second": 1.398, "learning_rate": 0.0001, "step": 6132 }, { "epoch": 29.0, "eval_explained_variance": 0.4459850490093231, "eval_kl_divergence": 0.43302619457244873, "eval_loss": 0.4678303897380829, "eval_mae": 0.11808297038078308, "eval_rmse": 0.16026519238948822, "eval_runtime": 50.5506, "eval_samples_per_second": 46.567, "eval_steps_per_second": 1.464, "learning_rate": 0.0001, "step": 6351 }, { "epoch": 29.680365296803654, "grad_norm": 0.11833047866821289, "learning_rate": 0.0001, "loss": 0.4578, "step": 6500 }, { "epoch": 30.0, "eval_explained_variance": 0.4503695070743561, "eval_kl_divergence": 0.3159695267677307, "eval_loss": 0.46800243854522705, "eval_mae": 0.11937135457992554, "eval_rmse": 0.160204216837883, "eval_runtime": 50.0689, "eval_samples_per_second": 47.015, "eval_steps_per_second": 1.478, "learning_rate": 0.0001, "step": 6570 }, { "epoch": 31.0, "eval_explained_variance": 0.4467611014842987, "eval_kl_divergence": 0.419010728597641, "eval_loss": 0.4676785469055176, "eval_mae": 0.11789224296808243, "eval_rmse": 0.1599912792444229, "eval_runtime": 50.2573, "eval_samples_per_second": 46.839, "eval_steps_per_second": 1.472, "learning_rate": 0.0001, "step": 6789 }, { "epoch": 31.963470319634702, "grad_norm": 0.1234586164355278, "learning_rate": 0.0001, "loss": 0.4579, "step": 7000 }, { "epoch": 32.0, "eval_explained_variance": 0.4503757953643799, "eval_kl_divergence": 0.3705631494522095, "eval_loss": 0.46752873063087463, "eval_mae": 0.11878199130296707, "eval_rmse": 0.159804567694664, "eval_runtime": 50.3085, "eval_samples_per_second": 46.791, "eval_steps_per_second": 1.471, "learning_rate": 0.0001, "step": 7008 }, { "epoch": 33.0, "eval_explained_variance": 0.4545632600784302, "eval_kl_divergence": 0.35043853521347046, "eval_loss": 0.46710190176963806, "eval_mae": 0.1181415393948555, "eval_rmse": 0.1593446284532547, "eval_runtime": 50.4199, "eval_samples_per_second": 46.688, "eval_steps_per_second": 1.468, "learning_rate": 0.0001, "step": 7227 }, { "epoch": 34.0, "eval_explained_variance": 0.4532606303691864, "eval_kl_divergence": 0.3881392180919647, "eval_loss": 0.4670344293117523, "eval_mae": 0.11804797500371933, "eval_rmse": 0.15942266583442688, "eval_runtime": 50.088, "eval_samples_per_second": 46.997, "eval_steps_per_second": 1.477, "learning_rate": 0.0001, "step": 7446 }, { "epoch": 34.24657534246575, "grad_norm": 0.14323526620864868, "learning_rate": 0.0001, "loss": 0.4569, "step": 7500 }, { "epoch": 35.0, "eval_explained_variance": 0.4555685818195343, "eval_kl_divergence": 0.43976902961730957, "eval_loss": 0.4662601053714752, "eval_mae": 0.11664538830518723, "eval_rmse": 0.1586536318063736, "eval_runtime": 49.8708, "eval_samples_per_second": 47.202, "eval_steps_per_second": 1.484, "learning_rate": 0.0001, "step": 7665 }, { "epoch": 36.0, "eval_explained_variance": 0.4544428884983063, "eval_kl_divergence": 0.4382496476173401, "eval_loss": 0.46657058596611023, "eval_mae": 0.11700741201639175, "eval_rmse": 0.15874631702899933, "eval_runtime": 49.7975, "eval_samples_per_second": 47.271, "eval_steps_per_second": 1.486, "learning_rate": 0.0001, "step": 7884 }, { "epoch": 36.529680365296805, "grad_norm": 0.17629703879356384, "learning_rate": 0.0001, "loss": 0.4572, "step": 8000 }, { "epoch": 37.0, "eval_explained_variance": 0.45941635966300964, "eval_kl_divergence": 0.4330490827560425, "eval_loss": 0.4657588005065918, "eval_mae": 0.11633748561143875, "eval_rmse": 0.15810036659240723, "eval_runtime": 51.4251, "eval_samples_per_second": 45.775, "eval_steps_per_second": 1.439, "learning_rate": 0.0001, "step": 8103 }, { "epoch": 38.0, "eval_explained_variance": 0.4566784203052521, "eval_kl_divergence": 0.4877949357032776, "eval_loss": 0.4659184217453003, "eval_mae": 0.11623784899711609, "eval_rmse": 0.15832678973674774, "eval_runtime": 49.7333, "eval_samples_per_second": 47.332, "eval_steps_per_second": 1.488, "learning_rate": 0.0001, "step": 8322 }, { "epoch": 38.81278538812786, "grad_norm": 0.1781003624200821, "learning_rate": 0.0001, "loss": 0.4572, "step": 8500 }, { "epoch": 39.0, "eval_explained_variance": 0.45519956946372986, "eval_kl_divergence": 0.3790707290172577, "eval_loss": 0.46703553199768066, "eval_mae": 0.11782807856798172, "eval_rmse": 0.15946339070796967, "eval_runtime": 52.4, "eval_samples_per_second": 44.924, "eval_steps_per_second": 1.412, "learning_rate": 0.0001, "step": 8541 }, { "epoch": 40.0, "eval_explained_variance": 0.45683178305625916, "eval_kl_divergence": 0.38892972469329834, "eval_loss": 0.4664987027645111, "eval_mae": 0.11783644556999207, "eval_rmse": 0.15876977145671844, "eval_runtime": 50.7398, "eval_samples_per_second": 46.394, "eval_steps_per_second": 1.458, "learning_rate": 0.0001, "step": 8760 }, { "epoch": 41.0, "eval_explained_variance": 0.4591364860534668, "eval_kl_divergence": 0.3222128450870514, "eval_loss": 0.46659526228904724, "eval_mae": 0.11838778108358383, "eval_rmse": 0.15888933837413788, "eval_runtime": 50.0159, "eval_samples_per_second": 47.065, "eval_steps_per_second": 1.48, "learning_rate": 0.0001, "step": 8979 }, { "epoch": 41.0958904109589, "grad_norm": 0.13085126876831055, "learning_rate": 0.0001, "loss": 0.4559, "step": 9000 }, { "epoch": 42.0, "eval_explained_variance": 0.4606964886188507, "eval_kl_divergence": 0.426244854927063, "eval_loss": 0.4655005633831024, "eval_mae": 0.11635158210992813, "eval_rmse": 0.15787668526172638, "eval_runtime": 49.9099, "eval_samples_per_second": 47.165, "eval_steps_per_second": 1.483, "learning_rate": 0.0001, "step": 9198 }, { "epoch": 43.0, "eval_explained_variance": 0.46034756302833557, "eval_kl_divergence": 0.4611224830150604, "eval_loss": 0.4656265676021576, "eval_mae": 0.11616652458906174, "eval_rmse": 0.1579464077949524, "eval_runtime": 50.0123, "eval_samples_per_second": 47.068, "eval_steps_per_second": 1.48, "learning_rate": 0.0001, "step": 9417 }, { "epoch": 43.37899543378995, "grad_norm": 0.17523790895938873, "learning_rate": 0.0001, "loss": 0.4554, "step": 9500 }, { "epoch": 44.0, "eval_explained_variance": 0.4616149961948395, "eval_kl_divergence": 0.45858410000801086, "eval_loss": 0.4655725955963135, "eval_mae": 0.11644264310598373, "eval_rmse": 0.15800905227661133, "eval_runtime": 50.6284, "eval_samples_per_second": 46.496, "eval_steps_per_second": 1.462, "learning_rate": 0.0001, "step": 9636 }, { "epoch": 45.0, "eval_explained_variance": 0.45969870686531067, "eval_kl_divergence": 0.4367772340774536, "eval_loss": 0.46600833535194397, "eval_mae": 0.11579249054193497, "eval_rmse": 0.15833592414855957, "eval_runtime": 50.629, "eval_samples_per_second": 46.495, "eval_steps_per_second": 1.462, "learning_rate": 0.0001, "step": 9855 }, { "epoch": 45.662100456621005, "grad_norm": 0.1231347844004631, "learning_rate": 0.0001, "loss": 0.4557, "step": 10000 }, { "epoch": 46.0, "eval_explained_variance": 0.4603704512119293, "eval_kl_divergence": 0.41175922751426697, "eval_loss": 0.4660418927669525, "eval_mae": 0.11639311909675598, "eval_rmse": 0.1581837385892868, "eval_runtime": 50.1537, "eval_samples_per_second": 46.936, "eval_steps_per_second": 1.475, "learning_rate": 0.0001, "step": 10074 }, { "epoch": 47.0, "eval_explained_variance": 0.4613979756832123, "eval_kl_divergence": 0.5424114465713501, "eval_loss": 0.46521857380867004, "eval_mae": 0.11542114615440369, "eval_rmse": 0.15771377086639404, "eval_runtime": 49.6928, "eval_samples_per_second": 47.371, "eval_steps_per_second": 1.489, "learning_rate": 0.0001, "step": 10293 }, { "epoch": 47.945205479452056, "grad_norm": 0.46352267265319824, "learning_rate": 0.0001, "loss": 0.4551, "step": 10500 }, { "epoch": 48.0, "eval_explained_variance": 0.45960724353790283, "eval_kl_divergence": 0.525124728679657, "eval_loss": 0.46598610281944275, "eval_mae": 0.1159835234284401, "eval_rmse": 0.15856431424617767, "eval_runtime": 49.9974, "eval_samples_per_second": 47.082, "eval_steps_per_second": 1.48, "learning_rate": 0.0001, "step": 10512 }, { "epoch": 49.0, "eval_explained_variance": 0.4572352468967438, "eval_kl_divergence": 0.5006867051124573, "eval_loss": 0.46604350209236145, "eval_mae": 0.11609696596860886, "eval_rmse": 0.15853044390678406, "eval_runtime": 50.2446, "eval_samples_per_second": 46.851, "eval_steps_per_second": 1.473, "learning_rate": 0.0001, "step": 10731 }, { "epoch": 50.0, "eval_explained_variance": 0.4658548831939697, "eval_kl_divergence": 0.24239596724510193, "eval_loss": 0.46660009026527405, "eval_mae": 0.11854288727045059, "eval_rmse": 0.15863054990768433, "eval_runtime": 50.1897, "eval_samples_per_second": 46.902, "eval_steps_per_second": 1.474, "learning_rate": 0.0001, "step": 10950 }, { "epoch": 50.22831050228311, "grad_norm": 0.1688494235277176, "learning_rate": 0.0001, "loss": 0.4545, "step": 11000 }, { "epoch": 51.0, "eval_explained_variance": 0.45888975262641907, "eval_kl_divergence": 0.4170607030391693, "eval_loss": 0.4660661220550537, "eval_mae": 0.11618483066558838, "eval_rmse": 0.15835459530353546, "eval_runtime": 49.5535, "eval_samples_per_second": 47.504, "eval_steps_per_second": 1.493, "learning_rate": 0.0001, "step": 11169 }, { "epoch": 52.0, "eval_explained_variance": 0.46297597885131836, "eval_kl_divergence": 0.49118655920028687, "eval_loss": 0.4649689793586731, "eval_mae": 0.11549883335828781, "eval_rmse": 0.1575259119272232, "eval_runtime": 50.3774, "eval_samples_per_second": 46.727, "eval_steps_per_second": 1.469, "learning_rate": 0.0001, "step": 11388 }, { "epoch": 52.51141552511415, "grad_norm": 0.2805333137512207, "learning_rate": 0.0001, "loss": 0.4548, "step": 11500 }, { "epoch": 53.0, "eval_explained_variance": 0.46440085768699646, "eval_kl_divergence": 0.4030352830886841, "eval_loss": 0.4653578996658325, "eval_mae": 0.11687562614679337, "eval_rmse": 0.15780305862426758, "eval_runtime": 51.1877, "eval_samples_per_second": 45.988, "eval_steps_per_second": 1.446, "learning_rate": 0.0001, "step": 11607 }, { "epoch": 54.0, "eval_explained_variance": 0.4594965875148773, "eval_kl_divergence": 0.4810858964920044, "eval_loss": 0.4660585820674896, "eval_mae": 0.11529505252838135, "eval_rmse": 0.15853293240070343, "eval_runtime": 51.2952, "eval_samples_per_second": 45.891, "eval_steps_per_second": 1.443, "learning_rate": 0.0001, "step": 11826 }, { "epoch": 54.794520547945204, "grad_norm": 0.22778521478176117, "learning_rate": 0.0001, "loss": 0.455, "step": 12000 }, { "epoch": 55.0, "eval_explained_variance": 0.46380600333213806, "eval_kl_divergence": 0.3773800730705261, "eval_loss": 0.46527624130249023, "eval_mae": 0.11668615788221359, "eval_rmse": 0.1576414853334427, "eval_runtime": 50.6825, "eval_samples_per_second": 46.446, "eval_steps_per_second": 1.46, "learning_rate": 0.0001, "step": 12045 }, { "epoch": 56.0, "eval_explained_variance": 0.4669934809207916, "eval_kl_divergence": 0.32541513442993164, "eval_loss": 0.4654240906238556, "eval_mae": 0.11757931858301163, "eval_rmse": 0.1575363427400589, "eval_runtime": 50.538, "eval_samples_per_second": 46.579, "eval_steps_per_second": 1.464, "learning_rate": 0.0001, "step": 12264 }, { "epoch": 57.0, "eval_explained_variance": 0.4661710560321808, "eval_kl_divergence": 0.3648814857006073, "eval_loss": 0.4654492139816284, "eval_mae": 0.11615876108407974, "eval_rmse": 0.15751774609088898, "eval_runtime": 51.1673, "eval_samples_per_second": 46.006, "eval_steps_per_second": 1.446, "learning_rate": 0.0001, "step": 12483 }, { "epoch": 57.077625570776256, "grad_norm": 0.16715611517429352, "learning_rate": 0.0001, "loss": 0.4531, "step": 12500 }, { "epoch": 58.0, "eval_explained_variance": 0.4606919586658478, "eval_kl_divergence": 0.40749335289001465, "eval_loss": 0.46654412150382996, "eval_mae": 0.1166309341788292, "eval_rmse": 0.15835203230381012, "eval_runtime": 50.603, "eval_samples_per_second": 46.519, "eval_steps_per_second": 1.462, "learning_rate": 0.0001, "step": 12702 }, { "epoch": 59.0, "eval_explained_variance": 0.4653950035572052, "eval_kl_divergence": 0.42019784450531006, "eval_loss": 0.465238481760025, "eval_mae": 0.11570876836776733, "eval_rmse": 0.15746039152145386, "eval_runtime": 50.3267, "eval_samples_per_second": 46.774, "eval_steps_per_second": 1.47, "learning_rate": 1e-05, "step": 12921 }, { "epoch": 59.36073059360731, "grad_norm": 0.19701753556728363, "learning_rate": 1e-05, "loss": 0.4538, "step": 13000 }, { "epoch": 60.0, "eval_explained_variance": 0.4668855369091034, "eval_kl_divergence": 0.4084234833717346, "eval_loss": 0.46530231833457947, "eval_mae": 0.11569295078516006, "eval_rmse": 0.15709955990314484, "eval_runtime": 51.1174, "eval_samples_per_second": 46.051, "eval_steps_per_second": 1.448, "learning_rate": 1e-05, "step": 13140 }, { "epoch": 61.0, "eval_explained_variance": 0.4661245346069336, "eval_kl_divergence": 0.4496937096118927, "eval_loss": 0.4653523564338684, "eval_mae": 0.11528477817773819, "eval_rmse": 0.15729330480098724, "eval_runtime": 50.8416, "eval_samples_per_second": 46.301, "eval_steps_per_second": 1.456, "learning_rate": 1e-05, "step": 13359 }, { "epoch": 61.64383561643836, "grad_norm": 0.1874207705259323, "learning_rate": 1e-05, "loss": 0.4529, "step": 13500 }, { "epoch": 62.0, "eval_explained_variance": 0.4681651294231415, "eval_kl_divergence": 0.411173015832901, "eval_loss": 0.46477487683296204, "eval_mae": 0.11529665440320969, "eval_rmse": 0.15684308111667633, "eval_runtime": 52.6214, "eval_samples_per_second": 44.735, "eval_steps_per_second": 1.406, "learning_rate": 1e-05, "step": 13578 }, { "epoch": 63.0, "eval_explained_variance": 0.47016242146492004, "eval_kl_divergence": 0.3748082220554352, "eval_loss": 0.46481335163116455, "eval_mae": 0.11518841236829758, "eval_rmse": 0.15671293437480927, "eval_runtime": 53.2469, "eval_samples_per_second": 44.209, "eval_steps_per_second": 1.39, "learning_rate": 1e-05, "step": 13797 }, { "epoch": 63.926940639269404, "grad_norm": 0.22562281787395477, "learning_rate": 1e-05, "loss": 0.4527, "step": 14000 }, { "epoch": 64.0, "eval_explained_variance": 0.4721170663833618, "eval_kl_divergence": 0.3044198155403137, "eval_loss": 0.46523070335388184, "eval_mae": 0.11618036776781082, "eval_rmse": 0.15709933638572693, "eval_runtime": 53.3051, "eval_samples_per_second": 44.161, "eval_steps_per_second": 1.388, "learning_rate": 1e-05, "step": 14016 }, { "epoch": 65.0, "eval_explained_variance": 0.46695852279663086, "eval_kl_divergence": 0.46853822469711304, "eval_loss": 0.46484872698783875, "eval_mae": 0.11532068997621536, "eval_rmse": 0.1568661779165268, "eval_runtime": 52.7599, "eval_samples_per_second": 44.617, "eval_steps_per_second": 1.403, "learning_rate": 1e-05, "step": 14235 }, { "epoch": 66.0, "eval_explained_variance": 0.46712610125541687, "eval_kl_divergence": 0.508738100528717, "eval_loss": 0.46500927209854126, "eval_mae": 0.11475471407175064, "eval_rmse": 0.15729309618473053, "eval_runtime": 54.0149, "eval_samples_per_second": 43.581, "eval_steps_per_second": 1.37, "learning_rate": 1e-05, "step": 14454 }, { "epoch": 66.21004566210046, "grad_norm": 0.18448679149150848, "learning_rate": 1e-05, "loss": 0.4531, "step": 14500 }, { "epoch": 67.0, "eval_explained_variance": 0.4690088927745819, "eval_kl_divergence": 0.42743220925331116, "eval_loss": 0.4645930230617523, "eval_mae": 0.1155417189002037, "eval_rmse": 0.1567572057247162, "eval_runtime": 52.5655, "eval_samples_per_second": 44.782, "eval_steps_per_second": 1.408, "learning_rate": 1e-05, "step": 14673 }, { "epoch": 68.0, "eval_explained_variance": 0.4680323302745819, "eval_kl_divergence": 0.49686378240585327, "eval_loss": 0.46456360816955566, "eval_mae": 0.11437365412712097, "eval_rmse": 0.1566230058670044, "eval_runtime": 50.8799, "eval_samples_per_second": 46.266, "eval_steps_per_second": 1.454, "learning_rate": 1e-05, "step": 14892 }, { "epoch": 68.4931506849315, "grad_norm": 0.21752646565437317, "learning_rate": 1e-05, "loss": 0.452, "step": 15000 }, { "epoch": 69.0, "eval_explained_variance": 0.4696376323699951, "eval_kl_divergence": 0.44800856709480286, "eval_loss": 0.464430034160614, "eval_mae": 0.11452987045049667, "eval_rmse": 0.15642575919628143, "eval_runtime": 61.8405, "eval_samples_per_second": 38.066, "eval_steps_per_second": 1.197, "learning_rate": 1e-05, "step": 15111 }, { "epoch": 70.0, "eval_explained_variance": 0.4692017734050751, "eval_kl_divergence": 0.42908576130867004, "eval_loss": 0.4648461937904358, "eval_mae": 0.11500384658575058, "eval_rmse": 0.15674862265586853, "eval_runtime": 60.5787, "eval_samples_per_second": 38.859, "eval_steps_per_second": 1.222, "learning_rate": 1e-05, "step": 15330 }, { "epoch": 70.77625570776256, "grad_norm": 0.23285503685474396, "learning_rate": 1e-05, "loss": 0.4524, "step": 15500 }, { "epoch": 71.0, "eval_explained_variance": 0.4711233675479889, "eval_kl_divergence": 0.37966692447662354, "eval_loss": 0.4645022749900818, "eval_mae": 0.11555531620979309, "eval_rmse": 0.15646833181381226, "eval_runtime": 61.2584, "eval_samples_per_second": 38.427, "eval_steps_per_second": 1.208, "learning_rate": 1e-05, "step": 15549 }, { "epoch": 72.0, "eval_explained_variance": 0.4690466821193695, "eval_kl_divergence": 0.42796915769577026, "eval_loss": 0.46473589539527893, "eval_mae": 0.11497951298952103, "eval_rmse": 0.15693025290966034, "eval_runtime": 61.782, "eval_samples_per_second": 38.102, "eval_steps_per_second": 1.198, "learning_rate": 1e-05, "step": 15768 }, { "epoch": 73.0, "eval_explained_variance": 0.4707035720348358, "eval_kl_divergence": 0.4591566324234009, "eval_loss": 0.46414923667907715, "eval_mae": 0.11423368006944656, "eval_rmse": 0.15631103515625, "eval_runtime": 62.9115, "eval_samples_per_second": 37.418, "eval_steps_per_second": 1.176, "learning_rate": 1e-05, "step": 15987 }, { "epoch": 73.05936073059361, "grad_norm": 0.1904192417860031, "learning_rate": 1e-05, "loss": 0.4515, "step": 16000 }, { "epoch": 74.0, "eval_explained_variance": 0.4705829620361328, "eval_kl_divergence": 0.43208685517311096, "eval_loss": 0.4641610085964203, "eval_mae": 0.11505597829818726, "eval_rmse": 0.1563975065946579, "eval_runtime": 61.932, "eval_samples_per_second": 38.009, "eval_steps_per_second": 1.195, "learning_rate": 1e-05, "step": 16206 }, { "epoch": 75.0, "eval_explained_variance": 0.47077181935310364, "eval_kl_divergence": 0.3843104839324951, "eval_loss": 0.4644509255886078, "eval_mae": 0.11519055813550949, "eval_rmse": 0.15653057396411896, "eval_runtime": 62.3182, "eval_samples_per_second": 37.774, "eval_steps_per_second": 1.187, "learning_rate": 1e-05, "step": 16425 }, { "epoch": 75.34246575342466, "grad_norm": 0.2563965618610382, "learning_rate": 1e-05, "loss": 0.4521, "step": 16500 }, { "epoch": 76.0, "eval_explained_variance": 0.4675123989582062, "eval_kl_divergence": 0.5215911269187927, "eval_loss": 0.4646488130092621, "eval_mae": 0.1146780475974083, "eval_rmse": 0.1569206565618515, "eval_runtime": 66.0488, "eval_samples_per_second": 35.64, "eval_steps_per_second": 1.12, "learning_rate": 1e-05, "step": 16644 }, { "epoch": 77.0, "eval_explained_variance": 0.46909868717193604, "eval_kl_divergence": 0.4094104468822479, "eval_loss": 0.46475714445114136, "eval_mae": 0.11523856967687607, "eval_rmse": 0.15687990188598633, "eval_runtime": 62.1685, "eval_samples_per_second": 37.865, "eval_steps_per_second": 1.19, "learning_rate": 1e-05, "step": 16863 }, { "epoch": 77.62557077625571, "grad_norm": 0.16491472721099854, "learning_rate": 1e-05, "loss": 0.4519, "step": 17000 }, { "epoch": 78.0, "eval_explained_variance": 0.47086599469184875, "eval_kl_divergence": 0.43988528847694397, "eval_loss": 0.46428272128105164, "eval_mae": 0.11493176966905594, "eval_rmse": 0.15638257563114166, "eval_runtime": 61.9923, "eval_samples_per_second": 37.972, "eval_steps_per_second": 1.194, "learning_rate": 1e-05, "step": 17082 }, { "epoch": 79.0, "eval_explained_variance": 0.4697439670562744, "eval_kl_divergence": 0.4178011417388916, "eval_loss": 0.4645934998989105, "eval_mae": 0.11465150117874146, "eval_rmse": 0.15666015446186066, "eval_runtime": 63.0404, "eval_samples_per_second": 37.341, "eval_steps_per_second": 1.174, "learning_rate": 1e-05, "step": 17301 }, { "epoch": 79.90867579908675, "grad_norm": 0.1647184044122696, "learning_rate": 1.0000000000000002e-06, "loss": 0.4517, "step": 17500 }, { "epoch": 80.0, "eval_explained_variance": 0.4699563980102539, "eval_kl_divergence": 0.43727052211761475, "eval_loss": 0.46436014771461487, "eval_mae": 0.11501001566648483, "eval_rmse": 0.15643416345119476, "eval_runtime": 61.5606, "eval_samples_per_second": 38.239, "eval_steps_per_second": 1.202, "learning_rate": 1.0000000000000002e-06, "step": 17520 }, { "epoch": 81.0, "eval_explained_variance": 0.468768835067749, "eval_kl_divergence": 0.47009941935539246, "eval_loss": 0.46448636054992676, "eval_mae": 0.11508657783269882, "eval_rmse": 0.15673168003559113, "eval_runtime": 62.9178, "eval_samples_per_second": 37.414, "eval_steps_per_second": 1.176, "learning_rate": 1.0000000000000002e-06, "step": 17739 }, { "epoch": 82.0, "eval_explained_variance": 0.470253586769104, "eval_kl_divergence": 0.4601159989833832, "eval_loss": 0.4644375145435333, "eval_mae": 0.11455937474966049, "eval_rmse": 0.15652652084827423, "eval_runtime": 62.6023, "eval_samples_per_second": 37.602, "eval_steps_per_second": 1.182, "learning_rate": 1.0000000000000002e-06, "step": 17958 }, { "epoch": 82.1917808219178, "grad_norm": 0.2432813197374344, "learning_rate": 1.0000000000000002e-06, "loss": 0.4514, "step": 18000 }, { "epoch": 83.0, "eval_explained_variance": 0.468420147895813, "eval_kl_divergence": 0.4510715901851654, "eval_loss": 0.46457409858703613, "eval_mae": 0.11468392610549927, "eval_rmse": 0.15669189393520355, "eval_runtime": 62.7877, "eval_samples_per_second": 37.491, "eval_steps_per_second": 1.179, "learning_rate": 1.0000000000000002e-06, "step": 18177 }, { "epoch": 83.0, "learning_rate": 1.0000000000000002e-06, "step": 18177, "total_flos": 8.603009036605255e+19, "train_loss": 0.45949580130708517, "train_runtime": 19431.3015, "train_samples_per_second": 54.06, "train_steps_per_second": 1.691 } ], "logging_steps": 500, "max_steps": 32850, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.603009036605255e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }