{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984871406959153, "eval_steps": 83, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030257186081694403, "grad_norm": 28.705169927796575, "learning_rate": 2.0000000000000003e-06, "loss": 1.1946, "step": 1 }, { "epoch": 0.006051437216338881, "grad_norm": 46.50769527739581, "learning_rate": 4.000000000000001e-06, "loss": 1.2123, "step": 2 }, { "epoch": 0.009077155824508321, "grad_norm": 25.39574491403161, "learning_rate": 6e-06, "loss": 1.156, "step": 3 }, { "epoch": 0.012102874432677761, "grad_norm": 45.24843842377614, "learning_rate": 8.000000000000001e-06, "loss": 1.0308, "step": 4 }, { "epoch": 0.015128593040847202, "grad_norm": 22.433510797009237, "learning_rate": 1e-05, "loss": 0.8752, "step": 5 }, { "epoch": 0.018154311649016642, "grad_norm": 13.241172853966125, "learning_rate": 1.2e-05, "loss": 0.8675, "step": 6 }, { "epoch": 0.02118003025718608, "grad_norm": 22.26903307955315, "learning_rate": 1.4e-05, "loss": 0.8845, "step": 7 }, { "epoch": 0.024205748865355523, "grad_norm": 9.822357864256519, "learning_rate": 1.6000000000000003e-05, "loss": 0.8197, "step": 8 }, { "epoch": 0.02723146747352496, "grad_norm": 8.694636509708216, "learning_rate": 1.8e-05, "loss": 0.8479, "step": 9 }, { "epoch": 0.030257186081694403, "grad_norm": 11.16214976570948, "learning_rate": 2e-05, "loss": 0.8287, "step": 10 }, { "epoch": 0.03328290468986384, "grad_norm": 5.911685823891796, "learning_rate": 1.999951808959328e-05, "loss": 0.8226, "step": 11 }, { "epoch": 0.036308623298033284, "grad_norm": 21.36728206664348, "learning_rate": 1.9998072404820648e-05, "loss": 0.8188, "step": 12 }, { "epoch": 0.039334341906202726, "grad_norm": 6.095716493647628, "learning_rate": 1.9995663085020215e-05, "loss": 0.8025, "step": 13 }, { "epoch": 0.04236006051437216, "grad_norm": 3.99657658054729, "learning_rate": 1.9992290362407232e-05, "loss": 0.7603, "step": 14 }, { "epoch": 0.0453857791225416, "grad_norm": 2.9561276705682618, "learning_rate": 1.9987954562051724e-05, "loss": 0.7753, "step": 15 }, { "epoch": 0.048411497730711045, "grad_norm": 3.1432043759551167, "learning_rate": 1.998265610184716e-05, "loss": 0.7652, "step": 16 }, { "epoch": 0.05143721633888049, "grad_norm": 3.4729320368253522, "learning_rate": 1.997639549247016e-05, "loss": 0.7687, "step": 17 }, { "epoch": 0.05446293494704992, "grad_norm": 2.7415235461015532, "learning_rate": 1.9969173337331283e-05, "loss": 0.7359, "step": 18 }, { "epoch": 0.057488653555219364, "grad_norm": 3.3156999565141376, "learning_rate": 1.9960990332516875e-05, "loss": 0.7636, "step": 19 }, { "epoch": 0.060514372163388806, "grad_norm": 3.920778052156621, "learning_rate": 1.995184726672197e-05, "loss": 0.7772, "step": 20 }, { "epoch": 0.06354009077155824, "grad_norm": 2.8732391212646093, "learning_rate": 1.9941745021174284e-05, "loss": 0.7555, "step": 21 }, { "epoch": 0.06656580937972768, "grad_norm": 3.365576404016739, "learning_rate": 1.9930684569549265e-05, "loss": 0.7404, "step": 22 }, { "epoch": 0.06959152798789713, "grad_norm": 2.930471826033828, "learning_rate": 1.991866697787626e-05, "loss": 0.7522, "step": 23 }, { "epoch": 0.07261724659606657, "grad_norm": 3.1830530819926204, "learning_rate": 1.990569340443577e-05, "loss": 0.7345, "step": 24 }, { "epoch": 0.07564296520423601, "grad_norm": 2.8871857831062453, "learning_rate": 1.989176509964781e-05, "loss": 0.7031, "step": 25 }, { "epoch": 0.07866868381240545, "grad_norm": 2.786912583824214, "learning_rate": 1.9876883405951378e-05, "loss": 0.7216, "step": 26 }, { "epoch": 0.08169440242057488, "grad_norm": 2.6929192382433915, "learning_rate": 1.9861049757675087e-05, "loss": 0.7216, "step": 27 }, { "epoch": 0.08472012102874432, "grad_norm": 2.6063726645303413, "learning_rate": 1.9844265680898917e-05, "loss": 0.7295, "step": 28 }, { "epoch": 0.08774583963691376, "grad_norm": 3.0782649912185365, "learning_rate": 1.982653279330712e-05, "loss": 0.7075, "step": 29 }, { "epoch": 0.0907715582450832, "grad_norm": 47.39648659137784, "learning_rate": 1.9807852804032306e-05, "loss": 0.7247, "step": 30 }, { "epoch": 0.09379727685325265, "grad_norm": 2.576758058922432, "learning_rate": 1.9788227513490724e-05, "loss": 0.7332, "step": 31 }, { "epoch": 0.09682299546142209, "grad_norm": 2.4299981611439834, "learning_rate": 1.9767658813208725e-05, "loss": 0.7215, "step": 32 }, { "epoch": 0.09984871406959153, "grad_norm": 2.693034366034686, "learning_rate": 1.974614868564045e-05, "loss": 0.6484, "step": 33 }, { "epoch": 0.10287443267776097, "grad_norm": 2.802485927477001, "learning_rate": 1.9723699203976768e-05, "loss": 0.7217, "step": 34 }, { "epoch": 0.1059001512859304, "grad_norm": 2.344053987952208, "learning_rate": 1.9700312531945444e-05, "loss": 0.7304, "step": 35 }, { "epoch": 0.10892586989409984, "grad_norm": 2.6568421428735873, "learning_rate": 1.96759909236026e-05, "loss": 0.7058, "step": 36 }, { "epoch": 0.11195158850226929, "grad_norm": 2.1838166261289094, "learning_rate": 1.9650736723115476e-05, "loss": 0.678, "step": 37 }, { "epoch": 0.11497730711043873, "grad_norm": 4.459380746633169, "learning_rate": 1.9624552364536472e-05, "loss": 0.7375, "step": 38 }, { "epoch": 0.11800302571860817, "grad_norm": 2.194060418634913, "learning_rate": 1.9597440371568576e-05, "loss": 0.7119, "step": 39 }, { "epoch": 0.12102874432677761, "grad_norm": 2.4998393239432053, "learning_rate": 1.956940335732209e-05, "loss": 0.729, "step": 40 }, { "epoch": 0.12405446293494705, "grad_norm": 2.315910880599856, "learning_rate": 1.9540444024062807e-05, "loss": 0.7038, "step": 41 }, { "epoch": 0.12708018154311648, "grad_norm": 2.2599221877365006, "learning_rate": 1.9510565162951538e-05, "loss": 0.738, "step": 42 }, { "epoch": 0.13010590015128592, "grad_norm": 2.523504960053174, "learning_rate": 1.9479769653775107e-05, "loss": 0.6821, "step": 43 }, { "epoch": 0.13313161875945537, "grad_norm": 2.5848519687661615, "learning_rate": 1.944806046466878e-05, "loss": 0.7491, "step": 44 }, { "epoch": 0.1361573373676248, "grad_norm": 2.580757184651181, "learning_rate": 1.941544065183021e-05, "loss": 0.698, "step": 45 }, { "epoch": 0.13918305597579425, "grad_norm": 2.428509105247681, "learning_rate": 1.9381913359224844e-05, "loss": 0.678, "step": 46 }, { "epoch": 0.1422087745839637, "grad_norm": 2.175415211354414, "learning_rate": 1.9347481818282927e-05, "loss": 0.6631, "step": 47 }, { "epoch": 0.14523449319213314, "grad_norm": 2.3270447497446916, "learning_rate": 1.9312149347588035e-05, "loss": 0.7061, "step": 48 }, { "epoch": 0.14826021180030258, "grad_norm": 2.4287476749314836, "learning_rate": 1.9275919352557242e-05, "loss": 0.7237, "step": 49 }, { "epoch": 0.15128593040847202, "grad_norm": 2.2410089350217213, "learning_rate": 1.9238795325112867e-05, "loss": 0.7193, "step": 50 }, { "epoch": 0.15431164901664146, "grad_norm": 2.3787409817474536, "learning_rate": 1.920078084334595e-05, "loss": 0.6697, "step": 51 }, { "epoch": 0.1573373676248109, "grad_norm": 2.2713457923537934, "learning_rate": 1.916187957117136e-05, "loss": 0.7033, "step": 52 }, { "epoch": 0.16036308623298035, "grad_norm": 2.581980330545954, "learning_rate": 1.9122095257974676e-05, "loss": 0.6939, "step": 53 }, { "epoch": 0.16338880484114976, "grad_norm": 2.432198331476182, "learning_rate": 1.9081431738250815e-05, "loss": 0.6892, "step": 54 }, { "epoch": 0.1664145234493192, "grad_norm": 2.099821368018975, "learning_rate": 1.9039892931234434e-05, "loss": 0.6577, "step": 55 }, { "epoch": 0.16944024205748864, "grad_norm": 2.2823357611877424, "learning_rate": 1.8997482840522218e-05, "loss": 0.6716, "step": 56 }, { "epoch": 0.17246596066565809, "grad_norm": 11.790926761539435, "learning_rate": 1.895420555368697e-05, "loss": 0.6863, "step": 57 }, { "epoch": 0.17549167927382753, "grad_norm": 2.1763784091818312, "learning_rate": 1.891006524188368e-05, "loss": 0.6779, "step": 58 }, { "epoch": 0.17851739788199697, "grad_norm": 2.1869852327736554, "learning_rate": 1.8865066159447468e-05, "loss": 0.7007, "step": 59 }, { "epoch": 0.1815431164901664, "grad_norm": 3.7000458856461753, "learning_rate": 1.881921264348355e-05, "loss": 0.696, "step": 60 }, { "epoch": 0.18456883509833585, "grad_norm": 2.1345158830729445, "learning_rate": 1.8772509113449243e-05, "loss": 0.6296, "step": 61 }, { "epoch": 0.1875945537065053, "grad_norm": 5.324784255219841, "learning_rate": 1.8724960070727974e-05, "loss": 0.7144, "step": 62 }, { "epoch": 0.19062027231467474, "grad_norm": 2.2854948381160733, "learning_rate": 1.8676570098195443e-05, "loss": 0.6876, "step": 63 }, { "epoch": 0.19364599092284418, "grad_norm": 2.2451810023966274, "learning_rate": 1.862734385977792e-05, "loss": 0.6923, "step": 64 }, { "epoch": 0.19667170953101362, "grad_norm": 2.4986096326582397, "learning_rate": 1.8577286100002723e-05, "loss": 0.6974, "step": 65 }, { "epoch": 0.19969742813918306, "grad_norm": 2.5519641214317117, "learning_rate": 1.8526401643540924e-05, "loss": 0.7081, "step": 66 }, { "epoch": 0.2027231467473525, "grad_norm": 2.269549566002934, "learning_rate": 1.8474695394742345e-05, "loss": 0.7218, "step": 67 }, { "epoch": 0.20574886535552195, "grad_norm": 1.9838611913406103, "learning_rate": 1.8422172337162865e-05, "loss": 0.7032, "step": 68 }, { "epoch": 0.2087745839636914, "grad_norm": 2.0276028298466064, "learning_rate": 1.8368837533084092e-05, "loss": 0.6917, "step": 69 }, { "epoch": 0.2118003025718608, "grad_norm": 1.8464725712525754, "learning_rate": 1.8314696123025456e-05, "loss": 0.6734, "step": 70 }, { "epoch": 0.21482602118003025, "grad_norm": 2.1527856581976623, "learning_rate": 1.825975332524873e-05, "loss": 0.698, "step": 71 }, { "epoch": 0.2178517397881997, "grad_norm": 2.016545413670171, "learning_rate": 1.8204014435255136e-05, "loss": 0.6818, "step": 72 }, { "epoch": 0.22087745839636913, "grad_norm": 2.3166505526604015, "learning_rate": 1.8147484825274895e-05, "loss": 0.6432, "step": 73 }, { "epoch": 0.22390317700453857, "grad_norm": 2.020726634051282, "learning_rate": 1.8090169943749477e-05, "loss": 0.6813, "step": 74 }, { "epoch": 0.22692889561270801, "grad_norm": 2.271648285326035, "learning_rate": 1.803207531480645e-05, "loss": 0.7153, "step": 75 }, { "epoch": 0.22995461422087746, "grad_norm": 2.0267783730128848, "learning_rate": 1.797320653772707e-05, "loss": 0.668, "step": 76 }, { "epoch": 0.2329803328290469, "grad_norm": 2.060993406421187, "learning_rate": 1.7913569286406606e-05, "loss": 0.7068, "step": 77 }, { "epoch": 0.23600605143721634, "grad_norm": 1.8795048223398712, "learning_rate": 1.785316930880745e-05, "loss": 0.718, "step": 78 }, { "epoch": 0.23903177004538578, "grad_norm": 2.0498135335289707, "learning_rate": 1.779201242640517e-05, "loss": 0.6744, "step": 79 }, { "epoch": 0.24205748865355523, "grad_norm": 1.9785152837196573, "learning_rate": 1.773010453362737e-05, "loss": 0.6739, "step": 80 }, { "epoch": 0.24508320726172467, "grad_norm": 2.0275695322358867, "learning_rate": 1.7667451597285617e-05, "loss": 0.6866, "step": 81 }, { "epoch": 0.2481089258698941, "grad_norm": 2.0024823958969464, "learning_rate": 1.7604059656000313e-05, "loss": 0.6949, "step": 82 }, { "epoch": 0.25113464447806355, "grad_norm": 2.0089305685455527, "learning_rate": 1.7539934819618696e-05, "loss": 0.6622, "step": 83 }, { "epoch": 0.25113464447806355, "eval_loss": 0.6706682443618774, "eval_runtime": 103.3626, "eval_samples_per_second": 40.924, "eval_steps_per_second": 0.648, "step": 83 }, { "epoch": 0.25416036308623297, "grad_norm": 2.00524741649147, "learning_rate": 1.747508326862597e-05, "loss": 0.6773, "step": 84 }, { "epoch": 0.25718608169440244, "grad_norm": 1.8886708832788808, "learning_rate": 1.7409511253549592e-05, "loss": 0.6931, "step": 85 }, { "epoch": 0.26021180030257185, "grad_norm": 2.0256246365229686, "learning_rate": 1.7343225094356857e-05, "loss": 0.6902, "step": 86 }, { "epoch": 0.2632375189107413, "grad_norm": 1.8431512625044615, "learning_rate": 1.727623117984575e-05, "loss": 0.6556, "step": 87 }, { "epoch": 0.26626323751891073, "grad_norm": 1.8944691010632577, "learning_rate": 1.720853596702919e-05, "loss": 0.6856, "step": 88 }, { "epoch": 0.2692889561270802, "grad_norm": 1.9923243591653923, "learning_rate": 1.7140145980512684e-05, "loss": 0.6628, "step": 89 }, { "epoch": 0.2723146747352496, "grad_norm": 2.2829966970709688, "learning_rate": 1.7071067811865477e-05, "loss": 0.7071, "step": 90 }, { "epoch": 0.2753403933434191, "grad_norm": 1.9768171039604017, "learning_rate": 1.7001308118985237e-05, "loss": 0.6874, "step": 91 }, { "epoch": 0.2783661119515885, "grad_norm": 1.8663920170577142, "learning_rate": 1.6930873625456362e-05, "loss": 0.6844, "step": 92 }, { "epoch": 0.2813918305597579, "grad_norm": 1.9299544891190301, "learning_rate": 1.685977111990193e-05, "loss": 0.6853, "step": 93 }, { "epoch": 0.2844175491679274, "grad_norm": 2.2225214208045183, "learning_rate": 1.678800745532942e-05, "loss": 0.6729, "step": 94 }, { "epoch": 0.2874432677760968, "grad_norm": 2.0661555212699905, "learning_rate": 1.6715589548470187e-05, "loss": 0.6844, "step": 95 }, { "epoch": 0.29046898638426627, "grad_norm": 2.007818883998332, "learning_rate": 1.664252437911282e-05, "loss": 0.7108, "step": 96 }, { "epoch": 0.2934947049924357, "grad_norm": 2.0125558122393, "learning_rate": 1.6568818989430416e-05, "loss": 0.6667, "step": 97 }, { "epoch": 0.29652042360060515, "grad_norm": 2.1343504218654012, "learning_rate": 1.6494480483301836e-05, "loss": 0.6732, "step": 98 }, { "epoch": 0.29954614220877457, "grad_norm": 1.9813823609834715, "learning_rate": 1.641951602562703e-05, "loss": 0.6593, "step": 99 }, { "epoch": 0.30257186081694404, "grad_norm": 1.975056311713501, "learning_rate": 1.6343932841636455e-05, "loss": 0.683, "step": 100 }, { "epoch": 0.30559757942511345, "grad_norm": 1.854491404440836, "learning_rate": 1.6267738216194698e-05, "loss": 0.6826, "step": 101 }, { "epoch": 0.3086232980332829, "grad_norm": 1.891220184424636, "learning_rate": 1.6190939493098344e-05, "loss": 0.665, "step": 102 }, { "epoch": 0.31164901664145234, "grad_norm": 2.735004116658562, "learning_rate": 1.6113544074368166e-05, "loss": 0.6818, "step": 103 }, { "epoch": 0.3146747352496218, "grad_norm": 1.844699616410939, "learning_rate": 1.6035559419535714e-05, "loss": 0.6771, "step": 104 }, { "epoch": 0.3177004538577912, "grad_norm": 2.0074416503342536, "learning_rate": 1.5956993044924334e-05, "loss": 0.664, "step": 105 }, { "epoch": 0.3207261724659607, "grad_norm": 1.9077963205419504, "learning_rate": 1.5877852522924733e-05, "loss": 0.6387, "step": 106 }, { "epoch": 0.3237518910741301, "grad_norm": 2.5814428860557315, "learning_rate": 1.579814548126514e-05, "loss": 0.6463, "step": 107 }, { "epoch": 0.3267776096822995, "grad_norm": 2.02923331561202, "learning_rate": 1.5717879602276123e-05, "loss": 0.6408, "step": 108 }, { "epoch": 0.329803328290469, "grad_norm": 1.8232061934697785, "learning_rate": 1.5637062622150168e-05, "loss": 0.6328, "step": 109 }, { "epoch": 0.3328290468986384, "grad_norm": 1.944530680893889, "learning_rate": 1.5555702330196024e-05, "loss": 0.6692, "step": 110 }, { "epoch": 0.3358547655068079, "grad_norm": 2.2591890489972735, "learning_rate": 1.547380656808797e-05, "loss": 0.6503, "step": 111 }, { "epoch": 0.3388804841149773, "grad_norm": 1.9297499891414, "learning_rate": 1.5391383229110005e-05, "loss": 0.6871, "step": 112 }, { "epoch": 0.34190620272314676, "grad_norm": 1.9218912909376045, "learning_rate": 1.5308440257395095e-05, "loss": 0.679, "step": 113 }, { "epoch": 0.34493192133131617, "grad_norm": 1.9228101408821987, "learning_rate": 1.5224985647159489e-05, "loss": 0.6788, "step": 114 }, { "epoch": 0.34795763993948564, "grad_norm": 1.9477527079521044, "learning_rate": 1.5141027441932217e-05, "loss": 0.6478, "step": 115 }, { "epoch": 0.35098335854765506, "grad_norm": 1.8524671597698312, "learning_rate": 1.5056573733779848e-05, "loss": 0.671, "step": 116 }, { "epoch": 0.3540090771558245, "grad_norm": 1.8754782332500444, "learning_rate": 1.4971632662526545e-05, "loss": 0.6422, "step": 117 }, { "epoch": 0.35703479576399394, "grad_norm": 1.9451177781938065, "learning_rate": 1.4886212414969551e-05, "loss": 0.6618, "step": 118 }, { "epoch": 0.3600605143721634, "grad_norm": 1.9806857849814412, "learning_rate": 1.4800321224090114e-05, "loss": 0.6624, "step": 119 }, { "epoch": 0.3630862329803328, "grad_norm": 1.9170879932814107, "learning_rate": 1.4713967368259981e-05, "loss": 0.6688, "step": 120 }, { "epoch": 0.3661119515885023, "grad_norm": 8.790756396410332, "learning_rate": 1.4627159170443504e-05, "loss": 0.6665, "step": 121 }, { "epoch": 0.3691376701966717, "grad_norm": 2.0362111479618834, "learning_rate": 1.4539904997395468e-05, "loss": 0.6584, "step": 122 }, { "epoch": 0.3721633888048411, "grad_norm": 1.8915219393485387, "learning_rate": 1.4452213258854684e-05, "loss": 0.6566, "step": 123 }, { "epoch": 0.3751891074130106, "grad_norm": 1.8144407540201268, "learning_rate": 1.436409240673342e-05, "loss": 0.6554, "step": 124 }, { "epoch": 0.37821482602118, "grad_norm": 1.967650978948985, "learning_rate": 1.4275550934302822e-05, "loss": 0.6933, "step": 125 }, { "epoch": 0.3812405446293495, "grad_norm": 1.9983937915711714, "learning_rate": 1.4186597375374283e-05, "loss": 0.6587, "step": 126 }, { "epoch": 0.3842662632375189, "grad_norm": 1.8623756601102601, "learning_rate": 1.4097240303476955e-05, "loss": 0.663, "step": 127 }, { "epoch": 0.38729198184568836, "grad_norm": 1.8157773118402531, "learning_rate": 1.4007488331031409e-05, "loss": 0.6355, "step": 128 }, { "epoch": 0.3903177004538578, "grad_norm": 2.0094412370616337, "learning_rate": 1.391735010851956e-05, "loss": 0.6984, "step": 129 }, { "epoch": 0.39334341906202724, "grad_norm": 1.8684281968245415, "learning_rate": 1.3826834323650899e-05, "loss": 0.6868, "step": 130 }, { "epoch": 0.39636913767019666, "grad_norm": 1.7815078230285688, "learning_rate": 1.3735949700525164e-05, "loss": 0.6395, "step": 131 }, { "epoch": 0.39939485627836613, "grad_norm": 1.8484611828643538, "learning_rate": 1.3644704998791501e-05, "loss": 0.6638, "step": 132 }, { "epoch": 0.40242057488653554, "grad_norm": 1.8795543889555775, "learning_rate": 1.3553109012804162e-05, "loss": 0.6584, "step": 133 }, { "epoch": 0.405446293494705, "grad_norm": 2.0272538642472466, "learning_rate": 1.346117057077493e-05, "loss": 0.6541, "step": 134 }, { "epoch": 0.4084720121028744, "grad_norm": 1.9101405219462362, "learning_rate": 1.3368898533922202e-05, "loss": 0.6565, "step": 135 }, { "epoch": 0.4114977307110439, "grad_norm": 2.0088784211383848, "learning_rate": 1.3276301795616937e-05, "loss": 0.6563, "step": 136 }, { "epoch": 0.4145234493192133, "grad_norm": 1.8914928243192555, "learning_rate": 1.3183389280525497e-05, "loss": 0.6901, "step": 137 }, { "epoch": 0.4175491679273828, "grad_norm": 1.7906713381258323, "learning_rate": 1.3090169943749475e-05, "loss": 0.6428, "step": 138 }, { "epoch": 0.4205748865355522, "grad_norm": 1.9342457965175344, "learning_rate": 1.2996652769962567e-05, "loss": 0.6846, "step": 139 }, { "epoch": 0.4236006051437216, "grad_norm": 1.9699269585190082, "learning_rate": 1.2902846772544625e-05, "loss": 0.65, "step": 140 }, { "epoch": 0.4266263237518911, "grad_norm": 1.9809903638292505, "learning_rate": 1.2808760992712923e-05, "loss": 0.653, "step": 141 }, { "epoch": 0.4296520423600605, "grad_norm": 2.032945686435021, "learning_rate": 1.2714404498650743e-05, "loss": 0.6289, "step": 142 }, { "epoch": 0.43267776096822996, "grad_norm": 2.0794369135009787, "learning_rate": 1.2619786384633374e-05, "loss": 0.6617, "step": 143 }, { "epoch": 0.4357034795763994, "grad_norm": 1.866924916352073, "learning_rate": 1.252491577015158e-05, "loss": 0.6546, "step": 144 }, { "epoch": 0.43872919818456885, "grad_norm": 1.8634818795040904, "learning_rate": 1.242980179903264e-05, "loss": 0.6547, "step": 145 }, { "epoch": 0.44175491679273826, "grad_norm": 2.027288842597497, "learning_rate": 1.2334453638559057e-05, "loss": 0.6716, "step": 146 }, { "epoch": 0.44478063540090773, "grad_norm": 1.8143983839498472, "learning_rate": 1.2238880478584987e-05, "loss": 0.63, "step": 147 }, { "epoch": 0.44780635400907715, "grad_norm": 1.9479084908694164, "learning_rate": 1.2143091530650508e-05, "loss": 0.666, "step": 148 }, { "epoch": 0.4508320726172466, "grad_norm": 5.864872587821412, "learning_rate": 1.2047096027093798e-05, "loss": 0.6403, "step": 149 }, { "epoch": 0.45385779122541603, "grad_norm": 1.8796985531477728, "learning_rate": 1.1950903220161286e-05, "loss": 0.647, "step": 150 }, { "epoch": 0.4568835098335855, "grad_norm": 1.8656381112298428, "learning_rate": 1.185452238111591e-05, "loss": 0.6434, "step": 151 }, { "epoch": 0.4599092284417549, "grad_norm": 1.8201079356405698, "learning_rate": 1.1757962799343548e-05, "loss": 0.6222, "step": 152 }, { "epoch": 0.4629349470499244, "grad_norm": 1.8114819922272898, "learning_rate": 1.1661233781457655e-05, "loss": 0.6276, "step": 153 }, { "epoch": 0.4659606656580938, "grad_norm": 1.7724243161132796, "learning_rate": 1.156434465040231e-05, "loss": 0.6102, "step": 154 }, { "epoch": 0.4689863842662632, "grad_norm": 1.8396658090661742, "learning_rate": 1.1467304744553618e-05, "loss": 0.633, "step": 155 }, { "epoch": 0.4720121028744327, "grad_norm": 1.9044000192773716, "learning_rate": 1.1370123416819683e-05, "loss": 0.6881, "step": 156 }, { "epoch": 0.4750378214826021, "grad_norm": 1.9801492797746596, "learning_rate": 1.1272810033739134e-05, "loss": 0.6737, "step": 157 }, { "epoch": 0.47806354009077157, "grad_norm": 1.8321494861233507, "learning_rate": 1.1175373974578378e-05, "loss": 0.634, "step": 158 }, { "epoch": 0.481089258698941, "grad_norm": 1.7385897891185031, "learning_rate": 1.1077824630427593e-05, "loss": 0.6601, "step": 159 }, { "epoch": 0.48411497730711045, "grad_norm": 1.8872311240237092, "learning_rate": 1.098017140329561e-05, "loss": 0.62, "step": 160 }, { "epoch": 0.48714069591527986, "grad_norm": 1.9327771087303642, "learning_rate": 1.0882423705203698e-05, "loss": 0.6417, "step": 161 }, { "epoch": 0.49016641452344933, "grad_norm": 1.854145978232897, "learning_rate": 1.0784590957278452e-05, "loss": 0.6638, "step": 162 }, { "epoch": 0.49319213313161875, "grad_norm": 1.7958642023452327, "learning_rate": 1.0686682588843737e-05, "loss": 0.6329, "step": 163 }, { "epoch": 0.4962178517397882, "grad_norm": 1.8646316242847085, "learning_rate": 1.058870803651189e-05, "loss": 0.628, "step": 164 }, { "epoch": 0.49924357034795763, "grad_norm": 2.034057025942459, "learning_rate": 1.0490676743274181e-05, "loss": 0.6602, "step": 165 }, { "epoch": 0.5022692889561271, "grad_norm": 2.075987391319436, "learning_rate": 1.0392598157590687e-05, "loss": 0.6705, "step": 166 }, { "epoch": 0.5022692889561271, "eval_loss": 0.6367672681808472, "eval_runtime": 98.7198, "eval_samples_per_second": 42.849, "eval_steps_per_second": 0.679, "step": 166 }, { "epoch": 0.5052950075642966, "grad_norm": 1.8209280670369759, "learning_rate": 1.0294481732479635e-05, "loss": 0.6437, "step": 167 }, { "epoch": 0.5083207261724659, "grad_norm": 2.01862600963615, "learning_rate": 1.0196336924606282e-05, "loss": 0.6908, "step": 168 }, { "epoch": 0.5113464447806354, "grad_norm": 1.8600737637708504, "learning_rate": 1.0098173193371498e-05, "loss": 0.6394, "step": 169 }, { "epoch": 0.5143721633888049, "grad_norm": 1.752154066184476, "learning_rate": 1e-05, "loss": 0.641, "step": 170 }, { "epoch": 0.5173978819969742, "grad_norm": 1.7382915526611122, "learning_rate": 9.901826806628505e-06, "loss": 0.6231, "step": 171 }, { "epoch": 0.5204236006051437, "grad_norm": 1.831957458509604, "learning_rate": 9.80366307539372e-06, "loss": 0.6153, "step": 172 }, { "epoch": 0.5234493192133132, "grad_norm": 1.9814775835155989, "learning_rate": 9.705518267520369e-06, "loss": 0.6342, "step": 173 }, { "epoch": 0.5264750378214826, "grad_norm": 1.7928530986876814, "learning_rate": 9.607401842409318e-06, "loss": 0.642, "step": 174 }, { "epoch": 0.529500756429652, "grad_norm": 1.8372327601480345, "learning_rate": 9.50932325672582e-06, "loss": 0.6323, "step": 175 }, { "epoch": 0.5325264750378215, "grad_norm": 1.9570960995420552, "learning_rate": 9.41129196348811e-06, "loss": 0.6446, "step": 176 }, { "epoch": 0.5355521936459909, "grad_norm": 1.9572579231094382, "learning_rate": 9.313317411156265e-06, "loss": 0.6869, "step": 177 }, { "epoch": 0.5385779122541604, "grad_norm": 1.8108192928711597, "learning_rate": 9.215409042721553e-06, "loss": 0.6278, "step": 178 }, { "epoch": 0.5416036308623298, "grad_norm": 1.821987784514361, "learning_rate": 9.117576294796307e-06, "loss": 0.6115, "step": 179 }, { "epoch": 0.5446293494704992, "grad_norm": 1.987620896891723, "learning_rate": 9.019828596704394e-06, "loss": 0.6209, "step": 180 }, { "epoch": 0.5476550680786687, "grad_norm": 2.028602066146976, "learning_rate": 8.922175369572407e-06, "loss": 0.6654, "step": 181 }, { "epoch": 0.5506807866868382, "grad_norm": 1.8580079260447575, "learning_rate": 8.824626025421625e-06, "loss": 0.6006, "step": 182 }, { "epoch": 0.5537065052950075, "grad_norm": 1.8986231751877851, "learning_rate": 8.72718996626087e-06, "loss": 0.6214, "step": 183 }, { "epoch": 0.556732223903177, "grad_norm": 1.8492480820880255, "learning_rate": 8.629876583180322e-06, "loss": 0.6186, "step": 184 }, { "epoch": 0.5597579425113465, "grad_norm": 1.922805209593131, "learning_rate": 8.532695255446384e-06, "loss": 0.6115, "step": 185 }, { "epoch": 0.5627836611195158, "grad_norm": 2.6417020206583954, "learning_rate": 8.43565534959769e-06, "loss": 0.6238, "step": 186 }, { "epoch": 0.5658093797276853, "grad_norm": 1.6531736881779153, "learning_rate": 8.338766218542348e-06, "loss": 0.5963, "step": 187 }, { "epoch": 0.5688350983358548, "grad_norm": 1.9058816333699906, "learning_rate": 8.242037200656455e-06, "loss": 0.657, "step": 188 }, { "epoch": 0.5718608169440242, "grad_norm": 1.844821663246108, "learning_rate": 8.145477618884092e-06, "loss": 0.6198, "step": 189 }, { "epoch": 0.5748865355521936, "grad_norm": 1.7545017629558766, "learning_rate": 8.04909677983872e-06, "loss": 0.6333, "step": 190 }, { "epoch": 0.5779122541603631, "grad_norm": 10.188950064582118, "learning_rate": 7.952903972906205e-06, "loss": 0.6019, "step": 191 }, { "epoch": 0.5809379727685325, "grad_norm": 1.926798592832203, "learning_rate": 7.856908469349495e-06, "loss": 0.6352, "step": 192 }, { "epoch": 0.583963691376702, "grad_norm": 1.8924205088850037, "learning_rate": 7.761119521415017e-06, "loss": 0.6337, "step": 193 }, { "epoch": 0.5869894099848714, "grad_norm": 1.8120023994489047, "learning_rate": 7.66554636144095e-06, "loss": 0.6432, "step": 194 }, { "epoch": 0.5900151285930408, "grad_norm": 1.963933061903905, "learning_rate": 7.570198200967363e-06, "loss": 0.6471, "step": 195 }, { "epoch": 0.5930408472012103, "grad_norm": 1.936190947316387, "learning_rate": 7.4750842298484205e-06, "loss": 0.6318, "step": 196 }, { "epoch": 0.5960665658093798, "grad_norm": 1.7686124345129755, "learning_rate": 7.380213615366627e-06, "loss": 0.6511, "step": 197 }, { "epoch": 0.5990922844175491, "grad_norm": 1.714431451377099, "learning_rate": 7.285595501349259e-06, "loss": 0.619, "step": 198 }, { "epoch": 0.6021180030257186, "grad_norm": 1.794288134133105, "learning_rate": 7.191239007287082e-06, "loss": 0.6368, "step": 199 }, { "epoch": 0.6051437216338881, "grad_norm": 1.9009974921888555, "learning_rate": 7.097153227455379e-06, "loss": 0.6724, "step": 200 }, { "epoch": 0.6081694402420574, "grad_norm": 1.8815723068802426, "learning_rate": 7.003347230037434e-06, "loss": 0.6479, "step": 201 }, { "epoch": 0.6111951588502269, "grad_norm": 2.0481718228938295, "learning_rate": 6.909830056250527e-06, "loss": 0.624, "step": 202 }, { "epoch": 0.6142208774583964, "grad_norm": 1.8660751503866284, "learning_rate": 6.816610719474503e-06, "loss": 0.6066, "step": 203 }, { "epoch": 0.6172465960665658, "grad_norm": 117.96240960174059, "learning_rate": 6.723698204383067e-06, "loss": 0.6306, "step": 204 }, { "epoch": 0.6202723146747352, "grad_norm": 2.076181536031468, "learning_rate": 6.631101466077801e-06, "loss": 0.6078, "step": 205 }, { "epoch": 0.6232980332829047, "grad_norm": 1.7578318607797734, "learning_rate": 6.538829429225068e-06, "loss": 0.6177, "step": 206 }, { "epoch": 0.6263237518910741, "grad_norm": 1.9981314247674604, "learning_rate": 6.446890987195842e-06, "loss": 0.6271, "step": 207 }, { "epoch": 0.6293494704992436, "grad_norm": 1.8418024833537472, "learning_rate": 6.355295001208504e-06, "loss": 0.6564, "step": 208 }, { "epoch": 0.632375189107413, "grad_norm": 1.7839428525822674, "learning_rate": 6.2640502994748375e-06, "loss": 0.6028, "step": 209 }, { "epoch": 0.6354009077155824, "grad_norm": 1.9280913987327355, "learning_rate": 6.173165676349103e-06, "loss": 0.6325, "step": 210 }, { "epoch": 0.6384266263237519, "grad_norm": 1.7624114926890098, "learning_rate": 6.082649891480441e-06, "loss": 0.6039, "step": 211 }, { "epoch": 0.6414523449319214, "grad_norm": 1.7978369158197076, "learning_rate": 5.9925116689685925e-06, "loss": 0.6116, "step": 212 }, { "epoch": 0.6444780635400907, "grad_norm": 1.7943900816934446, "learning_rate": 5.902759696523046e-06, "loss": 0.6142, "step": 213 }, { "epoch": 0.6475037821482602, "grad_norm": 1.7168302969493772, "learning_rate": 5.813402624625722e-06, "loss": 0.6029, "step": 214 }, { "epoch": 0.6505295007564297, "grad_norm": 1.8628109691773338, "learning_rate": 5.724449065697182e-06, "loss": 0.6307, "step": 215 }, { "epoch": 0.653555219364599, "grad_norm": 1.6997400415930484, "learning_rate": 5.635907593266578e-06, "loss": 0.6015, "step": 216 }, { "epoch": 0.6565809379727685, "grad_norm": 1.7490867060597552, "learning_rate": 5.54778674114532e-06, "loss": 0.6284, "step": 217 }, { "epoch": 0.659606656580938, "grad_norm": 1.76429241360846, "learning_rate": 5.460095002604533e-06, "loss": 0.6263, "step": 218 }, { "epoch": 0.6626323751891074, "grad_norm": 2.0251369838234896, "learning_rate": 5.3728408295565e-06, "loss": 0.6419, "step": 219 }, { "epoch": 0.6656580937972768, "grad_norm": 1.7306081116333085, "learning_rate": 5.286032631740023e-06, "loss": 0.6137, "step": 220 }, { "epoch": 0.6686838124054463, "grad_norm": 1.6542505700640842, "learning_rate": 5.199678775909889e-06, "loss": 0.5967, "step": 221 }, { "epoch": 0.6717095310136157, "grad_norm": 1.7919744473172632, "learning_rate": 5.1137875850304545e-06, "loss": 0.6091, "step": 222 }, { "epoch": 0.6747352496217852, "grad_norm": 1.766603281831805, "learning_rate": 5.0283673374734546e-06, "loss": 0.6317, "step": 223 }, { "epoch": 0.6777609682299546, "grad_norm": 1.7421115340862197, "learning_rate": 4.943426266220156e-06, "loss": 0.6092, "step": 224 }, { "epoch": 0.680786686838124, "grad_norm": 1.7745552222394632, "learning_rate": 4.858972558067784e-06, "loss": 0.599, "step": 225 }, { "epoch": 0.6838124054462935, "grad_norm": 1.6972184081839272, "learning_rate": 4.775014352840512e-06, "loss": 0.6135, "step": 226 }, { "epoch": 0.686838124054463, "grad_norm": 1.698395088856387, "learning_rate": 4.691559742604906e-06, "loss": 0.6221, "step": 227 }, { "epoch": 0.6898638426626323, "grad_norm": 1.7994471135722945, "learning_rate": 4.608616770889998e-06, "loss": 0.6055, "step": 228 }, { "epoch": 0.6928895612708018, "grad_norm": 1.759126619915089, "learning_rate": 4.526193431912038e-06, "loss": 0.6309, "step": 229 }, { "epoch": 0.6959152798789713, "grad_norm": 2.100213961777996, "learning_rate": 4.444297669803981e-06, "loss": 0.6635, "step": 230 }, { "epoch": 0.6989409984871406, "grad_norm": 1.7258768310459378, "learning_rate": 4.362937377849832e-06, "loss": 0.6062, "step": 231 }, { "epoch": 0.7019667170953101, "grad_norm": 1.7559400870524307, "learning_rate": 4.282120397723879e-06, "loss": 0.6062, "step": 232 }, { "epoch": 0.7049924357034796, "grad_norm": 2.2309488171255847, "learning_rate": 4.2018545187348645e-06, "loss": 0.61, "step": 233 }, { "epoch": 0.708018154311649, "grad_norm": 1.77038154751904, "learning_rate": 4.12214747707527e-06, "loss": 0.6239, "step": 234 }, { "epoch": 0.7110438729198184, "grad_norm": 2.81931851928973, "learning_rate": 4.043006955075667e-06, "loss": 0.623, "step": 235 }, { "epoch": 0.7140695915279879, "grad_norm": 1.8647484853899023, "learning_rate": 3.964440580464286e-06, "loss": 0.6378, "step": 236 }, { "epoch": 0.7170953101361573, "grad_norm": 1.7704652348097096, "learning_rate": 3.8864559256318375e-06, "loss": 0.6091, "step": 237 }, { "epoch": 0.7201210287443268, "grad_norm": 1.8492700632395556, "learning_rate": 3.8090605069016596e-06, "loss": 0.6534, "step": 238 }, { "epoch": 0.7231467473524962, "grad_norm": 1.6859140363387968, "learning_rate": 3.7322617838053066e-06, "loss": 0.6218, "step": 239 }, { "epoch": 0.7261724659606656, "grad_norm": 1.78360745200291, "learning_rate": 3.6560671583635467e-06, "loss": 0.6533, "step": 240 }, { "epoch": 0.7291981845688351, "grad_norm": 1.8821672849391728, "learning_rate": 3.58048397437297e-06, "loss": 0.6577, "step": 241 }, { "epoch": 0.7322239031770046, "grad_norm": 1.7018559651879825, "learning_rate": 3.505519516698165e-06, "loss": 0.6077, "step": 242 }, { "epoch": 0.735249621785174, "grad_norm": 1.6816021527136724, "learning_rate": 3.4311810105695875e-06, "loss": 0.5895, "step": 243 }, { "epoch": 0.7382753403933434, "grad_norm": 1.7460874029262543, "learning_rate": 3.3574756208871862e-06, "loss": 0.6224, "step": 244 }, { "epoch": 0.7413010590015129, "grad_norm": 1.7057523984488685, "learning_rate": 3.284410451529816e-06, "loss": 0.6099, "step": 245 }, { "epoch": 0.7443267776096822, "grad_norm": 1.7270983246178484, "learning_rate": 3.2119925446705824e-06, "loss": 0.6192, "step": 246 }, { "epoch": 0.7473524962178517, "grad_norm": 1.7719385527125833, "learning_rate": 3.140228880098074e-06, "loss": 0.619, "step": 247 }, { "epoch": 0.7503782148260212, "grad_norm": 1.8025142996003436, "learning_rate": 3.069126374543643e-06, "loss": 0.6098, "step": 248 }, { "epoch": 0.7534039334341907, "grad_norm": 1.6955751524948601, "learning_rate": 2.998691881014765e-06, "loss": 0.5871, "step": 249 }, { "epoch": 0.7534039334341907, "eval_loss": 0.6061348915100098, "eval_runtime": 98.7005, "eval_samples_per_second": 42.857, "eval_steps_per_second": 0.679, "step": 249 }, { "epoch": 0.75642965204236, "grad_norm": 1.7209631334708135, "learning_rate": 2.9289321881345257e-06, "loss": 0.6151, "step": 250 }, { "epoch": 0.7594553706505295, "grad_norm": 1.6344767691678022, "learning_rate": 2.859854019487318e-06, "loss": 0.6141, "step": 251 }, { "epoch": 0.762481089258699, "grad_norm": 1.869803493501466, "learning_rate": 2.791464032970812e-06, "loss": 0.6133, "step": 252 }, { "epoch": 0.7655068078668684, "grad_norm": 1.7100751850539497, "learning_rate": 2.723768820154251e-06, "loss": 0.6049, "step": 253 }, { "epoch": 0.7685325264750378, "grad_norm": 1.7234424171319058, "learning_rate": 2.656774905643147e-06, "loss": 0.599, "step": 254 }, { "epoch": 0.7715582450832073, "grad_norm": 1.6878132662269478, "learning_rate": 2.5904887464504115e-06, "loss": 0.6117, "step": 255 }, { "epoch": 0.7745839636913767, "grad_norm": 1.7093233245692183, "learning_rate": 2.5249167313740307e-06, "loss": 0.6198, "step": 256 }, { "epoch": 0.7776096822995462, "grad_norm": 1.912398596023919, "learning_rate": 2.4600651803813057e-06, "loss": 0.6191, "step": 257 }, { "epoch": 0.7806354009077155, "grad_norm": 1.789468500561203, "learning_rate": 2.395940343999691e-06, "loss": 0.6267, "step": 258 }, { "epoch": 0.783661119515885, "grad_norm": 2.075497889597922, "learning_rate": 2.332548402714385e-06, "loss": 0.6137, "step": 259 }, { "epoch": 0.7866868381240545, "grad_norm": 1.7618569590652433, "learning_rate": 2.26989546637263e-06, "loss": 0.6299, "step": 260 }, { "epoch": 0.789712556732224, "grad_norm": 2.2622555895319194, "learning_rate": 2.207987573594833e-06, "loss": 0.6329, "step": 261 }, { "epoch": 0.7927382753403933, "grad_norm": 1.7894533644126578, "learning_rate": 2.146830691192553e-06, "loss": 0.5947, "step": 262 }, { "epoch": 0.7957639939485628, "grad_norm": 2.6196101274062054, "learning_rate": 2.086430713593397e-06, "loss": 0.6329, "step": 263 }, { "epoch": 0.7987897125567323, "grad_norm": 1.9123478372822518, "learning_rate": 2.02679346227293e-06, "loss": 0.6241, "step": 264 }, { "epoch": 0.8018154311649016, "grad_norm": 1.7263173456003025, "learning_rate": 1.967924685193552e-06, "loss": 0.6017, "step": 265 }, { "epoch": 0.8048411497730711, "grad_norm": 1.8467856227350854, "learning_rate": 1.9098300562505266e-06, "loss": 0.647, "step": 266 }, { "epoch": 0.8078668683812406, "grad_norm": 1.6630727621724897, "learning_rate": 1.8525151747251058e-06, "loss": 0.5739, "step": 267 }, { "epoch": 0.81089258698941, "grad_norm": 1.7285904006134964, "learning_rate": 1.7959855647448642e-06, "loss": 0.6135, "step": 268 }, { "epoch": 0.8139183055975794, "grad_norm": 1.7269298338392545, "learning_rate": 1.7402466747512704e-06, "loss": 0.6082, "step": 269 }, { "epoch": 0.8169440242057489, "grad_norm": 1.7111586205844402, "learning_rate": 1.6853038769745466e-06, "loss": 0.6155, "step": 270 }, { "epoch": 0.8199697428139183, "grad_norm": 1.674017529420373, "learning_rate": 1.6311624669159064e-06, "loss": 0.598, "step": 271 }, { "epoch": 0.8229954614220878, "grad_norm": 1.7805315596940765, "learning_rate": 1.577827662837136e-06, "loss": 0.59, "step": 272 }, { "epoch": 0.8260211800302572, "grad_norm": 1.7422765737032646, "learning_rate": 1.5253046052576559e-06, "loss": 0.625, "step": 273 }, { "epoch": 0.8290468986384266, "grad_norm": 1.7242936339101036, "learning_rate": 1.4735983564590784e-06, "loss": 0.5911, "step": 274 }, { "epoch": 0.8320726172465961, "grad_norm": 1.7272091925820903, "learning_rate": 1.4227138999972801e-06, "loss": 0.6151, "step": 275 }, { "epoch": 0.8350983358547656, "grad_norm": 1.7250197363496207, "learning_rate": 1.3726561402220818e-06, "loss": 0.6215, "step": 276 }, { "epoch": 0.8381240544629349, "grad_norm": 1.6485340400230004, "learning_rate": 1.3234299018045615e-06, "loss": 0.5766, "step": 277 }, { "epoch": 0.8411497730711044, "grad_norm": 1.8678393181143942, "learning_rate": 1.2750399292720284e-06, "loss": 0.6286, "step": 278 }, { "epoch": 0.8441754916792739, "grad_norm": 1.6737731054502747, "learning_rate": 1.2274908865507595e-06, "loss": 0.588, "step": 279 }, { "epoch": 0.8472012102874432, "grad_norm": 1.7799288792849057, "learning_rate": 1.1807873565164507e-06, "loss": 0.5964, "step": 280 }, { "epoch": 0.8502269288956127, "grad_norm": 1.747150097774928, "learning_rate": 1.1349338405525368e-06, "loss": 0.6129, "step": 281 }, { "epoch": 0.8532526475037822, "grad_norm": 1.8392197300038893, "learning_rate": 1.0899347581163222e-06, "loss": 0.5917, "step": 282 }, { "epoch": 0.8562783661119516, "grad_norm": 2.033909314856892, "learning_rate": 1.045794446313031e-06, "loss": 0.6101, "step": 283 }, { "epoch": 0.859304084720121, "grad_norm": 1.8665347961123104, "learning_rate": 1.0025171594777872e-06, "loss": 0.6011, "step": 284 }, { "epoch": 0.8623298033282905, "grad_norm": 2.0264847769487515, "learning_rate": 9.601070687655667e-07, "loss": 0.6194, "step": 285 }, { "epoch": 0.8653555219364599, "grad_norm": 1.7878198612078795, "learning_rate": 9.185682617491865e-07, "loss": 0.6341, "step": 286 }, { "epoch": 0.8683812405446294, "grad_norm": 1.7213821794488278, "learning_rate": 8.779047420253239e-07, "loss": 0.6063, "step": 287 }, { "epoch": 0.8714069591527988, "grad_norm": 1.7529253914449372, "learning_rate": 8.381204288286415e-07, "loss": 0.6237, "step": 288 }, { "epoch": 0.8744326777609682, "grad_norm": 1.739234395593092, "learning_rate": 7.992191566540519e-07, "loss": 0.5866, "step": 289 }, { "epoch": 0.8774583963691377, "grad_norm": 1.7913343867359512, "learning_rate": 7.612046748871327e-07, "loss": 0.6245, "step": 290 }, { "epoch": 0.8804841149773072, "grad_norm": 1.7520857029251033, "learning_rate": 7.240806474427598e-07, "loss": 0.6088, "step": 291 }, { "epoch": 0.8835098335854765, "grad_norm": 1.8107427903884512, "learning_rate": 6.878506524119644e-07, "loss": 0.5917, "step": 292 }, { "epoch": 0.886535552193646, "grad_norm": 2.7889166090017823, "learning_rate": 6.525181817170756e-07, "loss": 0.6033, "step": 293 }, { "epoch": 0.8895612708018155, "grad_norm": 1.746936796122282, "learning_rate": 6.180866407751595e-07, "loss": 0.6504, "step": 294 }, { "epoch": 0.8925869894099848, "grad_norm": 1.7783458626371988, "learning_rate": 5.845593481697931e-07, "loss": 0.5897, "step": 295 }, { "epoch": 0.8956127080181543, "grad_norm": 1.7474879736728264, "learning_rate": 5.519395353312195e-07, "loss": 0.5996, "step": 296 }, { "epoch": 0.8986384266263238, "grad_norm": 1.738779253442242, "learning_rate": 5.20230346224897e-07, "loss": 0.6437, "step": 297 }, { "epoch": 0.9016641452344932, "grad_norm": 1.7492041972741343, "learning_rate": 4.894348370484648e-07, "loss": 0.6112, "step": 298 }, { "epoch": 0.9046898638426626, "grad_norm": 1.7353938238847695, "learning_rate": 4.5955597593719593e-07, "loss": 0.6131, "step": 299 }, { "epoch": 0.9077155824508321, "grad_norm": 1.6794144202567616, "learning_rate": 4.305966426779118e-07, "loss": 0.6217, "step": 300 }, { "epoch": 0.9107413010590015, "grad_norm": 1.6476681925670142, "learning_rate": 4.025596284314259e-07, "loss": 0.5749, "step": 301 }, { "epoch": 0.913767019667171, "grad_norm": 1.779793559081367, "learning_rate": 3.7544763546352834e-07, "loss": 0.5983, "step": 302 }, { "epoch": 0.9167927382753404, "grad_norm": 1.7898354786144914, "learning_rate": 3.492632768845261e-07, "loss": 0.6224, "step": 303 }, { "epoch": 0.9198184568835098, "grad_norm": 1.6850170985387, "learning_rate": 3.2400907639740243e-07, "loss": 0.5852, "step": 304 }, { "epoch": 0.9228441754916793, "grad_norm": 1.7003389450383504, "learning_rate": 2.996874680545603e-07, "loss": 0.604, "step": 305 }, { "epoch": 0.9258698940998488, "grad_norm": 1.8138684808267453, "learning_rate": 2.7630079602323447e-07, "loss": 0.606, "step": 306 }, { "epoch": 0.9288956127080181, "grad_norm": 1.631401355733875, "learning_rate": 2.5385131435955e-07, "loss": 0.5893, "step": 307 }, { "epoch": 0.9319213313161876, "grad_norm": 1.6639411975053835, "learning_rate": 2.3234118679127615e-07, "loss": 0.5991, "step": 308 }, { "epoch": 0.9349470499243571, "grad_norm": 1.8650796449881923, "learning_rate": 2.117724865092774e-07, "loss": 0.6116, "step": 309 }, { "epoch": 0.9379727685325264, "grad_norm": 1.8481908824589233, "learning_rate": 1.921471959676957e-07, "loss": 0.6319, "step": 310 }, { "epoch": 0.9409984871406959, "grad_norm": 1.8047500663011353, "learning_rate": 1.734672066928822e-07, "loss": 0.5994, "step": 311 }, { "epoch": 0.9440242057488654, "grad_norm": 1.8464143580018735, "learning_rate": 1.5573431910108404e-07, "loss": 0.5968, "step": 312 }, { "epoch": 0.9470499243570348, "grad_norm": 1.5943324787903639, "learning_rate": 1.3895024232491338e-07, "loss": 0.5795, "step": 313 }, { "epoch": 0.9500756429652042, "grad_norm": 2.1710266033161147, "learning_rate": 1.231165940486234e-07, "loss": 0.6345, "step": 314 }, { "epoch": 0.9531013615733737, "grad_norm": 1.716465292208872, "learning_rate": 1.0823490035218986e-07, "loss": 0.6059, "step": 315 }, { "epoch": 0.9561270801815431, "grad_norm": 1.7863152621038756, "learning_rate": 9.43065955642275e-08, "loss": 0.6023, "step": 316 }, { "epoch": 0.9591527987897126, "grad_norm": 1.8244440730769216, "learning_rate": 8.133302212373961e-08, "loss": 0.6157, "step": 317 }, { "epoch": 0.962178517397882, "grad_norm": 1.6881545392698598, "learning_rate": 6.931543045073708e-08, "loss": 0.5915, "step": 318 }, { "epoch": 0.9652042360060514, "grad_norm": 1.7445865304504102, "learning_rate": 5.8254978825718065e-08, "loss": 0.5973, "step": 319 }, { "epoch": 0.9682299546142209, "grad_norm": 1.7698582738520474, "learning_rate": 4.815273327803183e-08, "loss": 0.5974, "step": 320 }, { "epoch": 0.9712556732223904, "grad_norm": 1.6641052228223427, "learning_rate": 3.900966748312862e-08, "loss": 0.618, "step": 321 }, { "epoch": 0.9742813918305597, "grad_norm": 1.743456317223566, "learning_rate": 3.082666266872036e-08, "loss": 0.6038, "step": 322 }, { "epoch": 0.9773071104387292, "grad_norm": 1.709368453532382, "learning_rate": 2.3604507529843e-08, "loss": 0.6199, "step": 323 }, { "epoch": 0.9803328290468987, "grad_norm": 1.770311456826043, "learning_rate": 1.7343898152841765e-08, "loss": 0.593, "step": 324 }, { "epoch": 0.983358547655068, "grad_norm": 1.6607806329342598, "learning_rate": 1.2045437948275952e-08, "loss": 0.5876, "step": 325 }, { "epoch": 0.9863842662632375, "grad_norm": 1.6047392651075518, "learning_rate": 7.70963759277099e-09, "loss": 0.5873, "step": 326 }, { "epoch": 0.989409984871407, "grad_norm": 1.7363455551140259, "learning_rate": 4.336914979787832e-09, "loss": 0.625, "step": 327 }, { "epoch": 0.9924357034795764, "grad_norm": 1.7014506767679876, "learning_rate": 1.9275951793518154e-09, "loss": 0.611, "step": 328 }, { "epoch": 0.9954614220877458, "grad_norm": 1.7378760213357103, "learning_rate": 4.819104067199653e-10, "loss": 0.6026, "step": 329 }, { "epoch": 0.9984871406959153, "grad_norm": 1.6781131162092493, "learning_rate": 0.0, "loss": 0.6165, "step": 330 }, { "epoch": 0.9984871406959153, "step": 330, "total_flos": 552343531683840.0, "train_loss": 0.660008016499606, "train_runtime": 4533.1032, "train_samples_per_second": 9.332, "train_steps_per_second": 0.073 } ], "logging_steps": 1.0, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 83, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 552343531683840.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }