{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.95475113122172, "eval_steps": 25, "global_step": 3850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06464124111182935, "grad_norm": 0.6707131266593933, "learning_rate": 0.0001987561544441565, "loss": 1.2216, "step": 25 }, { "epoch": 0.06464124111182935, "eval_loss": 0.6498388051986694, "eval_runtime": 13.8809, "eval_samples_per_second": 13.976, "eval_steps_per_second": 1.801, "step": 25 }, { "epoch": 0.1292824822236587, "grad_norm": 1.3595010042190552, "learning_rate": 0.0001974604819901529, "loss": 0.547, "step": 50 }, { "epoch": 0.1292824822236587, "eval_loss": 0.6438911557197571, "eval_runtime": 13.8752, "eval_samples_per_second": 13.982, "eval_steps_per_second": 1.802, "step": 50 }, { "epoch": 0.19392372333548805, "grad_norm": 0.6108447909355164, "learning_rate": 0.00019616480953614927, "loss": 0.6683, "step": 75 }, { "epoch": 0.19392372333548805, "eval_loss": 0.5715335011482239, "eval_runtime": 13.8894, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 75 }, { "epoch": 0.2585649644473174, "grad_norm": 0.5456333160400391, "learning_rate": 0.00019486913708214565, "loss": 0.4938, "step": 100 }, { "epoch": 0.2585649644473174, "eval_loss": 0.5783222913742065, "eval_runtime": 13.8829, "eval_samples_per_second": 13.974, "eval_steps_per_second": 1.801, "step": 100 }, { "epoch": 0.32320620555914675, "grad_norm": 0.49040475487709045, "learning_rate": 0.000193573464628142, "loss": 0.6211, "step": 125 }, { "epoch": 0.32320620555914675, "eval_loss": 0.5169771909713745, "eval_runtime": 13.8894, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 125 }, { "epoch": 0.3878474466709761, "grad_norm": 0.5957172513008118, "learning_rate": 0.00019227779217413838, "loss": 0.4272, "step": 150 }, { "epoch": 0.3878474466709761, "eval_loss": 0.5196932554244995, "eval_runtime": 13.8809, "eval_samples_per_second": 13.976, "eval_steps_per_second": 1.801, "step": 150 }, { "epoch": 0.45248868778280543, "grad_norm": 0.5516940951347351, "learning_rate": 0.00019098211972013476, "loss": 0.5702, "step": 175 }, { "epoch": 0.45248868778280543, "eval_loss": 0.47964370250701904, "eval_runtime": 13.8882, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.8, "step": 175 }, { "epoch": 0.5171299288946348, "grad_norm": 0.6955189108848572, "learning_rate": 0.00018968644726613114, "loss": 0.3935, "step": 200 }, { "epoch": 0.5171299288946348, "eval_loss": 0.4756932258605957, "eval_runtime": 13.8869, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 200 }, { "epoch": 0.5817711700064642, "grad_norm": 0.6203818321228027, "learning_rate": 0.0001883907748121275, "loss": 0.5132, "step": 225 }, { "epoch": 0.5817711700064642, "eval_loss": 0.45499277114868164, "eval_runtime": 13.8818, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.801, "step": 225 }, { "epoch": 0.6464124111182935, "grad_norm": 0.5430071353912354, "learning_rate": 0.00018709510235812387, "loss": 0.3806, "step": 250 }, { "epoch": 0.6464124111182935, "eval_loss": 0.4564308226108551, "eval_runtime": 13.8864, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 250 }, { "epoch": 0.7110536522301228, "grad_norm": 0.47872328758239746, "learning_rate": 0.00018579942990412025, "loss": 0.5195, "step": 275 }, { "epoch": 0.7110536522301228, "eval_loss": 0.42872217297554016, "eval_runtime": 13.8797, "eval_samples_per_second": 13.977, "eval_steps_per_second": 1.801, "step": 275 }, { "epoch": 0.7756948933419522, "grad_norm": 0.43633943796157837, "learning_rate": 0.0001845037574501166, "loss": 0.337, "step": 300 }, { "epoch": 0.7756948933419522, "eval_loss": 0.44021356105804443, "eval_runtime": 13.8816, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.801, "step": 300 }, { "epoch": 0.8403361344537815, "grad_norm": 0.4739358127117157, "learning_rate": 0.00018320808499611298, "loss": 0.4566, "step": 325 }, { "epoch": 0.8403361344537815, "eval_loss": 0.41129282116889954, "eval_runtime": 13.8846, "eval_samples_per_second": 13.972, "eval_steps_per_second": 1.801, "step": 325 }, { "epoch": 0.9049773755656109, "grad_norm": 0.5350446105003357, "learning_rate": 0.00018191241254210936, "loss": 0.3097, "step": 350 }, { "epoch": 0.9049773755656109, "eval_loss": 0.4308871626853943, "eval_runtime": 13.884, "eval_samples_per_second": 13.973, "eval_steps_per_second": 1.801, "step": 350 }, { "epoch": 0.9696186166774402, "grad_norm": 0.5064755082130432, "learning_rate": 0.00018061674008810574, "loss": 0.4391, "step": 375 }, { "epoch": 0.9696186166774402, "eval_loss": 0.3927255868911743, "eval_runtime": 13.8854, "eval_samples_per_second": 13.972, "eval_steps_per_second": 1.8, "step": 375 }, { "epoch": 1.0342598577892697, "grad_norm": 0.594939649105072, "learning_rate": 0.0001793210676341021, "loss": 0.413, "step": 400 }, { "epoch": 1.0342598577892697, "eval_loss": 0.38139256834983826, "eval_runtime": 13.8858, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 400 }, { "epoch": 1.098901098901099, "grad_norm": 0.5481505393981934, "learning_rate": 0.00017802539518009847, "loss": 0.2737, "step": 425 }, { "epoch": 1.098901098901099, "eval_loss": 0.395649790763855, "eval_runtime": 13.8927, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 425 }, { "epoch": 1.1635423400129283, "grad_norm": 0.5053268074989319, "learning_rate": 0.00017672972272609485, "loss": 0.3219, "step": 450 }, { "epoch": 1.1635423400129283, "eval_loss": 0.37703385949134827, "eval_runtime": 13.8769, "eval_samples_per_second": 13.98, "eval_steps_per_second": 1.802, "step": 450 }, { "epoch": 1.2281835811247577, "grad_norm": 0.5772731304168701, "learning_rate": 0.00017543405027209123, "loss": 0.2942, "step": 475 }, { "epoch": 1.2281835811247577, "eval_loss": 0.37387123703956604, "eval_runtime": 13.8976, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.799, "step": 475 }, { "epoch": 1.292824822236587, "grad_norm": 0.5018692016601562, "learning_rate": 0.00017413837781808759, "loss": 0.3059, "step": 500 }, { "epoch": 1.292824822236587, "eval_loss": 0.3657205402851105, "eval_runtime": 13.8767, "eval_samples_per_second": 13.98, "eval_steps_per_second": 1.802, "step": 500 }, { "epoch": 1.3574660633484164, "grad_norm": 0.5072377324104309, "learning_rate": 0.000172842705364084, "loss": 0.3096, "step": 525 }, { "epoch": 1.3574660633484164, "eval_loss": 0.3578204810619354, "eval_runtime": 13.8863, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 525 }, { "epoch": 1.4221073044602457, "grad_norm": 0.849237322807312, "learning_rate": 0.00017154703291008034, "loss": 0.3177, "step": 550 }, { "epoch": 1.4221073044602457, "eval_loss": 0.3545144498348236, "eval_runtime": 13.8893, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 550 }, { "epoch": 1.486748545572075, "grad_norm": 0.5274140238761902, "learning_rate": 0.0001702513604560767, "loss": 0.2612, "step": 575 }, { "epoch": 1.486748545572075, "eval_loss": 0.35140955448150635, "eval_runtime": 13.8855, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 575 }, { "epoch": 1.5513897866839044, "grad_norm": 0.5522105097770691, "learning_rate": 0.00016895568800207308, "loss": 0.3167, "step": 600 }, { "epoch": 1.5513897866839044, "eval_loss": 0.3356325924396515, "eval_runtime": 13.8908, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 600 }, { "epoch": 1.6160310277957337, "grad_norm": 0.553512454032898, "learning_rate": 0.00016766001554806945, "loss": 0.2564, "step": 625 }, { "epoch": 1.6160310277957337, "eval_loss": 0.34089842438697815, "eval_runtime": 13.8734, "eval_samples_per_second": 13.984, "eval_steps_per_second": 1.802, "step": 625 }, { "epoch": 1.680672268907563, "grad_norm": 0.624850869178772, "learning_rate": 0.00016636434309406583, "loss": 0.2806, "step": 650 }, { "epoch": 1.680672268907563, "eval_loss": 0.33262547850608826, "eval_runtime": 13.8806, "eval_samples_per_second": 13.976, "eval_steps_per_second": 1.801, "step": 650 }, { "epoch": 1.7453135100193924, "grad_norm": 0.5675662159919739, "learning_rate": 0.0001650686706400622, "loss": 0.2681, "step": 675 }, { "epoch": 1.7453135100193924, "eval_loss": 0.33418890833854675, "eval_runtime": 13.8914, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 675 }, { "epoch": 1.8099547511312217, "grad_norm": 0.4701014757156372, "learning_rate": 0.00016377299818605857, "loss": 0.2764, "step": 700 }, { "epoch": 1.8099547511312217, "eval_loss": 0.3225066661834717, "eval_runtime": 13.891, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 700 }, { "epoch": 1.874595992243051, "grad_norm": 0.5774489641189575, "learning_rate": 0.00016247732573205495, "loss": 0.2597, "step": 725 }, { "epoch": 1.874595992243051, "eval_loss": 0.31712260842323303, "eval_runtime": 13.8906, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 725 }, { "epoch": 1.9392372333548804, "grad_norm": 0.5142238736152649, "learning_rate": 0.00016118165327805132, "loss": 0.2555, "step": 750 }, { "epoch": 1.9392372333548804, "eval_loss": 0.3142366409301758, "eval_runtime": 13.8911, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 750 }, { "epoch": 2.0038784744667097, "grad_norm": 0.49566784501075745, "learning_rate": 0.00015988598082404768, "loss": 0.235, "step": 775 }, { "epoch": 2.0038784744667097, "eval_loss": 0.3169262409210205, "eval_runtime": 13.8798, "eval_samples_per_second": 13.977, "eval_steps_per_second": 1.801, "step": 775 }, { "epoch": 2.0685197155785393, "grad_norm": 0.5878920555114746, "learning_rate": 0.00015859030837004406, "loss": 0.1971, "step": 800 }, { "epoch": 2.0685197155785393, "eval_loss": 0.32635125517845154, "eval_runtime": 13.8909, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 800 }, { "epoch": 2.1331609566903684, "grad_norm": 0.624754786491394, "learning_rate": 0.00015729463591604044, "loss": 0.1677, "step": 825 }, { "epoch": 2.1331609566903684, "eval_loss": 0.3346332609653473, "eval_runtime": 13.8969, "eval_samples_per_second": 13.96, "eval_steps_per_second": 1.799, "step": 825 }, { "epoch": 2.197802197802198, "grad_norm": 0.6018180847167969, "learning_rate": 0.0001559989634620368, "loss": 0.2027, "step": 850 }, { "epoch": 2.197802197802198, "eval_loss": 0.32181909680366516, "eval_runtime": 13.8871, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 850 }, { "epoch": 2.262443438914027, "grad_norm": 0.9019644856452942, "learning_rate": 0.00015470329100803317, "loss": 0.1793, "step": 875 }, { "epoch": 2.262443438914027, "eval_loss": 0.32273176312446594, "eval_runtime": 13.8886, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 875 }, { "epoch": 2.3270846800258567, "grad_norm": 0.7112553715705872, "learning_rate": 0.00015340761855402955, "loss": 0.1971, "step": 900 }, { "epoch": 2.3270846800258567, "eval_loss": 0.328635036945343, "eval_runtime": 13.8925, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 900 }, { "epoch": 2.391725921137686, "grad_norm": 0.9075572490692139, "learning_rate": 0.00015211194610002593, "loss": 0.199, "step": 925 }, { "epoch": 2.391725921137686, "eval_loss": 0.31903964281082153, "eval_runtime": 13.8926, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 925 }, { "epoch": 2.4563671622495153, "grad_norm": 0.3509347140789032, "learning_rate": 0.00015081627364602228, "loss": 0.2201, "step": 950 }, { "epoch": 2.4563671622495153, "eval_loss": 0.31318098306655884, "eval_runtime": 13.8863, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 950 }, { "epoch": 2.5210084033613445, "grad_norm": 0.6588907837867737, "learning_rate": 0.00014952060119201869, "loss": 0.1717, "step": 975 }, { "epoch": 2.5210084033613445, "eval_loss": 0.3208658695220947, "eval_runtime": 13.8946, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 975 }, { "epoch": 2.585649644473174, "grad_norm": 0.30743467807769775, "learning_rate": 0.00014822492873801504, "loss": 0.2173, "step": 1000 }, { "epoch": 2.585649644473174, "eval_loss": 0.3078968822956085, "eval_runtime": 13.881, "eval_samples_per_second": 13.976, "eval_steps_per_second": 1.801, "step": 1000 }, { "epoch": 2.650290885585003, "grad_norm": 0.9775978922843933, "learning_rate": 0.00014692925628401142, "loss": 0.1641, "step": 1025 }, { "epoch": 2.650290885585003, "eval_loss": 0.3325625956058502, "eval_runtime": 13.8923, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 1025 }, { "epoch": 2.7149321266968327, "grad_norm": 0.410194456577301, "learning_rate": 0.00014563358383000777, "loss": 0.2048, "step": 1050 }, { "epoch": 2.7149321266968327, "eval_loss": 0.3071564733982086, "eval_runtime": 13.8821, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.801, "step": 1050 }, { "epoch": 2.779573367808662, "grad_norm": 0.7554541826248169, "learning_rate": 0.00014433791137600415, "loss": 0.1721, "step": 1075 }, { "epoch": 2.779573367808662, "eval_loss": 0.31251639127731323, "eval_runtime": 13.8879, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.8, "step": 1075 }, { "epoch": 2.8442146089204914, "grad_norm": 0.6487146019935608, "learning_rate": 0.00014304223892200053, "loss": 0.2094, "step": 1100 }, { "epoch": 2.8442146089204914, "eval_loss": 0.30554458498954773, "eval_runtime": 13.9003, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 1100 }, { "epoch": 2.9088558500323205, "grad_norm": 1.1581164598464966, "learning_rate": 0.00014174656646799688, "loss": 0.1646, "step": 1125 }, { "epoch": 2.9088558500323205, "eval_loss": 0.3173879086971283, "eval_runtime": 13.8916, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 1125 }, { "epoch": 2.97349709114415, "grad_norm": 0.627264142036438, "learning_rate": 0.00014045089401399326, "loss": 0.1842, "step": 1150 }, { "epoch": 2.97349709114415, "eval_loss": 0.304733008146286, "eval_runtime": 13.8953, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 1150 }, { "epoch": 3.038138332255979, "grad_norm": 0.5179964900016785, "learning_rate": 0.00013915522155998964, "loss": 0.1582, "step": 1175 }, { "epoch": 3.038138332255979, "eval_loss": 0.3209947943687439, "eval_runtime": 13.8893, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 1175 }, { "epoch": 3.1027795733678087, "grad_norm": 0.4774376153945923, "learning_rate": 0.00013785954910598602, "loss": 0.1306, "step": 1200 }, { "epoch": 3.1027795733678087, "eval_loss": 0.3194412589073181, "eval_runtime": 13.8863, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 1200 }, { "epoch": 3.167420814479638, "grad_norm": 0.5093793869018555, "learning_rate": 0.00013656387665198237, "loss": 0.1294, "step": 1225 }, { "epoch": 3.167420814479638, "eval_loss": 0.3293355107307434, "eval_runtime": 13.8881, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.8, "step": 1225 }, { "epoch": 3.2320620555914674, "grad_norm": 0.2597694993019104, "learning_rate": 0.00013526820419797878, "loss": 0.1283, "step": 1250 }, { "epoch": 3.2320620555914674, "eval_loss": 0.3166593313217163, "eval_runtime": 13.8971, "eval_samples_per_second": 13.96, "eval_steps_per_second": 1.799, "step": 1250 }, { "epoch": 3.2967032967032965, "grad_norm": 0.23225939273834229, "learning_rate": 0.00013397253174397513, "loss": 0.1464, "step": 1275 }, { "epoch": 3.2967032967032965, "eval_loss": 0.3203945457935333, "eval_runtime": 13.89, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 1275 }, { "epoch": 3.361344537815126, "grad_norm": 0.332242488861084, "learning_rate": 0.0001326768592899715, "loss": 0.1398, "step": 1300 }, { "epoch": 3.361344537815126, "eval_loss": 0.31326356530189514, "eval_runtime": 13.8775, "eval_samples_per_second": 13.979, "eval_steps_per_second": 1.801, "step": 1300 }, { "epoch": 3.425985778926955, "grad_norm": 0.5355468988418579, "learning_rate": 0.00013138118683596786, "loss": 0.142, "step": 1325 }, { "epoch": 3.425985778926955, "eval_loss": 0.3070937395095825, "eval_runtime": 13.8888, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 1325 }, { "epoch": 3.490627020038785, "grad_norm": 0.6375517249107361, "learning_rate": 0.00013008551438196424, "loss": 0.1396, "step": 1350 }, { "epoch": 3.490627020038785, "eval_loss": 0.3138720393180847, "eval_runtime": 13.8924, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 1350 }, { "epoch": 3.555268261150614, "grad_norm": 0.3490104377269745, "learning_rate": 0.00012878984192796062, "loss": 0.1476, "step": 1375 }, { "epoch": 3.555268261150614, "eval_loss": 0.30152711272239685, "eval_runtime": 13.894, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 1375 }, { "epoch": 3.6199095022624435, "grad_norm": 0.3556181490421295, "learning_rate": 0.00012749416947395697, "loss": 0.1396, "step": 1400 }, { "epoch": 3.6199095022624435, "eval_loss": 0.31680139899253845, "eval_runtime": 13.8887, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 1400 }, { "epoch": 3.684550743374273, "grad_norm": 0.2230699509382248, "learning_rate": 0.00012619849701995338, "loss": 0.1363, "step": 1425 }, { "epoch": 3.684550743374273, "eval_loss": 0.3081817328929901, "eval_runtime": 13.8817, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.801, "step": 1425 }, { "epoch": 3.749191984486102, "grad_norm": 0.3208047151565552, "learning_rate": 0.00012490282456594973, "loss": 0.144, "step": 1450 }, { "epoch": 3.749191984486102, "eval_loss": 0.31721794605255127, "eval_runtime": 13.8879, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.8, "step": 1450 }, { "epoch": 3.8138332255979313, "grad_norm": 0.32816827297210693, "learning_rate": 0.0001236071521119461, "loss": 0.1645, "step": 1475 }, { "epoch": 3.8138332255979313, "eval_loss": 0.29447051882743835, "eval_runtime": 13.8874, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 1475 }, { "epoch": 3.878474466709761, "grad_norm": 0.4234698712825775, "learning_rate": 0.00012231147965794246, "loss": 0.1345, "step": 1500 }, { "epoch": 3.878474466709761, "eval_loss": 0.32389405369758606, "eval_runtime": 13.8841, "eval_samples_per_second": 13.973, "eval_steps_per_second": 1.801, "step": 1500 }, { "epoch": 3.9431157078215904, "grad_norm": 0.5032609105110168, "learning_rate": 0.00012101580720393886, "loss": 0.1436, "step": 1525 }, { "epoch": 3.9431157078215904, "eval_loss": 0.3067689538002014, "eval_runtime": 13.8918, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 1525 }, { "epoch": 4.0077569489334195, "grad_norm": 0.4656796455383301, "learning_rate": 0.00011972013474993522, "loss": 0.1322, "step": 1550 }, { "epoch": 4.0077569489334195, "eval_loss": 0.3148553967475891, "eval_runtime": 13.8941, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 1550 }, { "epoch": 4.072398190045249, "grad_norm": 0.312979519367218, "learning_rate": 0.0001184244622959316, "loss": 0.1147, "step": 1575 }, { "epoch": 4.072398190045249, "eval_loss": 0.3317333161830902, "eval_runtime": 13.8904, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 1575 }, { "epoch": 4.137039431157079, "grad_norm": 0.7652421593666077, "learning_rate": 0.00011712878984192797, "loss": 0.1092, "step": 1600 }, { "epoch": 4.137039431157079, "eval_loss": 0.32812875509262085, "eval_runtime": 13.8839, "eval_samples_per_second": 13.973, "eval_steps_per_second": 1.801, "step": 1600 }, { "epoch": 4.201680672268908, "grad_norm": 0.4065840244293213, "learning_rate": 0.00011583311738792433, "loss": 0.1088, "step": 1625 }, { "epoch": 4.201680672268908, "eval_loss": 0.32381245493888855, "eval_runtime": 13.8912, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 1625 }, { "epoch": 4.266321913380737, "grad_norm": 0.7044116854667664, "learning_rate": 0.00011453744493392071, "loss": 0.1145, "step": 1650 }, { "epoch": 4.266321913380737, "eval_loss": 0.33820751309394836, "eval_runtime": 13.8893, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 1650 }, { "epoch": 4.330963154492566, "grad_norm": 0.4830712676048279, "learning_rate": 0.00011324177247991708, "loss": 0.1134, "step": 1675 }, { "epoch": 4.330963154492566, "eval_loss": 0.32449984550476074, "eval_runtime": 13.8869, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 1675 }, { "epoch": 4.395604395604396, "grad_norm": 0.5964047908782959, "learning_rate": 0.00011194610002591346, "loss": 0.1202, "step": 1700 }, { "epoch": 4.395604395604396, "eval_loss": 0.34136590361595154, "eval_runtime": 13.8979, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.799, "step": 1700 }, { "epoch": 4.460245636716225, "grad_norm": 0.3279203176498413, "learning_rate": 0.00011065042757190982, "loss": 0.1149, "step": 1725 }, { "epoch": 4.460245636716225, "eval_loss": 0.3160356283187866, "eval_runtime": 13.8933, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.799, "step": 1725 }, { "epoch": 4.524886877828054, "grad_norm": 0.7072445154190063, "learning_rate": 0.0001093547551179062, "loss": 0.1185, "step": 1750 }, { "epoch": 4.524886877828054, "eval_loss": 0.339296817779541, "eval_runtime": 13.8937, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 1750 }, { "epoch": 4.589528118939883, "grad_norm": 0.2717013359069824, "learning_rate": 0.00010805908266390257, "loss": 0.1192, "step": 1775 }, { "epoch": 4.589528118939883, "eval_loss": 0.325735867023468, "eval_runtime": 13.8932, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.799, "step": 1775 }, { "epoch": 4.654169360051713, "grad_norm": 0.6134977340698242, "learning_rate": 0.00010676341020989895, "loss": 0.1157, "step": 1800 }, { "epoch": 4.654169360051713, "eval_loss": 0.33217981457710266, "eval_runtime": 13.888, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.8, "step": 1800 }, { "epoch": 4.7188106011635425, "grad_norm": 0.37782660126686096, "learning_rate": 0.00010546773775589531, "loss": 0.1111, "step": 1825 }, { "epoch": 4.7188106011635425, "eval_loss": 0.324066698551178, "eval_runtime": 13.9001, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 1825 }, { "epoch": 4.783451842275372, "grad_norm": 0.5282606482505798, "learning_rate": 0.00010417206530189169, "loss": 0.1184, "step": 1850 }, { "epoch": 4.783451842275372, "eval_loss": 0.3346935510635376, "eval_runtime": 13.8986, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 1850 }, { "epoch": 4.848093083387201, "grad_norm": 0.29640528559684753, "learning_rate": 0.00010287639284788806, "loss": 0.1097, "step": 1875 }, { "epoch": 4.848093083387201, "eval_loss": 0.33659717440605164, "eval_runtime": 13.8932, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.799, "step": 1875 }, { "epoch": 4.912734324499031, "grad_norm": 0.6442544460296631, "learning_rate": 0.00010158072039388442, "loss": 0.123, "step": 1900 }, { "epoch": 4.912734324499031, "eval_loss": 0.3181850016117096, "eval_runtime": 13.89, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 1900 }, { "epoch": 4.97737556561086, "grad_norm": 0.3405434489250183, "learning_rate": 0.0001002850479398808, "loss": 0.1198, "step": 1925 }, { "epoch": 4.97737556561086, "eval_loss": 0.3185713291168213, "eval_runtime": 13.9051, "eval_samples_per_second": 13.952, "eval_steps_per_second": 1.798, "step": 1925 }, { "epoch": 5.042016806722689, "grad_norm": 0.2473362535238266, "learning_rate": 0.00009898937548587718, "loss": 0.1015, "step": 1950 }, { "epoch": 5.042016806722689, "eval_loss": 0.33879318833351135, "eval_runtime": 13.882, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.801, "step": 1950 }, { "epoch": 5.106658047834518, "grad_norm": 0.6215495467185974, "learning_rate": 0.00009769370303187355, "loss": 0.0995, "step": 1975 }, { "epoch": 5.106658047834518, "eval_loss": 0.35385414958000183, "eval_runtime": 13.895, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 1975 }, { "epoch": 5.171299288946348, "grad_norm": 0.2980802357196808, "learning_rate": 0.00009639803057786993, "loss": 0.1036, "step": 2000 }, { "epoch": 5.171299288946348, "eval_loss": 0.35115987062454224, "eval_runtime": 13.8914, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 2000 }, { "epoch": 5.235940530058177, "grad_norm": 0.32530656456947327, "learning_rate": 0.00009510235812386628, "loss": 0.1091, "step": 2025 }, { "epoch": 5.235940530058177, "eval_loss": 0.33915072679519653, "eval_runtime": 13.8895, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 2025 }, { "epoch": 5.300581771170006, "grad_norm": 0.28688952326774597, "learning_rate": 0.00009380668566986266, "loss": 0.1025, "step": 2050 }, { "epoch": 5.300581771170006, "eval_loss": 0.33742988109588623, "eval_runtime": 13.8962, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 2050 }, { "epoch": 5.365223012281835, "grad_norm": 0.7334425449371338, "learning_rate": 0.00009251101321585903, "loss": 0.1012, "step": 2075 }, { "epoch": 5.365223012281835, "eval_loss": 0.3349464535713196, "eval_runtime": 13.897, "eval_samples_per_second": 13.96, "eval_steps_per_second": 1.799, "step": 2075 }, { "epoch": 5.429864253393665, "grad_norm": 0.4262210726737976, "learning_rate": 0.0000912153407618554, "loss": 0.1001, "step": 2100 }, { "epoch": 5.429864253393665, "eval_loss": 0.3597383499145508, "eval_runtime": 13.8902, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 2100 }, { "epoch": 5.4945054945054945, "grad_norm": 0.32452741265296936, "learning_rate": 0.00008991966830785178, "loss": 0.1034, "step": 2125 }, { "epoch": 5.4945054945054945, "eval_loss": 0.3415992259979248, "eval_runtime": 13.9049, "eval_samples_per_second": 13.952, "eval_steps_per_second": 1.798, "step": 2125 }, { "epoch": 5.559146735617324, "grad_norm": 0.3775513768196106, "learning_rate": 0.00008862399585384815, "loss": 0.1016, "step": 2150 }, { "epoch": 5.559146735617324, "eval_loss": 0.33714666962623596, "eval_runtime": 13.894, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 2150 }, { "epoch": 5.623787976729153, "grad_norm": 0.2996246814727783, "learning_rate": 0.00008732832339984453, "loss": 0.1073, "step": 2175 }, { "epoch": 5.623787976729153, "eval_loss": 0.33110281825065613, "eval_runtime": 13.9004, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.799, "step": 2175 }, { "epoch": 5.688429217840983, "grad_norm": 0.2353794425725937, "learning_rate": 0.0000860326509458409, "loss": 0.1007, "step": 2200 }, { "epoch": 5.688429217840983, "eval_loss": 0.33147233724594116, "eval_runtime": 13.8983, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 2200 }, { "epoch": 5.753070458952812, "grad_norm": 0.284140408039093, "learning_rate": 0.00008473697849183728, "loss": 0.1054, "step": 2225 }, { "epoch": 5.753070458952812, "eval_loss": 0.3283693194389343, "eval_runtime": 13.8987, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 2225 }, { "epoch": 5.817711700064641, "grad_norm": 0.322956383228302, "learning_rate": 0.00008344130603783364, "loss": 0.1023, "step": 2250 }, { "epoch": 5.817711700064641, "eval_loss": 0.3444914221763611, "eval_runtime": 13.9073, "eval_samples_per_second": 13.95, "eval_steps_per_second": 1.798, "step": 2250 }, { "epoch": 5.882352941176471, "grad_norm": 0.34075090289115906, "learning_rate": 0.00008214563358383001, "loss": 0.1049, "step": 2275 }, { "epoch": 5.882352941176471, "eval_loss": 0.34979012608528137, "eval_runtime": 13.8952, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 2275 }, { "epoch": 5.9469941822883, "grad_norm": 0.27347272634506226, "learning_rate": 0.00008084996112982637, "loss": 0.0988, "step": 2300 }, { "epoch": 5.9469941822883, "eval_loss": 0.3358798325061798, "eval_runtime": 13.892, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 2300 }, { "epoch": 6.011635423400129, "grad_norm": 0.2915317714214325, "learning_rate": 0.00007955428867582275, "loss": 0.0991, "step": 2325 }, { "epoch": 6.011635423400129, "eval_loss": 0.34607139229774475, "eval_runtime": 13.903, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 2325 }, { "epoch": 6.076276664511958, "grad_norm": 0.2220218926668167, "learning_rate": 0.00007825861622181913, "loss": 0.0898, "step": 2350 }, { "epoch": 6.076276664511958, "eval_loss": 0.3418421149253845, "eval_runtime": 13.9007, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.798, "step": 2350 }, { "epoch": 6.140917905623788, "grad_norm": 0.27043217420578003, "learning_rate": 0.0000769629437678155, "loss": 0.0919, "step": 2375 }, { "epoch": 6.140917905623788, "eval_loss": 0.35563594102859497, "eval_runtime": 13.8899, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 2375 }, { "epoch": 6.2055591467356175, "grad_norm": 0.3899531960487366, "learning_rate": 0.00007566727131381188, "loss": 0.0872, "step": 2400 }, { "epoch": 6.2055591467356175, "eval_loss": 0.35867059230804443, "eval_runtime": 13.8991, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 2400 }, { "epoch": 6.270200387847447, "grad_norm": 0.48080724477767944, "learning_rate": 0.00007437159885980824, "loss": 0.0973, "step": 2425 }, { "epoch": 6.270200387847447, "eval_loss": 0.35043036937713623, "eval_runtime": 13.8939, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 2425 }, { "epoch": 6.334841628959276, "grad_norm": 0.42479485273361206, "learning_rate": 0.00007307592640580462, "loss": 0.0917, "step": 2450 }, { "epoch": 6.334841628959276, "eval_loss": 0.35338568687438965, "eval_runtime": 13.8934, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.799, "step": 2450 }, { "epoch": 6.399482870071106, "grad_norm": 0.4198790490627289, "learning_rate": 0.00007178025395180099, "loss": 0.0974, "step": 2475 }, { "epoch": 6.399482870071106, "eval_loss": 0.3527772128582001, "eval_runtime": 13.8929, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.799, "step": 2475 }, { "epoch": 6.464124111182935, "grad_norm": 0.3013072907924652, "learning_rate": 0.00007048458149779737, "loss": 0.0924, "step": 2500 }, { "epoch": 6.464124111182935, "eval_loss": 0.3606307804584503, "eval_runtime": 13.8994, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 2500 }, { "epoch": 6.528765352294764, "grad_norm": 1.44315505027771, "learning_rate": 0.00006918890904379373, "loss": 0.0958, "step": 2525 }, { "epoch": 6.528765352294764, "eval_loss": 0.3492783308029175, "eval_runtime": 13.8992, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 2525 }, { "epoch": 6.593406593406593, "grad_norm": 0.2337220013141632, "learning_rate": 0.0000678932365897901, "loss": 0.0935, "step": 2550 }, { "epoch": 6.593406593406593, "eval_loss": 0.3465332090854645, "eval_runtime": 13.8855, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.8, "step": 2550 }, { "epoch": 6.658047834518423, "grad_norm": 0.560575544834137, "learning_rate": 0.00006659756413578648, "loss": 0.0989, "step": 2575 }, { "epoch": 6.658047834518423, "eval_loss": 0.3463946580886841, "eval_runtime": 13.9011, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.798, "step": 2575 }, { "epoch": 6.722689075630252, "grad_norm": 0.23282793164253235, "learning_rate": 0.00006530189168178284, "loss": 0.0883, "step": 2600 }, { "epoch": 6.722689075630252, "eval_loss": 0.3596608340740204, "eval_runtime": 13.8986, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 2600 }, { "epoch": 6.787330316742081, "grad_norm": 0.29077738523483276, "learning_rate": 0.00006400621922777922, "loss": 0.0997, "step": 2625 }, { "epoch": 6.787330316742081, "eval_loss": 0.3466287851333618, "eval_runtime": 13.8997, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 2625 }, { "epoch": 6.85197155785391, "grad_norm": 0.28866180777549744, "learning_rate": 0.00006271054677377559, "loss": 0.0907, "step": 2650 }, { "epoch": 6.85197155785391, "eval_loss": 0.3554084897041321, "eval_runtime": 13.8946, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 2650 }, { "epoch": 6.91661279896574, "grad_norm": 0.2319469451904297, "learning_rate": 0.00006141487431977197, "loss": 0.099, "step": 2675 }, { "epoch": 6.91661279896574, "eval_loss": 0.3529384732246399, "eval_runtime": 13.8906, "eval_samples_per_second": 13.966, "eval_steps_per_second": 1.8, "step": 2675 }, { "epoch": 6.98125404007757, "grad_norm": 0.25588566064834595, "learning_rate": 0.000060119201865768335, "loss": 0.0957, "step": 2700 }, { "epoch": 6.98125404007757, "eval_loss": 0.34903913736343384, "eval_runtime": 13.9001, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 2700 }, { "epoch": 7.045895281189399, "grad_norm": 0.2661280930042267, "learning_rate": 0.00005882352941176471, "loss": 0.0857, "step": 2725 }, { "epoch": 7.045895281189399, "eval_loss": 0.3659530580043793, "eval_runtime": 13.8975, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.799, "step": 2725 }, { "epoch": 7.110536522301228, "grad_norm": 0.26808837056159973, "learning_rate": 0.00005752785695776109, "loss": 0.0914, "step": 2750 }, { "epoch": 7.110536522301228, "eval_loss": 0.37946397066116333, "eval_runtime": 13.8953, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 2750 }, { "epoch": 7.175177763413058, "grad_norm": 0.2569040358066559, "learning_rate": 0.00005623218450375746, "loss": 0.0822, "step": 2775 }, { "epoch": 7.175177763413058, "eval_loss": 0.3785270154476166, "eval_runtime": 13.8941, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 2775 }, { "epoch": 7.239819004524887, "grad_norm": 0.2069663107395172, "learning_rate": 0.00005493651204975383, "loss": 0.0933, "step": 2800 }, { "epoch": 7.239819004524887, "eval_loss": 0.36601343750953674, "eval_runtime": 13.9025, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 2800 }, { "epoch": 7.304460245636716, "grad_norm": 0.3386838138103485, "learning_rate": 0.00005364083959575019, "loss": 0.0848, "step": 2825 }, { "epoch": 7.304460245636716, "eval_loss": 0.3629213273525238, "eval_runtime": 13.9024, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 2825 }, { "epoch": 7.369101486748546, "grad_norm": 0.24702002108097076, "learning_rate": 0.000052345167141746564, "loss": 0.0941, "step": 2850 }, { "epoch": 7.369101486748546, "eval_loss": 0.36621519923210144, "eval_runtime": 13.9011, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.798, "step": 2850 }, { "epoch": 7.433742727860375, "grad_norm": 0.1761961281299591, "learning_rate": 0.000051049494687742936, "loss": 0.0847, "step": 2875 }, { "epoch": 7.433742727860375, "eval_loss": 0.3717571496963501, "eval_runtime": 13.9013, "eval_samples_per_second": 13.955, "eval_steps_per_second": 1.798, "step": 2875 }, { "epoch": 7.498383968972204, "grad_norm": 0.22073306143283844, "learning_rate": 0.00004975382223373931, "loss": 0.0899, "step": 2900 }, { "epoch": 7.498383968972204, "eval_loss": 0.3767780363559723, "eval_runtime": 13.8867, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.8, "step": 2900 }, { "epoch": 7.563025210084033, "grad_norm": 0.23177385330200195, "learning_rate": 0.00004845814977973568, "loss": 0.0857, "step": 2925 }, { "epoch": 7.563025210084033, "eval_loss": 0.3604573607444763, "eval_runtime": 13.9028, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 2925 }, { "epoch": 7.6276664511958625, "grad_norm": 0.2143615335226059, "learning_rate": 0.000047162477325732054, "loss": 0.0931, "step": 2950 }, { "epoch": 7.6276664511958625, "eval_loss": 0.36467450857162476, "eval_runtime": 13.8939, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 2950 }, { "epoch": 7.6923076923076925, "grad_norm": 0.23051580786705017, "learning_rate": 0.000045866804871728434, "loss": 0.0843, "step": 2975 }, { "epoch": 7.6923076923076925, "eval_loss": 0.37799063324928284, "eval_runtime": 13.8887, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.8, "step": 2975 }, { "epoch": 7.756948933419522, "grad_norm": 0.2964944541454315, "learning_rate": 0.000044571132417724806, "loss": 0.0953, "step": 3000 }, { "epoch": 7.756948933419522, "eval_loss": 0.37458544969558716, "eval_runtime": 13.8975, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.799, "step": 3000 }, { "epoch": 7.821590174531351, "grad_norm": 0.1789853572845459, "learning_rate": 0.00004327545996372117, "loss": 0.0846, "step": 3025 }, { "epoch": 7.821590174531351, "eval_loss": 0.36440709233283997, "eval_runtime": 13.8936, "eval_samples_per_second": 13.963, "eval_steps_per_second": 1.799, "step": 3025 }, { "epoch": 7.886231415643181, "grad_norm": 0.2910414934158325, "learning_rate": 0.000041979787509717545, "loss": 0.0945, "step": 3050 }, { "epoch": 7.886231415643181, "eval_loss": 0.363972932100296, "eval_runtime": 13.899, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 3050 }, { "epoch": 7.95087265675501, "grad_norm": 0.16566026210784912, "learning_rate": 0.00004068411505571392, "loss": 0.0856, "step": 3075 }, { "epoch": 7.95087265675501, "eval_loss": 0.369998961687088, "eval_runtime": 13.8899, "eval_samples_per_second": 13.967, "eval_steps_per_second": 1.8, "step": 3075 }, { "epoch": 8.015513897866839, "grad_norm": 0.20665214955806732, "learning_rate": 0.00003938844260171029, "loss": 0.0858, "step": 3100 }, { "epoch": 8.015513897866839, "eval_loss": 0.3743078410625458, "eval_runtime": 13.9039, "eval_samples_per_second": 13.953, "eval_steps_per_second": 1.798, "step": 3100 }, { "epoch": 8.080155138978668, "grad_norm": 0.22216439247131348, "learning_rate": 0.00003809277014770666, "loss": 0.0804, "step": 3125 }, { "epoch": 8.080155138978668, "eval_loss": 0.38528111577033997, "eval_runtime": 13.9066, "eval_samples_per_second": 13.95, "eval_steps_per_second": 1.798, "step": 3125 }, { "epoch": 8.144796380090497, "grad_norm": 0.2925605773925781, "learning_rate": 0.00003679709769370303, "loss": 0.0855, "step": 3150 }, { "epoch": 8.144796380090497, "eval_loss": 0.3879942297935486, "eval_runtime": 13.8926, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 3150 }, { "epoch": 8.209437621202326, "grad_norm": 0.1863240748643875, "learning_rate": 0.0000355014252396994, "loss": 0.0797, "step": 3175 }, { "epoch": 8.209437621202326, "eval_loss": 0.38318556547164917, "eval_runtime": 13.8966, "eval_samples_per_second": 13.96, "eval_steps_per_second": 1.799, "step": 3175 }, { "epoch": 8.274078862314157, "grad_norm": 0.227659672498703, "learning_rate": 0.00003420575278569578, "loss": 0.0873, "step": 3200 }, { "epoch": 8.274078862314157, "eval_loss": 0.3866645097732544, "eval_runtime": 13.8958, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 3200 }, { "epoch": 8.338720103425986, "grad_norm": 0.24623195827007294, "learning_rate": 0.00003291008033169215, "loss": 0.0805, "step": 3225 }, { "epoch": 8.338720103425986, "eval_loss": 0.388787180185318, "eval_runtime": 13.9, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 3225 }, { "epoch": 8.403361344537815, "grad_norm": 0.17468619346618652, "learning_rate": 0.000031614407877688526, "loss": 0.0845, "step": 3250 }, { "epoch": 8.403361344537815, "eval_loss": 0.39511677622795105, "eval_runtime": 13.8992, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 3250 }, { "epoch": 8.468002585649645, "grad_norm": 0.2383006066083908, "learning_rate": 0.000030318735423684895, "loss": 0.0807, "step": 3275 }, { "epoch": 8.468002585649645, "eval_loss": 0.39297351241111755, "eval_runtime": 13.9101, "eval_samples_per_second": 13.947, "eval_steps_per_second": 1.797, "step": 3275 }, { "epoch": 8.532643826761474, "grad_norm": 0.7691563367843628, "learning_rate": 0.000029023062969681264, "loss": 0.0868, "step": 3300 }, { "epoch": 8.532643826761474, "eval_loss": 0.3870464265346527, "eval_runtime": 13.9059, "eval_samples_per_second": 13.951, "eval_steps_per_second": 1.798, "step": 3300 }, { "epoch": 8.597285067873303, "grad_norm": 0.19113022089004517, "learning_rate": 0.000027727390515677637, "loss": 0.0816, "step": 3325 }, { "epoch": 8.597285067873303, "eval_loss": 0.3797835409641266, "eval_runtime": 13.9074, "eval_samples_per_second": 13.949, "eval_steps_per_second": 1.798, "step": 3325 }, { "epoch": 8.661926308985132, "grad_norm": 0.20881256461143494, "learning_rate": 0.00002643171806167401, "loss": 0.0868, "step": 3350 }, { "epoch": 8.661926308985132, "eval_loss": 0.3818240165710449, "eval_runtime": 13.901, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.798, "step": 3350 }, { "epoch": 8.726567550096961, "grad_norm": 0.19840125739574432, "learning_rate": 0.000025136045607670382, "loss": 0.0808, "step": 3375 }, { "epoch": 8.726567550096961, "eval_loss": 0.38280540704727173, "eval_runtime": 13.8951, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 3375 }, { "epoch": 8.791208791208792, "grad_norm": 0.2708618640899658, "learning_rate": 0.000023840373153666755, "loss": 0.0915, "step": 3400 }, { "epoch": 8.791208791208792, "eval_loss": 0.3764805495738983, "eval_runtime": 13.8956, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 3400 }, { "epoch": 8.855850032320621, "grad_norm": 0.23502439260482788, "learning_rate": 0.000022544700699663127, "loss": 0.0827, "step": 3425 }, { "epoch": 8.855850032320621, "eval_loss": 0.378770649433136, "eval_runtime": 13.9002, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 3425 }, { "epoch": 8.92049127343245, "grad_norm": 0.25332191586494446, "learning_rate": 0.0000212490282456595, "loss": 0.0874, "step": 3450 }, { "epoch": 8.92049127343245, "eval_loss": 0.3801264762878418, "eval_runtime": 13.907, "eval_samples_per_second": 13.95, "eval_steps_per_second": 1.798, "step": 3450 }, { "epoch": 8.98513251454428, "grad_norm": 0.25316667556762695, "learning_rate": 0.00001995335579165587, "loss": 0.0891, "step": 3475 }, { "epoch": 8.98513251454428, "eval_loss": 0.3802332878112793, "eval_runtime": 13.9011, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.798, "step": 3475 }, { "epoch": 9.049773755656108, "grad_norm": 0.21907874941825867, "learning_rate": 0.000018657683337652242, "loss": 0.075, "step": 3500 }, { "epoch": 9.049773755656108, "eval_loss": 0.3840136229991913, "eval_runtime": 13.8958, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 3500 }, { "epoch": 9.114414996767938, "grad_norm": 0.24233920872211456, "learning_rate": 0.000017362010883648614, "loss": 0.0847, "step": 3525 }, { "epoch": 9.114414996767938, "eval_loss": 0.39279288053512573, "eval_runtime": 13.8994, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.799, "step": 3525 }, { "epoch": 9.179056237879767, "grad_norm": 0.2361481636762619, "learning_rate": 0.000016066338429644987, "loss": 0.0756, "step": 3550 }, { "epoch": 9.179056237879767, "eval_loss": 0.3995870351791382, "eval_runtime": 13.8927, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.8, "step": 3550 }, { "epoch": 9.243697478991596, "grad_norm": 0.19486679136753082, "learning_rate": 0.00001477066597564136, "loss": 0.0852, "step": 3575 }, { "epoch": 9.243697478991596, "eval_loss": 0.40310657024383545, "eval_runtime": 13.8953, "eval_samples_per_second": 13.962, "eval_steps_per_second": 1.799, "step": 3575 }, { "epoch": 9.308338720103427, "grad_norm": 0.23106832802295685, "learning_rate": 0.00001347499352163773, "loss": 0.0746, "step": 3600 }, { "epoch": 9.308338720103427, "eval_loss": 0.4031190872192383, "eval_runtime": 13.8976, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.799, "step": 3600 }, { "epoch": 9.372979961215256, "grad_norm": 0.21963872015476227, "learning_rate": 0.000012179321067634103, "loss": 0.0844, "step": 3625 }, { "epoch": 9.372979961215256, "eval_loss": 0.4036274552345276, "eval_runtime": 13.9016, "eval_samples_per_second": 13.955, "eval_steps_per_second": 1.798, "step": 3625 }, { "epoch": 9.437621202327085, "grad_norm": 0.19979609549045563, "learning_rate": 0.000010883648613630474, "loss": 0.0763, "step": 3650 }, { "epoch": 9.437621202327085, "eval_loss": 0.4039681851863861, "eval_runtime": 13.8986, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.799, "step": 3650 }, { "epoch": 9.502262443438914, "grad_norm": 0.29937314987182617, "learning_rate": 0.000009587976159626847, "loss": 0.0856, "step": 3675 }, { "epoch": 9.502262443438914, "eval_loss": 0.40228113532066345, "eval_runtime": 13.9023, "eval_samples_per_second": 13.955, "eval_steps_per_second": 1.798, "step": 3675 }, { "epoch": 9.566903684550743, "grad_norm": 0.19431306421756744, "learning_rate": 0.00000829230370562322, "loss": 0.0759, "step": 3700 }, { "epoch": 9.566903684550743, "eval_loss": 0.39987537264823914, "eval_runtime": 13.9031, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 3700 }, { "epoch": 9.631544925662572, "grad_norm": 0.23780672252178192, "learning_rate": 0.000006996631251619591, "loss": 0.0855, "step": 3725 }, { "epoch": 9.631544925662572, "eval_loss": 0.39910781383514404, "eval_runtime": 13.8918, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.8, "step": 3725 }, { "epoch": 9.696186166774401, "grad_norm": 0.28458669781684875, "learning_rate": 0.000005700958797615963, "loss": 0.0749, "step": 3750 }, { "epoch": 9.696186166774401, "eval_loss": 0.3985865116119385, "eval_runtime": 13.9024, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.798, "step": 3750 }, { "epoch": 9.760827407886232, "grad_norm": 0.2190496027469635, "learning_rate": 0.0000044052863436123355, "loss": 0.0887, "step": 3775 }, { "epoch": 9.760827407886232, "eval_loss": 0.39947444200515747, "eval_runtime": 13.8956, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 3775 }, { "epoch": 9.825468648998061, "grad_norm": 0.2251676619052887, "learning_rate": 0.000003109613889608707, "loss": 0.0738, "step": 3800 }, { "epoch": 9.825468648998061, "eval_loss": 0.4002096951007843, "eval_runtime": 13.9037, "eval_samples_per_second": 13.953, "eval_steps_per_second": 1.798, "step": 3800 }, { "epoch": 9.89010989010989, "grad_norm": 0.29096800088882446, "learning_rate": 0.000001813941435605079, "loss": 0.0867, "step": 3825 }, { "epoch": 9.89010989010989, "eval_loss": 0.40017297863960266, "eval_runtime": 13.9019, "eval_samples_per_second": 13.955, "eval_steps_per_second": 1.798, "step": 3825 }, { "epoch": 9.95475113122172, "grad_norm": 0.2308928519487381, "learning_rate": 5.182689816014512e-7, "loss": 0.0796, "step": 3850 }, { "epoch": 9.95475113122172, "eval_loss": 0.4002385139465332, "eval_runtime": 13.8963, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.799, "step": 3850 } ], "logging_steps": 25, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "total_flos": 13719373990686720, "train_batch_size": 1, "trial_name": null, "trial_params": null }