{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.089327560869047, "global_step": 25354, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.92063492063492e-07, "loss": 10.9882, "theoretical_loss": 20.81281780154715, "tokens_seen": 65536 }, { "epoch": 0.0, "learning_rate": 1.984126984126984e-06, "loss": 10.9924, "theoretical_loss": 17.566201104328645, "tokens_seen": 131072 }, { "epoch": 0.0, "learning_rate": 2.9761904761904763e-06, "loss": 10.9305, "theoretical_loss": 15.939477092836569, "tokens_seen": 196608 }, { "epoch": 0.0, "learning_rate": 3.968253968253968e-06, "loss": 10.8158, "theoretical_loss": 14.89231675598857, "tokens_seen": 262144 }, { "epoch": 0.0, "learning_rate": 4.96031746031746e-06, "loss": 10.6288, "theoretical_loss": 14.136216937762974, "tokens_seen": 327680 }, { "epoch": 0.0, "learning_rate": 5.9523809523809525e-06, "loss": 10.4728, "theoretical_loss": 13.552561472550224, "tokens_seen": 393216 }, { "epoch": 0.0, "learning_rate": 6.944444444444444e-06, "loss": 10.2507, "theoretical_loss": 13.08180900140119, "tokens_seen": 458752 }, { "epoch": 0.0, "learning_rate": 7.936507936507936e-06, "loss": 10.0684, "theoretical_loss": 12.690129625483323, "tokens_seen": 524288 }, { "epoch": 0.0, "learning_rate": 8.928571428571428e-06, "loss": 9.9076, "theoretical_loss": 12.356592463873625, "tokens_seen": 589824 }, { "epoch": 0.0, "learning_rate": 9.92063492063492e-06, "loss": 9.8903, "theoretical_loss": 12.067412607035077, "tokens_seen": 655360 }, { "epoch": 0.0, "learning_rate": 1.0912698412698412e-05, "loss": 9.5893, "theoretical_loss": 11.813066231101676, "tokens_seen": 720896 }, { "epoch": 0.0, "learning_rate": 1.1904761904761905e-05, "loss": 9.5802, "theoretical_loss": 11.586719208706729, "tokens_seen": 786432 }, { "epoch": 0.0, "learning_rate": 1.2896825396825396e-05, "loss": 9.569, "theoretical_loss": 11.383314140186787, "tokens_seen": 851968 }, { "epoch": 0.0, "learning_rate": 1.3888888888888888e-05, "loss": 9.5755, "theoretical_loss": 11.199011702111871, "tokens_seen": 917504 }, { "epoch": 0.0, "learning_rate": 1.4880952380952381e-05, "loss": 9.2437, "theoretical_loss": 11.030833917977912, "tokens_seen": 983040 }, { "epoch": 0.0, "learning_rate": 1.5873015873015872e-05, "loss": 9.4752, "theoretical_loss": 10.87642808645695, "tokens_seen": 1048576 }, { "epoch": 0.0, "learning_rate": 1.6865079365079364e-05, "loss": 9.3222, "theoretical_loss": 10.733905740062724, "tokens_seen": 1114112 }, { "epoch": 0.0, "learning_rate": 1.7857142857142855e-05, "loss": 9.2029, "theoretical_loss": 10.60172987623028, "tokens_seen": 1179648 }, { "epoch": 0.0, "learning_rate": 1.884920634920635e-05, "loss": 9.3823, "theoretical_loss": 10.478634172356642, "tokens_seen": 1245184 }, { "epoch": 0.0, "learning_rate": 1.984126984126984e-05, "loss": 9.1709, "theoretical_loss": 10.36356394376333, "tokens_seen": 1310720 }, { "epoch": 0.0, "learning_rate": 2.0833333333333333e-05, "loss": 9.0787, "theoretical_loss": 10.255632220896747, "tokens_seen": 1376256 }, { "epoch": 0.0, "learning_rate": 2.1825396825396824e-05, "loss": 9.0634, "theoretical_loss": 10.15408655327002, "tokens_seen": 1441792 }, { "epoch": 0.0, "learning_rate": 2.2817460317460315e-05, "loss": 9.0218, "theoretical_loss": 10.058283561732598, "tokens_seen": 1507328 }, { "epoch": 0.0, "learning_rate": 2.380952380952381e-05, "loss": 9.0203, "theoretical_loss": 9.967669178840278, "tokens_seen": 1572864 }, { "epoch": 0.0, "objective/train/docs_used": 36240, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 8.640295028686523, "objective/train/theoretical_loss": 9.881763126393109, "objective/train/tokens_used": 22098400, "theoretical_loss": 9.881763126393109, "tokens_seen": 1638400 }, { "epoch": 0.0, "learning_rate": 2.48015873015873e-05, "loss": 9.1474, "theoretical_loss": 9.881763126393109, "tokens_seen": 1638400 }, { "epoch": 0.0, "learning_rate": 2.5793650793650793e-05, "loss": 9.0533, "theoretical_loss": 9.80014659154056, "tokens_seen": 1703936 }, { "epoch": 0.0, "learning_rate": 2.6785714285714284e-05, "loss": 8.826, "theoretical_loss": 9.722452346907446, "tokens_seen": 1769472 }, { "epoch": 0.0, "learning_rate": 2.7777777777777776e-05, "loss": 8.8503, "theoretical_loss": 9.648356759081546, "tokens_seen": 1835008 }, { "epoch": 0.0, "learning_rate": 2.876984126984127e-05, "loss": 8.8149, "theoretical_loss": 9.577573271145639, "tokens_seen": 1900544 }, { "epoch": 0.0, "learning_rate": 2.9761904761904762e-05, "loss": 8.8667, "theoretical_loss": 9.509847046764852, "tokens_seen": 1966080 }, { "epoch": 0.0, "learning_rate": 3.075396825396825e-05, "loss": 8.9725, "theoretical_loss": 9.444950537631936, "tokens_seen": 2031616 }, { "epoch": 0.0, "learning_rate": 3.1746031746031745e-05, "loss": 8.715, "theoretical_loss": 9.382679790910457, "tokens_seen": 2097152 }, { "epoch": 0.0, "learning_rate": 3.273809523809524e-05, "loss": 8.4883, "theoretical_loss": 9.32285135423398, "tokens_seen": 2162688 }, { "epoch": 0.0, "learning_rate": 3.373015873015873e-05, "loss": 8.6853, "theoretical_loss": 9.265299666660276, "tokens_seen": 2228224 }, { "epoch": 0.0, "learning_rate": 3.472222222222222e-05, "loss": 8.5241, "theoretical_loss": 9.209874847444755, "tokens_seen": 2293760 }, { "epoch": 0.0, "learning_rate": 3.571428571428571e-05, "loss": 8.6135, "theoretical_loss": 9.156440812508292, "tokens_seen": 2359296 }, { "epoch": 0.0, "learning_rate": 3.670634920634921e-05, "loss": 8.8331, "theoretical_loss": 9.10487366241335, "tokens_seen": 2424832 }, { "epoch": 0.0, "learning_rate": 3.76984126984127e-05, "loss": 8.7112, "theoretical_loss": 9.055060296533734, "tokens_seen": 2490368 }, { "epoch": 0.0, "learning_rate": 3.8690476190476195e-05, "loss": 8.6425, "theoretical_loss": 9.006897216643829, "tokens_seen": 2555904 }, { "epoch": 0.0, "learning_rate": 3.968253968253968e-05, "loss": 8.7777, "theoretical_loss": 8.960289489909357, "tokens_seen": 2621440 }, { "epoch": 0.0, "learning_rate": 4.067460317460318e-05, "loss": 8.4367, "theoretical_loss": 8.915149846640611, "tokens_seen": 2686976 }, { "epoch": 0.0, "learning_rate": 4.1666666666666665e-05, "loss": 8.6589, "theoretical_loss": 8.871397892478225, "tokens_seen": 2752512 }, { "epoch": 0.0, "learning_rate": 4.265873015873016e-05, "loss": 8.4898, "theoretical_loss": 8.828959418153499, "tokens_seen": 2818048 }, { "epoch": 0.0, "learning_rate": 4.365079365079365e-05, "loss": 8.3764, "theoretical_loss": 8.787765792778412, "tokens_seen": 2883584 }, { "epoch": 0.0, "learning_rate": 4.464285714285714e-05, "loss": 8.0994, "theoretical_loss": 8.747753428911455, "tokens_seen": 2949120 }, { "epoch": 0.0, "learning_rate": 4.563492063492063e-05, "loss": 8.4379, "theoretical_loss": 8.708863309520833, "tokens_seen": 3014656 }, { "epoch": 0.0, "learning_rate": 4.6626984126984126e-05, "loss": 8.3013, "theoretical_loss": 8.671040568508847, "tokens_seen": 3080192 }, { "epoch": 0.0, "learning_rate": 4.761904761904762e-05, "loss": 8.2616, "theoretical_loss": 8.634234117735474, "tokens_seen": 3145728 }, { "epoch": 0.0, "learning_rate": 4.8611111111111115e-05, "loss": 8.1409, "theoretical_loss": 8.598396314536323, "tokens_seen": 3211264 }, { "epoch": 0.0, "objective/train/docs_used": 39163, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 7.331214427947998, "objective/train/theoretical_loss": 8.563482664611069, "objective/train/tokens_used": 23736800, "theoretical_loss": 8.563482664611069, "tokens_seen": 3276800 }, { "epoch": 0.0, "learning_rate": 4.96031746031746e-05, "loss": 8.0325, "theoretical_loss": 8.563482664611069, "tokens_seen": 3276800 }, { "epoch": 0.0, "learning_rate": 5.05952380952381e-05, "loss": 8.0843, "theoretical_loss": 8.529451555895115, "tokens_seen": 3342336 }, { "epoch": 0.0, "learning_rate": 5.1587301587301586e-05, "loss": 7.9905, "theoretical_loss": 8.496264019646002, "tokens_seen": 3407872 }, { "epoch": 0.0, "learning_rate": 5.257936507936508e-05, "loss": 8.0272, "theoretical_loss": 8.463883515497187, "tokens_seen": 3473408 }, { "epoch": 0.0, "learning_rate": 5.357142857142857e-05, "loss": 8.225, "theoretical_loss": 8.432275737672779, "tokens_seen": 3538944 }, { "epoch": 0.0, "learning_rate": 5.4563492063492063e-05, "loss": 7.9747, "theoretical_loss": 8.401408439930716, "tokens_seen": 3604480 }, { "epoch": 0.0, "learning_rate": 5.555555555555555e-05, "loss": 8.0662, "theoretical_loss": 8.371251277120209, "tokens_seen": 3670016 }, { "epoch": 0.0, "learning_rate": 5.6547619047619046e-05, "loss": 7.7727, "theoretical_loss": 8.341775661511075, "tokens_seen": 3735552 }, { "epoch": 0.0, "learning_rate": 5.753968253968254e-05, "loss": 7.8395, "theoretical_loss": 8.31295463228533, "tokens_seen": 3801088 }, { "epoch": 0.0, "learning_rate": 5.8531746031746036e-05, "loss": 7.9093, "theoretical_loss": 8.284762736781182, "tokens_seen": 3866624 }, { "epoch": 0.0, "learning_rate": 5.9523809523809524e-05, "loss": 7.8577, "theoretical_loss": 8.257175922251864, "tokens_seen": 3932160 }, { "epoch": 0.0, "learning_rate": 6.051587301587302e-05, "loss": 7.6734, "theoretical_loss": 8.230171437050114, "tokens_seen": 3997696 }, { "epoch": 0.0, "learning_rate": 6.15079365079365e-05, "loss": 7.7831, "theoretical_loss": 8.20372774027797, "tokens_seen": 4063232 }, { "epoch": 0.0, "learning_rate": 6.25e-05, "loss": 7.7666, "theoretical_loss": 8.177824419053046, "tokens_seen": 4128768 }, { "epoch": 0.0, "learning_rate": 6.349206349206349e-05, "loss": 7.3606, "theoretical_loss": 8.152442112639616, "tokens_seen": 4194304 }, { "epoch": 0.0, "learning_rate": 6.448412698412699e-05, "loss": 7.7604, "theoretical_loss": 8.1275624427775, "tokens_seen": 4259840 }, { "epoch": 0.0, "learning_rate": 6.547619047619048e-05, "loss": 7.6472, "theoretical_loss": 8.10316794961571, "tokens_seen": 4325376 }, { "epoch": 0.0, "learning_rate": 6.646825396825397e-05, "loss": 7.4343, "theoretical_loss": 8.07924203272264, "tokens_seen": 4390912 }, { "epoch": 0.0, "learning_rate": 6.746031746031745e-05, "loss": 7.472, "theoretical_loss": 8.055768896701416, "tokens_seen": 4456448 }, { "epoch": 0.0, "learning_rate": 6.845238095238096e-05, "loss": 7.3591, "theoretical_loss": 8.032733500989007, "tokens_seen": 4521984 }, { "epoch": 0.0, "learning_rate": 6.944444444444444e-05, "loss": 7.4194, "theoretical_loss": 8.010121513461836, "tokens_seen": 4587520 }, { "epoch": 0.0, "learning_rate": 7.043650793650793e-05, "loss": 7.2564, "theoretical_loss": 7.987919267509379, "tokens_seen": 4653056 }, { "epoch": 0.0, "learning_rate": 7.142857142857142e-05, "loss": 7.4497, "theoretical_loss": 7.966113722271801, "tokens_seen": 4718592 }, { "epoch": 0.0, "learning_rate": 7.242063492063492e-05, "loss": 7.2708, "theoretical_loss": 7.944692425767988, "tokens_seen": 4784128 }, { "epoch": 0.0, "learning_rate": 7.341269841269842e-05, "loss": 7.2767, "theoretical_loss": 7.9236434806675184, "tokens_seen": 4849664 }, { "epoch": 0.0, "objective/train/docs_used": 40621, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 7.239190101623535, "objective/train/theoretical_loss": 7.902955512484067, "objective/train/tokens_used": 25375200, "theoretical_loss": 7.902955512484067, "tokens_seen": 4915200 }, { "epoch": 0.0, "learning_rate": 7.440476190476191e-05, "loss": 7.2867, "theoretical_loss": 7.902955512484067, "tokens_seen": 4915200 }, { "epoch": 0.0, "learning_rate": 7.53968253968254e-05, "loss": 7.322, "theoretical_loss": 7.882617639989203, "tokens_seen": 4980736 }, { "epoch": 0.0, "learning_rate": 7.63888888888889e-05, "loss": 7.4332, "theoretical_loss": 7.862619447664628, "tokens_seen": 5046272 }, { "epoch": 0.0, "learning_rate": 7.738095238095239e-05, "loss": 7.3823, "theoretical_loss": 7.842950960027937, "tokens_seen": 5111808 }, { "epoch": 0.0, "learning_rate": 7.837301587301588e-05, "loss": 7.2318, "theoretical_loss": 7.823602617682313, "tokens_seen": 5177344 }, { "epoch": 0.0, "learning_rate": 7.936507936507937e-05, "loss": 7.0715, "theoretical_loss": 7.804565254954165, "tokens_seen": 5242880 }, { "epoch": 0.0, "learning_rate": 8.035714285714287e-05, "loss": 7.2308, "theoretical_loss": 7.7858300789950725, "tokens_seen": 5308416 }, { "epoch": 0.0, "learning_rate": 8.134920634920635e-05, "loss": 6.8802, "theoretical_loss": 7.767388650235364, "tokens_seen": 5373952 }, { "epoch": 0.0, "learning_rate": 8.234126984126984e-05, "loss": 7.1078, "theoretical_loss": 7.749232864086619, "tokens_seen": 5439488 }, { "epoch": 0.0, "learning_rate": 8.333333333333333e-05, "loss": 6.9493, "theoretical_loss": 7.731354933799318, "tokens_seen": 5505024 }, { "epoch": 0.0, "learning_rate": 8.432539682539683e-05, "loss": 7.0778, "theoretical_loss": 7.71374737438992, "tokens_seen": 5570560 }, { "epoch": 0.0, "learning_rate": 8.531746031746032e-05, "loss": 6.9567, "theoretical_loss": 7.696402987558934, "tokens_seen": 5636096 }, { "epoch": 0.0, "learning_rate": 8.630952380952381e-05, "loss": 7.181, "theoretical_loss": 7.679314847528181, "tokens_seen": 5701632 }, { "epoch": 0.0, "learning_rate": 8.73015873015873e-05, "loss": 7.1135, "theoretical_loss": 7.662476287731328, "tokens_seen": 5767168 }, { "epoch": 0.0, "learning_rate": 8.82936507936508e-05, "loss": 6.898, "theoretical_loss": 7.645880888297279, "tokens_seen": 5832704 }, { "epoch": 0.0, "learning_rate": 8.928571428571429e-05, "loss": 7.0754, "theoretical_loss": 7.629522464270861, "tokens_seen": 5898240 }, { "epoch": 0.0, "learning_rate": 9.027777777777777e-05, "loss": 6.9068, "theoretical_loss": 7.613395054519696, "tokens_seen": 5963776 }, { "epoch": 0.0, "learning_rate": 9.126984126984126e-05, "loss": 7.2782, "theoretical_loss": 7.59749291128028, "tokens_seen": 6029312 }, { "epoch": 0.0, "learning_rate": 9.226190476190476e-05, "loss": 6.8508, "theoretical_loss": 7.581810490299888, "tokens_seen": 6094848 }, { "epoch": 0.0, "learning_rate": 9.325396825396825e-05, "loss": 6.8092, "theoretical_loss": 7.5663424415343705, "tokens_seen": 6160384 }, { "epoch": 0.0, "learning_rate": 9.424603174603175e-05, "loss": 6.9951, "theoretical_loss": 7.551083600364949, "tokens_seen": 6225920 }, { "epoch": 0.0, "learning_rate": 9.523809523809524e-05, "loss": 6.9773, "theoretical_loss": 7.536028979299919, "tokens_seen": 6291456 }, { "epoch": 0.0, "learning_rate": 9.623015873015874e-05, "loss": 6.8359, "theoretical_loss": 7.521173760129762, "tokens_seen": 6356992 }, { "epoch": 0.0, "learning_rate": 9.722222222222223e-05, "loss": 6.9984, "theoretical_loss": 7.506513286506497, "tokens_seen": 6422528 }, { "epoch": 0.0, "learning_rate": 9.821428571428572e-05, "loss": 7.0042, "theoretical_loss": 7.492043056920249, "tokens_seen": 6488064 }, { "epoch": 0.0, "objective/train/docs_used": 44240, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.328563213348389, "objective/train/theoretical_loss": 7.4777587180480305, "objective/train/tokens_used": 27013600, "theoretical_loss": 7.4777587180480305, "tokens_seen": 6553600 }, { "epoch": 0.0, "learning_rate": 9.92063492063492e-05, "loss": 6.8583, "theoretical_loss": 7.4777587180480305, "tokens_seen": 6553600 }, { "epoch": 0.0, "learning_rate": 0.00010019841269841271, "loss": 6.8027, "theoretical_loss": 7.463656058451462, "tokens_seen": 6619136 }, { "epoch": 0.0, "learning_rate": 0.0001011904761904762, "loss": 6.8616, "theoretical_loss": 7.449731002601916, "tokens_seen": 6684672 }, { "epoch": 0.0, "learning_rate": 0.00010218253968253968, "loss": 6.7577, "theoretical_loss": 7.435979605213019, "tokens_seen": 6750208 }, { "epoch": 0.0, "learning_rate": 0.00010317460317460317, "loss": 6.9605, "theoretical_loss": 7.422398045861905, "tokens_seen": 6815744 }, { "epoch": 0.0, "learning_rate": 0.00010416666666666667, "loss": 6.8721, "theoretical_loss": 7.408982623881875, "tokens_seen": 6881280 }, { "epoch": 0.0, "learning_rate": 0.00010515873015873016, "loss": 6.7875, "theoretical_loss": 7.395729753510345, "tokens_seen": 6946816 }, { "epoch": 0.0, "learning_rate": 0.00010615079365079365, "loss": 6.9431, "theoretical_loss": 7.3826359592770325, "tokens_seen": 7012352 }, { "epoch": 0.0, "learning_rate": 0.00010714285714285714, "loss": 6.7258, "theoretical_loss": 7.369697871618373, "tokens_seen": 7077888 }, { "epoch": 0.0, "learning_rate": 0.00010813492063492064, "loss": 6.7623, "theoretical_loss": 7.3569122227050885, "tokens_seen": 7143424 }, { "epoch": 0.0, "learning_rate": 0.00010912698412698413, "loss": 6.8068, "theoretical_loss": 7.3442758424706875, "tokens_seen": 7208960 }, { "epoch": 0.0, "learning_rate": 0.00011011904761904761, "loss": 6.736, "theoretical_loss": 7.331785654829519, "tokens_seen": 7274496 }, { "epoch": 0.0, "learning_rate": 0.0001111111111111111, "loss": 6.6337, "theoretical_loss": 7.319438674073677, "tokens_seen": 7340032 }, { "epoch": 0.0, "learning_rate": 0.0001121031746031746, "loss": 6.8302, "theoretical_loss": 7.307232001438824, "tokens_seen": 7405568 }, { "epoch": 0.0, "learning_rate": 0.00011309523809523809, "loss": 6.6336, "theoretical_loss": 7.295162821829564, "tokens_seen": 7471104 }, { "epoch": 0.0, "learning_rate": 0.00011408730158730158, "loss": 6.8159, "theoretical_loss": 7.283228400695652, "tokens_seen": 7536640 }, { "epoch": 0.0, "learning_rate": 0.00011507936507936508, "loss": 6.7659, "theoretical_loss": 7.271426081050832, "tokens_seen": 7602176 }, { "epoch": 0.0, "learning_rate": 0.00011607142857142858, "loss": 6.8116, "theoretical_loss": 7.259753280626623, "tokens_seen": 7667712 }, { "epoch": 0.0, "learning_rate": 0.00011706349206349207, "loss": 6.6805, "theoretical_loss": 7.24820748915387, "tokens_seen": 7733248 }, { "epoch": 0.0, "learning_rate": 0.00011805555555555556, "loss": 6.5565, "theoretical_loss": 7.236786265765262, "tokens_seen": 7798784 }, { "epoch": 0.0, "learning_rate": 0.00011904761904761905, "loss": 6.4731, "theoretical_loss": 7.225487236512497, "tokens_seen": 7864320 }, { "epoch": 0.0, "learning_rate": 0.00012003968253968255, "loss": 6.6966, "theoretical_loss": 7.21430809199212, "tokens_seen": 7929856 }, { "epoch": 0.0, "learning_rate": 0.00012103174603174604, "loss": 6.7285, "theoretical_loss": 7.2032465850744005, "tokens_seen": 7995392 }, { "epoch": 0.0, "learning_rate": 0.00012202380952380953, "loss": 6.6032, "theoretical_loss": 7.192300528730015, "tokens_seen": 8060928 }, { "epoch": 0.0, "learning_rate": 0.000123015873015873, "loss": 6.6353, "theoretical_loss": 7.1814677939495155, "tokens_seen": 8126464 }, { "epoch": 0.0, "objective/train/docs_used": 47165, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.636279582977295, "objective/train/theoretical_loss": 7.1707463077509646, "objective/train/tokens_used": 28652000, "theoretical_loss": 7.1707463077509646, "tokens_seen": 8192000 }, { "epoch": 0.0, "learning_rate": 0.0001240079365079365, "loss": 6.6969, "theoretical_loss": 7.1707463077509646, "tokens_seen": 8192000 }, { "epoch": 0.0, "learning_rate": 0.000125, "loss": 6.6674, "theoretical_loss": 7.160134051271272, "tokens_seen": 8257536 }, { "epoch": 0.0, "learning_rate": 0.0001259920634920635, "loss": 6.5741, "theoretical_loss": 7.149629057937138, "tokens_seen": 8323072 }, { "epoch": 0.0, "learning_rate": 0.00012698412698412698, "loss": 6.4856, "theoretical_loss": 7.139229411711638, "tokens_seen": 8388608 }, { "epoch": 0.0, "learning_rate": 0.00012797619047619048, "loss": 6.4402, "theoretical_loss": 7.128933245412794, "tokens_seen": 8454144 }, { "epoch": 0.0, "learning_rate": 0.00012896825396825398, "loss": 6.7068, "theoretical_loss": 7.118738739100616, "tokens_seen": 8519680 }, { "epoch": 0.0, "learning_rate": 0.00012996031746031748, "loss": 6.5039, "theoretical_loss": 7.1086441185293445, "tokens_seen": 8585216 }, { "epoch": 0.0, "learning_rate": 0.00013095238095238096, "loss": 6.7312, "theoretical_loss": 7.09864765366177, "tokens_seen": 8650752 }, { "epoch": 0.0, "learning_rate": 0.00013194444444444446, "loss": 6.5982, "theoretical_loss": 7.088747657242693, "tokens_seen": 8716288 }, { "epoch": 0.0, "learning_rate": 0.00013293650793650793, "loss": 6.6338, "theoretical_loss": 7.078942483428749, "tokens_seen": 8781824 }, { "epoch": 0.0, "learning_rate": 0.00013392857142857144, "loss": 6.5337, "theoretical_loss": 7.069230526471966, "tokens_seen": 8847360 }, { "epoch": 0.0, "learning_rate": 0.0001349206349206349, "loss": 6.5643, "theoretical_loss": 7.059610219454568, "tokens_seen": 8912896 }, { "epoch": 0.0, "learning_rate": 0.0001359126984126984, "loss": 6.3834, "theoretical_loss": 7.0500800330726685, "tokens_seen": 8978432 }, { "epoch": 0.0, "learning_rate": 0.0001369047619047619, "loss": 6.2631, "theoretical_loss": 7.040638474466625, "tokens_seen": 9043968 }, { "epoch": 0.0, "learning_rate": 0.00013789682539682541, "loss": 6.515, "theoretical_loss": 7.031284086095933, "tokens_seen": 9109504 }, { "epoch": 0.0, "learning_rate": 0.0001388888888888889, "loss": 6.3084, "theoretical_loss": 7.022015444656678, "tokens_seen": 9175040 }, { "epoch": 0.0, "learning_rate": 0.0001398809523809524, "loss": 6.7253, "theoretical_loss": 7.012831160039609, "tokens_seen": 9240576 }, { "epoch": 0.0, "learning_rate": 0.00014087301587301586, "loss": 6.6355, "theoretical_loss": 7.003729874327071, "tokens_seen": 9306112 }, { "epoch": 0.0, "learning_rate": 0.00014186507936507937, "loss": 6.3124, "theoretical_loss": 6.994710260827057, "tokens_seen": 9371648 }, { "epoch": 0.0, "learning_rate": 0.00014285714285714284, "loss": 6.5939, "theoretical_loss": 6.98577102314278, "tokens_seen": 9437184 }, { "epoch": 0.0, "learning_rate": 0.00014384920634920634, "loss": 6.3713, "theoretical_loss": 6.976910894276189, "tokens_seen": 9502720 }, { "epoch": 0.0, "learning_rate": 0.00014484126984126984, "loss": 6.4805, "theoretical_loss": 6.968128635764015, "tokens_seen": 9568256 }, { "epoch": 0.0, "learning_rate": 0.00014583333333333335, "loss": 6.484, "theoretical_loss": 6.959423036844894, "tokens_seen": 9633792 }, { "epoch": 0.0, "learning_rate": 0.00014682539682539685, "loss": 6.4699, "theoretical_loss": 6.950792913656309, "tokens_seen": 9699328 }, { "epoch": 0.0, "learning_rate": 0.00014781746031746032, "loss": 6.5364, "theoretical_loss": 6.942237108460029, "tokens_seen": 9764864 }, { "epoch": 0.0, "objective/train/docs_used": 48678, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.406508922576904, "objective/train/theoretical_loss": 6.9337544888949, "objective/train/tokens_used": 30290400, "theoretical_loss": 6.9337544888949, "tokens_seen": 9830400 }, { "epoch": 0.0, "learning_rate": 0.00014880952380952382, "loss": 6.3458, "theoretical_loss": 6.9337544888949, "tokens_seen": 9830400 }, { "epoch": 0.0, "learning_rate": 0.0001498015873015873, "loss": 6.5242, "theoretical_loss": 6.925343947255817, "tokens_seen": 9895936 }, { "epoch": 0.0, "learning_rate": 0.0001507936507936508, "loss": 6.4498, "theoretical_loss": 6.917004399797798, "tokens_seen": 9961472 }, { "epoch": 0.0, "learning_rate": 0.00015178571428571427, "loss": 6.4252, "theoretical_loss": 6.908734786064147, "tokens_seen": 10027008 }, { "epoch": 0.0, "learning_rate": 0.0001527777777777778, "loss": 6.3569, "theoretical_loss": 6.900534068237688, "tokens_seen": 10092544 }, { "epoch": 0.0, "learning_rate": 0.00015376984126984128, "loss": 6.3459, "theoretical_loss": 6.89240123051416, "tokens_seen": 10158080 }, { "epoch": 0.0, "learning_rate": 0.00015476190476190478, "loss": 6.3345, "theoretical_loss": 6.884335278496871, "tokens_seen": 10223616 }, { "epoch": 0.0, "learning_rate": 0.00015575396825396825, "loss": 6.1536, "theoretical_loss": 6.87633523861175, "tokens_seen": 10289152 }, { "epoch": 0.0, "learning_rate": 0.00015674603174603175, "loss": 6.3782, "theoretical_loss": 6.868400157541997, "tokens_seen": 10354688 }, { "epoch": 0.0, "learning_rate": 0.00015773809523809523, "loss": 6.2656, "theoretical_loss": 6.860529101681551, "tokens_seen": 10420224 }, { "epoch": 0.0, "learning_rate": 0.00015873015873015873, "loss": 6.488, "theoretical_loss": 6.85272115660663, "tokens_seen": 10485760 }, { "epoch": 0.0, "learning_rate": 0.0001597222222222222, "loss": 6.4672, "theoretical_loss": 6.844975426564642, "tokens_seen": 10551296 }, { "epoch": 0.0, "learning_rate": 0.00016071428571428573, "loss": 6.4513, "theoretical_loss": 6.8372910339797945, "tokens_seen": 10616832 }, { "epoch": 0.0, "learning_rate": 0.0001617063492063492, "loss": 6.4301, "theoretical_loss": 6.829667118974749, "tokens_seen": 10682368 }, { "epoch": 0.0, "learning_rate": 0.0001626984126984127, "loss": 6.2815, "theoretical_loss": 6.8221028389077185, "tokens_seen": 10747904 }, { "epoch": 0.0, "learning_rate": 0.00016369047619047618, "loss": 6.054, "theoretical_loss": 6.814597367924395, "tokens_seen": 10813440 }, { "epoch": 0.0, "learning_rate": 0.00016468253968253969, "loss": 6.1561, "theoretical_loss": 6.807149896524181, "tokens_seen": 10878976 }, { "epoch": 0.0, "learning_rate": 0.00016567460317460316, "loss": 6.2108, "theoretical_loss": 6.799759631140145, "tokens_seen": 10944512 }, { "epoch": 0.0, "learning_rate": 0.00016666666666666666, "loss": 6.2156, "theoretical_loss": 6.7924257937322245, "tokens_seen": 11010048 }, { "epoch": 0.0, "learning_rate": 0.00016765873015873016, "loss": 6.1798, "theoretical_loss": 6.785147621393148, "tokens_seen": 11075584 }, { "epoch": 0.0, "learning_rate": 0.00016865079365079366, "loss": 6.1029, "theoretical_loss": 6.777924365966638, "tokens_seen": 11141120 }, { "epoch": 0.0, "learning_rate": 0.00016964285714285717, "loss": 6.3399, "theoretical_loss": 6.770755293677423, "tokens_seen": 11206656 }, { "epoch": 0.0, "learning_rate": 0.00017063492063492064, "loss": 6.3517, "theoretical_loss": 6.763639684772625, "tokens_seen": 11272192 }, { "epoch": 0.0, "learning_rate": 0.00017162698412698414, "loss": 6.1829, "theoretical_loss": 6.756576833174123, "tokens_seen": 11337728 }, { "epoch": 0.0, "learning_rate": 0.00017261904761904762, "loss": 6.2728, "theoretical_loss": 6.749566046141486, "tokens_seen": 11403264 }, { "epoch": 0.0, "objective/train/docs_used": 51612, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.3861236572265625, "objective/train/theoretical_loss": 6.7426066439450905, "objective/train/tokens_used": 31928800, "theoretical_loss": 6.7426066439450905, "tokens_seen": 11468800 }, { "epoch": 0.0, "learning_rate": 0.00017361111111111112, "loss": 6.3208, "theoretical_loss": 6.7426066439450905, "tokens_seen": 11468800 }, { "epoch": 0.0, "learning_rate": 0.0001746031746031746, "loss": 6.2016, "theoretical_loss": 6.735697959549075, "tokens_seen": 11534336 }, { "epoch": 0.0, "learning_rate": 0.0001755952380952381, "loss": 6.4435, "theoretical_loss": 6.728839338303761, "tokens_seen": 11599872 }, { "epoch": 0.0, "learning_rate": 0.0001765873015873016, "loss": 6.1695, "theoretical_loss": 6.722030137647226, "tokens_seen": 11665408 }, { "epoch": 0.0, "learning_rate": 0.0001775793650793651, "loss": 6.192, "theoretical_loss": 6.715269726815689, "tokens_seen": 11730944 }, { "epoch": 0.0, "learning_rate": 0.00017857142857142857, "loss": 6.0874, "theoretical_loss": 6.7085574865624125, "tokens_seen": 11796480 }, { "epoch": 0.0, "learning_rate": 0.00017956349206349207, "loss": 6.2323, "theoretical_loss": 6.701892808884824, "tokens_seen": 11862016 }, { "epoch": 0.0, "learning_rate": 0.00018055555555555555, "loss": 6.418, "theoretical_loss": 6.695275096759559, "tokens_seen": 11927552 }, { "epoch": 0.0, "learning_rate": 0.00018154761904761905, "loss": 6.1257, "theoretical_loss": 6.68870376388518, "tokens_seen": 11993088 }, { "epoch": 0.0, "learning_rate": 0.00018253968253968252, "loss": 6.1984, "theoretical_loss": 6.682178234432274, "tokens_seen": 12058624 }, { "epoch": 0.0, "learning_rate": 0.00018353174603174602, "loss": 6.1225, "theoretical_loss": 6.675697942800715, "tokens_seen": 12124160 }, { "epoch": 0.0, "learning_rate": 0.00018452380952380953, "loss": 5.9573, "theoretical_loss": 6.669262333383815, "tokens_seen": 12189696 }, { "epoch": 0.0, "learning_rate": 0.00018551587301587303, "loss": 6.0214, "theoretical_loss": 6.662870860339158, "tokens_seen": 12255232 }, { "epoch": 0.0, "learning_rate": 0.0001865079365079365, "loss": 6.3566, "theoretical_loss": 6.656522987365879, "tokens_seen": 12320768 }, { "epoch": 0.0, "learning_rate": 0.0001875, "loss": 6.0185, "theoretical_loss": 6.6502181874881705, "tokens_seen": 12386304 }, { "epoch": 0.0, "learning_rate": 0.0001884920634920635, "loss": 6.2232, "theoretical_loss": 6.643955942844831, "tokens_seen": 12451840 }, { "epoch": 0.0, "learning_rate": 0.00018948412698412698, "loss": 6.2963, "theoretical_loss": 6.637735744484626, "tokens_seen": 12517376 }, { "epoch": 0.0, "learning_rate": 0.00019047619047619048, "loss": 6.0402, "theoretical_loss": 6.631557092167304, "tokens_seen": 12582912 }, { "epoch": 0.0, "learning_rate": 0.00019146825396825398, "loss": 6.2384, "theoretical_loss": 6.625419494170049, "tokens_seen": 12648448 }, { "epoch": 0.0, "learning_rate": 0.00019246031746031748, "loss": 6.0744, "theoretical_loss": 6.619322467099223, "tokens_seen": 12713984 }, { "epoch": 0.0, "learning_rate": 0.00019345238095238096, "loss": 6.084, "theoretical_loss": 6.613265535707211, "tokens_seen": 12779520 }, { "epoch": 0.0, "learning_rate": 0.00019444444444444446, "loss": 6.1142, "theoretical_loss": 6.607248232714213, "tokens_seen": 12845056 }, { "epoch": 0.0, "learning_rate": 0.00019543650793650793, "loss": 6.1338, "theoretical_loss": 6.60127009863481, "tokens_seen": 12910592 }, { "epoch": 0.0, "learning_rate": 0.00019642857142857144, "loss": 6.0052, "theoretical_loss": 6.59533068160918, "tokens_seen": 12976128 }, { "epoch": 0.0, "learning_rate": 0.0001974206349206349, "loss": 6.2726, "theoretical_loss": 6.589429537238785, "tokens_seen": 13041664 }, { "epoch": 0.0, "objective/train/docs_used": 54492, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.0954179763793945, "objective/train/theoretical_loss": 6.583566228426414, "objective/train/tokens_used": 33567200, "theoretical_loss": 6.583566228426414, "tokens_seen": 13107200 }, { "epoch": 0.0, "learning_rate": 0.0001984126984126984, "loss": 6.0669, "theoretical_loss": 6.583566228426414, "tokens_seen": 13107200 }, { "epoch": 0.0, "learning_rate": 0.00019940476190476191, "loss": 6.3194, "theoretical_loss": 6.5777403252204305, "tokens_seen": 13172736 }, { "epoch": 0.0, "learning_rate": 0.00020039682539682542, "loss": 6.3362, "theoretical_loss": 6.571951404663098, "tokens_seen": 13238272 }, { "epoch": 0.0, "learning_rate": 0.0002013888888888889, "loss": 6.0787, "theoretical_loss": 6.566199050642863, "tokens_seen": 13303808 }, { "epoch": 0.0, "learning_rate": 0.0002023809523809524, "loss": 6.1372, "theoretical_loss": 6.560482853750463, "tokens_seen": 13369344 }, { "epoch": 0.0, "learning_rate": 0.00020337301587301587, "loss": 6.376, "theoretical_loss": 6.554802411138745, "tokens_seen": 13434880 }, { "epoch": 0.0, "learning_rate": 0.00020436507936507937, "loss": 6.1034, "theoretical_loss": 6.549157326386091, "tokens_seen": 13500416 }, { "epoch": 0.0, "learning_rate": 0.00020535714285714284, "loss": 6.2188, "theoretical_loss": 6.54354720936333, "tokens_seen": 13565952 }, { "epoch": 0.0, "learning_rate": 0.00020634920634920634, "loss": 5.8478, "theoretical_loss": 6.537971676104026, "tokens_seen": 13631488 }, { "epoch": 0.0, "learning_rate": 0.00020734126984126985, "loss": 5.9413, "theoretical_loss": 6.532430348678068, "tokens_seen": 13697024 }, { "epoch": 0.0, "learning_rate": 0.00020833333333333335, "loss": 6.0273, "theoretical_loss": 6.5269228550684195, "tokens_seen": 13762560 }, { "epoch": 0.0, "learning_rate": 0.00020932539682539685, "loss": 6.0542, "theoretical_loss": 6.521448829050978, "tokens_seen": 13828096 }, { "epoch": 0.0, "learning_rate": 0.00021031746031746032, "loss": 5.8503, "theoretical_loss": 6.516007910077416, "tokens_seen": 13893632 }, { "epoch": 0.0, "learning_rate": 0.00021130952380952382, "loss": 6.0639, "theoretical_loss": 6.51059974316095, "tokens_seen": 13959168 }, { "epoch": 0.0, "learning_rate": 0.0002123015873015873, "loss": 6.2358, "theoretical_loss": 6.50522397876491, "tokens_seen": 14024704 }, { "epoch": 0.0, "learning_rate": 0.0002132936507936508, "loss": 6.0715, "theoretical_loss": 6.499880272694068, "tokens_seen": 14090240 }, { "epoch": 0.0, "learning_rate": 0.00021428571428571427, "loss": 6.0494, "theoretical_loss": 6.494568285988618, "tokens_seen": 14155776 }, { "epoch": 0.0, "learning_rate": 0.0002152777777777778, "loss": 6.1305, "theoretical_loss": 6.489287684820745, "tokens_seen": 14221312 }, { "epoch": 0.0, "learning_rate": 0.00021626984126984128, "loss": 6.0004, "theoretical_loss": 6.484038140393699, "tokens_seen": 14286848 }, { "epoch": 0.0, "learning_rate": 0.00021726190476190478, "loss": 6.2037, "theoretical_loss": 6.4788193288433105, "tokens_seen": 14352384 }, { "epoch": 0.0, "learning_rate": 0.00021825396825396825, "loss": 6.0852, "theoretical_loss": 6.473630931141869, "tokens_seen": 14417920 }, { "epoch": 0.0, "learning_rate": 0.00021924603174603176, "loss": 6.0212, "theoretical_loss": 6.468472633004308, "tokens_seen": 14483456 }, { "epoch": 0.0, "learning_rate": 0.00022023809523809523, "loss": 5.9655, "theoretical_loss": 6.463344124796616, "tokens_seen": 14548992 }, { "epoch": 0.0, "learning_rate": 0.00022123015873015873, "loss": 5.9862, "theoretical_loss": 6.45824510144643, "tokens_seen": 14614528 }, { "epoch": 0.0, "learning_rate": 0.0002222222222222222, "loss": 6.0558, "theoretical_loss": 6.45317526235573, "tokens_seen": 14680064 }, { "epoch": 0.0, "objective/train/docs_used": 56313, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 6.086879253387451, "objective/train/theoretical_loss": 6.448134311315593, "objective/train/tokens_used": 35205600, "theoretical_loss": 6.448134311315593, "tokens_seen": 14745600 }, { "epoch": 0.0, "learning_rate": 0.00022321428571428573, "loss": 6.0624, "theoretical_loss": 6.448134311315593, "tokens_seen": 14745600 }, { "epoch": 0.0, "learning_rate": 0.0002242063492063492, "loss": 6.0122, "theoretical_loss": 6.443121956422939, "tokens_seen": 14811136 }, { "epoch": 0.0, "learning_rate": 0.0002251984126984127, "loss": 5.8401, "theoretical_loss": 6.438137909999214, "tokens_seen": 14876672 }, { "epoch": 0.0, "learning_rate": 0.00022619047619047618, "loss": 5.8943, "theoretical_loss": 6.433181888510964, "tokens_seen": 14942208 }, { "epoch": 0.0, "learning_rate": 0.00022718253968253969, "loss": 6.0433, "theoretical_loss": 6.428253612492239, "tokens_seen": 15007744 }, { "epoch": 0.0, "learning_rate": 0.00022817460317460316, "loss": 5.8834, "theoretical_loss": 6.4233528064687855, "tokens_seen": 15073280 }, { "epoch": 0.0, "learning_rate": 0.00022916666666666666, "loss": 5.9581, "theoretical_loss": 6.418479198883969, "tokens_seen": 15138816 }, { "epoch": 0.0, "learning_rate": 0.00023015873015873016, "loss": 6.1002, "theoretical_loss": 6.413632522026391, "tokens_seen": 15204352 }, { "epoch": 0.0, "learning_rate": 0.00023115079365079367, "loss": 6.0166, "theoretical_loss": 6.40881251195914, "tokens_seen": 15269888 }, { "epoch": 0.0, "learning_rate": 0.00023214285714285717, "loss": 5.8998, "theoretical_loss": 6.404018908450656, "tokens_seen": 15335424 }, { "epoch": 0.0, "learning_rate": 0.00023313492063492064, "loss": 5.9655, "theoretical_loss": 6.399251454907132, "tokens_seen": 15400960 }, { "epoch": 0.0, "learning_rate": 0.00023412698412698414, "loss": 5.7726, "theoretical_loss": 6.394509898306452, "tokens_seen": 15466496 }, { "epoch": 0.0, "learning_rate": 0.00023511904761904762, "loss": 5.8575, "theoretical_loss": 6.389793989133574, "tokens_seen": 15532032 }, { "epoch": 0.0, "learning_rate": 0.00023611111111111112, "loss": 5.9759, "theoretical_loss": 6.385103481317387, "tokens_seen": 15597568 }, { "epoch": 0.0, "learning_rate": 0.0002371031746031746, "loss": 5.97, "theoretical_loss": 6.380438132168923, "tokens_seen": 15663104 }, { "epoch": 0.0, "learning_rate": 0.0002380952380952381, "loss": 6.0219, "theoretical_loss": 6.375797702320966, "tokens_seen": 15728640 }, { "epoch": 0.0, "learning_rate": 0.0002390873015873016, "loss": 5.8308, "theoretical_loss": 6.371181955668966, "tokens_seen": 15794176 }, { "epoch": 0.0, "learning_rate": 0.0002400793650793651, "loss": 5.9361, "theoretical_loss": 6.366590659313248, "tokens_seen": 15859712 }, { "epoch": 0.0, "learning_rate": 0.00024107142857142857, "loss": 5.8208, "theoretical_loss": 6.36202358350248, "tokens_seen": 15925248 }, { "epoch": 0.0, "learning_rate": 0.00024206349206349207, "loss": 5.9479, "theoretical_loss": 6.357480501578371, "tokens_seen": 15990784 }, { "epoch": 0.0, "learning_rate": 0.00024305555555555555, "loss": 6.1544, "theoretical_loss": 6.352961189921553, "tokens_seen": 16056320 }, { "epoch": 0.0, "learning_rate": 0.00024404761904761905, "loss": 5.7589, "theoretical_loss": 6.348465427898629, "tokens_seen": 16121856 }, { "epoch": 0.0, "learning_rate": 0.00024503968253968255, "loss": 6.0063, "theoretical_loss": 6.343992997810366, "tokens_seen": 16187392 }, { "epoch": 0.0, "learning_rate": 0.000246031746031746, "loss": 5.9188, "theoretical_loss": 6.33954368484097, "tokens_seen": 16252928 }, { "epoch": 0.0, "learning_rate": 0.00024702380952380955, "loss": 5.6843, "theoretical_loss": 6.33511727700846, "tokens_seen": 16318464 }, { "epoch": 0.0, "objective/train/docs_used": 59194, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.91401481628418, "objective/train/theoretical_loss": 6.330713565116083, "objective/train/tokens_used": 36844000, "theoretical_loss": 6.330713565116083, "tokens_seen": 16384000 }, { "epoch": 0.0, "learning_rate": 0.000248015873015873, "loss": 5.6106, "theoretical_loss": 6.330713565116083, "tokens_seen": 16384000 }, { "epoch": 0.0, "learning_rate": 0.0002490079365079365, "loss": 5.8086, "theoretical_loss": 6.326332342704751, "tokens_seen": 16449536 }, { "epoch": 0.01, "learning_rate": 0.00025, "loss": 6.0893, "theoretical_loss": 6.32197340600647, "tokens_seen": 16515072 }, { "epoch": 0.01, "learning_rate": 0.0002509920634920635, "loss": 5.7681, "theoretical_loss": 6.3176365538987636, "tokens_seen": 16580608 }, { "epoch": 0.01, "learning_rate": 0.000251984126984127, "loss": 5.8809, "theoretical_loss": 6.313321587860021, "tokens_seen": 16646144 }, { "epoch": 0.01, "learning_rate": 0.00025297619047619046, "loss": 6.0753, "theoretical_loss": 6.309028311925785, "tokens_seen": 16711680 }, { "epoch": 0.01, "learning_rate": 0.00025396825396825396, "loss": 6.0053, "theoretical_loss": 6.304756532645939, "tokens_seen": 16777216 }, { "epoch": 0.01, "learning_rate": 0.00025496031746031746, "loss": 6.0781, "theoretical_loss": 6.300506059042775, "tokens_seen": 16842752 }, { "epoch": 0.01, "learning_rate": 0.00025595238095238096, "loss": 5.8909, "theoretical_loss": 6.296276702569918, "tokens_seen": 16908288 }, { "epoch": 0.01, "learning_rate": 0.0002569444444444444, "loss": 5.9127, "theoretical_loss": 6.292068277072099, "tokens_seen": 16973824 }, { "epoch": 0.01, "learning_rate": 0.00025793650793650796, "loss": 5.9811, "theoretical_loss": 6.28788059874573, "tokens_seen": 17039360 }, { "epoch": 0.01, "learning_rate": 0.00025892857142857146, "loss": 5.8947, "theoretical_loss": 6.283713486100297, "tokens_seen": 17104896 }, { "epoch": 0.01, "learning_rate": 0.00025992063492063497, "loss": 5.7892, "theoretical_loss": 6.279566759920507, "tokens_seen": 17170432 }, { "epoch": 0.01, "learning_rate": 0.0002609126984126984, "loss": 5.9186, "theoretical_loss": 6.275440243229228, "tokens_seen": 17235968 }, { "epoch": 0.01, "learning_rate": 0.0002619047619047619, "loss": 5.5963, "theoretical_loss": 6.271333761251142, "tokens_seen": 17301504 }, { "epoch": 0.01, "learning_rate": 0.0002628968253968254, "loss": 6.0761, "theoretical_loss": 6.267247141377137, "tokens_seen": 17367040 }, { "epoch": 0.01, "learning_rate": 0.0002638888888888889, "loss": 6.0247, "theoretical_loss": 6.2631802131294085, "tokens_seen": 17432576 }, { "epoch": 0.01, "learning_rate": 0.00026488095238095237, "loss": 5.894, "theoretical_loss": 6.259132808127246, "tokens_seen": 17498112 }, { "epoch": 0.01, "learning_rate": 0.00026587301587301587, "loss": 5.8622, "theoretical_loss": 6.255104760053497, "tokens_seen": 17563648 }, { "epoch": 0.01, "learning_rate": 0.00026686507936507937, "loss": 5.9411, "theoretical_loss": 6.251095904621689, "tokens_seen": 17629184 }, { "epoch": 0.01, "learning_rate": 0.00026785714285714287, "loss": 5.7838, "theoretical_loss": 6.247106079543801, "tokens_seen": 17694720 }, { "epoch": 0.01, "learning_rate": 0.0002688492063492063, "loss": 5.8726, "theoretical_loss": 6.243135124498652, "tokens_seen": 17760256 }, { "epoch": 0.01, "learning_rate": 0.0002698412698412698, "loss": 5.9666, "theoretical_loss": 6.239182881100916, "tokens_seen": 17825792 }, { "epoch": 0.01, "learning_rate": 0.0002708333333333333, "loss": 5.632, "theoretical_loss": 6.235249192870732, "tokens_seen": 17891328 }, { "epoch": 0.01, "learning_rate": 0.0002718253968253968, "loss": 5.9117, "theoretical_loss": 6.231333905203899, "tokens_seen": 17956864 }, { "epoch": 0.01, "objective/train/docs_used": 62100, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.888058662414551, "objective/train/theoretical_loss": 6.227436865342643, "objective/train/tokens_used": 38482400, "theoretical_loss": 6.227436865342643, "tokens_seen": 18022400 }, { "epoch": 0.01, "learning_rate": 0.0002728174603174603, "loss": 5.9095, "theoretical_loss": 6.227436865342643, "tokens_seen": 18022400 }, { "epoch": 0.01, "learning_rate": 0.0002738095238095238, "loss": 5.9651, "theoretical_loss": 6.223557922346955, "tokens_seen": 18087936 }, { "epoch": 0.01, "learning_rate": 0.0002748015873015873, "loss": 5.9025, "theoretical_loss": 6.219696927066456, "tokens_seen": 18153472 }, { "epoch": 0.01, "learning_rate": 0.00027579365079365083, "loss": 5.8309, "theoretical_loss": 6.215853732112821, "tokens_seen": 18219008 }, { "epoch": 0.01, "learning_rate": 0.00027678571428571433, "loss": 5.8131, "theoretical_loss": 6.212028191832702, "tokens_seen": 18284544 }, { "epoch": 0.01, "learning_rate": 0.0002777777777777778, "loss": 5.7637, "theoretical_loss": 6.208220162281178, "tokens_seen": 18350080 }, { "epoch": 0.01, "learning_rate": 0.0002787698412698413, "loss": 5.6733, "theoretical_loss": 6.204429501195701, "tokens_seen": 18415616 }, { "epoch": 0.01, "learning_rate": 0.0002797619047619048, "loss": 5.7202, "theoretical_loss": 6.20065606797053, "tokens_seen": 18481152 }, { "epoch": 0.01, "learning_rate": 0.0002807539682539683, "loss": 5.6929, "theoretical_loss": 6.19689972363164, "tokens_seen": 18546688 }, { "epoch": 0.01, "learning_rate": 0.00028174603174603173, "loss": 5.9452, "theoretical_loss": 6.1931603308120975, "tokens_seen": 18612224 }, { "epoch": 0.01, "learning_rate": 0.00028273809523809523, "loss": 5.7951, "theoretical_loss": 6.189437753727901, "tokens_seen": 18677760 }, { "epoch": 0.01, "learning_rate": 0.00028373015873015873, "loss": 5.641, "theoretical_loss": 6.185731858154261, "tokens_seen": 18743296 }, { "epoch": 0.01, "learning_rate": 0.00028472222222222223, "loss": 5.5868, "theoretical_loss": 6.182042511402313, "tokens_seen": 18808832 }, { "epoch": 0.01, "learning_rate": 0.0002857142857142857, "loss": 5.695, "theoretical_loss": 6.17836958229627, "tokens_seen": 18874368 }, { "epoch": 0.01, "learning_rate": 0.0002867063492063492, "loss": 5.7386, "theoretical_loss": 6.1747129411509825, "tokens_seen": 18939904 }, { "epoch": 0.01, "learning_rate": 0.0002876984126984127, "loss": 5.7513, "theoretical_loss": 6.171072459749913, "tokens_seen": 19005440 }, { "epoch": 0.01, "learning_rate": 0.0002886904761904762, "loss": 5.8827, "theoretical_loss": 6.1674480113235095, "tokens_seen": 19070976 }, { "epoch": 0.01, "learning_rate": 0.0002896825396825397, "loss": 5.723, "theoretical_loss": 6.163839470527964, "tokens_seen": 19136512 }, { "epoch": 0.01, "learning_rate": 0.0002906746031746032, "loss": 5.8091, "theoretical_loss": 6.160246713424372, "tokens_seen": 19202048 }, { "epoch": 0.01, "learning_rate": 0.0002916666666666667, "loss": 5.6575, "theoretical_loss": 6.156669617458243, "tokens_seen": 19267584 }, { "epoch": 0.01, "learning_rate": 0.0002926587301587302, "loss": 5.6244, "theoretical_loss": 6.153108061439397, "tokens_seen": 19333120 }, { "epoch": 0.01, "learning_rate": 0.0002936507936507937, "loss": 5.6921, "theoretical_loss": 6.149561925522211, "tokens_seen": 19398656 }, { "epoch": 0.01, "learning_rate": 0.00029464285714285714, "loss": 5.6634, "theoretical_loss": 6.146031091186222, "tokens_seen": 19464192 }, { "epoch": 0.01, "learning_rate": 0.00029563492063492064, "loss": 5.8238, "theoretical_loss": 6.142515441217064, "tokens_seen": 19529728 }, { "epoch": 0.01, "learning_rate": 0.00029662698412698414, "loss": 5.7809, "theoretical_loss": 6.1390148596877605, "tokens_seen": 19595264 }, { "epoch": 0.01, "objective/train/docs_used": 64900, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.566831111907959, "objective/train/theoretical_loss": 6.135529231940326, "objective/train/tokens_used": 40120800, "theoretical_loss": 6.135529231940326, "tokens_seen": 19660800 }, { "epoch": 0.01, "learning_rate": 0.00029761904761904765, "loss": 5.6619, "theoretical_loss": 6.135529231940326, "tokens_seen": 19660800 }, { "epoch": 0.01, "learning_rate": 0.0002986111111111111, "loss": 5.7348, "theoretical_loss": 6.132058444567705, "tokens_seen": 19726336 }, { "epoch": 0.01, "learning_rate": 0.0002996031746031746, "loss": 5.7043, "theoretical_loss": 6.128602385396022, "tokens_seen": 19791872 }, { "epoch": 0.01, "learning_rate": 0.0003005952380952381, "loss": 5.7255, "theoretical_loss": 6.125160943467138, "tokens_seen": 19857408 }, { "epoch": 0.01, "learning_rate": 0.0003015873015873016, "loss": 5.5479, "theoretical_loss": 6.121734009021521, "tokens_seen": 19922944 }, { "epoch": 0.01, "learning_rate": 0.00030257936507936505, "loss": 5.5617, "theoretical_loss": 6.118321473481398, "tokens_seen": 19988480 }, { "epoch": 0.01, "learning_rate": 0.00030357142857142855, "loss": 5.6942, "theoretical_loss": 6.114923229434213, "tokens_seen": 20054016 }, { "epoch": 0.01, "learning_rate": 0.00030456349206349205, "loss": 5.6712, "theoretical_loss": 6.111539170616359, "tokens_seen": 20119552 }, { "epoch": 0.01, "learning_rate": 0.0003055555555555556, "loss": 5.7459, "theoretical_loss": 6.108169191897195, "tokens_seen": 20185088 }, { "epoch": 0.01, "learning_rate": 0.00030654761904761905, "loss": 5.8681, "theoretical_loss": 6.104813189263336, "tokens_seen": 20250624 }, { "epoch": 0.01, "learning_rate": 0.00030753968253968255, "loss": 5.7064, "theoretical_loss": 6.101471059803204, "tokens_seen": 20316160 }, { "epoch": 0.01, "learning_rate": 0.00030853174603174605, "loss": 5.6727, "theoretical_loss": 6.098142701691856, "tokens_seen": 20381696 }, { "epoch": 0.01, "learning_rate": 0.00030952380952380956, "loss": 5.5902, "theoretical_loss": 6.094828014176053, "tokens_seen": 20447232 }, { "epoch": 0.01, "learning_rate": 0.000310515873015873, "loss": 5.6721, "theoretical_loss": 6.091526897559593, "tokens_seen": 20512768 }, { "epoch": 0.01, "learning_rate": 0.0003115079365079365, "loss": 5.8291, "theoretical_loss": 6.088239253188885, "tokens_seen": 20578304 }, { "epoch": 0.01, "learning_rate": 0.0003125, "loss": 5.477, "theoretical_loss": 6.084964983438763, "tokens_seen": 20643840 }, { "epoch": 0.01, "learning_rate": 0.0003134920634920635, "loss": 5.7125, "theoretical_loss": 6.0817039916985465, "tokens_seen": 20709376 }, { "epoch": 0.01, "learning_rate": 0.000314484126984127, "loss": 5.4744, "theoretical_loss": 6.078456182358325, "tokens_seen": 20774912 }, { "epoch": 0.01, "learning_rate": 0.00031547619047619046, "loss": 5.6602, "theoretical_loss": 6.075221460795472, "tokens_seen": 20840448 }, { "epoch": 0.01, "learning_rate": 0.00031646825396825396, "loss": 5.7366, "theoretical_loss": 6.071999733361386, "tokens_seen": 20905984 }, { "epoch": 0.01, "learning_rate": 0.00031746031746031746, "loss": 5.5388, "theoretical_loss": 6.068790907368448, "tokens_seen": 20971520 }, { "epoch": 0.01, "learning_rate": 0.00031845238095238096, "loss": 5.5893, "theoretical_loss": 6.0655948910771915, "tokens_seen": 21037056 }, { "epoch": 0.01, "learning_rate": 0.0003194444444444444, "loss": 5.8566, "theoretical_loss": 6.062411593683687, "tokens_seen": 21102592 }, { "epoch": 0.01, "learning_rate": 0.00032043650793650796, "loss": 5.7343, "theoretical_loss": 6.059240925307134, "tokens_seen": 21168128 }, { "epoch": 0.01, "learning_rate": 0.00032142857142857147, "loss": 5.5727, "theoretical_loss": 6.056082796977648, "tokens_seen": 21233664 }, { "epoch": 0.01, "objective/train/docs_used": 67698, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.528525352478027, "objective/train/theoretical_loss": 6.052937120624258, "objective/train/tokens_used": 41759200, "theoretical_loss": 6.052937120624258, "tokens_seen": 21299200 }, { "epoch": 0.01, "learning_rate": 0.00032242063492063497, "loss": 5.5647, "theoretical_loss": 6.052937120624258, "tokens_seen": 21299200 }, { "epoch": 0.01, "learning_rate": 0.0003234126984126984, "loss": 5.6199, "theoretical_loss": 6.049803809063083, "tokens_seen": 21364736 }, { "epoch": 0.01, "learning_rate": 0.0003244047619047619, "loss": 5.7615, "theoretical_loss": 6.0466827759857145, "tokens_seen": 21430272 }, { "epoch": 0.01, "learning_rate": 0.0003253968253968254, "loss": 5.7515, "theoretical_loss": 6.04357393594778, "tokens_seen": 21495808 }, { "epoch": 0.01, "learning_rate": 0.0003263888888888889, "loss": 5.5601, "theoretical_loss": 6.040477204357686, "tokens_seen": 21561344 }, { "epoch": 0.01, "learning_rate": 0.00032738095238095237, "loss": 5.644, "theoretical_loss": 6.037392497465552, "tokens_seen": 21626880 }, { "epoch": 0.01, "learning_rate": 0.00032837301587301587, "loss": 5.7359, "theoretical_loss": 6.034319732352309, "tokens_seen": 21692416 }, { "epoch": 0.01, "learning_rate": 0.00032936507936507937, "loss": 5.781, "theoretical_loss": 6.031258826918979, "tokens_seen": 21757952 }, { "epoch": 0.01, "learning_rate": 0.00033035714285714287, "loss": 5.527, "theoretical_loss": 6.0282096998761245, "tokens_seen": 21823488 }, { "epoch": 0.01, "learning_rate": 0.0003313492063492063, "loss": 5.7481, "theoretical_loss": 6.025172270733464, "tokens_seen": 21889024 }, { "epoch": 0.01, "learning_rate": 0.0003323412698412698, "loss": 5.6225, "theoretical_loss": 6.0221464597896475, "tokens_seen": 21954560 }, { "epoch": 0.01, "learning_rate": 0.0003333333333333333, "loss": 5.7295, "theoretical_loss": 6.0191321881221995, "tokens_seen": 22020096 }, { "epoch": 0.01, "learning_rate": 0.0003343253968253968, "loss": 5.749, "theoretical_loss": 6.016129377577614, "tokens_seen": 22085632 }, { "epoch": 0.01, "learning_rate": 0.0003353174603174603, "loss": 5.5909, "theoretical_loss": 6.01313795076161, "tokens_seen": 22151168 }, { "epoch": 0.01, "learning_rate": 0.0003363095238095238, "loss": 5.5183, "theoretical_loss": 6.010157831029533, "tokens_seen": 22216704 }, { "epoch": 0.01, "learning_rate": 0.00033730158730158733, "loss": 5.6762, "theoretical_loss": 6.007188942476907, "tokens_seen": 22282240 }, { "epoch": 0.01, "learning_rate": 0.00033829365079365083, "loss": 5.7283, "theoretical_loss": 6.0042312099301425, "tokens_seen": 22347776 }, { "epoch": 0.01, "learning_rate": 0.00033928571428571433, "loss": 5.6462, "theoretical_loss": 6.001284558937368, "tokens_seen": 22413312 }, { "epoch": 0.01, "learning_rate": 0.0003402777777777778, "loss": 5.5794, "theoretical_loss": 5.998348915759426, "tokens_seen": 22478848 }, { "epoch": 0.01, "learning_rate": 0.0003412698412698413, "loss": 5.525, "theoretical_loss": 5.995424207360987, "tokens_seen": 22544384 }, { "epoch": 0.01, "learning_rate": 0.0003422619047619048, "loss": 5.5266, "theoretical_loss": 5.992510361401818, "tokens_seen": 22609920 }, { "epoch": 0.01, "learning_rate": 0.0003432539682539683, "loss": 5.4959, "theoretical_loss": 5.989607306228168, "tokens_seen": 22675456 }, { "epoch": 0.01, "learning_rate": 0.00034424603174603173, "loss": 5.5175, "theoretical_loss": 5.986714970864292, "tokens_seen": 22740992 }, { "epoch": 0.01, "learning_rate": 0.00034523809523809523, "loss": 5.6895, "theoretical_loss": 5.983833285004112, "tokens_seen": 22806528 }, { "epoch": 0.01, "learning_rate": 0.00034623015873015873, "loss": 5.6666, "theoretical_loss": 5.980962179002983, "tokens_seen": 22872064 }, { "epoch": 0.01, "objective/train/docs_used": 70317, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.590818405151367, "objective/train/theoretical_loss": 5.978101583869607, "objective/train/tokens_used": 43397600, "theoretical_loss": 5.978101583869607, "tokens_seen": 22937600 }, { "epoch": 0.01, "learning_rate": 0.00034722222222222224, "loss": 5.596, "theoretical_loss": 5.978101583869607, "tokens_seen": 22937600 }, { "epoch": 0.01, "learning_rate": 0.0003482142857142857, "loss": 5.5981, "theoretical_loss": 5.975251431258057, "tokens_seen": 23003136 }, { "epoch": 0.01, "learning_rate": 0.0003492063492063492, "loss": 5.578, "theoretical_loss": 5.972411653459913, "tokens_seen": 23068672 }, { "epoch": 0.01, "learning_rate": 0.0003501984126984127, "loss": 5.7015, "theoretical_loss": 5.9695821833965335, "tokens_seen": 23134208 }, { "epoch": 0.01, "learning_rate": 0.0003511904761904762, "loss": 5.451, "theoretical_loss": 5.966762954611432, "tokens_seen": 23199744 }, { "epoch": 0.01, "learning_rate": 0.0003521825396825397, "loss": 5.5237, "theoretical_loss": 5.963953901262764, "tokens_seen": 23265280 }, { "epoch": 0.01, "learning_rate": 0.0003531746031746032, "loss": 5.6503, "theoretical_loss": 5.961154958115937, "tokens_seen": 23330816 }, { "epoch": 0.01, "learning_rate": 0.0003541666666666667, "loss": 5.7299, "theoretical_loss": 5.958366060536315, "tokens_seen": 23396352 }, { "epoch": 0.01, "learning_rate": 0.0003551587301587302, "loss": 5.6056, "theoretical_loss": 5.955587144482044, "tokens_seen": 23461888 }, { "epoch": 0.01, "learning_rate": 0.0003561507936507937, "loss": 5.4389, "theoretical_loss": 5.952818146496978, "tokens_seen": 23527424 }, { "epoch": 0.01, "learning_rate": 0.00035714285714285714, "loss": 5.3916, "theoretical_loss": 5.950059003703704, "tokens_seen": 23592960 }, { "epoch": 0.01, "learning_rate": 0.00035813492063492064, "loss": 5.4824, "theoretical_loss": 5.94730965379668, "tokens_seen": 23658496 }, { "epoch": 0.01, "learning_rate": 0.00035912698412698415, "loss": 5.5433, "theoretical_loss": 5.944570035035458, "tokens_seen": 23724032 }, { "epoch": 0.01, "learning_rate": 0.00036011904761904765, "loss": 5.6627, "theoretical_loss": 5.941840086238027, "tokens_seen": 23789568 }, { "epoch": 0.01, "learning_rate": 0.0003611111111111111, "loss": 5.6087, "theoretical_loss": 5.939119746774228, "tokens_seen": 23855104 }, { "epoch": 0.01, "learning_rate": 0.0003621031746031746, "loss": 5.6528, "theoretical_loss": 5.936408956559284, "tokens_seen": 23920640 }, { "epoch": 0.01, "learning_rate": 0.0003630952380952381, "loss": 5.4345, "theoretical_loss": 5.933707656047414, "tokens_seen": 23986176 }, { "epoch": 0.01, "learning_rate": 0.0003640873015873016, "loss": 5.5503, "theoretical_loss": 5.93101578622554, "tokens_seen": 24051712 }, { "epoch": 0.01, "learning_rate": 0.00036507936507936505, "loss": 5.516, "theoretical_loss": 5.928333288607086, "tokens_seen": 24117248 }, { "epoch": 0.01, "learning_rate": 0.00036607142857142855, "loss": 5.644, "theoretical_loss": 5.925660105225867, "tokens_seen": 24182784 }, { "epoch": 0.01, "learning_rate": 0.00036706349206349205, "loss": 5.704, "theoretical_loss": 5.92299617863006, "tokens_seen": 24248320 }, { "epoch": 0.01, "learning_rate": 0.0003680555555555556, "loss": 5.4612, "theoretical_loss": 5.920341451876267, "tokens_seen": 24313856 }, { "epoch": 0.01, "learning_rate": 0.00036904761904761905, "loss": 5.5863, "theoretical_loss": 5.9176958685236585, "tokens_seen": 24379392 }, { "epoch": 0.01, "learning_rate": 0.00037003968253968255, "loss": 5.3849, "theoretical_loss": 5.9150593726282015, "tokens_seen": 24444928 }, { "epoch": 0.01, "learning_rate": 0.00037103174603174606, "loss": 5.5262, "theoretical_loss": 5.912431908736972, "tokens_seen": 24510464 }, { "epoch": 0.01, "objective/train/docs_used": 71675, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.531198024749756, "objective/train/theoretical_loss": 5.909813421882534, "objective/train/tokens_used": 45036000, "theoretical_loss": 5.909813421882534, "tokens_seen": 24576000 }, { "epoch": 0.01, "learning_rate": 0.00037202380952380956, "loss": 5.549, "theoretical_loss": 5.909813421882534, "tokens_seen": 24576000 }, { "epoch": 0.01, "learning_rate": 0.000373015873015873, "loss": 5.4436, "theoretical_loss": 5.907203857577422, "tokens_seen": 24641536 }, { "epoch": 0.01, "learning_rate": 0.0003740079365079365, "loss": 5.5545, "theoretical_loss": 5.9046031618086765, "tokens_seen": 24707072 }, { "epoch": 0.01, "learning_rate": 0.000375, "loss": 5.217, "theoretical_loss": 5.902011281032472, "tokens_seen": 24772608 }, { "epoch": 0.01, "learning_rate": 0.0003759920634920635, "loss": 5.5304, "theoretical_loss": 5.899428162168808, "tokens_seen": 24838144 }, { "epoch": 0.01, "learning_rate": 0.000376984126984127, "loss": 5.4246, "theoretical_loss": 5.896853752596286, "tokens_seen": 24903680 }, { "epoch": 0.01, "learning_rate": 0.00037797619047619046, "loss": 5.5621, "theoretical_loss": 5.894288000146949, "tokens_seen": 24969216 }, { "epoch": 0.01, "learning_rate": 0.00037896825396825396, "loss": 5.2542, "theoretical_loss": 5.891730853101199, "tokens_seen": 25034752 }, { "epoch": 0.01, "learning_rate": 0.00037996031746031746, "loss": 5.4454, "theoretical_loss": 5.88918226018278, "tokens_seen": 25100288 }, { "epoch": 0.01, "learning_rate": 0.00038095238095238096, "loss": 5.4325, "theoretical_loss": 5.8866421705538325, "tokens_seen": 25165824 }, { "epoch": 0.01, "learning_rate": 0.0003819444444444444, "loss": 5.1965, "theoretical_loss": 5.8841105338100155, "tokens_seen": 25231360 }, { "epoch": 0.01, "learning_rate": 0.00038293650793650797, "loss": 5.4776, "theoretical_loss": 5.881587299975694, "tokens_seen": 25296896 }, { "epoch": 0.01, "learning_rate": 0.00038392857142857147, "loss": 5.3966, "theoretical_loss": 5.8790724194991935, "tokens_seen": 25362432 }, { "epoch": 0.01, "learning_rate": 0.00038492063492063497, "loss": 5.4151, "theoretical_loss": 5.876565843248124, "tokens_seen": 25427968 }, { "epoch": 0.01, "learning_rate": 0.0003859126984126984, "loss": 5.1842, "theoretical_loss": 5.8740675225047525, "tokens_seen": 25493504 }, { "epoch": 0.01, "learning_rate": 0.0003869047619047619, "loss": 5.2931, "theoretical_loss": 5.871577408961457, "tokens_seen": 25559040 }, { "epoch": 0.01, "learning_rate": 0.0003878968253968254, "loss": 5.5767, "theoretical_loss": 5.869095454716231, "tokens_seen": 25624576 }, { "epoch": 0.01, "learning_rate": 0.0003888888888888889, "loss": 5.4841, "theoretical_loss": 5.866621612268246, "tokens_seen": 25690112 }, { "epoch": 0.01, "learning_rate": 0.00038988095238095237, "loss": 5.3775, "theoretical_loss": 5.864155834513486, "tokens_seen": 25755648 }, { "epoch": 0.01, "learning_rate": 0.00039087301587301587, "loss": 5.4684, "theoretical_loss": 5.8616980747404295, "tokens_seen": 25821184 }, { "epoch": 0.01, "learning_rate": 0.00039186507936507937, "loss": 5.4561, "theoretical_loss": 5.859248286625787, "tokens_seen": 25886720 }, { "epoch": 0.01, "learning_rate": 0.0003928571428571429, "loss": 5.3155, "theoretical_loss": 5.856806424230314, "tokens_seen": 25952256 }, { "epoch": 0.01, "learning_rate": 0.0003938492063492063, "loss": 5.632, "theoretical_loss": 5.854372441994654, "tokens_seen": 26017792 }, { "epoch": 0.01, "learning_rate": 0.0003948412698412698, "loss": 5.5455, "theoretical_loss": 5.851946294735258, "tokens_seen": 26083328 }, { "epoch": 0.01, "learning_rate": 0.0003958333333333333, "loss": 5.425, "theoretical_loss": 5.849527937640345, "tokens_seen": 26148864 }, { "epoch": 0.01, "objective/train/docs_used": 74395, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.870551586151123, "objective/train/theoretical_loss": 5.8471173262659235, "objective/train/tokens_used": 46674400, "theoretical_loss": 5.8471173262659235, "tokens_seen": 26214400 }, { "epoch": 0.01, "learning_rate": 0.0003968253968253968, "loss": 5.1402, "theoretical_loss": 5.8471173262659235, "tokens_seen": 26214400 }, { "epoch": 0.01, "learning_rate": 0.0003978174603174603, "loss": 5.3716, "theoretical_loss": 5.84471441653186, "tokens_seen": 26279936 }, { "epoch": 0.01, "learning_rate": 0.00039880952380952383, "loss": 5.5112, "theoretical_loss": 5.842319164718004, "tokens_seen": 26345472 }, { "epoch": 0.01, "learning_rate": 0.00039980158730158733, "loss": 5.1401, "theoretical_loss": 5.83993152746036, "tokens_seen": 26411008 }, { "epoch": 0.01, "learning_rate": 0.00040079365079365083, "loss": 5.6602, "theoretical_loss": 5.83755146174731, "tokens_seen": 26476544 }, { "epoch": 0.01, "learning_rate": 0.00040178571428571433, "loss": 5.3841, "theoretical_loss": 5.835178924915889, "tokens_seen": 26542080 }, { "epoch": 0.01, "learning_rate": 0.0004027777777777778, "loss": 5.3389, "theoretical_loss": 5.832813874648102, "tokens_seen": 26607616 }, { "epoch": 0.01, "learning_rate": 0.0004037698412698413, "loss": 5.521, "theoretical_loss": 5.8304562689673, "tokens_seen": 26673152 }, { "epoch": 0.01, "learning_rate": 0.0004047619047619048, "loss": 5.3763, "theoretical_loss": 5.828106066234588, "tokens_seen": 26738688 }, { "epoch": 0.01, "learning_rate": 0.0004057539682539683, "loss": 5.5239, "theoretical_loss": 5.825763225145295, "tokens_seen": 26804224 }, { "epoch": 0.01, "learning_rate": 0.00040674603174603173, "loss": 5.2245, "theoretical_loss": 5.823427704725473, "tokens_seen": 26869760 }, { "epoch": 0.01, "learning_rate": 0.00040773809523809523, "loss": 5.4116, "theoretical_loss": 5.82109946432846, "tokens_seen": 26935296 }, { "epoch": 0.01, "learning_rate": 0.00040873015873015874, "loss": 5.5206, "theoretical_loss": 5.818778463631473, "tokens_seen": 27000832 }, { "epoch": 0.01, "learning_rate": 0.00040972222222222224, "loss": 5.3176, "theoretical_loss": 5.816464662632243, "tokens_seen": 27066368 }, { "epoch": 0.01, "learning_rate": 0.0004107142857142857, "loss": 5.5284, "theoretical_loss": 5.8141580216457065, "tokens_seen": 27131904 }, { "epoch": 0.01, "learning_rate": 0.0004117063492063492, "loss": 5.5378, "theoretical_loss": 5.811858501300729, "tokens_seen": 27197440 }, { "epoch": 0.01, "learning_rate": 0.0004126984126984127, "loss": 5.3653, "theoretical_loss": 5.809566062536868, "tokens_seen": 27262976 }, { "epoch": 0.01, "learning_rate": 0.0004136904761904762, "loss": 5.385, "theoretical_loss": 5.807280666601191, "tokens_seen": 27328512 }, { "epoch": 0.01, "learning_rate": 0.0004146825396825397, "loss": 5.4605, "theoretical_loss": 5.805002275045111, "tokens_seen": 27394048 }, { "epoch": 0.01, "learning_rate": 0.0004156746031746032, "loss": 5.296, "theoretical_loss": 5.8027308497212875, "tokens_seen": 27459584 }, { "epoch": 0.01, "learning_rate": 0.0004166666666666667, "loss": 5.3494, "theoretical_loss": 5.800466352780546, "tokens_seen": 27525120 }, { "epoch": 0.01, "learning_rate": 0.0004176587301587302, "loss": 5.2574, "theoretical_loss": 5.798208746668847, "tokens_seen": 27590656 }, { "epoch": 0.01, "learning_rate": 0.0004186507936507937, "loss": 5.0877, "theoretical_loss": 5.795957994124291, "tokens_seen": 27656192 }, { "epoch": 0.01, "learning_rate": 0.00041964285714285714, "loss": 5.4098, "theoretical_loss": 5.7937140581741575, "tokens_seen": 27721728 }, { "epoch": 0.01, "learning_rate": 0.00042063492063492065, "loss": 5.2452, "theoretical_loss": 5.791476902131985, "tokens_seen": 27787264 }, { "epoch": 0.01, "objective/train/docs_used": 77021, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.236026287078857, "objective/train/theoretical_loss": 5.789246489594688, "objective/train/tokens_used": 48312800, "theoretical_loss": 5.789246489594688, "tokens_seen": 27852800 }, { "epoch": 0.01, "learning_rate": 0.00042162698412698415, "loss": 5.2499, "theoretical_loss": 5.789246489594688, "tokens_seen": 27852800 }, { "epoch": 0.01, "learning_rate": 0.00042261904761904765, "loss": 5.5142, "theoretical_loss": 5.787022784439701, "tokens_seen": 27918336 }, { "epoch": 0.01, "learning_rate": 0.0004236111111111111, "loss": 5.282, "theoretical_loss": 5.784805750822171, "tokens_seen": 27983872 }, { "epoch": 0.01, "learning_rate": 0.0004246031746031746, "loss": 5.3784, "theoretical_loss": 5.782595353172176, "tokens_seen": 28049408 }, { "epoch": 0.01, "learning_rate": 0.0004255952380952381, "loss": 5.031, "theoretical_loss": 5.780391556191977, "tokens_seen": 28114944 }, { "epoch": 0.01, "learning_rate": 0.0004265873015873016, "loss": 5.3311, "theoretical_loss": 5.778194324853311, "tokens_seen": 28180480 }, { "epoch": 0.01, "learning_rate": 0.00042757936507936505, "loss": 5.3294, "theoretical_loss": 5.776003624394711, "tokens_seen": 28246016 }, { "epoch": 0.01, "learning_rate": 0.00042857142857142855, "loss": 5.3188, "theoretical_loss": 5.773819420318858, "tokens_seen": 28311552 }, { "epoch": 0.01, "learning_rate": 0.00042956349206349205, "loss": 5.331, "theoretical_loss": 5.771641678389971, "tokens_seen": 28377088 }, { "epoch": 0.01, "learning_rate": 0.0004305555555555556, "loss": 5.3138, "theoretical_loss": 5.769470364631225, "tokens_seen": 28442624 }, { "epoch": 0.01, "learning_rate": 0.00043154761904761905, "loss": 5.4144, "theoretical_loss": 5.767305445322201, "tokens_seen": 28508160 }, { "epoch": 0.01, "learning_rate": 0.00043253968253968256, "loss": 5.3346, "theoretical_loss": 5.765146886996363, "tokens_seen": 28573696 }, { "epoch": 0.01, "learning_rate": 0.00043353174603174606, "loss": 5.3439, "theoretical_loss": 5.762994656438579, "tokens_seen": 28639232 }, { "epoch": 0.01, "learning_rate": 0.00043452380952380956, "loss": 5.3321, "theoretical_loss": 5.760848720682651, "tokens_seen": 28704768 }, { "epoch": 0.01, "learning_rate": 0.000435515873015873, "loss": 5.1672, "theoretical_loss": 5.758709047008894, "tokens_seen": 28770304 }, { "epoch": 0.01, "learning_rate": 0.0004365079365079365, "loss": 5.2031, "theoretical_loss": 5.756575602941732, "tokens_seen": 28835840 }, { "epoch": 0.01, "learning_rate": 0.0004375, "loss": 5.1737, "theoretical_loss": 5.75444835624733, "tokens_seen": 28901376 }, { "epoch": 0.01, "learning_rate": 0.0004384920634920635, "loss": 5.3551, "theoretical_loss": 5.752327274931249, "tokens_seen": 28966912 }, { "epoch": 0.01, "learning_rate": 0.000439484126984127, "loss": 5.4541, "theoretical_loss": 5.750212327236129, "tokens_seen": 29032448 }, { "epoch": 0.01, "learning_rate": 0.00044047619047619046, "loss": 5.392, "theoretical_loss": 5.7481034816394105, "tokens_seen": 29097984 }, { "epoch": 0.01, "learning_rate": 0.00044146825396825396, "loss": 5.3021, "theoretical_loss": 5.7460007068510635, "tokens_seen": 29163520 }, { "epoch": 0.01, "learning_rate": 0.00044246031746031746, "loss": 5.289, "theoretical_loss": 5.74390397181136, "tokens_seen": 29229056 }, { "epoch": 0.01, "learning_rate": 0.00044345238095238096, "loss": 5.2307, "theoretical_loss": 5.741813245688668, "tokens_seen": 29294592 }, { "epoch": 0.01, "learning_rate": 0.0004444444444444444, "loss": 5.3211, "theoretical_loss": 5.739728497877267, "tokens_seen": 29360128 }, { "epoch": 0.01, "learning_rate": 0.00044543650793650797, "loss": 5.5305, "theoretical_loss": 5.737649697995197, "tokens_seen": 29425664 }, { "epoch": 0.01, "objective/train/docs_used": 79798, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.395312786102295, "objective/train/theoretical_loss": 5.7355768158821245, "objective/train/tokens_used": 49951200, "theoretical_loss": 5.7355768158821245, "tokens_seen": 29491200 }, { "epoch": 0.01, "learning_rate": 0.00044642857142857147, "loss": 5.462, "theoretical_loss": 5.7355768158821245, "tokens_seen": 29491200 }, { "epoch": 0.01, "learning_rate": 0.00044742063492063497, "loss": 5.3297, "theoretical_loss": 5.73350982159724, "tokens_seen": 29556736 }, { "epoch": 0.01, "learning_rate": 0.0004484126984126984, "loss": 5.4285, "theoretical_loss": 5.731448685417178, "tokens_seen": 29622272 }, { "epoch": 0.01, "learning_rate": 0.0004494047619047619, "loss": 5.4189, "theoretical_loss": 5.729393377833956, "tokens_seen": 29687808 }, { "epoch": 0.01, "learning_rate": 0.0004503968253968254, "loss": 5.2553, "theoretical_loss": 5.7273438695529535, "tokens_seen": 29753344 }, { "epoch": 0.01, "learning_rate": 0.0004513888888888889, "loss": 5.3209, "theoretical_loss": 5.725300131490888, "tokens_seen": 29818880 }, { "epoch": 0.01, "learning_rate": 0.00045238095238095237, "loss": 5.2443, "theoretical_loss": 5.7232621347738455, "tokens_seen": 29884416 }, { "epoch": 0.01, "learning_rate": 0.00045337301587301587, "loss": 5.2922, "theoretical_loss": 5.721229850735305, "tokens_seen": 29949952 }, { "epoch": 0.01, "learning_rate": 0.00045436507936507937, "loss": 5.4053, "theoretical_loss": 5.719203250914208, "tokens_seen": 30015488 }, { "epoch": 0.01, "learning_rate": 0.0004553571428571429, "loss": 5.5051, "theoretical_loss": 5.717182307053037, "tokens_seen": 30081024 }, { "epoch": 0.01, "learning_rate": 0.0004563492063492063, "loss": 5.3054, "theoretical_loss": 5.715166991095922, "tokens_seen": 30146560 }, { "epoch": 0.01, "learning_rate": 0.0004573412698412698, "loss": 5.3957, "theoretical_loss": 5.713157275186761, "tokens_seen": 30212096 }, { "epoch": 0.01, "learning_rate": 0.0004583333333333333, "loss": 5.4036, "theoretical_loss": 5.71115313166738, "tokens_seen": 30277632 }, { "epoch": 0.01, "learning_rate": 0.0004593253968253968, "loss": 5.1919, "theoretical_loss": 5.709154533075688, "tokens_seen": 30343168 }, { "epoch": 0.01, "learning_rate": 0.00046031746031746033, "loss": 5.3468, "theoretical_loss": 5.707161452143879, "tokens_seen": 30408704 }, { "epoch": 0.01, "learning_rate": 0.00046130952380952383, "loss": 5.4207, "theoretical_loss": 5.7051738617966326, "tokens_seen": 30474240 }, { "epoch": 0.01, "learning_rate": 0.00046230158730158733, "loss": 5.2871, "theoretical_loss": 5.7031917351493515, "tokens_seen": 30539776 }, { "epoch": 0.01, "learning_rate": 0.00046329365079365083, "loss": 5.1817, "theoretical_loss": 5.701215045506411, "tokens_seen": 30605312 }, { "epoch": 0.01, "learning_rate": 0.00046428571428571433, "loss": 5.0701, "theoretical_loss": 5.699243766359421, "tokens_seen": 30670848 }, { "epoch": 0.01, "learning_rate": 0.0004652777777777778, "loss": 5.0038, "theoretical_loss": 5.697277871385534, "tokens_seen": 30736384 }, { "epoch": 0.01, "learning_rate": 0.0004662698412698413, "loss": 5.2376, "theoretical_loss": 5.695317334445736, "tokens_seen": 30801920 }, { "epoch": 0.01, "learning_rate": 0.0004672619047619048, "loss": 5.2305, "theoretical_loss": 5.693362129583184, "tokens_seen": 30867456 }, { "epoch": 0.01, "learning_rate": 0.0004682539682539683, "loss": 5.2078, "theoretical_loss": 5.691412231021549, "tokens_seen": 30932992 }, { "epoch": 0.01, "learning_rate": 0.00046924603174603173, "loss": 5.2874, "theoretical_loss": 5.689467613163388, "tokens_seen": 30998528 }, { "epoch": 0.01, "learning_rate": 0.00047023809523809523, "loss": 5.2812, "theoretical_loss": 5.687528250588518, "tokens_seen": 31064064 }, { "epoch": 0.01, "objective/train/docs_used": 82602, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.465230941772461, "objective/train/theoretical_loss": 5.6855941180524265, "objective/train/tokens_used": 51589600, "theoretical_loss": 5.6855941180524265, "tokens_seen": 31129600 }, { "epoch": 0.01, "learning_rate": 0.00047123015873015874, "loss": 5.3918, "theoretical_loss": 5.6855941180524265, "tokens_seen": 31129600 }, { "epoch": 0.01, "learning_rate": 0.00047222222222222224, "loss": 4.9912, "theoretical_loss": 5.683665190484683, "tokens_seen": 31195136 }, { "epoch": 0.01, "learning_rate": 0.0004732142857142857, "loss": 5.0617, "theoretical_loss": 5.681741442987381, "tokens_seen": 31260672 }, { "epoch": 0.01, "learning_rate": 0.0004742063492063492, "loss": 5.2282, "theoretical_loss": 5.679822850833591, "tokens_seen": 31326208 }, { "epoch": 0.01, "learning_rate": 0.0004751984126984127, "loss": 5.2731, "theoretical_loss": 5.677909389465831, "tokens_seen": 31391744 }, { "epoch": 0.01, "learning_rate": 0.0004761904761904762, "loss": 5.0976, "theoretical_loss": 5.676001034494554, "tokens_seen": 31457280 }, { "epoch": 0.01, "learning_rate": 0.0004771825396825397, "loss": 4.7839, "theoretical_loss": 5.674097761696653, "tokens_seen": 31522816 }, { "epoch": 0.01, "learning_rate": 0.0004781746031746032, "loss": 5.3917, "theoretical_loss": 5.672199547013983, "tokens_seen": 31588352 }, { "epoch": 0.01, "learning_rate": 0.0004791666666666667, "loss": 5.1195, "theoretical_loss": 5.670306366551898, "tokens_seen": 31653888 }, { "epoch": 0.01, "learning_rate": 0.0004801587301587302, "loss": 5.0443, "theoretical_loss": 5.6684181965778, "tokens_seen": 31719424 }, { "epoch": 0.01, "learning_rate": 0.0004811507936507937, "loss": 5.2041, "theoretical_loss": 5.666535013519715, "tokens_seen": 31784960 }, { "epoch": 0.01, "learning_rate": 0.00048214285714285715, "loss": 5.3055, "theoretical_loss": 5.6646567939648715, "tokens_seen": 31850496 }, { "epoch": 0.01, "learning_rate": 0.00048313492063492065, "loss": 5.1063, "theoretical_loss": 5.6627835146583045, "tokens_seen": 31916032 }, { "epoch": 0.01, "learning_rate": 0.00048412698412698415, "loss": 5.1879, "theoretical_loss": 5.660915152501465, "tokens_seen": 31981568 }, { "epoch": 0.01, "learning_rate": 0.00048511904761904765, "loss": 5.147, "theoretical_loss": 5.659051684550857, "tokens_seen": 32047104 }, { "epoch": 0.01, "learning_rate": 0.0004861111111111111, "loss": 5.2982, "theoretical_loss": 5.657193088016677, "tokens_seen": 32112640 }, { "epoch": 0.01, "learning_rate": 0.0004871031746031746, "loss": 5.0787, "theoretical_loss": 5.655339340261474, "tokens_seen": 32178176 }, { "epoch": 0.01, "learning_rate": 0.0004880952380952381, "loss": 5.0581, "theoretical_loss": 5.653490418798825, "tokens_seen": 32243712 }, { "epoch": 0.01, "learning_rate": 0.0004890873015873016, "loss": 5.0563, "theoretical_loss": 5.651646301292022, "tokens_seen": 32309248 }, { "epoch": 0.01, "learning_rate": 0.0004900793650793651, "loss": 5.031, "theoretical_loss": 5.649806965552774, "tokens_seen": 32374784 }, { "epoch": 0.01, "learning_rate": 0.0004910714285714286, "loss": 5.1185, "theoretical_loss": 5.6479723895399205, "tokens_seen": 32440320 }, { "epoch": 0.01, "learning_rate": 0.000492063492063492, "loss": 5.0566, "theoretical_loss": 5.6461425513581665, "tokens_seen": 32505856 }, { "epoch": 0.01, "learning_rate": 0.0004930555555555556, "loss": 5.2047, "theoretical_loss": 5.6443174292568195, "tokens_seen": 32571392 }, { "epoch": 0.01, "learning_rate": 0.0004940476190476191, "loss": 5.1356, "theoretical_loss": 5.6424970016285485, "tokens_seen": 32636928 }, { "epoch": 0.01, "learning_rate": 0.0004950396825396826, "loss": 5.1215, "theoretical_loss": 5.640681247008156, "tokens_seen": 32702464 }, { "epoch": 0.01, "objective/train/docs_used": 84200, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.386771202087402, "objective/train/theoretical_loss": 5.638870144071353, "objective/train/tokens_used": 53228000, "theoretical_loss": 5.638870144071353, "tokens_seen": 32768000 }, { "epoch": 0.01, "learning_rate": 0.000496031746031746, "loss": 5.1595, "theoretical_loss": 5.638870144071353, "tokens_seen": 32768000 }, { "epoch": 0.01, "learning_rate": 0.0004970238095238095, "loss": 5.1837, "theoretical_loss": 5.637063671633564, "tokens_seen": 32833536 }, { "epoch": 0.01, "learning_rate": 0.000498015873015873, "loss": 4.7304, "theoretical_loss": 5.635261808648728, "tokens_seen": 32899072 }, { "epoch": 0.01, "learning_rate": 0.0004990079365079365, "loss": 5.0359, "theoretical_loss": 5.6334645342081195, "tokens_seen": 32964608 }, { "epoch": 0.01, "learning_rate": 0.0005, "loss": 5.168, "theoretical_loss": 5.631671827539186, "tokens_seen": 33030144 }, { "epoch": 0.01, "learning_rate": 0.0004999899699097292, "loss": 5.2413, "theoretical_loss": 5.629883668004389, "tokens_seen": 33095680 }, { "epoch": 0.01, "learning_rate": 0.0004999799398194584, "loss": 5.0038, "theoretical_loss": 5.628100035100061, "tokens_seen": 33161216 }, { "epoch": 0.01, "learning_rate": 0.0004999699097291876, "loss": 5.1702, "theoretical_loss": 5.626320908455279, "tokens_seen": 33226752 }, { "epoch": 0.01, "learning_rate": 0.0004999598796389167, "loss": 4.9585, "theoretical_loss": 5.6245462678307385, "tokens_seen": 33292288 }, { "epoch": 0.01, "learning_rate": 0.000499949849548646, "loss": 5.1635, "theoretical_loss": 5.622776093117652, "tokens_seen": 33357824 }, { "epoch": 0.01, "learning_rate": 0.0004999398194583751, "loss": 5.0631, "theoretical_loss": 5.621010364336651, "tokens_seen": 33423360 }, { "epoch": 0.01, "learning_rate": 0.0004999297893681044, "loss": 5.2771, "theoretical_loss": 5.619249061636698, "tokens_seen": 33488896 }, { "epoch": 0.01, "learning_rate": 0.0004999197592778335, "loss": 5.0846, "theoretical_loss": 5.61749216529402, "tokens_seen": 33554432 }, { "epoch": 0.01, "learning_rate": 0.0004999097291875627, "loss": 5.205, "theoretical_loss": 5.615739655711037, "tokens_seen": 33619968 }, { "epoch": 0.01, "learning_rate": 0.0004998996990972919, "loss": 5.1364, "theoretical_loss": 5.61399151341532, "tokens_seen": 33685504 }, { "epoch": 0.01, "learning_rate": 0.0004998896690070211, "loss": 5.1684, "theoretical_loss": 5.6122477190585425, "tokens_seen": 33751040 }, { "epoch": 0.01, "learning_rate": 0.0004998796389167503, "loss": 5.072, "theoretical_loss": 5.610508253415453, "tokens_seen": 33816576 }, { "epoch": 0.01, "learning_rate": 0.0004998696088264795, "loss": 5.2499, "theoretical_loss": 5.6087730973828585, "tokens_seen": 33882112 }, { "epoch": 0.01, "learning_rate": 0.0004998595787362087, "loss": 5.186, "theoretical_loss": 5.6070422319786095, "tokens_seen": 33947648 }, { "epoch": 0.01, "learning_rate": 0.0004998495486459378, "loss": 5.2648, "theoretical_loss": 5.605315638340606, "tokens_seen": 34013184 }, { "epoch": 0.01, "learning_rate": 0.000499839518555667, "loss": 5.0135, "theoretical_loss": 5.603593297725807, "tokens_seen": 34078720 }, { "epoch": 0.01, "learning_rate": 0.0004998294884653962, "loss": 5.1018, "theoretical_loss": 5.601875191509249, "tokens_seen": 34144256 }, { "epoch": 0.01, "learning_rate": 0.0004998194583751254, "loss": 5.1234, "theoretical_loss": 5.600161301183084, "tokens_seen": 34209792 }, { "epoch": 0.01, "learning_rate": 0.0004998094282848546, "loss": 5.0933, "theoretical_loss": 5.598451608355614, "tokens_seen": 34275328 }, { "epoch": 0.01, "learning_rate": 0.0004997993981945837, "loss": 5.0015, "theoretical_loss": 5.596746094750342, "tokens_seen": 34340864 }, { "epoch": 0.01, "objective/train/docs_used": 86919, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.115919589996338, "objective/train/theoretical_loss": 5.595044742205037, "objective/train/tokens_used": 54866400, "theoretical_loss": 5.595044742205037, "tokens_seen": 34406400 }, { "epoch": 0.01, "learning_rate": 0.0004997893681043129, "loss": 5.0966, "theoretical_loss": 5.595044742205037, "tokens_seen": 34406400 }, { "epoch": 0.01, "learning_rate": 0.0004997793380140421, "loss": 5.1401, "theoretical_loss": 5.5933475326707995, "tokens_seen": 34471936 }, { "epoch": 0.01, "learning_rate": 0.0004997693079237714, "loss": 5.21, "theoretical_loss": 5.591654448211143, "tokens_seen": 34537472 }, { "epoch": 0.01, "learning_rate": 0.0004997592778335005, "loss": 5.0291, "theoretical_loss": 5.589965471001077, "tokens_seen": 34603008 }, { "epoch": 0.01, "learning_rate": 0.0004997492477432298, "loss": 4.9979, "theoretical_loss": 5.5882805833262115, "tokens_seen": 34668544 }, { "epoch": 0.01, "learning_rate": 0.0004997392176529588, "loss": 5.1176, "theoretical_loss": 5.586599767581859, "tokens_seen": 34734080 }, { "epoch": 0.01, "learning_rate": 0.0004997291875626881, "loss": 5.0064, "theoretical_loss": 5.584923006272151, "tokens_seen": 34799616 }, { "epoch": 0.01, "learning_rate": 0.0004997191574724173, "loss": 4.9069, "theoretical_loss": 5.583250282009159, "tokens_seen": 34865152 }, { "epoch": 0.01, "learning_rate": 0.0004997091273821465, "loss": 4.96, "theoretical_loss": 5.581581577512031, "tokens_seen": 34930688 }, { "epoch": 0.01, "learning_rate": 0.0004996990972918757, "loss": 5.0907, "theoretical_loss": 5.579916875606134, "tokens_seen": 34996224 }, { "epoch": 0.01, "learning_rate": 0.0004996890672016048, "loss": 4.8884, "theoretical_loss": 5.578256159222196, "tokens_seen": 35061760 }, { "epoch": 0.01, "learning_rate": 0.000499679037111334, "loss": 5.0087, "theoretical_loss": 5.576599411395472, "tokens_seen": 35127296 }, { "epoch": 0.01, "learning_rate": 0.0004996690070210632, "loss": 4.9011, "theoretical_loss": 5.574946615264906, "tokens_seen": 35192832 }, { "epoch": 0.01, "learning_rate": 0.0004996589769307924, "loss": 4.7994, "theoretical_loss": 5.5732977540723105, "tokens_seen": 35258368 }, { "epoch": 0.01, "learning_rate": 0.0004996489468405216, "loss": 5.0924, "theoretical_loss": 5.571652811161542, "tokens_seen": 35323904 }, { "epoch": 0.01, "learning_rate": 0.0004996389167502507, "loss": 5.0754, "theoretical_loss": 5.570011769977693, "tokens_seen": 35389440 }, { "epoch": 0.01, "learning_rate": 0.0004996288866599799, "loss": 5.1565, "theoretical_loss": 5.568374614066299, "tokens_seen": 35454976 }, { "epoch": 0.01, "learning_rate": 0.0004996188565697091, "loss": 4.9573, "theoretical_loss": 5.566741327072535, "tokens_seen": 35520512 }, { "epoch": 0.01, "learning_rate": 0.0004996088264794383, "loss": 4.9423, "theoretical_loss": 5.565111892740433, "tokens_seen": 35586048 }, { "epoch": 0.01, "learning_rate": 0.0004995987963891675, "loss": 4.922, "theoretical_loss": 5.563486294912105, "tokens_seen": 35651584 }, { "epoch": 0.01, "learning_rate": 0.0004995887662988968, "loss": 5.0515, "theoretical_loss": 5.56186451752697, "tokens_seen": 35717120 }, { "epoch": 0.01, "learning_rate": 0.0004995787362086258, "loss": 4.9958, "theoretical_loss": 5.560246544620993, "tokens_seen": 35782656 }, { "epoch": 0.01, "learning_rate": 0.0004995687061183551, "loss": 5.1636, "theoretical_loss": 5.558632360325929, "tokens_seen": 35848192 }, { "epoch": 0.01, "learning_rate": 0.0004995586760280842, "loss": 4.8019, "theoretical_loss": 5.557021948868571, "tokens_seen": 35913728 }, { "epoch": 0.01, "learning_rate": 0.0004995486459378135, "loss": 5.1178, "theoretical_loss": 5.555415294570011, "tokens_seen": 35979264 }, { "epoch": 0.01, "objective/train/docs_used": 89797, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.99353551864624, "objective/train/theoretical_loss": 5.553812381844907, "objective/train/tokens_used": 56504800, "theoretical_loss": 5.553812381844907, "tokens_seen": 36044800 }, { "epoch": 0.01, "learning_rate": 0.0004995386158475427, "loss": 5.0038, "theoretical_loss": 5.553812381844907, "tokens_seen": 36044800 }, { "epoch": 0.01, "learning_rate": 0.0004995285857572718, "loss": 4.7839, "theoretical_loss": 5.552213195200755, "tokens_seen": 36110336 }, { "epoch": 0.01, "learning_rate": 0.000499518555667001, "loss": 5.135, "theoretical_loss": 5.550617719237167, "tokens_seen": 36175872 }, { "epoch": 0.01, "learning_rate": 0.0004995085255767302, "loss": 5.0185, "theoretical_loss": 5.549025938645155, "tokens_seen": 36241408 }, { "epoch": 0.01, "learning_rate": 0.0004994984954864594, "loss": 4.9681, "theoretical_loss": 5.547437838206435, "tokens_seen": 36306944 }, { "epoch": 0.01, "learning_rate": 0.0004994884653961886, "loss": 4.9417, "theoretical_loss": 5.545853402792717, "tokens_seen": 36372480 }, { "epoch": 0.01, "learning_rate": 0.0004994784353059178, "loss": 5.0949, "theoretical_loss": 5.544272617365014, "tokens_seen": 36438016 }, { "epoch": 0.01, "learning_rate": 0.0004994684052156469, "loss": 5.239, "theoretical_loss": 5.542695466972956, "tokens_seen": 36503552 }, { "epoch": 0.01, "learning_rate": 0.0004994583751253761, "loss": 5.1184, "theoretical_loss": 5.541121936754111, "tokens_seen": 36569088 }, { "epoch": 0.01, "learning_rate": 0.0004994483450351053, "loss": 5.0122, "theoretical_loss": 5.539552011933312, "tokens_seen": 36634624 }, { "epoch": 0.01, "learning_rate": 0.0004994383149448345, "loss": 5.1478, "theoretical_loss": 5.537985677821986, "tokens_seen": 36700160 }, { "epoch": 0.01, "learning_rate": 0.0004994282848545637, "loss": 4.8757, "theoretical_loss": 5.536422919817495, "tokens_seen": 36765696 }, { "epoch": 0.01, "learning_rate": 0.0004994182547642928, "loss": 5.0535, "theoretical_loss": 5.5348637234024824, "tokens_seen": 36831232 }, { "epoch": 0.01, "learning_rate": 0.0004994082246740221, "loss": 4.9883, "theoretical_loss": 5.53330807414422, "tokens_seen": 36896768 }, { "epoch": 0.01, "learning_rate": 0.0004993981945837512, "loss": 5.1022, "theoretical_loss": 5.5317559576939725, "tokens_seen": 36962304 }, { "epoch": 0.01, "learning_rate": 0.0004993881644934805, "loss": 4.9078, "theoretical_loss": 5.530207359786353, "tokens_seen": 37027840 }, { "epoch": 0.01, "learning_rate": 0.0004993781344032096, "loss": 4.8219, "theoretical_loss": 5.5286622662386975, "tokens_seen": 37093376 }, { "epoch": 0.01, "learning_rate": 0.0004993681043129389, "loss": 4.8296, "theoretical_loss": 5.52712066295044, "tokens_seen": 37158912 }, { "epoch": 0.01, "learning_rate": 0.000499358074222668, "loss": 4.782, "theoretical_loss": 5.525582535902489, "tokens_seen": 37224448 }, { "epoch": 0.01, "learning_rate": 0.0004993480441323972, "loss": 4.8344, "theoretical_loss": 5.524047871156618, "tokens_seen": 37289984 }, { "epoch": 0.01, "learning_rate": 0.0004993380140421264, "loss": 5.0178, "theoretical_loss": 5.52251665485486, "tokens_seen": 37355520 }, { "epoch": 0.01, "learning_rate": 0.0004993279839518556, "loss": 4.8575, "theoretical_loss": 5.520988873218897, "tokens_seen": 37421056 }, { "epoch": 0.01, "learning_rate": 0.0004993179538615848, "loss": 4.944, "theoretical_loss": 5.519464512549478, "tokens_seen": 37486592 }, { "epoch": 0.01, "learning_rate": 0.0004993079237713139, "loss": 4.9993, "theoretical_loss": 5.5179435592258095, "tokens_seen": 37552128 }, { "epoch": 0.01, "learning_rate": 0.0004992978936810431, "loss": 4.8035, "theoretical_loss": 5.516425999704987, "tokens_seen": 37617664 }, { "epoch": 0.01, "objective/train/docs_used": 92627, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 5.006066799163818, "objective/train/theoretical_loss": 5.514911820521407, "objective/train/tokens_used": 58143200, "theoretical_loss": 5.514911820521407, "tokens_seen": 37683200 }, { "epoch": 0.01, "learning_rate": 0.0004992878635907723, "loss": 5.0221, "theoretical_loss": 5.514911820521407, "tokens_seen": 37683200 }, { "epoch": 0.01, "learning_rate": 0.0004992778335005015, "loss": 5.0618, "theoretical_loss": 5.5134010082861895, "tokens_seen": 37748736 }, { "epoch": 0.01, "learning_rate": 0.0004992678034102307, "loss": 4.9917, "theoretical_loss": 5.511893549686616, "tokens_seen": 37814272 }, { "epoch": 0.01, "learning_rate": 0.0004992577733199598, "loss": 4.9486, "theoretical_loss": 5.51038943148556, "tokens_seen": 37879808 }, { "epoch": 0.01, "learning_rate": 0.000499247743229689, "loss": 4.9724, "theoretical_loss": 5.508888640520928, "tokens_seen": 37945344 }, { "epoch": 0.01, "learning_rate": 0.0004992377131394183, "loss": 4.947, "theoretical_loss": 5.50739116370511, "tokens_seen": 38010880 }, { "epoch": 0.01, "learning_rate": 0.0004992276830491475, "loss": 4.9617, "theoretical_loss": 5.505896988024423, "tokens_seen": 38076416 }, { "epoch": 0.01, "learning_rate": 0.0004992176529588767, "loss": 4.8146, "theoretical_loss": 5.5044061005385725, "tokens_seen": 38141952 }, { "epoch": 0.01, "learning_rate": 0.0004992076228686059, "loss": 4.8697, "theoretical_loss": 5.502918488380116, "tokens_seen": 38207488 }, { "epoch": 0.01, "learning_rate": 0.000499197592778335, "loss": 5.1328, "theoretical_loss": 5.501434138753918, "tokens_seen": 38273024 }, { "epoch": 0.01, "learning_rate": 0.0004991875626880642, "loss": 4.8343, "theoretical_loss": 5.499953038936635, "tokens_seen": 38338560 }, { "epoch": 0.01, "learning_rate": 0.0004991775325977934, "loss": 5.0288, "theoretical_loss": 5.498475176276176, "tokens_seen": 38404096 }, { "epoch": 0.01, "learning_rate": 0.0004991675025075226, "loss": 5.0794, "theoretical_loss": 5.497000538191195, "tokens_seen": 38469632 }, { "epoch": 0.01, "learning_rate": 0.0004991574724172518, "loss": 5.0806, "theoretical_loss": 5.495529112170568, "tokens_seen": 38535168 }, { "epoch": 0.01, "learning_rate": 0.000499147442326981, "loss": 4.9336, "theoretical_loss": 5.494060885772887, "tokens_seen": 38600704 }, { "epoch": 0.01, "learning_rate": 0.0004991374122367101, "loss": 4.8124, "theoretical_loss": 5.492595846625951, "tokens_seen": 38666240 }, { "epoch": 0.01, "learning_rate": 0.0004991273821464393, "loss": 4.8791, "theoretical_loss": 5.491133982426266, "tokens_seen": 38731776 }, { "epoch": 0.01, "learning_rate": 0.0004991173520561685, "loss": 5.009, "theoretical_loss": 5.489675280938547, "tokens_seen": 38797312 }, { "epoch": 0.01, "learning_rate": 0.0004991073219658977, "loss": 4.9827, "theoretical_loss": 5.488219729995227, "tokens_seen": 38862848 }, { "epoch": 0.01, "learning_rate": 0.000499097291875627, "loss": 4.9909, "theoretical_loss": 5.486767317495966, "tokens_seen": 38928384 }, { "epoch": 0.01, "learning_rate": 0.000499087261785356, "loss": 4.8335, "theoretical_loss": 5.48531803140717, "tokens_seen": 38993920 }, { "epoch": 0.01, "learning_rate": 0.0004990772316950853, "loss": 5.0399, "theoretical_loss": 5.483871859761511, "tokens_seen": 39059456 }, { "epoch": 0.01, "learning_rate": 0.0004990672016048144, "loss": 4.8211, "theoretical_loss": 5.482428790657449, "tokens_seen": 39124992 }, { "epoch": 0.01, "learning_rate": 0.0004990571715145437, "loss": 4.9803, "theoretical_loss": 5.480988812258763, "tokens_seen": 39190528 }, { "epoch": 0.01, "learning_rate": 0.0004990471414242729, "loss": 4.8649, "theoretical_loss": 5.479551912794086, "tokens_seen": 39256064 }, { "epoch": 0.01, "objective/train/docs_used": 95351, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.7135090827941895, "objective/train/theoretical_loss": 5.478118080556438, "objective/train/tokens_used": 59781600, "theoretical_loss": 5.478118080556438, "tokens_seen": 39321600 }, { "epoch": 0.01, "learning_rate": 0.000499037111334002, "loss": 4.803, "theoretical_loss": 5.478118080556438, "tokens_seen": 39321600 }, { "epoch": 0.01, "learning_rate": 0.0004990270812437312, "loss": 5.0687, "theoretical_loss": 5.476687303902768, "tokens_seen": 39387136 }, { "epoch": 0.01, "learning_rate": 0.0004990170511534604, "loss": 4.7608, "theoretical_loss": 5.475259571253502, "tokens_seen": 39452672 }, { "epoch": 0.01, "learning_rate": 0.0004990070210631896, "loss": 4.8082, "theoretical_loss": 5.473834871092089, "tokens_seen": 39518208 }, { "epoch": 0.01, "learning_rate": 0.0004989969909729188, "loss": 4.6849, "theoretical_loss": 5.4724131919645576, "tokens_seen": 39583744 }, { "epoch": 0.01, "learning_rate": 0.000498986960882648, "loss": 5.0514, "theoretical_loss": 5.470994522479069, "tokens_seen": 39649280 }, { "epoch": 0.01, "learning_rate": 0.0004989769307923771, "loss": 4.9242, "theoretical_loss": 5.4695788513054815, "tokens_seen": 39714816 }, { "epoch": 0.01, "learning_rate": 0.0004989669007021063, "loss": 4.7622, "theoretical_loss": 5.468166167174912, "tokens_seen": 39780352 }, { "epoch": 0.01, "learning_rate": 0.0004989568706118355, "loss": 4.7981, "theoretical_loss": 5.466756458879306, "tokens_seen": 39845888 }, { "epoch": 0.01, "learning_rate": 0.0004989468405215647, "loss": 4.9278, "theoretical_loss": 5.465349715271013, "tokens_seen": 39911424 }, { "epoch": 0.01, "learning_rate": 0.0004989368104312939, "loss": 4.9603, "theoretical_loss": 5.463945925262355, "tokens_seen": 39976960 }, { "epoch": 0.01, "learning_rate": 0.000498926780341023, "loss": 4.9503, "theoretical_loss": 5.462545077825214, "tokens_seen": 40042496 }, { "epoch": 0.01, "learning_rate": 0.0004989167502507523, "loss": 4.9889, "theoretical_loss": 5.461147161990611, "tokens_seen": 40108032 }, { "epoch": 0.01, "learning_rate": 0.0004989067201604814, "loss": 4.9718, "theoretical_loss": 5.459752166848292, "tokens_seen": 40173568 }, { "epoch": 0.01, "learning_rate": 0.0004988966900702107, "loss": 4.8495, "theoretical_loss": 5.458360081546321, "tokens_seen": 40239104 }, { "epoch": 0.01, "learning_rate": 0.0004988866599799398, "loss": 4.8663, "theoretical_loss": 5.456970895290674, "tokens_seen": 40304640 }, { "epoch": 0.01, "learning_rate": 0.0004988766298896691, "loss": 4.8309, "theoretical_loss": 5.455584597344835, "tokens_seen": 40370176 }, { "epoch": 0.01, "learning_rate": 0.0004988665997993982, "loss": 4.8161, "theoretical_loss": 5.454201177029395, "tokens_seen": 40435712 }, { "epoch": 0.01, "learning_rate": 0.0004988565697091274, "loss": 4.8777, "theoretical_loss": 5.452820623721662, "tokens_seen": 40501248 }, { "epoch": 0.01, "learning_rate": 0.0004988465396188566, "loss": 4.6986, "theoretical_loss": 5.45144292685526, "tokens_seen": 40566784 }, { "epoch": 0.01, "learning_rate": 0.0004988365095285858, "loss": 4.7928, "theoretical_loss": 5.450068075919752, "tokens_seen": 40632320 }, { "epoch": 0.01, "learning_rate": 0.000498826479438315, "loss": 4.8082, "theoretical_loss": 5.44869606046024, "tokens_seen": 40697856 }, { "epoch": 0.01, "learning_rate": 0.0004988164493480441, "loss": 4.8257, "theoretical_loss": 5.447326870076996, "tokens_seen": 40763392 }, { "epoch": 0.01, "learning_rate": 0.0004988064192577733, "loss": 4.897, "theoretical_loss": 5.445960494425072, "tokens_seen": 40828928 }, { "epoch": 0.01, "learning_rate": 0.0004987963891675025, "loss": 4.9007, "theoretical_loss": 5.444596923213931, "tokens_seen": 40894464 }, { "epoch": 0.01, "objective/train/docs_used": 97874, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.755735874176025, "objective/train/theoretical_loss": 5.443236146207074, "objective/train/tokens_used": 61420000, "theoretical_loss": 5.443236146207074, "tokens_seen": 40960000 }, { "epoch": 0.01, "learning_rate": 0.0004987863590772317, "loss": 4.9183, "theoretical_loss": 5.443236146207074, "tokens_seen": 40960000 }, { "epoch": 0.01, "learning_rate": 0.0004987763289869609, "loss": 4.7758, "theoretical_loss": 5.441878153221662, "tokens_seen": 41025536 }, { "epoch": 0.01, "learning_rate": 0.00049876629889669, "loss": 4.8559, "theoretical_loss": 5.440522934128164, "tokens_seen": 41091072 }, { "epoch": 0.01, "learning_rate": 0.0004987562688064192, "loss": 4.9575, "theoretical_loss": 5.439170478849976, "tokens_seen": 41156608 }, { "epoch": 0.01, "learning_rate": 0.0004987462387161484, "loss": 4.9176, "theoretical_loss": 5.437820777363078, "tokens_seen": 41222144 }, { "epoch": 0.01, "learning_rate": 0.0004987362086258777, "loss": 4.8991, "theoretical_loss": 5.4364738196956655, "tokens_seen": 41287680 }, { "epoch": 0.01, "learning_rate": 0.0004987261785356068, "loss": 4.7547, "theoretical_loss": 5.435129595927794, "tokens_seen": 41353216 }, { "epoch": 0.01, "learning_rate": 0.0004987161484453361, "loss": 4.7133, "theoretical_loss": 5.433788096191039, "tokens_seen": 41418752 }, { "epoch": 0.01, "learning_rate": 0.0004987061183550651, "loss": 4.8071, "theoretical_loss": 5.432449310668134, "tokens_seen": 41484288 }, { "epoch": 0.01, "learning_rate": 0.0004986960882647944, "loss": 4.8461, "theoretical_loss": 5.4311132295926345, "tokens_seen": 41549824 }, { "epoch": 0.01, "learning_rate": 0.0004986860581745236, "loss": 4.913, "theoretical_loss": 5.42977984324857, "tokens_seen": 41615360 }, { "epoch": 0.01, "learning_rate": 0.0004986760280842528, "loss": 4.9086, "theoretical_loss": 5.428449141970107, "tokens_seen": 41680896 }, { "epoch": 0.01, "learning_rate": 0.000498665997993982, "loss": 4.9386, "theoretical_loss": 5.427121116141212, "tokens_seen": 41746432 }, { "epoch": 0.01, "learning_rate": 0.0004986559679037111, "loss": 4.9283, "theoretical_loss": 5.42579575619531, "tokens_seen": 41811968 }, { "epoch": 0.01, "learning_rate": 0.0004986459378134403, "loss": 4.7521, "theoretical_loss": 5.424473052614967, "tokens_seen": 41877504 }, { "epoch": 0.01, "learning_rate": 0.0004986359077231695, "loss": 4.9531, "theoretical_loss": 5.423152995931552, "tokens_seen": 41943040 }, { "epoch": 0.01, "learning_rate": 0.0004986258776328987, "loss": 4.7598, "theoretical_loss": 5.421835576724906, "tokens_seen": 42008576 }, { "epoch": 0.01, "learning_rate": 0.0004986158475426279, "loss": 4.741, "theoretical_loss": 5.420520785623031, "tokens_seen": 42074112 }, { "epoch": 0.01, "learning_rate": 0.000498605817452357, "loss": 4.8552, "theoretical_loss": 5.4192086133017625, "tokens_seen": 42139648 }, { "epoch": 0.01, "learning_rate": 0.0004985957873620862, "loss": 4.4737, "theoretical_loss": 5.417899050484451, "tokens_seen": 42205184 }, { "epoch": 0.01, "learning_rate": 0.0004985857572718154, "loss": 4.8658, "theoretical_loss": 5.416592087941646, "tokens_seen": 42270720 }, { "epoch": 0.01, "learning_rate": 0.0004985757271815446, "loss": 4.7093, "theoretical_loss": 5.415287716490787, "tokens_seen": 42336256 }, { "epoch": 0.01, "learning_rate": 0.0004985656970912738, "loss": 4.922, "theoretical_loss": 5.413985926995892, "tokens_seen": 42401792 }, { "epoch": 0.01, "learning_rate": 0.0004985556670010031, "loss": 4.8776, "theoretical_loss": 5.412686710367245, "tokens_seen": 42467328 }, { "epoch": 0.01, "learning_rate": 0.0004985456369107321, "loss": 4.777, "theoretical_loss": 5.411390057561097, "tokens_seen": 42532864 }, { "epoch": 0.01, "objective/train/docs_used": 100753, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.816022872924805, "objective/train/theoretical_loss": 5.410095959579362, "objective/train/tokens_used": 63058400, "theoretical_loss": 5.410095959579362, "tokens_seen": 42598400 }, { "epoch": 0.01, "learning_rate": 0.0004985356068204614, "loss": 4.8128, "theoretical_loss": 5.410095959579362, "tokens_seen": 42598400 }, { "epoch": 0.01, "learning_rate": 0.0004985255767301905, "loss": 4.8134, "theoretical_loss": 5.408804407469308, "tokens_seen": 42663936 }, { "epoch": 0.01, "learning_rate": 0.0004985155466399198, "loss": 4.7602, "theoretical_loss": 5.407515392323276, "tokens_seen": 42729472 }, { "epoch": 0.01, "learning_rate": 0.000498505516549649, "loss": 4.8106, "theoretical_loss": 5.406228905278368, "tokens_seen": 42795008 }, { "epoch": 0.01, "learning_rate": 0.0004984954864593782, "loss": 4.7571, "theoretical_loss": 5.404944937516161, "tokens_seen": 42860544 }, { "epoch": 0.01, "learning_rate": 0.0004984854563691073, "loss": 4.893, "theoretical_loss": 5.403663480262418, "tokens_seen": 42926080 }, { "epoch": 0.01, "learning_rate": 0.0004984754262788365, "loss": 4.6546, "theoretical_loss": 5.402384524786797, "tokens_seen": 42991616 }, { "epoch": 0.01, "learning_rate": 0.0004984653961885657, "loss": 4.7938, "theoretical_loss": 5.401108062402562, "tokens_seen": 43057152 }, { "epoch": 0.01, "learning_rate": 0.0004984553660982949, "loss": 4.7062, "theoretical_loss": 5.399834084466306, "tokens_seen": 43122688 }, { "epoch": 0.01, "learning_rate": 0.0004984453360080241, "loss": 4.8931, "theoretical_loss": 5.398562582377666, "tokens_seen": 43188224 }, { "epoch": 0.01, "learning_rate": 0.0004984353059177532, "loss": 4.7835, "theoretical_loss": 5.397293547579041, "tokens_seen": 43253760 }, { "epoch": 0.01, "learning_rate": 0.0004984252758274825, "loss": 4.783, "theoretical_loss": 5.396026971555319, "tokens_seen": 43319296 }, { "epoch": 0.01, "learning_rate": 0.0004984152457372116, "loss": 4.7901, "theoretical_loss": 5.394762845833601, "tokens_seen": 43384832 }, { "epoch": 0.01, "learning_rate": 0.0004984052156469409, "loss": 4.6659, "theoretical_loss": 5.393501161982926, "tokens_seen": 43450368 }, { "epoch": 0.01, "learning_rate": 0.00049839518555667, "loss": 4.6294, "theoretical_loss": 5.392241911614005, "tokens_seen": 43515904 }, { "epoch": 0.01, "learning_rate": 0.0004983851554663993, "loss": 4.9399, "theoretical_loss": 5.390985086378949, "tokens_seen": 43581440 }, { "epoch": 0.01, "learning_rate": 0.0004983751253761284, "loss": 4.7793, "theoretical_loss": 5.389730677971002, "tokens_seen": 43646976 }, { "epoch": 0.01, "learning_rate": 0.0004983650952858576, "loss": 4.7091, "theoretical_loss": 5.388478678124285, "tokens_seen": 43712512 }, { "epoch": 0.01, "learning_rate": 0.0004983550651955868, "loss": 4.6184, "theoretical_loss": 5.387229078613521, "tokens_seen": 43778048 }, { "epoch": 0.01, "learning_rate": 0.000498345035105316, "loss": 4.679, "theoretical_loss": 5.385981871253785, "tokens_seen": 43843584 }, { "epoch": 0.01, "learning_rate": 0.0004983350050150452, "loss": 4.6663, "theoretical_loss": 5.384737047900243, "tokens_seen": 43909120 }, { "epoch": 0.01, "learning_rate": 0.0004983249749247743, "loss": 4.8546, "theoretical_loss": 5.3834946004478965, "tokens_seen": 43974656 }, { "epoch": 0.01, "learning_rate": 0.0004983149448345035, "loss": 4.7143, "theoretical_loss": 5.382254520831328, "tokens_seen": 44040192 }, { "epoch": 0.01, "learning_rate": 0.0004983049147442327, "loss": 4.7268, "theoretical_loss": 5.381016801024449, "tokens_seen": 44105728 }, { "epoch": 0.01, "learning_rate": 0.0004982948846539619, "loss": 4.8047, "theoretical_loss": 5.379781433040252, "tokens_seen": 44171264 }, { "epoch": 0.01, "objective/train/docs_used": 102279, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.590313911437988, "objective/train/theoretical_loss": 5.378548408930558, "objective/train/tokens_used": 64696800, "theoretical_loss": 5.378548408930558, "tokens_seen": 44236800 }, { "epoch": 0.01, "learning_rate": 0.0004982848545636911, "loss": 4.7113, "theoretical_loss": 5.378548408930558, "tokens_seen": 44236800 }, { "epoch": 0.01, "learning_rate": 0.0004982748244734202, "loss": 4.7989, "theoretical_loss": 5.377317720785777, "tokens_seen": 44302336 }, { "epoch": 0.01, "learning_rate": 0.0004982647943831494, "loss": 4.4996, "theoretical_loss": 5.37608936073466, "tokens_seen": 44367872 }, { "epoch": 0.01, "learning_rate": 0.0004982547642928786, "loss": 4.7684, "theoretical_loss": 5.374863320944057, "tokens_seen": 44433408 }, { "epoch": 0.01, "learning_rate": 0.0004982447342026079, "loss": 4.5463, "theoretical_loss": 5.373639593618675, "tokens_seen": 44498944 }, { "epoch": 0.01, "learning_rate": 0.000498234704112337, "loss": 4.3462, "theoretical_loss": 5.372418171000847, "tokens_seen": 44564480 }, { "epoch": 0.01, "learning_rate": 0.0004982246740220663, "loss": 4.7321, "theoretical_loss": 5.371199045370283, "tokens_seen": 44630016 }, { "epoch": 0.01, "learning_rate": 0.0004982146439317953, "loss": 4.8854, "theoretical_loss": 5.369982209043851, "tokens_seen": 44695552 }, { "epoch": 0.01, "learning_rate": 0.0004982046138415246, "loss": 4.7716, "theoretical_loss": 5.368767654375327, "tokens_seen": 44761088 }, { "epoch": 0.01, "learning_rate": 0.0004981945837512538, "loss": 4.712, "theoretical_loss": 5.367555373755179, "tokens_seen": 44826624 }, { "epoch": 0.01, "learning_rate": 0.000498184553660983, "loss": 4.746, "theoretical_loss": 5.366345359610327, "tokens_seen": 44892160 }, { "epoch": 0.01, "learning_rate": 0.0004981745235707122, "loss": 4.8641, "theoretical_loss": 5.365137604403923, "tokens_seen": 44957696 }, { "epoch": 0.01, "learning_rate": 0.0004981644934804413, "loss": 4.6909, "theoretical_loss": 5.363932100635117, "tokens_seen": 45023232 }, { "epoch": 0.01, "learning_rate": 0.0004981544633901705, "loss": 4.6995, "theoretical_loss": 5.362728840838843, "tokens_seen": 45088768 }, { "epoch": 0.01, "learning_rate": 0.0004981444332998997, "loss": 4.6854, "theoretical_loss": 5.361527817585586, "tokens_seen": 45154304 }, { "epoch": 0.01, "learning_rate": 0.0004981344032096289, "loss": 4.6322, "theoretical_loss": 5.360329023481169, "tokens_seen": 45219840 }, { "epoch": 0.01, "learning_rate": 0.0004981243731193581, "loss": 4.6479, "theoretical_loss": 5.359132451166534, "tokens_seen": 45285376 }, { "epoch": 0.01, "learning_rate": 0.0004981143430290873, "loss": 4.6406, "theoretical_loss": 5.357938093317518, "tokens_seen": 45350912 }, { "epoch": 0.01, "learning_rate": 0.0004981043129388164, "loss": 4.4345, "theoretical_loss": 5.356745942644645, "tokens_seen": 45416448 }, { "epoch": 0.01, "learning_rate": 0.0004980942828485456, "loss": 4.9082, "theoretical_loss": 5.355555991892905, "tokens_seen": 45481984 }, { "epoch": 0.01, "learning_rate": 0.0004980842527582748, "loss": 4.7047, "theoretical_loss": 5.35436823384155, "tokens_seen": 45547520 }, { "epoch": 0.01, "learning_rate": 0.000498074222668004, "loss": 4.7846, "theoretical_loss": 5.353182661303873, "tokens_seen": 45613056 }, { "epoch": 0.01, "learning_rate": 0.0004980641925777333, "loss": 4.7408, "theoretical_loss": 5.35199926712701, "tokens_seen": 45678592 }, { "epoch": 0.01, "learning_rate": 0.0004980541624874623, "loss": 4.8273, "theoretical_loss": 5.350818044191721, "tokens_seen": 45744128 }, { "epoch": 0.01, "learning_rate": 0.0004980441323971916, "loss": 4.6955, "theoretical_loss": 5.349638985412193, "tokens_seen": 45809664 }, { "epoch": 0.01, "objective/train/docs_used": 105336, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.803884983062744, "objective/train/theoretical_loss": 5.348462083735834, "objective/train/tokens_used": 66335200, "theoretical_loss": 5.348462083735834, "tokens_seen": 45875200 }, { "epoch": 0.01, "learning_rate": 0.0004980341023069207, "loss": 4.7167, "theoretical_loss": 5.348462083735834, "tokens_seen": 45875200 }, { "epoch": 0.01, "learning_rate": 0.00049802407221665, "loss": 4.6347, "theoretical_loss": 5.347287332143064, "tokens_seen": 45940736 }, { "epoch": 0.01, "learning_rate": 0.0004980140421263792, "loss": 4.6561, "theoretical_loss": 5.346114723647119, "tokens_seen": 46006272 }, { "epoch": 0.01, "learning_rate": 0.0004980040120361084, "loss": 4.6456, "theoretical_loss": 5.344944251293852, "tokens_seen": 46071808 }, { "epoch": 0.01, "learning_rate": 0.0004979939819458375, "loss": 4.7118, "theoretical_loss": 5.343775908161532, "tokens_seen": 46137344 }, { "epoch": 0.01, "learning_rate": 0.0004979839518555667, "loss": 4.7093, "theoretical_loss": 5.342609687360644, "tokens_seen": 46202880 }, { "epoch": 0.01, "learning_rate": 0.0004979739217652959, "loss": 4.4997, "theoretical_loss": 5.341445582033705, "tokens_seen": 46268416 }, { "epoch": 0.01, "learning_rate": 0.0004979638916750251, "loss": 4.926, "theoretical_loss": 5.3402835853550545, "tokens_seen": 46333952 }, { "epoch": 0.01, "learning_rate": 0.0004979538615847543, "loss": 4.6076, "theoretical_loss": 5.339123690530673, "tokens_seen": 46399488 }, { "epoch": 0.01, "learning_rate": 0.0004979438314944834, "loss": 4.5893, "theoretical_loss": 5.337965890797989, "tokens_seen": 46465024 }, { "epoch": 0.01, "learning_rate": 0.0004979338014042126, "loss": 4.6241, "theoretical_loss": 5.336810179425685, "tokens_seen": 46530560 }, { "epoch": 0.01, "learning_rate": 0.0004979237713139418, "loss": 4.6573, "theoretical_loss": 5.335656549713516, "tokens_seen": 46596096 }, { "epoch": 0.01, "learning_rate": 0.000497913741223671, "loss": 4.8395, "theoretical_loss": 5.334504994992115, "tokens_seen": 46661632 }, { "epoch": 0.01, "learning_rate": 0.0004979037111334002, "loss": 4.4759, "theoretical_loss": 5.333355508622814, "tokens_seen": 46727168 }, { "epoch": 0.01, "learning_rate": 0.0004978936810431293, "loss": 4.6277, "theoretical_loss": 5.332208083997459, "tokens_seen": 46792704 }, { "epoch": 0.01, "learning_rate": 0.0004978836509528586, "loss": 4.5771, "theoretical_loss": 5.33106271453822, "tokens_seen": 46858240 }, { "epoch": 0.01, "learning_rate": 0.0004978736208625877, "loss": 4.6222, "theoretical_loss": 5.329919393697422, "tokens_seen": 46923776 }, { "epoch": 0.01, "learning_rate": 0.000497863590772317, "loss": 4.5664, "theoretical_loss": 5.328778114957351, "tokens_seen": 46989312 }, { "epoch": 0.01, "learning_rate": 0.0004978535606820461, "loss": 4.5498, "theoretical_loss": 5.327638871830089, "tokens_seen": 47054848 }, { "epoch": 0.01, "learning_rate": 0.0004978435305917754, "loss": 4.599, "theoretical_loss": 5.326501657857326, "tokens_seen": 47120384 }, { "epoch": 0.01, "learning_rate": 0.0004978335005015045, "loss": 4.7156, "theoretical_loss": 5.32536646661019, "tokens_seen": 47185920 }, { "epoch": 0.01, "learning_rate": 0.0004978234704112337, "loss": 4.5681, "theoretical_loss": 5.324233291689069, "tokens_seen": 47251456 }, { "epoch": 0.01, "learning_rate": 0.0004978134403209629, "loss": 4.6256, "theoretical_loss": 5.323102126723439, "tokens_seen": 47316992 }, { "epoch": 0.01, "learning_rate": 0.0004978034102306921, "loss": 4.686, "theoretical_loss": 5.321972965371691, "tokens_seen": 47382528 }, { "epoch": 0.01, "learning_rate": 0.0004977933801404213, "loss": 4.7664, "theoretical_loss": 5.320845801320959, "tokens_seen": 47448064 }, { "epoch": 0.01, "objective/train/docs_used": 108007, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.417771816253662, "objective/train/theoretical_loss": 5.319720628286955, "objective/train/tokens_used": 67973600, "theoretical_loss": 5.319720628286955, "tokens_seen": 47513600 }, { "epoch": 0.01, "learning_rate": 0.0004977833500501504, "loss": 4.4891, "theoretical_loss": 5.319720628286955, "tokens_seen": 47513600 }, { "epoch": 0.01, "learning_rate": 0.0004977733199598796, "loss": 4.6702, "theoretical_loss": 5.318597440013795, "tokens_seen": 47579136 }, { "epoch": 0.01, "learning_rate": 0.0004977632898696088, "loss": 4.5472, "theoretical_loss": 5.317476230273831, "tokens_seen": 47644672 }, { "epoch": 0.01, "learning_rate": 0.000497753259779338, "loss": 4.5772, "theoretical_loss": 5.316356992867491, "tokens_seen": 47710208 }, { "epoch": 0.01, "learning_rate": 0.0004977432296890672, "loss": 4.5223, "theoretical_loss": 5.31523972162311, "tokens_seen": 47775744 }, { "epoch": 0.01, "learning_rate": 0.0004977331995987965, "loss": 4.6454, "theoretical_loss": 5.314124410396767, "tokens_seen": 47841280 }, { "epoch": 0.01, "learning_rate": 0.0004977231695085255, "loss": 4.6003, "theoretical_loss": 5.31301105307212, "tokens_seen": 47906816 }, { "epoch": 0.01, "learning_rate": 0.0004977131394182548, "loss": 4.629, "theoretical_loss": 5.311899643560251, "tokens_seen": 47972352 }, { "epoch": 0.01, "learning_rate": 0.000497703109327984, "loss": 4.7082, "theoretical_loss": 5.310790175799497, "tokens_seen": 48037888 }, { "epoch": 0.01, "learning_rate": 0.0004976930792377132, "loss": 4.6847, "theoretical_loss": 5.3096826437553, "tokens_seen": 48103424 }, { "epoch": 0.01, "learning_rate": 0.0004976830491474424, "loss": 4.5777, "theoretical_loss": 5.308577041420046, "tokens_seen": 48168960 }, { "epoch": 0.01, "learning_rate": 0.0004976730190571715, "loss": 4.7092, "theoretical_loss": 5.3074733628129005, "tokens_seen": 48234496 }, { "epoch": 0.01, "learning_rate": 0.0004976629889669007, "loss": 4.7267, "theoretical_loss": 5.3063716019796665, "tokens_seen": 48300032 }, { "epoch": 0.01, "learning_rate": 0.0004976529588766299, "loss": 4.6459, "theoretical_loss": 5.305271752992619, "tokens_seen": 48365568 }, { "epoch": 0.01, "learning_rate": 0.0004976429287863591, "loss": 4.76, "theoretical_loss": 5.304173809950358, "tokens_seen": 48431104 }, { "epoch": 0.01, "learning_rate": 0.0004976328986960883, "loss": 4.5692, "theoretical_loss": 5.303077766977653, "tokens_seen": 48496640 }, { "epoch": 0.01, "learning_rate": 0.0004976228686058175, "loss": 4.7771, "theoretical_loss": 5.3019836182252895, "tokens_seen": 48562176 }, { "epoch": 0.01, "learning_rate": 0.0004976128385155466, "loss": 4.287, "theoretical_loss": 5.300891357869929, "tokens_seen": 48627712 }, { "epoch": 0.01, "learning_rate": 0.0004976028084252758, "loss": 4.6951, "theoretical_loss": 5.299800980113945, "tokens_seen": 48693248 }, { "epoch": 0.01, "learning_rate": 0.000497592778335005, "loss": 4.4413, "theoretical_loss": 5.298712479185288, "tokens_seen": 48758784 }, { "epoch": 0.01, "learning_rate": 0.0004975827482447342, "loss": 4.6678, "theoretical_loss": 5.297625849337331, "tokens_seen": 48824320 }, { "epoch": 0.01, "learning_rate": 0.0004975727181544635, "loss": 4.6205, "theoretical_loss": 5.296541084848727, "tokens_seen": 48889856 }, { "epoch": 0.01, "learning_rate": 0.0004975626880641925, "loss": 4.4521, "theoretical_loss": 5.295458180023262, "tokens_seen": 48955392 }, { "epoch": 0.01, "learning_rate": 0.0004975526579739218, "loss": 4.4898, "theoretical_loss": 5.294377129189715, "tokens_seen": 49020928 }, { "epoch": 0.01, "learning_rate": 0.0004975426278836509, "loss": 4.434, "theoretical_loss": 5.293297926701706, "tokens_seen": 49086464 }, { "epoch": 0.01, "objective/train/docs_used": 110370, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.414011478424072, "objective/train/theoretical_loss": 5.292220566937567, "objective/train/tokens_used": 69612000, "theoretical_loss": 5.292220566937567, "tokens_seen": 49152000 }, { "epoch": 0.01, "learning_rate": 0.0004975325977933802, "loss": 4.4522, "theoretical_loss": 5.292220566937567, "tokens_seen": 49152000 }, { "epoch": 0.01, "learning_rate": 0.0004975225677031094, "loss": 4.5149, "theoretical_loss": 5.29114504430019, "tokens_seen": 49217536 }, { "epoch": 0.01, "learning_rate": 0.0004975125376128386, "loss": 4.5722, "theoretical_loss": 5.290071353216895, "tokens_seen": 49283072 }, { "epoch": 0.01, "learning_rate": 0.0004975025075225677, "loss": 4.6173, "theoretical_loss": 5.288999488139284, "tokens_seen": 49348608 }, { "epoch": 0.01, "learning_rate": 0.0004974924774322969, "loss": 4.467, "theoretical_loss": 5.28792944354311, "tokens_seen": 49414144 }, { "epoch": 0.01, "learning_rate": 0.0004974824473420261, "loss": 4.662, "theoretical_loss": 5.286861213928137, "tokens_seen": 49479680 }, { "epoch": 0.02, "learning_rate": 0.0004974724172517553, "loss": 4.5867, "theoretical_loss": 5.285794793817999, "tokens_seen": 49545216 }, { "epoch": 0.02, "learning_rate": 0.0004974623871614845, "loss": 4.5959, "theoretical_loss": 5.284730177760077, "tokens_seen": 49610752 }, { "epoch": 0.02, "learning_rate": 0.0004974523570712136, "loss": 4.6009, "theoretical_loss": 5.283667360325351, "tokens_seen": 49676288 }, { "epoch": 0.02, "learning_rate": 0.0004974423269809428, "loss": 4.6977, "theoretical_loss": 5.2826063361082785, "tokens_seen": 49741824 }, { "epoch": 0.02, "learning_rate": 0.000497432296890672, "loss": 4.5974, "theoretical_loss": 5.281547099726654, "tokens_seen": 49807360 }, { "epoch": 0.02, "learning_rate": 0.0004974222668004012, "loss": 4.7487, "theoretical_loss": 5.280489645821483, "tokens_seen": 49872896 }, { "epoch": 0.02, "learning_rate": 0.0004974122367101304, "loss": 4.5144, "theoretical_loss": 5.279433969056848, "tokens_seen": 49938432 }, { "epoch": 0.02, "learning_rate": 0.0004974022066198595, "loss": 4.408, "theoretical_loss": 5.278380064119782, "tokens_seen": 50003968 }, { "epoch": 0.02, "learning_rate": 0.0004973921765295888, "loss": 4.586, "theoretical_loss": 5.277327925720137, "tokens_seen": 50069504 }, { "epoch": 0.02, "learning_rate": 0.0004973821464393179, "loss": 4.6509, "theoretical_loss": 5.276277548590457, "tokens_seen": 50135040 }, { "epoch": 0.02, "learning_rate": 0.0004973721163490472, "loss": 4.4472, "theoretical_loss": 5.275228927485855, "tokens_seen": 50200576 }, { "epoch": 0.02, "learning_rate": 0.0004973620862587763, "loss": 4.5831, "theoretical_loss": 5.2741820571838804, "tokens_seen": 50266112 }, { "epoch": 0.02, "learning_rate": 0.0004973520561685056, "loss": 4.4805, "theoretical_loss": 5.273136932484399, "tokens_seen": 50331648 }, { "epoch": 0.02, "learning_rate": 0.0004973420260782347, "loss": 4.5786, "theoretical_loss": 5.272093548209467, "tokens_seen": 50397184 }, { "epoch": 0.02, "learning_rate": 0.0004973319959879639, "loss": 4.3, "theoretical_loss": 5.271051899203207, "tokens_seen": 50462720 }, { "epoch": 0.02, "learning_rate": 0.0004973219658976931, "loss": 4.5309, "theoretical_loss": 5.270011980331685, "tokens_seen": 50528256 }, { "epoch": 0.02, "learning_rate": 0.0004973119358074223, "loss": 4.5175, "theoretical_loss": 5.268973786482794, "tokens_seen": 50593792 }, { "epoch": 0.02, "learning_rate": 0.0004973019057171515, "loss": 4.5032, "theoretical_loss": 5.267937312566123, "tokens_seen": 50659328 }, { "epoch": 0.02, "learning_rate": 0.0004972918756268806, "loss": 4.5004, "theoretical_loss": 5.266902553512847, "tokens_seen": 50724864 }, { "epoch": 0.02, "objective/train/docs_used": 113141, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.864771842956543, "objective/train/theoretical_loss": 5.265869504275602, "objective/train/tokens_used": 71250400, "theoretical_loss": 5.265869504275602, "tokens_seen": 50790400 }, { "epoch": 0.02, "learning_rate": 0.0004972818455366098, "loss": 4.4227, "theoretical_loss": 5.265869504275602, "tokens_seen": 50790400 }, { "epoch": 0.02, "learning_rate": 0.000497271815446339, "loss": 4.4984, "theoretical_loss": 5.264838159828369, "tokens_seen": 50855936 }, { "epoch": 0.02, "learning_rate": 0.0004972617853560682, "loss": 4.6557, "theoretical_loss": 5.263808515166355, "tokens_seen": 50921472 }, { "epoch": 0.02, "learning_rate": 0.0004972517552657974, "loss": 4.5067, "theoretical_loss": 5.262780565305875, "tokens_seen": 50987008 }, { "epoch": 0.02, "learning_rate": 0.0004972417251755266, "loss": 4.4877, "theoretical_loss": 5.261754305284241, "tokens_seen": 51052544 }, { "epoch": 0.02, "learning_rate": 0.0004972316950852557, "loss": 4.4446, "theoretical_loss": 5.260729730159641, "tokens_seen": 51118080 }, { "epoch": 0.02, "learning_rate": 0.0004972216649949849, "loss": 4.486, "theoretical_loss": 5.259706835011027, "tokens_seen": 51183616 }, { "epoch": 0.02, "learning_rate": 0.0004972116349047142, "loss": 4.7461, "theoretical_loss": 5.2586856149380035, "tokens_seen": 51249152 }, { "epoch": 0.02, "learning_rate": 0.0004972016048144433, "loss": 4.4907, "theoretical_loss": 5.257666065060709, "tokens_seen": 51314688 }, { "epoch": 0.02, "learning_rate": 0.0004971915747241726, "loss": 4.3394, "theoretical_loss": 5.256648180519708, "tokens_seen": 51380224 }, { "epoch": 0.02, "learning_rate": 0.0004971815446339017, "loss": 4.4946, "theoretical_loss": 5.255631956475881, "tokens_seen": 51445760 }, { "epoch": 0.02, "learning_rate": 0.0004971715145436309, "loss": 4.48, "theoretical_loss": 5.25461738811031, "tokens_seen": 51511296 }, { "epoch": 0.02, "learning_rate": 0.0004971614844533601, "loss": 4.605, "theoretical_loss": 5.25360447062417, "tokens_seen": 51576832 }, { "epoch": 0.02, "learning_rate": 0.0004971514543630893, "loss": 4.6123, "theoretical_loss": 5.252593199238619, "tokens_seen": 51642368 }, { "epoch": 0.02, "learning_rate": 0.0004971414242728185, "loss": 4.3952, "theoretical_loss": 5.2515835691946915, "tokens_seen": 51707904 }, { "epoch": 0.02, "learning_rate": 0.0004971313941825477, "loss": 4.2219, "theoretical_loss": 5.2505755757531904, "tokens_seen": 51773440 }, { "epoch": 0.02, "learning_rate": 0.0004971213640922768, "loss": 4.4433, "theoretical_loss": 5.24956921419458, "tokens_seen": 51838976 }, { "epoch": 0.02, "learning_rate": 0.000497111334002006, "loss": 4.4231, "theoretical_loss": 5.248564479818876, "tokens_seen": 51904512 }, { "epoch": 0.02, "learning_rate": 0.0004971013039117352, "loss": 4.3818, "theoretical_loss": 5.247561367945544, "tokens_seen": 51970048 }, { "epoch": 0.02, "learning_rate": 0.0004970912738214644, "loss": 4.3699, "theoretical_loss": 5.246559873913396, "tokens_seen": 52035584 }, { "epoch": 0.02, "learning_rate": 0.0004970812437311936, "loss": 4.5293, "theoretical_loss": 5.245559993080484, "tokens_seen": 52101120 }, { "epoch": 0.02, "learning_rate": 0.0004970712136409227, "loss": 4.5855, "theoretical_loss": 5.24456172082399, "tokens_seen": 52166656 }, { "epoch": 0.02, "learning_rate": 0.0004970611835506519, "loss": 4.3564, "theoretical_loss": 5.243565052540136, "tokens_seen": 52232192 }, { "epoch": 0.02, "learning_rate": 0.0004970511534603811, "loss": 4.6024, "theoretical_loss": 5.242569983644074, "tokens_seen": 52297728 }, { "epoch": 0.02, "learning_rate": 0.0004970411233701103, "loss": 4.5138, "theoretical_loss": 5.241576509569784, "tokens_seen": 52363264 }, { "epoch": 0.02, "objective/train/docs_used": 115841, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.4031195640563965, "objective/train/theoretical_loss": 5.240584625769978, "objective/train/tokens_used": 72888800, "theoretical_loss": 5.240584625769978, "tokens_seen": 52428800 }, { "epoch": 0.02, "learning_rate": 0.0004970310932798396, "loss": 4.5139, "theoretical_loss": 5.240584625769978, "tokens_seen": 52428800 }, { "epoch": 0.02, "learning_rate": 0.0004970210631895686, "loss": 4.5802, "theoretical_loss": 5.239594327715992, "tokens_seen": 52494336 }, { "epoch": 0.02, "learning_rate": 0.0004970110330992979, "loss": 4.3654, "theoretical_loss": 5.238605610897698, "tokens_seen": 52559872 }, { "epoch": 0.02, "learning_rate": 0.0004970010030090271, "loss": 4.3094, "theoretical_loss": 5.237618470823394, "tokens_seen": 52625408 }, { "epoch": 0.02, "learning_rate": 0.0004969909729187563, "loss": 4.3776, "theoretical_loss": 5.2366329030197125, "tokens_seen": 52690944 }, { "epoch": 0.02, "learning_rate": 0.0004969809428284855, "loss": 4.4353, "theoretical_loss": 5.235648903031521, "tokens_seen": 52756480 }, { "epoch": 0.02, "learning_rate": 0.0004969709127382147, "loss": 4.3403, "theoretical_loss": 5.2346664664218245, "tokens_seen": 52822016 }, { "epoch": 0.02, "learning_rate": 0.0004969608826479438, "loss": 4.3184, "theoretical_loss": 5.233685588771669, "tokens_seen": 52887552 }, { "epoch": 0.02, "learning_rate": 0.000496950852557673, "loss": 4.5992, "theoretical_loss": 5.232706265680049, "tokens_seen": 52953088 }, { "epoch": 0.02, "learning_rate": 0.0004969408224674022, "loss": 4.4904, "theoretical_loss": 5.231728492763811, "tokens_seen": 53018624 }, { "epoch": 0.02, "learning_rate": 0.0004969307923771314, "loss": 4.4165, "theoretical_loss": 5.230752265657554, "tokens_seen": 53084160 }, { "epoch": 0.02, "learning_rate": 0.0004969207622868606, "loss": 4.361, "theoretical_loss": 5.229777580013545, "tokens_seen": 53149696 }, { "epoch": 0.02, "learning_rate": 0.0004969107321965897, "loss": 4.4311, "theoretical_loss": 5.228804431501619, "tokens_seen": 53215232 }, { "epoch": 0.02, "learning_rate": 0.000496900702106319, "loss": 4.5651, "theoretical_loss": 5.227832815809087, "tokens_seen": 53280768 }, { "epoch": 0.02, "learning_rate": 0.0004968906720160481, "loss": 4.4309, "theoretical_loss": 5.226862728640651, "tokens_seen": 53346304 }, { "epoch": 0.02, "learning_rate": 0.0004968806419257774, "loss": 4.4437, "theoretical_loss": 5.2258941657183, "tokens_seen": 53411840 }, { "epoch": 0.02, "learning_rate": 0.0004968706118355065, "loss": 4.5457, "theoretical_loss": 5.2249271227812315, "tokens_seen": 53477376 }, { "epoch": 0.02, "learning_rate": 0.0004968605817452358, "loss": 4.614, "theoretical_loss": 5.223961595585755, "tokens_seen": 53542912 }, { "epoch": 0.02, "learning_rate": 0.0004968505516549649, "loss": 4.1335, "theoretical_loss": 5.222997579905204, "tokens_seen": 53608448 }, { "epoch": 0.02, "learning_rate": 0.0004968405215646941, "loss": 4.379, "theoretical_loss": 5.222035071529845, "tokens_seen": 53673984 }, { "epoch": 0.02, "learning_rate": 0.0004968304914744233, "loss": 4.5265, "theoretical_loss": 5.2210740662667945, "tokens_seen": 53739520 }, { "epoch": 0.02, "learning_rate": 0.0004968204613841525, "loss": 4.4555, "theoretical_loss": 5.220114559939923, "tokens_seen": 53805056 }, { "epoch": 0.02, "learning_rate": 0.0004968104312938817, "loss": 4.2934, "theoretical_loss": 5.219156548389775, "tokens_seen": 53870592 }, { "epoch": 0.02, "learning_rate": 0.0004968004012036108, "loss": 4.2318, "theoretical_loss": 5.218200027473481, "tokens_seen": 53936128 }, { "epoch": 0.02, "learning_rate": 0.00049679037111334, "loss": 4.4897, "theoretical_loss": 5.217244993064664, "tokens_seen": 54001664 }, { "epoch": 0.02, "objective/train/docs_used": 118581, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.235632419586182, "objective/train/theoretical_loss": 5.216291441053366, "objective/train/tokens_used": 74527200, "theoretical_loss": 5.216291441053366, "tokens_seen": 54067200 }, { "epoch": 0.02, "learning_rate": 0.0004967803410230692, "loss": 4.3089, "theoretical_loss": 5.216291441053366, "tokens_seen": 54067200 }, { "epoch": 0.02, "learning_rate": 0.0004967703109327984, "loss": 4.4575, "theoretical_loss": 5.215339367345955, "tokens_seen": 54132736 }, { "epoch": 0.02, "learning_rate": 0.0004967602808425276, "loss": 4.4368, "theoretical_loss": 5.214388767865036, "tokens_seen": 54198272 }, { "epoch": 0.02, "learning_rate": 0.0004967502507522568, "loss": 4.4379, "theoretical_loss": 5.2134396385493815, "tokens_seen": 54263808 }, { "epoch": 0.02, "learning_rate": 0.0004967402206619859, "loss": 4.2878, "theoretical_loss": 5.212491975353835, "tokens_seen": 54329344 }, { "epoch": 0.02, "learning_rate": 0.0004967301905717151, "loss": 4.4054, "theoretical_loss": 5.211545774249233, "tokens_seen": 54394880 }, { "epoch": 0.02, "learning_rate": 0.0004967201604814444, "loss": 4.4136, "theoretical_loss": 5.210601031222324, "tokens_seen": 54460416 }, { "epoch": 0.02, "learning_rate": 0.0004967101303911735, "loss": 4.4705, "theoretical_loss": 5.209657742275683, "tokens_seen": 54525952 }, { "epoch": 0.02, "learning_rate": 0.0004967001003009028, "loss": 4.3145, "theoretical_loss": 5.208715903427631, "tokens_seen": 54591488 }, { "epoch": 0.02, "learning_rate": 0.000496690070210632, "loss": 4.3954, "theoretical_loss": 5.207775510712159, "tokens_seen": 54657024 }, { "epoch": 0.02, "learning_rate": 0.0004966800401203611, "loss": 4.3114, "theoretical_loss": 5.2068365601788384, "tokens_seen": 54722560 }, { "epoch": 0.02, "learning_rate": 0.0004966700100300903, "loss": 4.5148, "theoretical_loss": 5.205899047892753, "tokens_seen": 54788096 }, { "epoch": 0.02, "learning_rate": 0.0004966599799398195, "loss": 4.4433, "theoretical_loss": 5.2049629699344075, "tokens_seen": 54853632 }, { "epoch": 0.02, "learning_rate": 0.0004966499498495487, "loss": 4.3502, "theoretical_loss": 5.204028322399658, "tokens_seen": 54919168 }, { "epoch": 0.02, "learning_rate": 0.0004966399197592779, "loss": 4.3441, "theoretical_loss": 5.203095101399628, "tokens_seen": 54984704 }, { "epoch": 0.02, "learning_rate": 0.000496629889669007, "loss": 4.2882, "theoretical_loss": 5.202163303060633, "tokens_seen": 55050240 }, { "epoch": 0.02, "learning_rate": 0.0004966198595787362, "loss": 4.1062, "theoretical_loss": 5.201232923524104, "tokens_seen": 55115776 }, { "epoch": 0.02, "learning_rate": 0.0004966098294884654, "loss": 4.4164, "theoretical_loss": 5.20030395894651, "tokens_seen": 55181312 }, { "epoch": 0.02, "learning_rate": 0.0004965997993981946, "loss": 4.2151, "theoretical_loss": 5.199376405499277, "tokens_seen": 55246848 }, { "epoch": 0.02, "learning_rate": 0.0004965897693079238, "loss": 4.4393, "theoretical_loss": 5.198450259368721, "tokens_seen": 55312384 }, { "epoch": 0.02, "learning_rate": 0.0004965797392176529, "loss": 4.1344, "theoretical_loss": 5.197525516755965, "tokens_seen": 55377920 }, { "epoch": 0.02, "learning_rate": 0.0004965697091273821, "loss": 4.3788, "theoretical_loss": 5.196602173876867, "tokens_seen": 55443456 }, { "epoch": 0.02, "learning_rate": 0.0004965596790371113, "loss": 4.4126, "theoretical_loss": 5.195680226961947, "tokens_seen": 55508992 }, { "epoch": 0.02, "learning_rate": 0.0004965496489468405, "loss": 4.2708, "theoretical_loss": 5.194759672256309, "tokens_seen": 55574528 }, { "epoch": 0.02, "learning_rate": 0.0004965396188565698, "loss": 4.3842, "theoretical_loss": 5.19384050601957, "tokens_seen": 55640064 }, { "epoch": 0.02, "objective/train/docs_used": 121212, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.1444597244262695, "objective/train/theoretical_loss": 5.192922724525789, "objective/train/tokens_used": 76165600, "theoretical_loss": 5.192922724525789, "tokens_seen": 55705600 }, { "epoch": 0.02, "learning_rate": 0.0004965295887662988, "loss": 4.2186, "theoretical_loss": 5.192922724525789, "tokens_seen": 55705600 }, { "epoch": 0.02, "learning_rate": 0.0004965195586760281, "loss": 4.1946, "theoretical_loss": 5.19200632406339, "tokens_seen": 55771136 }, { "epoch": 0.02, "learning_rate": 0.0004965095285857573, "loss": 4.3589, "theoretical_loss": 5.19109130093509, "tokens_seen": 55836672 }, { "epoch": 0.02, "learning_rate": 0.0004964994984954865, "loss": 4.2025, "theoretical_loss": 5.190177651457833, "tokens_seen": 55902208 }, { "epoch": 0.02, "learning_rate": 0.0004964894684052157, "loss": 4.3106, "theoretical_loss": 5.189265371962712, "tokens_seen": 55967744 }, { "epoch": 0.02, "learning_rate": 0.0004964794383149449, "loss": 4.2048, "theoretical_loss": 5.188354458794902, "tokens_seen": 56033280 }, { "epoch": 0.02, "learning_rate": 0.000496469408224674, "loss": 4.2852, "theoretical_loss": 5.187444908313586, "tokens_seen": 56098816 }, { "epoch": 0.02, "learning_rate": 0.0004964593781344032, "loss": 4.2389, "theoretical_loss": 5.186536716891892, "tokens_seen": 56164352 }, { "epoch": 0.02, "learning_rate": 0.0004964493480441324, "loss": 4.2972, "theoretical_loss": 5.185629880916814, "tokens_seen": 56229888 }, { "epoch": 0.02, "learning_rate": 0.0004964393179538616, "loss": 4.405, "theoretical_loss": 5.18472439678915, "tokens_seen": 56295424 }, { "epoch": 0.02, "learning_rate": 0.0004964292878635908, "loss": 4.2377, "theoretical_loss": 5.18382026092343, "tokens_seen": 56360960 }, { "epoch": 0.02, "learning_rate": 0.00049641925777332, "loss": 4.4402, "theoretical_loss": 5.182917469747851, "tokens_seen": 56426496 }, { "epoch": 0.02, "learning_rate": 0.0004964092276830491, "loss": 4.3639, "theoretical_loss": 5.182016019704204, "tokens_seen": 56492032 }, { "epoch": 0.02, "learning_rate": 0.0004963991975927783, "loss": 4.3409, "theoretical_loss": 5.1811159072478095, "tokens_seen": 56557568 }, { "epoch": 0.02, "learning_rate": 0.0004963891675025075, "loss": 4.4243, "theoretical_loss": 5.180217128847451, "tokens_seen": 56623104 }, { "epoch": 0.02, "learning_rate": 0.0004963791374122367, "loss": 4.1986, "theoretical_loss": 5.17931968098531, "tokens_seen": 56688640 }, { "epoch": 0.02, "learning_rate": 0.0004963691073219659, "loss": 4.2784, "theoretical_loss": 5.178423560156894, "tokens_seen": 56754176 }, { "epoch": 0.02, "learning_rate": 0.0004963590772316951, "loss": 4.2823, "theoretical_loss": 5.177528762870973, "tokens_seen": 56819712 }, { "epoch": 0.02, "learning_rate": 0.0004963490471414242, "loss": 4.321, "theoretical_loss": 5.176635285649521, "tokens_seen": 56885248 }, { "epoch": 0.02, "learning_rate": 0.0004963390170511535, "loss": 3.9992, "theoretical_loss": 5.175743125027638, "tokens_seen": 56950784 }, { "epoch": 0.02, "learning_rate": 0.0004963289869608827, "loss": 4.2855, "theoretical_loss": 5.174852277553498, "tokens_seen": 57016320 }, { "epoch": 0.02, "learning_rate": 0.0004963189568706119, "loss": 4.4971, "theoretical_loss": 5.173962739788276, "tokens_seen": 57081856 }, { "epoch": 0.02, "learning_rate": 0.000496308926780341, "loss": 4.1936, "theoretical_loss": 5.17307450830609, "tokens_seen": 57147392 }, { "epoch": 0.02, "learning_rate": 0.0004962988966900702, "loss": 4.3968, "theoretical_loss": 5.172187579693933, "tokens_seen": 57212928 }, { "epoch": 0.02, "learning_rate": 0.0004962888665997994, "loss": 4.3373, "theoretical_loss": 5.1713019505516105, "tokens_seen": 57278464 }, { "epoch": 0.02, "objective/train/docs_used": 122670, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.256080150604248, "objective/train/theoretical_loss": 5.170417617491682, "objective/train/tokens_used": 77804000, "theoretical_loss": 5.170417617491682, "tokens_seen": 57344000 }, { "epoch": 0.02, "learning_rate": 0.0004962788365095286, "loss": 4.2986, "theoretical_loss": 5.170417617491682, "tokens_seen": 57344000 }, { "epoch": 0.02, "learning_rate": 0.0004962688064192578, "loss": 4.2528, "theoretical_loss": 5.169534577139395, "tokens_seen": 57409536 }, { "epoch": 0.02, "learning_rate": 0.000496258776328987, "loss": 4.2352, "theoretical_loss": 5.168652826132623, "tokens_seen": 57475072 }, { "epoch": 0.02, "learning_rate": 0.0004962487462387161, "loss": 4.191, "theoretical_loss": 5.167772361121805, "tokens_seen": 57540608 }, { "epoch": 0.02, "learning_rate": 0.0004962387161484453, "loss": 4.2218, "theoretical_loss": 5.166893178769884, "tokens_seen": 57606144 }, { "epoch": 0.02, "learning_rate": 0.0004962286860581746, "loss": 4.1902, "theoretical_loss": 5.1660152757522475, "tokens_seen": 57671680 }, { "epoch": 0.02, "learning_rate": 0.0004962186559679037, "loss": 4.192, "theoretical_loss": 5.165138648756665, "tokens_seen": 57737216 }, { "epoch": 0.02, "learning_rate": 0.000496208625877633, "loss": 4.0931, "theoretical_loss": 5.164263294483226, "tokens_seen": 57802752 }, { "epoch": 0.02, "learning_rate": 0.0004961985957873621, "loss": 4.1215, "theoretical_loss": 5.163389209644287, "tokens_seen": 57868288 }, { "epoch": 0.02, "learning_rate": 0.0004961885656970913, "loss": 4.4525, "theoretical_loss": 5.162516390964408, "tokens_seen": 57933824 }, { "epoch": 0.02, "learning_rate": 0.0004961785356068205, "loss": 4.2821, "theoretical_loss": 5.1616448351802875, "tokens_seen": 57999360 }, { "epoch": 0.02, "learning_rate": 0.0004961685055165497, "loss": 4.1095, "theoretical_loss": 5.160774539040716, "tokens_seen": 58064896 }, { "epoch": 0.02, "learning_rate": 0.0004961584754262789, "loss": 4.3305, "theoretical_loss": 5.159905499306511, "tokens_seen": 58130432 }, { "epoch": 0.02, "learning_rate": 0.0004961484453360081, "loss": 4.1279, "theoretical_loss": 5.159037712750455, "tokens_seen": 58195968 }, { "epoch": 0.02, "learning_rate": 0.0004961384152457372, "loss": 4.3076, "theoretical_loss": 5.158171176157245, "tokens_seen": 58261504 }, { "epoch": 0.02, "learning_rate": 0.0004961283851554664, "loss": 4.1626, "theoretical_loss": 5.157305886323435, "tokens_seen": 58327040 }, { "epoch": 0.02, "learning_rate": 0.0004961183550651956, "loss": 4.2295, "theoretical_loss": 5.156441840057371, "tokens_seen": 58392576 }, { "epoch": 0.02, "learning_rate": 0.0004961083249749248, "loss": 4.1819, "theoretical_loss": 5.155579034179144, "tokens_seen": 58458112 }, { "epoch": 0.02, "learning_rate": 0.000496098294884654, "loss": 4.1157, "theoretical_loss": 5.15471746552053, "tokens_seen": 58523648 }, { "epoch": 0.02, "learning_rate": 0.0004960882647943831, "loss": 4.2755, "theoretical_loss": 5.153857130924929, "tokens_seen": 58589184 }, { "epoch": 0.02, "learning_rate": 0.0004960782347041123, "loss": 4.1704, "theoretical_loss": 5.1529980272473175, "tokens_seen": 58654720 }, { "epoch": 0.02, "learning_rate": 0.0004960682046138415, "loss": 4.2012, "theoretical_loss": 5.152140151354191, "tokens_seen": 58720256 }, { "epoch": 0.02, "learning_rate": 0.0004960581745235707, "loss": 4.2999, "theoretical_loss": 5.151283500123505, "tokens_seen": 58785792 }, { "epoch": 0.02, "learning_rate": 0.0004960481444333, "loss": 4.1182, "theoretical_loss": 5.150428070444621, "tokens_seen": 58851328 }, { "epoch": 0.02, "learning_rate": 0.000496038114343029, "loss": 4.3979, "theoretical_loss": 5.149573859218261, "tokens_seen": 58916864 }, { "epoch": 0.02, "objective/train/docs_used": 125600, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.472883224487305, "objective/train/theoretical_loss": 5.1487208633564405, "objective/train/tokens_used": 79442400, "theoretical_loss": 5.1487208633564405, "tokens_seen": 58982400 }, { "epoch": 0.02, "learning_rate": 0.0004960280842527583, "loss": 4.482, "theoretical_loss": 5.1487208633564405, "tokens_seen": 58982400 }, { "epoch": 0.02, "learning_rate": 0.0004960180541624875, "loss": 4.1883, "theoretical_loss": 5.147869079782423, "tokens_seen": 59047936 }, { "epoch": 0.02, "learning_rate": 0.0004960080240722167, "loss": 4.2775, "theoretical_loss": 5.147018505430666, "tokens_seen": 59113472 }, { "epoch": 0.02, "learning_rate": 0.0004959979939819459, "loss": 4.185, "theoretical_loss": 5.146169137246765, "tokens_seen": 59179008 }, { "epoch": 0.02, "learning_rate": 0.0004959879638916751, "loss": 4.3182, "theoretical_loss": 5.145320972187402, "tokens_seen": 59244544 }, { "epoch": 0.02, "learning_rate": 0.0004959779338014042, "loss": 4.1473, "theoretical_loss": 5.144474007220293, "tokens_seen": 59310080 }, { "epoch": 0.02, "learning_rate": 0.0004959679037111334, "loss": 4.0125, "theoretical_loss": 5.143628239324139, "tokens_seen": 59375616 }, { "epoch": 0.02, "learning_rate": 0.0004959578736208626, "loss": 4.2036, "theoretical_loss": 5.142783665488567, "tokens_seen": 59441152 }, { "epoch": 0.02, "learning_rate": 0.0004959478435305918, "loss": 4.337, "theoretical_loss": 5.1419402827140885, "tokens_seen": 59506688 }, { "epoch": 0.02, "learning_rate": 0.000495937813440321, "loss": 4.1653, "theoretical_loss": 5.141098088012036, "tokens_seen": 59572224 }, { "epoch": 0.02, "learning_rate": 0.0004959277833500501, "loss": 4.3892, "theoretical_loss": 5.140257078404524, "tokens_seen": 59637760 }, { "epoch": 0.02, "learning_rate": 0.0004959177532597793, "loss": 4.2111, "theoretical_loss": 5.13941725092439, "tokens_seen": 59703296 }, { "epoch": 0.02, "learning_rate": 0.0004959077231695085, "loss": 4.39, "theoretical_loss": 5.138578602615146, "tokens_seen": 59768832 }, { "epoch": 0.02, "learning_rate": 0.0004958976930792377, "loss": 4.2749, "theoretical_loss": 5.137741130530934, "tokens_seen": 59834368 }, { "epoch": 0.02, "learning_rate": 0.0004958876629889669, "loss": 4.253, "theoretical_loss": 5.1369048317364685, "tokens_seen": 59899904 }, { "epoch": 0.02, "learning_rate": 0.000495877632898696, "loss": 4.1452, "theoretical_loss": 5.13606970330699, "tokens_seen": 59965440 }, { "epoch": 0.02, "learning_rate": 0.0004958676028084253, "loss": 4.2293, "theoretical_loss": 5.135235742328217, "tokens_seen": 60030976 }, { "epoch": 0.02, "learning_rate": 0.0004958575727181544, "loss": 4.0665, "theoretical_loss": 5.134402945896297, "tokens_seen": 60096512 }, { "epoch": 0.02, "learning_rate": 0.0004958475426278837, "loss": 4.2399, "theoretical_loss": 5.133571311117755, "tokens_seen": 60162048 }, { "epoch": 0.02, "learning_rate": 0.0004958375125376129, "loss": 4.4003, "theoretical_loss": 5.132740835109448, "tokens_seen": 60227584 }, { "epoch": 0.02, "learning_rate": 0.0004958274824473421, "loss": 4.0205, "theoretical_loss": 5.131911514998518, "tokens_seen": 60293120 }, { "epoch": 0.02, "learning_rate": 0.0004958174523570712, "loss": 4.199, "theoretical_loss": 5.131083347922338, "tokens_seen": 60358656 }, { "epoch": 0.02, "learning_rate": 0.0004958074222668004, "loss": 4.2801, "theoretical_loss": 5.130256331028474, "tokens_seen": 60424192 }, { "epoch": 0.02, "learning_rate": 0.0004957973921765296, "loss": 4.0595, "theoretical_loss": 5.129430461474628, "tokens_seen": 60489728 }, { "epoch": 0.02, "learning_rate": 0.0004957873620862588, "loss": 4.1621, "theoretical_loss": 5.128605736428597, "tokens_seen": 60555264 }, { "epoch": 0.02, "objective/train/docs_used": 128428, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.065517425537109, "objective/train/theoretical_loss": 5.127782153068225, "objective/train/tokens_used": 81080800, "theoretical_loss": 5.127782153068225, "tokens_seen": 60620800 }, { "epoch": 0.02, "learning_rate": 0.000495777331995988, "loss": 4.1243, "theoretical_loss": 5.127782153068225, "tokens_seen": 60620800 }, { "epoch": 0.02, "learning_rate": 0.0004957673019057172, "loss": 4.0784, "theoretical_loss": 5.126959708581356, "tokens_seen": 60686336 }, { "epoch": 0.02, "learning_rate": 0.0004957572718154463, "loss": 4.2492, "theoretical_loss": 5.1261384001657895, "tokens_seen": 60751872 }, { "epoch": 0.02, "learning_rate": 0.0004957472417251755, "loss": 4.1702, "theoretical_loss": 5.125318225029231, "tokens_seen": 60817408 }, { "epoch": 0.02, "learning_rate": 0.0004957372116349047, "loss": 4.1188, "theoretical_loss": 5.124499180389249, "tokens_seen": 60882944 }, { "epoch": 0.02, "learning_rate": 0.0004957271815446339, "loss": 3.9981, "theoretical_loss": 5.12368126347323, "tokens_seen": 60948480 }, { "epoch": 0.02, "learning_rate": 0.0004957171514543631, "loss": 4.2976, "theoretical_loss": 5.122864471518334, "tokens_seen": 61014016 }, { "epoch": 0.02, "learning_rate": 0.0004957071213640923, "loss": 4.2015, "theoretical_loss": 5.122048801771443, "tokens_seen": 61079552 }, { "epoch": 0.02, "learning_rate": 0.0004956970912738214, "loss": 4.2576, "theoretical_loss": 5.121234251489128, "tokens_seen": 61145088 }, { "epoch": 0.02, "learning_rate": 0.0004956870611835507, "loss": 4.186, "theoretical_loss": 5.120420817937591, "tokens_seen": 61210624 }, { "epoch": 0.02, "learning_rate": 0.0004956770310932798, "loss": 4.2294, "theoretical_loss": 5.119608498392633, "tokens_seen": 61276160 }, { "epoch": 0.02, "learning_rate": 0.0004956670010030091, "loss": 4.2561, "theoretical_loss": 5.118797290139605, "tokens_seen": 61341696 }, { "epoch": 0.02, "learning_rate": 0.0004956569709127383, "loss": 4.2512, "theoretical_loss": 5.117987190473361, "tokens_seen": 61407232 }, { "epoch": 0.02, "learning_rate": 0.0004956469408224674, "loss": 4.1595, "theoretical_loss": 5.1171781966982195, "tokens_seen": 61472768 }, { "epoch": 0.02, "learning_rate": 0.0004956369107321966, "loss": 4.1855, "theoretical_loss": 5.116370306127921, "tokens_seen": 61538304 }, { "epoch": 0.02, "learning_rate": 0.0004956268806419258, "loss": 4.2902, "theoretical_loss": 5.11556351608558, "tokens_seen": 61603840 }, { "epoch": 0.02, "learning_rate": 0.000495616850551655, "loss": 4.2424, "theoretical_loss": 5.114757823903647, "tokens_seen": 61669376 }, { "epoch": 0.02, "learning_rate": 0.0004956068204613842, "loss": 4.0196, "theoretical_loss": 5.113953226923864, "tokens_seen": 61734912 }, { "epoch": 0.02, "learning_rate": 0.0004955967903711133, "loss": 4.2195, "theoretical_loss": 5.113149722497221, "tokens_seen": 61800448 }, { "epoch": 0.02, "learning_rate": 0.0004955867602808425, "loss": 3.9841, "theoretical_loss": 5.112347307983919, "tokens_seen": 61865984 }, { "epoch": 0.02, "learning_rate": 0.0004955767301905717, "loss": 4.1058, "theoretical_loss": 5.111545980753322, "tokens_seen": 61931520 }, { "epoch": 0.02, "learning_rate": 0.0004955667001003009, "loss": 4.1658, "theoretical_loss": 5.110745738183919, "tokens_seen": 61997056 }, { "epoch": 0.02, "learning_rate": 0.0004955566700100301, "loss": 4.0938, "theoretical_loss": 5.109946577663284, "tokens_seen": 62062592 }, { "epoch": 0.02, "learning_rate": 0.0004955466399197592, "loss": 4.2543, "theoretical_loss": 5.109148496588032, "tokens_seen": 62128128 }, { "epoch": 0.02, "learning_rate": 0.0004955366098294884, "loss": 4.0179, "theoretical_loss": 5.108351492363779, "tokens_seen": 62193664 }, { "epoch": 0.02, "objective/train/docs_used": 131355, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.294135570526123, "objective/train/theoretical_loss": 5.107555562405102, "objective/train/tokens_used": 82719200, "theoretical_loss": 5.107555562405102, "tokens_seen": 62259200 }, { "epoch": 0.02, "learning_rate": 0.0004955265797392177, "loss": 4.235, "theoretical_loss": 5.107555562405102, "tokens_seen": 62259200 }, { "epoch": 0.02, "learning_rate": 0.0004955165496489468, "loss": 4.0742, "theoretical_loss": 5.106760704135499, "tokens_seen": 62324736 }, { "epoch": 0.02, "learning_rate": 0.0004955065195586761, "loss": 4.0379, "theoretical_loss": 5.105966914987349, "tokens_seen": 62390272 }, { "epoch": 0.02, "learning_rate": 0.0004954964894684052, "loss": 4.1993, "theoretical_loss": 5.1051741924018685, "tokens_seen": 62455808 }, { "epoch": 0.02, "learning_rate": 0.0004954864593781344, "loss": 3.9683, "theoretical_loss": 5.10438253382908, "tokens_seen": 62521344 }, { "epoch": 0.02, "learning_rate": 0.0004954764292878636, "loss": 4.1085, "theoretical_loss": 5.103591936727762, "tokens_seen": 62586880 }, { "epoch": 0.02, "learning_rate": 0.0004954663991975928, "loss": 4.3479, "theoretical_loss": 5.102802398565418, "tokens_seen": 62652416 }, { "epoch": 0.02, "learning_rate": 0.000495456369107322, "loss": 4.2432, "theoretical_loss": 5.102013916818235, "tokens_seen": 62717952 }, { "epoch": 0.02, "learning_rate": 0.0004954463390170512, "loss": 4.1302, "theoretical_loss": 5.101226488971042, "tokens_seen": 62783488 }, { "epoch": 0.02, "learning_rate": 0.0004954363089267803, "loss": 4.1294, "theoretical_loss": 5.100440112517276, "tokens_seen": 62849024 }, { "epoch": 0.02, "learning_rate": 0.0004954262788365095, "loss": 4.1946, "theoretical_loss": 5.09965478495894, "tokens_seen": 62914560 }, { "epoch": 0.02, "learning_rate": 0.0004954162487462387, "loss": 4.2068, "theoretical_loss": 5.098870503806567, "tokens_seen": 62980096 }, { "epoch": 0.02, "learning_rate": 0.0004954062186559679, "loss": 3.9795, "theoretical_loss": 5.09808726657918, "tokens_seen": 63045632 }, { "epoch": 0.02, "learning_rate": 0.0004953961885656971, "loss": 4.2351, "theoretical_loss": 5.097305070804255, "tokens_seen": 63111168 }, { "epoch": 0.02, "learning_rate": 0.0004953861584754263, "loss": 4.1814, "theoretical_loss": 5.096523914017688, "tokens_seen": 63176704 }, { "epoch": 0.02, "learning_rate": 0.0004953761283851555, "loss": 4.0852, "theoretical_loss": 5.095743793763747, "tokens_seen": 63242240 }, { "epoch": 0.02, "learning_rate": 0.0004953660982948846, "loss": 3.8904, "theoretical_loss": 5.094964707595047, "tokens_seen": 63307776 }, { "epoch": 0.02, "learning_rate": 0.0004953560682046139, "loss": 4.1595, "theoretical_loss": 5.094186653072505, "tokens_seen": 63373312 }, { "epoch": 0.02, "learning_rate": 0.0004953460381143431, "loss": 4.029, "theoretical_loss": 5.093409627765306, "tokens_seen": 63438848 }, { "epoch": 0.02, "learning_rate": 0.0004953360080240723, "loss": 4.1613, "theoretical_loss": 5.092633629250866, "tokens_seen": 63504384 }, { "epoch": 0.02, "learning_rate": 0.0004953259779338014, "loss": 4.2231, "theoretical_loss": 5.091858655114796, "tokens_seen": 63569920 }, { "epoch": 0.02, "learning_rate": 0.0004953159478435306, "loss": 4.3171, "theoretical_loss": 5.091084702950868, "tokens_seen": 63635456 }, { "epoch": 0.02, "learning_rate": 0.0004953059177532598, "loss": 4.2317, "theoretical_loss": 5.090311770360971, "tokens_seen": 63700992 }, { "epoch": 0.02, "learning_rate": 0.000495295887662989, "loss": 4.1302, "theoretical_loss": 5.089539854955088, "tokens_seen": 63766528 }, { "epoch": 0.02, "learning_rate": 0.0004952858575727182, "loss": 4.0565, "theoretical_loss": 5.088768954351249, "tokens_seen": 63832064 }, { "epoch": 0.02, "objective/train/docs_used": 134120, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.161616325378418, "objective/train/theoretical_loss": 5.087999066175502, "objective/train/tokens_used": 84357600, "theoretical_loss": 5.087999066175502, "tokens_seen": 63897600 }, { "epoch": 0.02, "learning_rate": 0.0004952758274824474, "loss": 4.1542, "theoretical_loss": 5.087999066175502, "tokens_seen": 63897600 }, { "epoch": 0.02, "learning_rate": 0.0004952657973921765, "loss": 4.1301, "theoretical_loss": 5.0872301880618735, "tokens_seen": 63963136 }, { "epoch": 0.02, "learning_rate": 0.0004952557673019057, "loss": 3.9617, "theoretical_loss": 5.086462317652341, "tokens_seen": 64028672 }, { "epoch": 0.02, "learning_rate": 0.0004952457372116349, "loss": 3.7772, "theoretical_loss": 5.085695452596788, "tokens_seen": 64094208 }, { "epoch": 0.02, "learning_rate": 0.0004952357071213641, "loss": 4.1847, "theoretical_loss": 5.084929590552976, "tokens_seen": 64159744 }, { "epoch": 0.02, "learning_rate": 0.0004952256770310933, "loss": 4.0907, "theoretical_loss": 5.0841647291865115, "tokens_seen": 64225280 }, { "epoch": 0.02, "learning_rate": 0.0004952156469408225, "loss": 4.0874, "theoretical_loss": 5.083400866170806, "tokens_seen": 64290816 }, { "epoch": 0.02, "learning_rate": 0.0004952056168505516, "loss": 4.1376, "theoretical_loss": 5.082637999187046, "tokens_seen": 64356352 }, { "epoch": 0.02, "learning_rate": 0.0004951955867602809, "loss": 3.8776, "theoretical_loss": 5.081876125924159, "tokens_seen": 64421888 }, { "epoch": 0.02, "learning_rate": 0.00049518555667001, "loss": 4.1629, "theoretical_loss": 5.0811152440787755, "tokens_seen": 64487424 }, { "epoch": 0.02, "learning_rate": 0.0004951755265797393, "loss": 4.0412, "theoretical_loss": 5.0803553513552036, "tokens_seen": 64552960 }, { "epoch": 0.02, "learning_rate": 0.0004951654964894685, "loss": 4.1742, "theoretical_loss": 5.079596445465386, "tokens_seen": 64618496 }, { "epoch": 0.02, "learning_rate": 0.0004951554663991976, "loss": 4.1667, "theoretical_loss": 5.078838524128878, "tokens_seen": 64684032 }, { "epoch": 0.02, "learning_rate": 0.0004951454363089268, "loss": 4.1905, "theoretical_loss": 5.078081585072802, "tokens_seen": 64749568 }, { "epoch": 0.02, "learning_rate": 0.000495135406218656, "loss": 3.9638, "theoretical_loss": 5.077325626031826, "tokens_seen": 64815104 }, { "epoch": 0.02, "learning_rate": 0.0004951253761283852, "loss": 3.8881, "theoretical_loss": 5.076570644748123, "tokens_seen": 64880640 }, { "epoch": 0.02, "learning_rate": 0.0004951153460381144, "loss": 4.04, "theoretical_loss": 5.075816638971341, "tokens_seen": 64946176 }, { "epoch": 0.02, "learning_rate": 0.0004951053159478435, "loss": 4.2046, "theoretical_loss": 5.075063606458576, "tokens_seen": 65011712 }, { "epoch": 0.02, "learning_rate": 0.0004950952858575727, "loss": 4.195, "theoretical_loss": 5.074311544974331, "tokens_seen": 65077248 }, { "epoch": 0.02, "learning_rate": 0.0004950852557673019, "loss": 3.9225, "theoretical_loss": 5.07356045229049, "tokens_seen": 65142784 }, { "epoch": 0.02, "learning_rate": 0.0004950752256770311, "loss": 4.221, "theoretical_loss": 5.072810326186285, "tokens_seen": 65208320 }, { "epoch": 0.02, "learning_rate": 0.0004950651955867603, "loss": 4.1412, "theoretical_loss": 5.072061164448261, "tokens_seen": 65273856 }, { "epoch": 0.02, "learning_rate": 0.0004950551654964894, "loss": 4.0405, "theoretical_loss": 5.071312964870252, "tokens_seen": 65339392 }, { "epoch": 0.02, "learning_rate": 0.0004950451354062186, "loss": 4.2494, "theoretical_loss": 5.070565725253344, "tokens_seen": 65404928 }, { "epoch": 0.02, "learning_rate": 0.0004950351053159479, "loss": 4.1054, "theoretical_loss": 5.069819443405842, "tokens_seen": 65470464 }, { "epoch": 0.02, "objective/train/docs_used": 137067, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.935450315475464, "objective/train/theoretical_loss": 5.069074117143246, "objective/train/tokens_used": 85996000, "theoretical_loss": 5.069074117143246, "tokens_seen": 65536000 }, { "epoch": 0.02, "learning_rate": 0.000495025075225677, "loss": 3.9259, "theoretical_loss": 5.069074117143246, "tokens_seen": 65536000 }, { "epoch": 0.02, "learning_rate": 0.0004950150451354063, "loss": 3.9163, "theoretical_loss": 5.068329744288216, "tokens_seen": 65601536 }, { "epoch": 0.02, "learning_rate": 0.0004950050150451354, "loss": 3.8929, "theoretical_loss": 5.067586322670541, "tokens_seen": 65667072 }, { "epoch": 0.02, "learning_rate": 0.0004949949849548646, "loss": 4.1936, "theoretical_loss": 5.0668438501271105, "tokens_seen": 65732608 }, { "epoch": 0.02, "learning_rate": 0.0004949849548645938, "loss": 4.0016, "theoretical_loss": 5.066102324501883, "tokens_seen": 65798144 }, { "epoch": 0.02, "learning_rate": 0.000494974924774323, "loss": 4.2001, "theoretical_loss": 5.065361743645855, "tokens_seen": 65863680 }, { "epoch": 0.02, "learning_rate": 0.0004949648946840522, "loss": 3.9806, "theoretical_loss": 5.064622105417033, "tokens_seen": 65929216 }, { "epoch": 0.02, "learning_rate": 0.0004949548645937814, "loss": 3.9397, "theoretical_loss": 5.063883407680405, "tokens_seen": 65994752 }, { "epoch": 0.02, "learning_rate": 0.0004949448345035105, "loss": 4.0455, "theoretical_loss": 5.063145648307904, "tokens_seen": 66060288 }, { "epoch": 0.02, "learning_rate": 0.0004949348044132397, "loss": 3.8713, "theoretical_loss": 5.062408825178388, "tokens_seen": 66125824 }, { "epoch": 0.02, "learning_rate": 0.0004949247743229689, "loss": 4.3171, "theoretical_loss": 5.061672936177604, "tokens_seen": 66191360 }, { "epoch": 0.02, "learning_rate": 0.0004949147442326981, "loss": 4.1452, "theoretical_loss": 5.06093797919816, "tokens_seen": 66256896 }, { "epoch": 0.02, "learning_rate": 0.0004949047141424273, "loss": 4.1444, "theoretical_loss": 5.060203952139497, "tokens_seen": 66322432 }, { "epoch": 0.02, "learning_rate": 0.0004948946840521565, "loss": 4.0652, "theoretical_loss": 5.059470852907861, "tokens_seen": 66387968 }, { "epoch": 0.02, "learning_rate": 0.0004948846539618856, "loss": 4.1365, "theoretical_loss": 5.0587386794162725, "tokens_seen": 66453504 }, { "epoch": 0.02, "learning_rate": 0.0004948746238716148, "loss": 3.9927, "theoretical_loss": 5.058007429584498, "tokens_seen": 66519040 }, { "epoch": 0.02, "learning_rate": 0.000494864593781344, "loss": 3.8353, "theoretical_loss": 5.057277101339023, "tokens_seen": 66584576 }, { "epoch": 0.02, "learning_rate": 0.0004948545636910733, "loss": 4.0003, "theoretical_loss": 5.056547692613021, "tokens_seen": 66650112 }, { "epoch": 0.02, "learning_rate": 0.0004948445336008024, "loss": 3.9769, "theoretical_loss": 5.055819201346331, "tokens_seen": 66715648 }, { "epoch": 0.02, "learning_rate": 0.0004948345035105316, "loss": 4.1418, "theoretical_loss": 5.055091625485421, "tokens_seen": 66781184 }, { "epoch": 0.02, "learning_rate": 0.0004948244734202607, "loss": 4.0081, "theoretical_loss": 5.054364962983367, "tokens_seen": 66846720 }, { "epoch": 0.02, "learning_rate": 0.00049481444332999, "loss": 3.9146, "theoretical_loss": 5.053639211799824, "tokens_seen": 66912256 }, { "epoch": 0.02, "learning_rate": 0.0004948044132397192, "loss": 3.9315, "theoretical_loss": 5.052914369900997, "tokens_seen": 66977792 }, { "epoch": 0.02, "learning_rate": 0.0004947943831494484, "loss": 3.9892, "theoretical_loss": 5.052190435259614, "tokens_seen": 67043328 }, { "epoch": 0.02, "learning_rate": 0.0004947843530591776, "loss": 4.1295, "theoretical_loss": 5.051467405854897, "tokens_seen": 67108864 }, { "epoch": 0.02, "objective/train/docs_used": 139457, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.231722354888916, "objective/train/theoretical_loss": 5.05074527967254, "objective/train/tokens_used": 87634400, "theoretical_loss": 5.05074527967254, "tokens_seen": 67174400 }, { "epoch": 0.02, "learning_rate": 0.0004947743229689067, "loss": 4.2334, "theoretical_loss": 5.05074527967254, "tokens_seen": 67174400 }, { "epoch": 0.02, "learning_rate": 0.0004947642928786359, "loss": 4.0289, "theoretical_loss": 5.050024054704677, "tokens_seen": 67239936 }, { "epoch": 0.02, "learning_rate": 0.0004947542627883651, "loss": 3.8336, "theoretical_loss": 5.049303728949859, "tokens_seen": 67305472 }, { "epoch": 0.02, "learning_rate": 0.0004947442326980943, "loss": 4.1588, "theoretical_loss": 5.048584300413019, "tokens_seen": 67371008 }, { "epoch": 0.02, "learning_rate": 0.0004947342026078235, "loss": 4.3102, "theoretical_loss": 5.04786576710546, "tokens_seen": 67436544 }, { "epoch": 0.02, "learning_rate": 0.0004947241725175527, "loss": 4.1837, "theoretical_loss": 5.0471481270448155, "tokens_seen": 67502080 }, { "epoch": 0.02, "learning_rate": 0.0004947141424272818, "loss": 3.9582, "theoretical_loss": 5.046431378255027, "tokens_seen": 67567616 }, { "epoch": 0.02, "learning_rate": 0.0004947041123370111, "loss": 3.9631, "theoretical_loss": 5.045715518766322, "tokens_seen": 67633152 }, { "epoch": 0.02, "learning_rate": 0.0004946940822467402, "loss": 3.9578, "theoretical_loss": 5.0450005466151815, "tokens_seen": 67698688 }, { "epoch": 0.02, "learning_rate": 0.0004946840521564695, "loss": 4.0774, "theoretical_loss": 5.044286459844319, "tokens_seen": 67764224 }, { "epoch": 0.02, "learning_rate": 0.0004946740220661987, "loss": 4.0998, "theoretical_loss": 5.043573256502652, "tokens_seen": 67829760 }, { "epoch": 0.02, "learning_rate": 0.0004946639919759278, "loss": 4.1742, "theoretical_loss": 5.0428609346452795, "tokens_seen": 67895296 }, { "epoch": 0.02, "learning_rate": 0.000494653961885657, "loss": 4.1121, "theoretical_loss": 5.042149492333452, "tokens_seen": 67960832 }, { "epoch": 0.02, "learning_rate": 0.0004946439317953862, "loss": 4.1038, "theoretical_loss": 5.041438927634549, "tokens_seen": 68026368 }, { "epoch": 0.02, "learning_rate": 0.0004946339017051154, "loss": 4.0825, "theoretical_loss": 5.040729238622053, "tokens_seen": 68091904 }, { "epoch": 0.02, "learning_rate": 0.0004946238716148446, "loss": 4.062, "theoretical_loss": 5.040020423375525, "tokens_seen": 68157440 }, { "epoch": 0.02, "learning_rate": 0.0004946138415245737, "loss": 4.0421, "theoretical_loss": 5.039312479980579, "tokens_seen": 68222976 }, { "epoch": 0.02, "learning_rate": 0.0004946038114343029, "loss": 4.0054, "theoretical_loss": 5.038605406528857, "tokens_seen": 68288512 }, { "epoch": 0.02, "learning_rate": 0.0004945937813440321, "loss": 4.0021, "theoretical_loss": 5.037899201118005, "tokens_seen": 68354048 }, { "epoch": 0.02, "learning_rate": 0.0004945837512537613, "loss": 4.0666, "theoretical_loss": 5.037193861851646, "tokens_seen": 68419584 }, { "epoch": 0.02, "learning_rate": 0.0004945737211634905, "loss": 4.0285, "theoretical_loss": 5.03648938683936, "tokens_seen": 68485120 }, { "epoch": 0.02, "learning_rate": 0.0004945636910732196, "loss": 4.0458, "theoretical_loss": 5.035785774196654, "tokens_seen": 68550656 }, { "epoch": 0.02, "learning_rate": 0.0004945536609829488, "loss": 4.1159, "theoretical_loss": 5.035083022044944, "tokens_seen": 68616192 }, { "epoch": 0.02, "learning_rate": 0.0004945436308926781, "loss": 3.9503, "theoretical_loss": 5.034381128511525, "tokens_seen": 68681728 }, { "epoch": 0.02, "learning_rate": 0.0004945336008024072, "loss": 4.0066, "theoretical_loss": 5.0336800917295506, "tokens_seen": 68747264 }, { "epoch": 0.02, "objective/train/docs_used": 142114, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7973508834838867, "objective/train/theoretical_loss": 5.032979909838007, "objective/train/tokens_used": 89272800, "theoretical_loss": 5.032979909838007, "tokens_seen": 68812800 }, { "epoch": 0.02, "learning_rate": 0.0004945235707121365, "loss": 3.9512, "theoretical_loss": 5.032979909838007, "tokens_seen": 68812800 }, { "epoch": 0.02, "learning_rate": 0.0004945135406218656, "loss": 4.0956, "theoretical_loss": 5.032280580981691, "tokens_seen": 68878336 }, { "epoch": 0.02, "learning_rate": 0.0004945035105315948, "loss": 3.9395, "theoretical_loss": 5.031582103311187, "tokens_seen": 68943872 }, { "epoch": 0.02, "learning_rate": 0.000494493480441324, "loss": 3.957, "theoretical_loss": 5.030884474982842, "tokens_seen": 69009408 }, { "epoch": 0.02, "learning_rate": 0.0004944834503510532, "loss": 4.1577, "theoretical_loss": 5.030187694158739, "tokens_seen": 69074944 }, { "epoch": 0.02, "learning_rate": 0.0004944734202607824, "loss": 4.0106, "theoretical_loss": 5.02949175900668, "tokens_seen": 69140480 }, { "epoch": 0.02, "learning_rate": 0.0004944633901705116, "loss": 4.0667, "theoretical_loss": 5.028796667700159, "tokens_seen": 69206016 }, { "epoch": 0.02, "learning_rate": 0.0004944533600802407, "loss": 4.0383, "theoretical_loss": 5.0281024184183405, "tokens_seen": 69271552 }, { "epoch": 0.02, "learning_rate": 0.0004944433299899699, "loss": 4.1313, "theoretical_loss": 5.0274090093460355, "tokens_seen": 69337088 }, { "epoch": 0.02, "learning_rate": 0.0004944332998996991, "loss": 3.7901, "theoretical_loss": 5.026716438673677, "tokens_seen": 69402624 }, { "epoch": 0.02, "learning_rate": 0.0004944232698094283, "loss": 4.0969, "theoretical_loss": 5.0260247045973045, "tokens_seen": 69468160 }, { "epoch": 0.02, "learning_rate": 0.0004944132397191575, "loss": 4.0319, "theoretical_loss": 5.02533380531853, "tokens_seen": 69533696 }, { "epoch": 0.02, "learning_rate": 0.0004944032096288867, "loss": 4.1798, "theoretical_loss": 5.024643739044526, "tokens_seen": 69599232 }, { "epoch": 0.02, "learning_rate": 0.0004943931795386158, "loss": 3.7498, "theoretical_loss": 5.023954503987998, "tokens_seen": 69664768 }, { "epoch": 0.02, "learning_rate": 0.000494383149448345, "loss": 4.0847, "theoretical_loss": 5.023266098367161, "tokens_seen": 69730304 }, { "epoch": 0.02, "learning_rate": 0.0004943731193580742, "loss": 3.9896, "theoretical_loss": 5.022578520405721, "tokens_seen": 69795840 }, { "epoch": 0.02, "learning_rate": 0.0004943630892678035, "loss": 4.062, "theoretical_loss": 5.0218917683328534, "tokens_seen": 69861376 }, { "epoch": 0.02, "learning_rate": 0.0004943530591775326, "loss": 4.0661, "theoretical_loss": 5.021205840383175, "tokens_seen": 69926912 }, { "epoch": 0.02, "learning_rate": 0.0004943430290872618, "loss": 4.0297, "theoretical_loss": 5.020520734796728, "tokens_seen": 69992448 }, { "epoch": 0.02, "learning_rate": 0.0004943329989969909, "loss": 4.1665, "theoretical_loss": 5.019836449818957, "tokens_seen": 70057984 }, { "epoch": 0.02, "learning_rate": 0.0004943229689067202, "loss": 3.9306, "theoretical_loss": 5.019152983700687, "tokens_seen": 70123520 }, { "epoch": 0.02, "learning_rate": 0.0004943129388164494, "loss": 4.0787, "theoretical_loss": 5.018470334698101, "tokens_seen": 70189056 }, { "epoch": 0.02, "learning_rate": 0.0004943029087261786, "loss": 3.914, "theoretical_loss": 5.01778850107272, "tokens_seen": 70254592 }, { "epoch": 0.02, "learning_rate": 0.0004942928786359078, "loss": 3.9696, "theoretical_loss": 5.017107481091379, "tokens_seen": 70320128 }, { "epoch": 0.02, "learning_rate": 0.0004942828485456369, "loss": 4.0437, "theoretical_loss": 5.016427273026212, "tokens_seen": 70385664 }, { "epoch": 0.02, "objective/train/docs_used": 144953, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.3670573234558105, "objective/train/theoretical_loss": 5.015747875154622, "objective/train/tokens_used": 90911200, "theoretical_loss": 5.015747875154622, "tokens_seen": 70451200 }, { "epoch": 0.02, "learning_rate": 0.0004942728184553661, "loss": 4.0221, "theoretical_loss": 5.015747875154622, "tokens_seen": 70451200 }, { "epoch": 0.02, "learning_rate": 0.0004942627883650953, "loss": 3.9573, "theoretical_loss": 5.015069285759269, "tokens_seen": 70516736 }, { "epoch": 0.02, "learning_rate": 0.0004942527582748245, "loss": 3.9717, "theoretical_loss": 5.01439150312804, "tokens_seen": 70582272 }, { "epoch": 0.02, "learning_rate": 0.0004942427281845537, "loss": 4.1384, "theoretical_loss": 5.0137145255540405, "tokens_seen": 70647808 }, { "epoch": 0.02, "learning_rate": 0.0004942326980942828, "loss": 4.1492, "theoretical_loss": 5.013038351335559, "tokens_seen": 70713344 }, { "epoch": 0.02, "learning_rate": 0.000494222668004012, "loss": 4.1445, "theoretical_loss": 5.012362978776057, "tokens_seen": 70778880 }, { "epoch": 0.02, "learning_rate": 0.0004942126379137412, "loss": 3.8555, "theoretical_loss": 5.011688406184147, "tokens_seen": 70844416 }, { "epoch": 0.02, "learning_rate": 0.0004942026078234704, "loss": 4.118, "theoretical_loss": 5.011014631873566, "tokens_seen": 70909952 }, { "epoch": 0.02, "learning_rate": 0.0004941925777331996, "loss": 4.0121, "theoretical_loss": 5.010341654163167, "tokens_seen": 70975488 }, { "epoch": 0.02, "learning_rate": 0.0004941825476429289, "loss": 3.9979, "theoretical_loss": 5.009669471376882, "tokens_seen": 71041024 }, { "epoch": 0.02, "learning_rate": 0.0004941725175526579, "loss": 4.0427, "theoretical_loss": 5.008998081843721, "tokens_seen": 71106560 }, { "epoch": 0.02, "learning_rate": 0.0004941624874623872, "loss": 3.759, "theoretical_loss": 5.008327483897736, "tokens_seen": 71172096 }, { "epoch": 0.02, "learning_rate": 0.0004941524573721163, "loss": 3.9499, "theoretical_loss": 5.00765767587801, "tokens_seen": 71237632 }, { "epoch": 0.02, "learning_rate": 0.0004941424272818456, "loss": 4.0108, "theoretical_loss": 5.006988656128635, "tokens_seen": 71303168 }, { "epoch": 0.02, "learning_rate": 0.0004941323971915748, "loss": 4.0896, "theoretical_loss": 5.006320422998691, "tokens_seen": 71368704 }, { "epoch": 0.02, "learning_rate": 0.0004941223671013039, "loss": 4.0297, "theoretical_loss": 5.00565297484223, "tokens_seen": 71434240 }, { "epoch": 0.02, "learning_rate": 0.0004941123370110331, "loss": 3.9761, "theoretical_loss": 5.004986310018252, "tokens_seen": 71499776 }, { "epoch": 0.02, "learning_rate": 0.0004941023069207623, "loss": 4.0212, "theoretical_loss": 5.004320426890686, "tokens_seen": 71565312 }, { "epoch": 0.02, "learning_rate": 0.0004940922768304915, "loss": 3.8745, "theoretical_loss": 5.003655323828376, "tokens_seen": 71630848 }, { "epoch": 0.02, "learning_rate": 0.0004940822467402207, "loss": 4.0426, "theoretical_loss": 5.002990999205057, "tokens_seen": 71696384 }, { "epoch": 0.02, "learning_rate": 0.0004940722166499498, "loss": 3.7595, "theoretical_loss": 5.002327451399335, "tokens_seen": 71761920 }, { "epoch": 0.02, "learning_rate": 0.000494062186559679, "loss": 4.0643, "theoretical_loss": 5.001664678794671, "tokens_seen": 71827456 }, { "epoch": 0.02, "learning_rate": 0.0004940521564694082, "loss": 4.0287, "theoretical_loss": 5.001002679779363, "tokens_seen": 71892992 }, { "epoch": 0.02, "learning_rate": 0.0004940421263791374, "loss": 4.062, "theoretical_loss": 5.0003414527465235, "tokens_seen": 71958528 }, { "epoch": 0.02, "learning_rate": 0.0004940320962888666, "loss": 4.0174, "theoretical_loss": 4.99968099609406, "tokens_seen": 72024064 }, { "epoch": 0.02, "objective/train/docs_used": 146265, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.239278793334961, "objective/train/theoretical_loss": 4.999021308224664, "objective/train/tokens_used": 92549600, "theoretical_loss": 4.999021308224664, "tokens_seen": 72089600 }, { "epoch": 0.02, "learning_rate": 0.0004940220661985958, "loss": 4.0257, "theoretical_loss": 4.999021308224664, "tokens_seen": 72089600 }, { "epoch": 0.02, "learning_rate": 0.0004940120361083249, "loss": 3.9197, "theoretical_loss": 4.998362387545782, "tokens_seen": 72155136 }, { "epoch": 0.02, "learning_rate": 0.0004940020060180542, "loss": 3.9914, "theoretical_loss": 4.997704232469606, "tokens_seen": 72220672 }, { "epoch": 0.02, "learning_rate": 0.0004939919759277834, "loss": 3.8925, "theoretical_loss": 4.997046841413049, "tokens_seen": 72286208 }, { "epoch": 0.02, "learning_rate": 0.0004939819458375126, "loss": 4.0499, "theoretical_loss": 4.996390212797728, "tokens_seen": 72351744 }, { "epoch": 0.02, "learning_rate": 0.0004939719157472418, "loss": 3.9382, "theoretical_loss": 4.995734345049949, "tokens_seen": 72417280 }, { "epoch": 0.02, "learning_rate": 0.000493961885656971, "loss": 3.7116, "theoretical_loss": 4.995079236600686, "tokens_seen": 72482816 }, { "epoch": 0.02, "learning_rate": 0.0004939518555667001, "loss": 3.911, "theoretical_loss": 4.994424885885564, "tokens_seen": 72548352 }, { "epoch": 0.02, "learning_rate": 0.0004939418254764293, "loss": 4.1174, "theoretical_loss": 4.993771291344839, "tokens_seen": 72613888 }, { "epoch": 0.02, "learning_rate": 0.0004939317953861585, "loss": 4.0971, "theoretical_loss": 4.993118451423381, "tokens_seen": 72679424 }, { "epoch": 0.02, "learning_rate": 0.0004939217652958877, "loss": 4.0222, "theoretical_loss": 4.992466364570659, "tokens_seen": 72744960 }, { "epoch": 0.02, "learning_rate": 0.0004939117352056169, "loss": 3.986, "theoretical_loss": 4.991815029240721, "tokens_seen": 72810496 }, { "epoch": 0.02, "learning_rate": 0.000493901705115346, "loss": 4.0267, "theoretical_loss": 4.991164443892175, "tokens_seen": 72876032 }, { "epoch": 0.02, "learning_rate": 0.0004938916750250752, "loss": 3.9407, "theoretical_loss": 4.990514606988173, "tokens_seen": 72941568 }, { "epoch": 0.02, "learning_rate": 0.0004938816449348044, "loss": 3.7718, "theoretical_loss": 4.989865516996396, "tokens_seen": 73007104 }, { "epoch": 0.02, "learning_rate": 0.0004938716148445337, "loss": 3.9444, "theoretical_loss": 4.98921717238903, "tokens_seen": 73072640 }, { "epoch": 0.02, "learning_rate": 0.0004938615847542628, "loss": 4.0837, "theoretical_loss": 4.988569571642756, "tokens_seen": 73138176 }, { "epoch": 0.02, "learning_rate": 0.000493851554663992, "loss": 3.9803, "theoretical_loss": 4.98792271323873, "tokens_seen": 73203712 }, { "epoch": 0.02, "learning_rate": 0.0004938415245737211, "loss": 3.9733, "theoretical_loss": 4.9872765956625615, "tokens_seen": 73269248 }, { "epoch": 0.02, "learning_rate": 0.0004938314944834504, "loss": 3.909, "theoretical_loss": 4.9866312174043035, "tokens_seen": 73334784 }, { "epoch": 0.02, "learning_rate": 0.0004938214643931796, "loss": 3.9689, "theoretical_loss": 4.9859865769584335, "tokens_seen": 73400320 }, { "epoch": 0.02, "learning_rate": 0.0004938114343029088, "loss": 3.9131, "theoretical_loss": 4.9853426728238315, "tokens_seen": 73465856 }, { "epoch": 0.02, "learning_rate": 0.000493801404212638, "loss": 4.0438, "theoretical_loss": 4.984699503503771, "tokens_seen": 73531392 }, { "epoch": 0.02, "learning_rate": 0.0004937913741223671, "loss": 3.9365, "theoretical_loss": 4.984057067505898, "tokens_seen": 73596928 }, { "epoch": 0.02, "learning_rate": 0.0004937813440320963, "loss": 4.069, "theoretical_loss": 4.9834153633422105, "tokens_seen": 73662464 }, { "epoch": 0.02, "objective/train/docs_used": 149094, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.849897861480713, "objective/train/theoretical_loss": 4.982774389529053, "objective/train/tokens_used": 94188000, "theoretical_loss": 4.982774389529053, "tokens_seen": 73728000 }, { "epoch": 0.02, "learning_rate": 0.0004937713139418255, "loss": 3.9546, "theoretical_loss": 4.982774389529053, "tokens_seen": 73728000 }, { "epoch": 0.02, "learning_rate": 0.0004937612838515547, "loss": 3.81, "theoretical_loss": 4.9821341445870875, "tokens_seen": 73793536 }, { "epoch": 0.02, "learning_rate": 0.0004937512537612839, "loss": 4.045, "theoretical_loss": 4.981494627041286, "tokens_seen": 73859072 }, { "epoch": 0.02, "learning_rate": 0.000493741223671013, "loss": 3.8663, "theoretical_loss": 4.98085583542091, "tokens_seen": 73924608 }, { "epoch": 0.02, "learning_rate": 0.0004937311935807422, "loss": 4.1956, "theoretical_loss": 4.980217768259496, "tokens_seen": 73990144 }, { "epoch": 0.02, "learning_rate": 0.0004937211634904714, "loss": 3.966, "theoretical_loss": 4.979580424094836, "tokens_seen": 74055680 }, { "epoch": 0.02, "learning_rate": 0.0004937111334002006, "loss": 3.893, "theoretical_loss": 4.978943801468967, "tokens_seen": 74121216 }, { "epoch": 0.02, "learning_rate": 0.0004937011033099298, "loss": 3.9932, "theoretical_loss": 4.978307898928149, "tokens_seen": 74186752 }, { "epoch": 0.02, "learning_rate": 0.0004936910732196591, "loss": 3.9426, "theoretical_loss": 4.977672715022855, "tokens_seen": 74252288 }, { "epoch": 0.02, "learning_rate": 0.0004936810431293881, "loss": 4.2044, "theoretical_loss": 4.97703824830775, "tokens_seen": 74317824 }, { "epoch": 0.02, "learning_rate": 0.0004936710130391174, "loss": 3.7717, "theoretical_loss": 4.976404497341676, "tokens_seen": 74383360 }, { "epoch": 0.02, "learning_rate": 0.0004936609829488465, "loss": 3.8478, "theoretical_loss": 4.975771460687641, "tokens_seen": 74448896 }, { "epoch": 0.02, "learning_rate": 0.0004936509528585758, "loss": 3.7693, "theoretical_loss": 4.975139136912794, "tokens_seen": 74514432 }, { "epoch": 0.02, "learning_rate": 0.000493640922768305, "loss": 3.9908, "theoretical_loss": 4.974507524588424, "tokens_seen": 74579968 }, { "epoch": 0.02, "learning_rate": 0.0004936308926780341, "loss": 4.0607, "theoretical_loss": 4.973876622289927, "tokens_seen": 74645504 }, { "epoch": 0.02, "learning_rate": 0.0004936208625877633, "loss": 3.9677, "theoretical_loss": 4.973246428596802, "tokens_seen": 74711040 }, { "epoch": 0.02, "learning_rate": 0.0004936108324974925, "loss": 3.6206, "theoretical_loss": 4.972616942092634, "tokens_seen": 74776576 }, { "epoch": 0.02, "learning_rate": 0.0004936008024072217, "loss": 3.9187, "theoretical_loss": 4.971988161365077, "tokens_seen": 74842112 }, { "epoch": 0.02, "learning_rate": 0.0004935907723169509, "loss": 4.0964, "theoretical_loss": 4.9713600850058395, "tokens_seen": 74907648 }, { "epoch": 0.02, "learning_rate": 0.00049358074222668, "loss": 3.9787, "theoretical_loss": 4.970732711610667, "tokens_seen": 74973184 }, { "epoch": 0.02, "learning_rate": 0.0004935707121364092, "loss": 4.0178, "theoretical_loss": 4.97010603977933, "tokens_seen": 75038720 }, { "epoch": 0.02, "learning_rate": 0.0004935606820461384, "loss": 3.8845, "theoretical_loss": 4.96948006811561, "tokens_seen": 75104256 }, { "epoch": 0.02, "learning_rate": 0.0004935506519558676, "loss": 4.0952, "theoretical_loss": 4.968854795227281, "tokens_seen": 75169792 }, { "epoch": 0.02, "learning_rate": 0.0004935406218655968, "loss": 3.7761, "theoretical_loss": 4.968230219726093, "tokens_seen": 75235328 }, { "epoch": 0.02, "learning_rate": 0.000493530591775326, "loss": 3.9844, "theoretical_loss": 4.967606340227765, "tokens_seen": 75300864 }, { "epoch": 0.02, "objective/train/docs_used": 151789, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.175643444061279, "objective/train/theoretical_loss": 4.966983155351962, "objective/train/tokens_used": 95826400, "theoretical_loss": 4.966983155351962, "tokens_seen": 75366400 }, { "epoch": 0.02, "learning_rate": 0.0004935205616850551, "loss": 4.0763, "theoretical_loss": 4.966983155351962, "tokens_seen": 75366400 }, { "epoch": 0.02, "learning_rate": 0.0004935105315947844, "loss": 3.89, "theoretical_loss": 4.966360663722287, "tokens_seen": 75431936 }, { "epoch": 0.02, "learning_rate": 0.0004935005015045135, "loss": 3.9747, "theoretical_loss": 4.96573886396626, "tokens_seen": 75497472 }, { "epoch": 0.02, "learning_rate": 0.0004934904714142428, "loss": 3.9217, "theoretical_loss": 4.965117754715307, "tokens_seen": 75563008 }, { "epoch": 0.02, "learning_rate": 0.0004934804413239719, "loss": 3.7822, "theoretical_loss": 4.964497334604748, "tokens_seen": 75628544 }, { "epoch": 0.02, "learning_rate": 0.0004934704112337011, "loss": 3.8835, "theoretical_loss": 4.963877602273776, "tokens_seen": 75694080 }, { "epoch": 0.02, "learning_rate": 0.0004934603811434303, "loss": 4.0015, "theoretical_loss": 4.963258556365449, "tokens_seen": 75759616 }, { "epoch": 0.02, "learning_rate": 0.0004934503510531595, "loss": 4.0118, "theoretical_loss": 4.962640195526673, "tokens_seen": 75825152 }, { "epoch": 0.02, "learning_rate": 0.0004934403209628887, "loss": 4.1615, "theoretical_loss": 4.962022518408183, "tokens_seen": 75890688 }, { "epoch": 0.02, "learning_rate": 0.0004934302908726179, "loss": 4.0, "theoretical_loss": 4.96140552366454, "tokens_seen": 75956224 }, { "epoch": 0.02, "learning_rate": 0.000493420260782347, "loss": 3.9934, "theoretical_loss": 4.9607892099541075, "tokens_seen": 76021760 }, { "epoch": 0.02, "learning_rate": 0.0004934102306920762, "loss": 3.9141, "theoretical_loss": 4.9601735759390415, "tokens_seen": 76087296 }, { "epoch": 0.02, "learning_rate": 0.0004934002006018054, "loss": 3.838, "theoretical_loss": 4.959558620285274, "tokens_seen": 76152832 }, { "epoch": 0.02, "learning_rate": 0.0004933901705115346, "loss": 3.9634, "theoretical_loss": 4.958944341662502, "tokens_seen": 76218368 }, { "epoch": 0.02, "learning_rate": 0.0004933801404212638, "loss": 3.9765, "theoretical_loss": 4.958330738744172, "tokens_seen": 76283904 }, { "epoch": 0.02, "learning_rate": 0.000493370110330993, "loss": 4.02, "theoretical_loss": 4.957717810207466, "tokens_seen": 76349440 }, { "epoch": 0.02, "learning_rate": 0.0004933600802407221, "loss": 4.0177, "theoretical_loss": 4.957105554733289, "tokens_seen": 76414976 }, { "epoch": 0.02, "learning_rate": 0.0004933500501504513, "loss": 3.8828, "theoretical_loss": 4.956493971006253, "tokens_seen": 76480512 }, { "epoch": 0.02, "learning_rate": 0.0004933400200601805, "loss": 3.873, "theoretical_loss": 4.955883057714669, "tokens_seen": 76546048 }, { "epoch": 0.02, "learning_rate": 0.0004933299899699098, "loss": 3.9547, "theoretical_loss": 4.955272813550524, "tokens_seen": 76611584 }, { "epoch": 0.02, "learning_rate": 0.0004933199598796389, "loss": 3.9313, "theoretical_loss": 4.954663237209477, "tokens_seen": 76677120 }, { "epoch": 0.02, "learning_rate": 0.0004933099297893682, "loss": 4.1077, "theoretical_loss": 4.954054327390841, "tokens_seen": 76742656 }, { "epoch": 0.02, "learning_rate": 0.0004932998996990972, "loss": 3.847, "theoretical_loss": 4.9534460827975675, "tokens_seen": 76808192 }, { "epoch": 0.02, "learning_rate": 0.0004932898696088265, "loss": 3.9527, "theoretical_loss": 4.952838502136241, "tokens_seen": 76873728 }, { "epoch": 0.02, "learning_rate": 0.0004932798395185557, "loss": 3.9222, "theoretical_loss": 4.952231584117056, "tokens_seen": 76939264 }, { "epoch": 0.02, "objective/train/docs_used": 154459, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7871055603027344, "objective/train/theoretical_loss": 4.951625327453812, "objective/train/tokens_used": 97464800, "theoretical_loss": 4.951625327453812, "tokens_seen": 77004800 }, { "epoch": 0.02, "learning_rate": 0.0004932698094282849, "loss": 3.6864, "theoretical_loss": 4.951625327453812, "tokens_seen": 77004800 }, { "epoch": 0.02, "learning_rate": 0.0004932597793380141, "loss": 4.1088, "theoretical_loss": 4.951019730863894, "tokens_seen": 77070336 }, { "epoch": 0.02, "learning_rate": 0.0004932497492477432, "loss": 3.9973, "theoretical_loss": 4.950414793068266, "tokens_seen": 77135872 }, { "epoch": 0.02, "learning_rate": 0.0004932397191574724, "loss": 3.945, "theoretical_loss": 4.94981051279145, "tokens_seen": 77201408 }, { "epoch": 0.02, "learning_rate": 0.0004932296890672016, "loss": 3.8127, "theoretical_loss": 4.94920688876152, "tokens_seen": 77266944 }, { "epoch": 0.02, "learning_rate": 0.0004932196589769308, "loss": 3.8468, "theoretical_loss": 4.948603919710088, "tokens_seen": 77332480 }, { "epoch": 0.02, "learning_rate": 0.00049320962888666, "loss": 3.9881, "theoretical_loss": 4.948001604372287, "tokens_seen": 77398016 }, { "epoch": 0.02, "learning_rate": 0.0004931995987963893, "loss": 3.7925, "theoretical_loss": 4.947399941486762, "tokens_seen": 77463552 }, { "epoch": 0.02, "learning_rate": 0.0004931895687061183, "loss": 3.9202, "theoretical_loss": 4.946798929795658, "tokens_seen": 77529088 }, { "epoch": 0.02, "learning_rate": 0.0004931795386158476, "loss": 3.9039, "theoretical_loss": 4.946198568044602, "tokens_seen": 77594624 }, { "epoch": 0.02, "learning_rate": 0.0004931695085255767, "loss": 3.743, "theoretical_loss": 4.945598854982698, "tokens_seen": 77660160 }, { "epoch": 0.02, "learning_rate": 0.000493159478435306, "loss": 3.8027, "theoretical_loss": 4.944999789362508, "tokens_seen": 77725696 }, { "epoch": 0.02, "learning_rate": 0.0004931494483450352, "loss": 3.9098, "theoretical_loss": 4.944401369940043, "tokens_seen": 77791232 }, { "epoch": 0.02, "learning_rate": 0.0004931394182547643, "loss": 3.9303, "theoretical_loss": 4.9438035954747495, "tokens_seen": 77856768 }, { "epoch": 0.02, "learning_rate": 0.0004931293881644935, "loss": 3.8478, "theoretical_loss": 4.9432064647294975, "tokens_seen": 77922304 }, { "epoch": 0.02, "learning_rate": 0.0004931193580742227, "loss": 3.9932, "theoretical_loss": 4.942609976470566, "tokens_seen": 77987840 }, { "epoch": 0.02, "learning_rate": 0.0004931093279839519, "loss": 3.7249, "theoretical_loss": 4.942014129467637, "tokens_seen": 78053376 }, { "epoch": 0.02, "learning_rate": 0.0004930992978936811, "loss": 3.9785, "theoretical_loss": 4.941418922493774, "tokens_seen": 78118912 }, { "epoch": 0.02, "learning_rate": 0.0004930892678034102, "loss": 4.0042, "theoretical_loss": 4.940824354325419, "tokens_seen": 78184448 }, { "epoch": 0.02, "learning_rate": 0.0004930792377131394, "loss": 3.7392, "theoretical_loss": 4.940230423742372, "tokens_seen": 78249984 }, { "epoch": 0.02, "learning_rate": 0.0004930692076228686, "loss": 4.0698, "theoretical_loss": 4.939637129527789, "tokens_seen": 78315520 }, { "epoch": 0.02, "learning_rate": 0.0004930591775325978, "loss": 3.9662, "theoretical_loss": 4.939044470468156, "tokens_seen": 78381056 }, { "epoch": 0.02, "learning_rate": 0.000493049147442327, "loss": 3.9963, "theoretical_loss": 4.938452445353294, "tokens_seen": 78446592 }, { "epoch": 0.02, "learning_rate": 0.0004930391173520562, "loss": 3.8801, "theoretical_loss": 4.937861052976332, "tokens_seen": 78512128 }, { "epoch": 0.02, "learning_rate": 0.0004930290872617853, "loss": 4.0235, "theoretical_loss": 4.937270292133704, "tokens_seen": 78577664 }, { "epoch": 0.02, "objective/train/docs_used": 157231, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.538254737854004, "objective/train/theoretical_loss": 4.9366801616251355, "objective/train/tokens_used": 99103200, "theoretical_loss": 4.9366801616251355, "tokens_seen": 78643200 }, { "epoch": 0.02, "learning_rate": 0.0004930190571715146, "loss": 3.6767, "theoretical_loss": 4.9366801616251355, "tokens_seen": 78643200 }, { "epoch": 0.02, "learning_rate": 0.0004930090270812437, "loss": 3.9388, "theoretical_loss": 4.93609066025363, "tokens_seen": 78708736 }, { "epoch": 0.02, "learning_rate": 0.000492998996990973, "loss": 4.0195, "theoretical_loss": 4.935501786825457, "tokens_seen": 78774272 }, { "epoch": 0.02, "learning_rate": 0.0004929889669007021, "loss": 4.0291, "theoretical_loss": 4.934913540150143, "tokens_seen": 78839808 }, { "epoch": 0.02, "learning_rate": 0.0004929789368104313, "loss": 3.9964, "theoretical_loss": 4.934325919040461, "tokens_seen": 78905344 }, { "epoch": 0.02, "learning_rate": 0.0004929689067201605, "loss": 3.8374, "theoretical_loss": 4.933738922312413, "tokens_seen": 78970880 }, { "epoch": 0.02, "learning_rate": 0.0004929588766298897, "loss": 3.7884, "theoretical_loss": 4.933152548785222, "tokens_seen": 79036416 }, { "epoch": 0.02, "learning_rate": 0.0004929488465396189, "loss": 4.0148, "theoretical_loss": 4.932566797281324, "tokens_seen": 79101952 }, { "epoch": 0.02, "learning_rate": 0.0004929388164493481, "loss": 3.6818, "theoretical_loss": 4.931981666626351, "tokens_seen": 79167488 }, { "epoch": 0.02, "learning_rate": 0.0004929287863590773, "loss": 3.7694, "theoretical_loss": 4.931397155649121, "tokens_seen": 79233024 }, { "epoch": 0.02, "learning_rate": 0.0004929187562688064, "loss": 3.7043, "theoretical_loss": 4.930813263181631, "tokens_seen": 79298560 }, { "epoch": 0.02, "learning_rate": 0.0004929087261785356, "loss": 3.8084, "theoretical_loss": 4.93022998805904, "tokens_seen": 79364096 }, { "epoch": 0.02, "learning_rate": 0.0004928986960882648, "loss": 3.7386, "theoretical_loss": 4.929647329119659, "tokens_seen": 79429632 }, { "epoch": 0.02, "learning_rate": 0.000492888665997994, "loss": 4.0392, "theoretical_loss": 4.9290652852049455, "tokens_seen": 79495168 }, { "epoch": 0.02, "learning_rate": 0.0004928786359077232, "loss": 3.8436, "theoretical_loss": 4.928483855159485, "tokens_seen": 79560704 }, { "epoch": 0.02, "learning_rate": 0.0004928686058174523, "loss": 3.7666, "theoretical_loss": 4.927903037830983, "tokens_seen": 79626240 }, { "epoch": 0.02, "learning_rate": 0.0004928585757271815, "loss": 3.8876, "theoretical_loss": 4.9273228320702565, "tokens_seen": 79691776 }, { "epoch": 0.02, "learning_rate": 0.0004928485456369107, "loss": 3.8625, "theoretical_loss": 4.926743236731218, "tokens_seen": 79757312 }, { "epoch": 0.02, "learning_rate": 0.00049283851554664, "loss": 3.9097, "theoretical_loss": 4.926164250670868, "tokens_seen": 79822848 }, { "epoch": 0.02, "learning_rate": 0.0004928284854563691, "loss": 3.7908, "theoretical_loss": 4.925585872749284, "tokens_seen": 79888384 }, { "epoch": 0.02, "learning_rate": 0.0004928184553660984, "loss": 3.9244, "theoretical_loss": 4.925008101829608, "tokens_seen": 79953920 }, { "epoch": 0.02, "learning_rate": 0.0004928084252758274, "loss": 3.9244, "theoretical_loss": 4.9244309367780374, "tokens_seen": 80019456 }, { "epoch": 0.02, "learning_rate": 0.0004927983951855567, "loss": 3.8718, "theoretical_loss": 4.923854376463816, "tokens_seen": 80084992 }, { "epoch": 0.02, "learning_rate": 0.0004927883650952859, "loss": 3.9542, "theoretical_loss": 4.923278419759217, "tokens_seen": 80150528 }, { "epoch": 0.02, "learning_rate": 0.0004927783350050151, "loss": 3.8026, "theoretical_loss": 4.92270306553954, "tokens_seen": 80216064 }, { "epoch": 0.02, "objective/train/docs_used": 158547, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.087661266326904, "objective/train/theoretical_loss": 4.922128312683096, "objective/train/tokens_used": 100741600, "theoretical_loss": 4.922128312683096, "tokens_seen": 80281600 }, { "epoch": 0.02, "learning_rate": 0.0004927683049147443, "loss": 3.9703, "theoretical_loss": 4.922128312683096, "tokens_seen": 80281600 }, { "epoch": 0.02, "learning_rate": 0.0004927582748244734, "loss": 4.0453, "theoretical_loss": 4.921554160071194, "tokens_seen": 80347136 }, { "epoch": 0.02, "learning_rate": 0.0004927482447342026, "loss": 3.9074, "theoretical_loss": 4.920980606588142, "tokens_seen": 80412672 }, { "epoch": 0.02, "learning_rate": 0.0004927382146439318, "loss": 3.9357, "theoretical_loss": 4.920407651121222, "tokens_seen": 80478208 }, { "epoch": 0.02, "learning_rate": 0.000492728184553661, "loss": 3.8219, "theoretical_loss": 4.919835292560689, "tokens_seen": 80543744 }, { "epoch": 0.02, "learning_rate": 0.0004927181544633902, "loss": 3.8784, "theoretical_loss": 4.919263529799759, "tokens_seen": 80609280 }, { "epoch": 0.02, "learning_rate": 0.0004927081243731193, "loss": 3.9352, "theoretical_loss": 4.918692361734598, "tokens_seen": 80674816 }, { "epoch": 0.02, "learning_rate": 0.0004926980942828485, "loss": 3.8361, "theoretical_loss": 4.91812178726431, "tokens_seen": 80740352 }, { "epoch": 0.02, "learning_rate": 0.0004926880641925777, "loss": 3.8587, "theoretical_loss": 4.917551805290929, "tokens_seen": 80805888 }, { "epoch": 0.02, "learning_rate": 0.0004926780341023069, "loss": 3.8611, "theoretical_loss": 4.916982414719408, "tokens_seen": 80871424 }, { "epoch": 0.02, "learning_rate": 0.0004926680040120361, "loss": 3.9623, "theoretical_loss": 4.9164136144576105, "tokens_seen": 80936960 }, { "epoch": 0.02, "learning_rate": 0.0004926579739217654, "loss": 3.7275, "theoretical_loss": 4.915845403416299, "tokens_seen": 81002496 }, { "epoch": 0.02, "learning_rate": 0.0004926479438314944, "loss": 3.8576, "theoretical_loss": 4.915277780509124, "tokens_seen": 81068032 }, { "epoch": 0.02, "learning_rate": 0.0004926379137412237, "loss": 3.9434, "theoretical_loss": 4.914710744652614, "tokens_seen": 81133568 }, { "epoch": 0.02, "learning_rate": 0.0004926278836509528, "loss": 3.8827, "theoretical_loss": 4.914144294766169, "tokens_seen": 81199104 }, { "epoch": 0.02, "learning_rate": 0.0004926178535606821, "loss": 3.8772, "theoretical_loss": 4.913578429772047, "tokens_seen": 81264640 }, { "epoch": 0.02, "learning_rate": 0.0004926078234704113, "loss": 3.9044, "theoretical_loss": 4.913013148595355, "tokens_seen": 81330176 }, { "epoch": 0.02, "learning_rate": 0.0004925977933801404, "loss": 3.8868, "theoretical_loss": 4.912448450164041, "tokens_seen": 81395712 }, { "epoch": 0.02, "learning_rate": 0.0004925877632898696, "loss": 3.7639, "theoretical_loss": 4.91188433340888, "tokens_seen": 81461248 }, { "epoch": 0.02, "learning_rate": 0.0004925777331995988, "loss": 3.9118, "theoretical_loss": 4.911320797263471, "tokens_seen": 81526784 }, { "epoch": 0.02, "learning_rate": 0.000492567703109328, "loss": 3.8558, "theoretical_loss": 4.910757840664219, "tokens_seen": 81592320 }, { "epoch": 0.02, "learning_rate": 0.0004925576730190572, "loss": 3.7093, "theoretical_loss": 4.910195462550334, "tokens_seen": 81657856 }, { "epoch": 0.02, "learning_rate": 0.0004925476429287864, "loss": 3.9947, "theoretical_loss": 4.909633661863811, "tokens_seen": 81723392 }, { "epoch": 0.02, "learning_rate": 0.0004925376128385155, "loss": 3.7547, "theoretical_loss": 4.909072437549434, "tokens_seen": 81788928 }, { "epoch": 0.02, "learning_rate": 0.0004925275827482447, "loss": 3.6629, "theoretical_loss": 4.908511788554753, "tokens_seen": 81854464 }, { "epoch": 0.02, "objective/train/docs_used": 161230, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.66131854057312, "objective/train/theoretical_loss": 4.907951713830082, "objective/train/tokens_used": 102380000, "theoretical_loss": 4.907951713830082, "tokens_seen": 81920000 }, { "epoch": 0.02, "learning_rate": 0.0004925175526579739, "loss": 3.9337, "theoretical_loss": 4.907951713830082, "tokens_seen": 81920000 }, { "epoch": 0.02, "learning_rate": 0.0004925075225677031, "loss": 3.8424, "theoretical_loss": 4.907392212328489, "tokens_seen": 81985536 }, { "epoch": 0.02, "learning_rate": 0.0004924974924774323, "loss": 3.8054, "theoretical_loss": 4.906833283005785, "tokens_seen": 82051072 }, { "epoch": 0.02, "learning_rate": 0.0004924874623871615, "loss": 3.8893, "theoretical_loss": 4.906274924820515, "tokens_seen": 82116608 }, { "epoch": 0.02, "learning_rate": 0.0004924774322968907, "loss": 4.0462, "theoretical_loss": 4.90571713673395, "tokens_seen": 82182144 }, { "epoch": 0.02, "learning_rate": 0.0004924674022066199, "loss": 3.8713, "theoretical_loss": 4.905159917710073, "tokens_seen": 82247680 }, { "epoch": 0.02, "learning_rate": 0.0004924573721163491, "loss": 3.6772, "theoretical_loss": 4.904603266715578, "tokens_seen": 82313216 }, { "epoch": 0.02, "learning_rate": 0.0004924473420260783, "loss": 3.8641, "theoretical_loss": 4.904047182719854, "tokens_seen": 82378752 }, { "epoch": 0.02, "learning_rate": 0.0004924373119358075, "loss": 3.9288, "theoretical_loss": 4.903491664694977, "tokens_seen": 82444288 }, { "epoch": 0.03, "learning_rate": 0.0004924272818455366, "loss": 3.8435, "theoretical_loss": 4.902936711615702, "tokens_seen": 82509824 }, { "epoch": 0.03, "learning_rate": 0.0004924172517552658, "loss": 3.6423, "theoretical_loss": 4.902382322459456, "tokens_seen": 82575360 }, { "epoch": 0.03, "learning_rate": 0.000492407221664995, "loss": 3.7453, "theoretical_loss": 4.901828496206322, "tokens_seen": 82640896 }, { "epoch": 0.03, "learning_rate": 0.0004923971915747242, "loss": 3.8669, "theoretical_loss": 4.90127523183904, "tokens_seen": 82706432 }, { "epoch": 0.03, "learning_rate": 0.0004923871614844534, "loss": 3.9096, "theoretical_loss": 4.900722528342988, "tokens_seen": 82771968 }, { "epoch": 0.03, "learning_rate": 0.0004923771313941825, "loss": 3.8983, "theoretical_loss": 4.900170384706181, "tokens_seen": 82837504 }, { "epoch": 0.03, "learning_rate": 0.0004923671013039117, "loss": 3.8518, "theoretical_loss": 4.899618799919256, "tokens_seen": 82903040 }, { "epoch": 0.03, "learning_rate": 0.0004923570712136409, "loss": 3.9242, "theoretical_loss": 4.899067772975469, "tokens_seen": 82968576 }, { "epoch": 0.03, "learning_rate": 0.0004923470411233702, "loss": 3.8165, "theoretical_loss": 4.898517302870679, "tokens_seen": 83034112 }, { "epoch": 0.03, "learning_rate": 0.0004923370110330993, "loss": 3.7829, "theoretical_loss": 4.897967388603346, "tokens_seen": 83099648 }, { "epoch": 0.03, "learning_rate": 0.0004923269809428286, "loss": 4.027, "theoretical_loss": 4.897418029174519, "tokens_seen": 83165184 }, { "epoch": 0.03, "learning_rate": 0.0004923169508525576, "loss": 3.809, "theoretical_loss": 4.896869223587828, "tokens_seen": 83230720 }, { "epoch": 0.03, "learning_rate": 0.0004923069207622869, "loss": 3.5978, "theoretical_loss": 4.896320970849472, "tokens_seen": 83296256 }, { "epoch": 0.03, "learning_rate": 0.0004922968906720161, "loss": 3.8513, "theoretical_loss": 4.895773269968219, "tokens_seen": 83361792 }, { "epoch": 0.03, "learning_rate": 0.0004922868605817453, "loss": 3.8171, "theoretical_loss": 4.895226119955386, "tokens_seen": 83427328 }, { "epoch": 0.03, "learning_rate": 0.0004922768304914745, "loss": 3.9333, "theoretical_loss": 4.894679519824841, "tokens_seen": 83492864 }, { "epoch": 0.03, "objective/train/docs_used": 164075, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.917760133743286, "objective/train/theoretical_loss": 4.894133468592984, "objective/train/tokens_used": 104018400, "theoretical_loss": 4.894133468592984, "tokens_seen": 83558400 }, { "epoch": 0.03, "learning_rate": 0.0004922668004012036, "loss": 3.8042, "theoretical_loss": 4.894133468592984, "tokens_seen": 83558400 }, { "epoch": 0.03, "learning_rate": 0.0004922567703109328, "loss": 3.7611, "theoretical_loss": 4.8935879652787495, "tokens_seen": 83623936 }, { "epoch": 0.03, "learning_rate": 0.000492246740220662, "loss": 3.7287, "theoretical_loss": 4.893043008903591, "tokens_seen": 83689472 }, { "epoch": 0.03, "learning_rate": 0.0004922367101303912, "loss": 3.9224, "theoretical_loss": 4.892498598491473, "tokens_seen": 83755008 }, { "epoch": 0.03, "learning_rate": 0.0004922266800401204, "loss": 3.8353, "theoretical_loss": 4.891954733068863, "tokens_seen": 83820544 }, { "epoch": 0.03, "learning_rate": 0.0004922166499498495, "loss": 3.8987, "theoretical_loss": 4.891411411664727, "tokens_seen": 83886080 }, { "epoch": 0.03, "learning_rate": 0.0004922066198595787, "loss": 3.8912, "theoretical_loss": 4.890868633310515, "tokens_seen": 83951616 }, { "epoch": 0.03, "learning_rate": 0.0004921965897693079, "loss": 3.7295, "theoretical_loss": 4.890326397040158, "tokens_seen": 84017152 }, { "epoch": 0.03, "learning_rate": 0.0004921865596790371, "loss": 3.9006, "theoretical_loss": 4.889784701890056, "tokens_seen": 84082688 }, { "epoch": 0.03, "learning_rate": 0.0004921765295887663, "loss": 3.8, "theoretical_loss": 4.8892435468990705, "tokens_seen": 84148224 }, { "epoch": 0.03, "learning_rate": 0.0004921664994984956, "loss": 3.8752, "theoretical_loss": 4.88870293110852, "tokens_seen": 84213760 }, { "epoch": 0.03, "learning_rate": 0.0004921564694082246, "loss": 3.7345, "theoretical_loss": 4.888162853562166, "tokens_seen": 84279296 }, { "epoch": 0.03, "learning_rate": 0.0004921464393179539, "loss": 3.8029, "theoretical_loss": 4.88762331330621, "tokens_seen": 84344832 }, { "epoch": 0.03, "learning_rate": 0.000492136409227683, "loss": 3.911, "theoretical_loss": 4.88708430938928, "tokens_seen": 84410368 }, { "epoch": 0.03, "learning_rate": 0.0004921263791374123, "loss": 3.8969, "theoretical_loss": 4.8865458408624285, "tokens_seen": 84475904 }, { "epoch": 0.03, "learning_rate": 0.0004921163490471415, "loss": 3.7947, "theoretical_loss": 4.8860079067791204, "tokens_seen": 84541440 }, { "epoch": 0.03, "learning_rate": 0.0004921063189568706, "loss": 3.8612, "theoretical_loss": 4.885470506195227, "tokens_seen": 84606976 }, { "epoch": 0.03, "learning_rate": 0.0004920962888665998, "loss": 3.8006, "theoretical_loss": 4.884933638169014, "tokens_seen": 84672512 }, { "epoch": 0.03, "learning_rate": 0.000492086258776329, "loss": 3.954, "theoretical_loss": 4.88439730176114, "tokens_seen": 84738048 }, { "epoch": 0.03, "learning_rate": 0.0004920762286860582, "loss": 3.7704, "theoretical_loss": 4.883861496034644, "tokens_seen": 84803584 }, { "epoch": 0.03, "learning_rate": 0.0004920661985957874, "loss": 3.8477, "theoretical_loss": 4.88332622005494, "tokens_seen": 84869120 }, { "epoch": 0.03, "learning_rate": 0.0004920561685055166, "loss": 3.862, "theoretical_loss": 4.8827914728898065, "tokens_seen": 84934656 }, { "epoch": 0.03, "learning_rate": 0.0004920461384152457, "loss": 3.8259, "theoretical_loss": 4.88225725360938, "tokens_seen": 85000192 }, { "epoch": 0.03, "learning_rate": 0.0004920361083249749, "loss": 3.9433, "theoretical_loss": 4.881723561286149, "tokens_seen": 85065728 }, { "epoch": 0.03, "learning_rate": 0.0004920260782347041, "loss": 3.8617, "theoretical_loss": 4.881190394994943, "tokens_seen": 85131264 }, { "epoch": 0.03, "objective/train/docs_used": 167121, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6758368015289307, "objective/train/theoretical_loss": 4.880657753812926, "objective/train/tokens_used": 105656800, "theoretical_loss": 4.880657753812926, "tokens_seen": 85196800 }, { "epoch": 0.03, "learning_rate": 0.0004920160481444333, "loss": 3.8256, "theoretical_loss": 4.880657753812926, "tokens_seen": 85196800 }, { "epoch": 0.03, "learning_rate": 0.0004920060180541625, "loss": 3.6626, "theoretical_loss": 4.880125636819594, "tokens_seen": 85262336 }, { "epoch": 0.03, "learning_rate": 0.0004919959879638916, "loss": 4.0169, "theoretical_loss": 4.879594043096755, "tokens_seen": 85327872 }, { "epoch": 0.03, "learning_rate": 0.0004919859578736209, "loss": 3.8117, "theoretical_loss": 4.879062971728534, "tokens_seen": 85393408 }, { "epoch": 0.03, "learning_rate": 0.00049197592778335, "loss": 3.7823, "theoretical_loss": 4.87853242180136, "tokens_seen": 85458944 }, { "epoch": 0.03, "learning_rate": 0.0004919658976930793, "loss": 3.7133, "theoretical_loss": 4.878002392403959, "tokens_seen": 85524480 }, { "epoch": 0.03, "learning_rate": 0.0004919558676028084, "loss": 3.876, "theoretical_loss": 4.877472882627343, "tokens_seen": 85590016 }, { "epoch": 0.03, "learning_rate": 0.0004919458375125377, "loss": 3.8415, "theoretical_loss": 4.8769438915648085, "tokens_seen": 85655552 }, { "epoch": 0.03, "learning_rate": 0.0004919358074222668, "loss": 3.644, "theoretical_loss": 4.876415418311928, "tokens_seen": 85721088 }, { "epoch": 0.03, "learning_rate": 0.000491925777331996, "loss": 3.8158, "theoretical_loss": 4.875887461966537, "tokens_seen": 85786624 }, { "epoch": 0.03, "learning_rate": 0.0004919157472417252, "loss": 3.787, "theoretical_loss": 4.875360021628733, "tokens_seen": 85852160 }, { "epoch": 0.03, "learning_rate": 0.0004919057171514544, "loss": 3.7131, "theoretical_loss": 4.874833096400865, "tokens_seen": 85917696 }, { "epoch": 0.03, "learning_rate": 0.0004918956870611836, "loss": 3.7312, "theoretical_loss": 4.874306685387525, "tokens_seen": 85983232 }, { "epoch": 0.03, "learning_rate": 0.0004918856569709127, "loss": 3.8239, "theoretical_loss": 4.873780787695547, "tokens_seen": 86048768 }, { "epoch": 0.03, "learning_rate": 0.0004918756268806419, "loss": 3.8682, "theoretical_loss": 4.87325540243399, "tokens_seen": 86114304 }, { "epoch": 0.03, "learning_rate": 0.0004918655967903711, "loss": 3.9031, "theoretical_loss": 4.872730528714139, "tokens_seen": 86179840 }, { "epoch": 0.03, "learning_rate": 0.0004918555667001003, "loss": 3.7074, "theoretical_loss": 4.872206165649493, "tokens_seen": 86245376 }, { "epoch": 0.03, "learning_rate": 0.0004918455366098295, "loss": 3.7849, "theoretical_loss": 4.871682312355761, "tokens_seen": 86310912 }, { "epoch": 0.03, "learning_rate": 0.0004918355065195586, "loss": 3.8876, "theoretical_loss": 4.871158967950852, "tokens_seen": 86376448 }, { "epoch": 0.03, "learning_rate": 0.0004918254764292878, "loss": 3.8094, "theoretical_loss": 4.870636131554869, "tokens_seen": 86441984 }, { "epoch": 0.03, "learning_rate": 0.000491815446339017, "loss": 3.8882, "theoretical_loss": 4.8701138022901045, "tokens_seen": 86507520 }, { "epoch": 0.03, "learning_rate": 0.0004918054162487463, "loss": 3.818, "theoretical_loss": 4.869591979281028, "tokens_seen": 86573056 }, { "epoch": 0.03, "learning_rate": 0.0004917953861584754, "loss": 3.9174, "theoretical_loss": 4.8690706616542805, "tokens_seen": 86638592 }, { "epoch": 0.03, "learning_rate": 0.0004917853560682047, "loss": 3.7477, "theoretical_loss": 4.868549848538675, "tokens_seen": 86704128 }, { "epoch": 0.03, "learning_rate": 0.0004917753259779337, "loss": 3.7602, "theoretical_loss": 4.868029539065176, "tokens_seen": 86769664 }, { "epoch": 0.03, "objective/train/docs_used": 170013, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.95674204826355, "objective/train/theoretical_loss": 4.867509732366907, "objective/train/tokens_used": 107295200, "theoretical_loss": 4.867509732366907, "tokens_seen": 86835200 }, { "epoch": 0.03, "learning_rate": 0.000491765295887663, "loss": 3.9083, "theoretical_loss": 4.867509732366907, "tokens_seen": 86835200 }, { "epoch": 0.03, "learning_rate": 0.0004917552657973922, "loss": 3.8751, "theoretical_loss": 4.866990427579129, "tokens_seen": 86900736 }, { "epoch": 0.03, "learning_rate": 0.0004917452357071214, "loss": 3.8234, "theoretical_loss": 4.866471623839248, "tokens_seen": 86966272 }, { "epoch": 0.03, "learning_rate": 0.0004917352056168506, "loss": 3.8019, "theoretical_loss": 4.8659533202867955, "tokens_seen": 87031808 }, { "epoch": 0.03, "learning_rate": 0.0004917251755265797, "loss": 3.9246, "theoretical_loss": 4.86543551606343, "tokens_seen": 87097344 }, { "epoch": 0.03, "learning_rate": 0.0004917151454363089, "loss": 3.7829, "theoretical_loss": 4.864918210312927, "tokens_seen": 87162880 }, { "epoch": 0.03, "learning_rate": 0.0004917051153460381, "loss": 3.8191, "theoretical_loss": 4.864401402181173, "tokens_seen": 87228416 }, { "epoch": 0.03, "learning_rate": 0.0004916950852557673, "loss": 3.7456, "theoretical_loss": 4.863885090816158, "tokens_seen": 87293952 }, { "epoch": 0.03, "learning_rate": 0.0004916850551654965, "loss": 3.8745, "theoretical_loss": 4.863369275367968, "tokens_seen": 87359488 }, { "epoch": 0.03, "learning_rate": 0.0004916750250752258, "loss": 3.8965, "theoretical_loss": 4.862853954988781, "tokens_seen": 87425024 }, { "epoch": 0.03, "learning_rate": 0.0004916649949849548, "loss": 3.7952, "theoretical_loss": 4.862339128832857, "tokens_seen": 87490560 }, { "epoch": 0.03, "learning_rate": 0.0004916549648946841, "loss": 3.8899, "theoretical_loss": 4.861824796056533, "tokens_seen": 87556096 }, { "epoch": 0.03, "learning_rate": 0.0004916449348044132, "loss": 3.6942, "theoretical_loss": 4.861310955818219, "tokens_seen": 87621632 }, { "epoch": 0.03, "learning_rate": 0.0004916349047141425, "loss": 3.8929, "theoretical_loss": 4.860797607278385, "tokens_seen": 87687168 }, { "epoch": 0.03, "learning_rate": 0.0004916248746238717, "loss": 3.8, "theoretical_loss": 4.86028474959956, "tokens_seen": 87752704 }, { "epoch": 0.03, "learning_rate": 0.0004916148445336008, "loss": 3.6308, "theoretical_loss": 4.859772381946323, "tokens_seen": 87818240 }, { "epoch": 0.03, "learning_rate": 0.00049160481444333, "loss": 3.7281, "theoretical_loss": 4.859260503485298, "tokens_seen": 87883776 }, { "epoch": 0.03, "learning_rate": 0.0004915947843530592, "loss": 3.7924, "theoretical_loss": 4.858749113385144, "tokens_seen": 87949312 }, { "epoch": 0.03, "learning_rate": 0.0004915847542627884, "loss": 3.7335, "theoretical_loss": 4.858238210816554, "tokens_seen": 88014848 }, { "epoch": 0.03, "learning_rate": 0.0004915747241725176, "loss": 3.7622, "theoretical_loss": 4.8577277949522415, "tokens_seen": 88080384 }, { "epoch": 0.03, "learning_rate": 0.0004915646940822468, "loss": 3.7617, "theoretical_loss": 4.857217864966943, "tokens_seen": 88145920 }, { "epoch": 0.03, "learning_rate": 0.0004915546639919759, "loss": 3.7003, "theoretical_loss": 4.856708420037402, "tokens_seen": 88211456 }, { "epoch": 0.03, "learning_rate": 0.0004915446339017051, "loss": 3.6099, "theoretical_loss": 4.8561994593423705, "tokens_seen": 88276992 }, { "epoch": 0.03, "learning_rate": 0.0004915346038114343, "loss": 3.6149, "theoretical_loss": 4.8556909820625975, "tokens_seen": 88342528 }, { "epoch": 0.03, "learning_rate": 0.0004915245737211635, "loss": 3.7565, "theoretical_loss": 4.855182987380823, "tokens_seen": 88408064 }, { "epoch": 0.03, "objective/train/docs_used": 171411, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 4.086789131164551, "objective/train/theoretical_loss": 4.854675474481779, "objective/train/tokens_used": 108933600, "theoretical_loss": 4.854675474481779, "tokens_seen": 88473600 }, { "epoch": 0.03, "learning_rate": 0.0004915145436308927, "loss": 3.8653, "theoretical_loss": 4.854675474481779, "tokens_seen": 88473600 }, { "epoch": 0.03, "learning_rate": 0.0004915045135406218, "loss": 3.6881, "theoretical_loss": 4.8541684425521705, "tokens_seen": 88539136 }, { "epoch": 0.03, "learning_rate": 0.0004914944834503511, "loss": 3.7421, "theoretical_loss": 4.85366189078068, "tokens_seen": 88604672 }, { "epoch": 0.03, "learning_rate": 0.0004914844533600802, "loss": 3.8453, "theoretical_loss": 4.853155818357957, "tokens_seen": 88670208 }, { "epoch": 0.03, "learning_rate": 0.0004914744232698095, "loss": 3.8291, "theoretical_loss": 4.852650224476609, "tokens_seen": 88735744 }, { "epoch": 0.03, "learning_rate": 0.0004914643931795386, "loss": 3.7445, "theoretical_loss": 4.852145108331205, "tokens_seen": 88801280 }, { "epoch": 0.03, "learning_rate": 0.0004914543630892679, "loss": 3.639, "theoretical_loss": 4.851640469118255, "tokens_seen": 88866816 }, { "epoch": 0.03, "learning_rate": 0.000491444332998997, "loss": 3.8614, "theoretical_loss": 4.851136306036219, "tokens_seen": 88932352 }, { "epoch": 0.03, "learning_rate": 0.0004914343029087262, "loss": 3.8589, "theoretical_loss": 4.850632618285486, "tokens_seen": 88997888 }, { "epoch": 0.03, "learning_rate": 0.0004914242728184554, "loss": 3.8876, "theoretical_loss": 4.850129405068383, "tokens_seen": 89063424 }, { "epoch": 0.03, "learning_rate": 0.0004914142427281846, "loss": 3.7332, "theoretical_loss": 4.849626665589156, "tokens_seen": 89128960 }, { "epoch": 0.03, "learning_rate": 0.0004914042126379138, "loss": 3.7111, "theoretical_loss": 4.849124399053969, "tokens_seen": 89194496 }, { "epoch": 0.03, "learning_rate": 0.0004913941825476429, "loss": 3.4025, "theoretical_loss": 4.8486226046709024, "tokens_seen": 89260032 }, { "epoch": 0.03, "learning_rate": 0.0004913841524573721, "loss": 3.7824, "theoretical_loss": 4.8481212816499415, "tokens_seen": 89325568 }, { "epoch": 0.03, "learning_rate": 0.0004913741223671013, "loss": 3.7842, "theoretical_loss": 4.847620429202967, "tokens_seen": 89391104 }, { "epoch": 0.03, "learning_rate": 0.0004913640922768305, "loss": 3.8446, "theoretical_loss": 4.847120046543763, "tokens_seen": 89456640 }, { "epoch": 0.03, "learning_rate": 0.0004913540621865597, "loss": 3.7927, "theoretical_loss": 4.846620132887992, "tokens_seen": 89522176 }, { "epoch": 0.03, "learning_rate": 0.0004913440320962888, "loss": 3.8718, "theoretical_loss": 4.8461206874532055, "tokens_seen": 89587712 }, { "epoch": 0.03, "learning_rate": 0.000491334002006018, "loss": 3.6709, "theoretical_loss": 4.845621709458831, "tokens_seen": 89653248 }, { "epoch": 0.03, "learning_rate": 0.0004913239719157472, "loss": 3.968, "theoretical_loss": 4.845123198126162, "tokens_seen": 89718784 }, { "epoch": 0.03, "learning_rate": 0.0004913139418254765, "loss": 3.7329, "theoretical_loss": 4.844625152678364, "tokens_seen": 89784320 }, { "epoch": 0.03, "learning_rate": 0.0004913039117352056, "loss": 3.791, "theoretical_loss": 4.844127572340455, "tokens_seen": 89849856 }, { "epoch": 0.03, "learning_rate": 0.0004912938816449349, "loss": 3.4753, "theoretical_loss": 4.84363045633931, "tokens_seen": 89915392 }, { "epoch": 0.03, "learning_rate": 0.0004912838515546639, "loss": 3.6891, "theoretical_loss": 4.843133803903651, "tokens_seen": 89980928 }, { "epoch": 0.03, "learning_rate": 0.0004912738214643932, "loss": 3.7444, "theoretical_loss": 4.84263761426404, "tokens_seen": 90046464 }, { "epoch": 0.03, "objective/train/docs_used": 174465, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.986464500427246, "objective/train/theoretical_loss": 4.842141886652876, "objective/train/tokens_used": 110572000, "theoretical_loss": 4.842141886652876, "tokens_seen": 90112000 }, { "epoch": 0.03, "learning_rate": 0.0004912637913741224, "loss": 3.8477, "theoretical_loss": 4.842141886652876, "tokens_seen": 90112000 }, { "epoch": 0.03, "learning_rate": 0.0004912537612838516, "loss": 3.6286, "theoretical_loss": 4.841646620304388, "tokens_seen": 90177536 }, { "epoch": 0.03, "learning_rate": 0.0004912437311935808, "loss": 3.7422, "theoretical_loss": 4.841151814454632, "tokens_seen": 90243072 }, { "epoch": 0.03, "learning_rate": 0.00049123370110331, "loss": 3.8501, "theoretical_loss": 4.840657468341476, "tokens_seen": 90308608 }, { "epoch": 0.03, "learning_rate": 0.0004912236710130391, "loss": 3.5873, "theoretical_loss": 4.84016358120461, "tokens_seen": 90374144 }, { "epoch": 0.03, "learning_rate": 0.0004912136409227683, "loss": 3.9249, "theoretical_loss": 4.839670152285526, "tokens_seen": 90439680 }, { "epoch": 0.03, "learning_rate": 0.0004912036108324975, "loss": 3.8918, "theoretical_loss": 4.8391771808275195, "tokens_seen": 90505216 }, { "epoch": 0.03, "learning_rate": 0.0004911935807422267, "loss": 3.726, "theoretical_loss": 4.838684666075682, "tokens_seen": 90570752 }, { "epoch": 0.03, "learning_rate": 0.0004911835506519559, "loss": 3.9148, "theoretical_loss": 4.838192607276896, "tokens_seen": 90636288 }, { "epoch": 0.03, "learning_rate": 0.000491173520561685, "loss": 3.9637, "theoretical_loss": 4.837701003679829, "tokens_seen": 90701824 }, { "epoch": 0.03, "learning_rate": 0.0004911634904714142, "loss": 3.8333, "theoretical_loss": 4.8372098545349305, "tokens_seen": 90767360 }, { "epoch": 0.03, "learning_rate": 0.0004911534603811434, "loss": 3.9592, "theoretical_loss": 4.836719159094422, "tokens_seen": 90832896 }, { "epoch": 0.03, "learning_rate": 0.0004911434302908726, "loss": 3.7236, "theoretical_loss": 4.836228916612292, "tokens_seen": 90898432 }, { "epoch": 0.03, "learning_rate": 0.0004911334002006019, "loss": 3.8823, "theoretical_loss": 4.835739126344298, "tokens_seen": 90963968 }, { "epoch": 0.03, "learning_rate": 0.0004911233701103309, "loss": 3.9575, "theoretical_loss": 4.8352497875479505, "tokens_seen": 91029504 }, { "epoch": 0.03, "learning_rate": 0.0004911133400200602, "loss": 3.9343, "theoretical_loss": 4.834760899482514, "tokens_seen": 91095040 }, { "epoch": 0.03, "learning_rate": 0.0004911033099297893, "loss": 3.9543, "theoretical_loss": 4.834272461409001, "tokens_seen": 91160576 }, { "epoch": 0.03, "learning_rate": 0.0004910932798395186, "loss": 3.8803, "theoretical_loss": 4.833784472590165, "tokens_seen": 91226112 }, { "epoch": 0.03, "learning_rate": 0.0004910832497492478, "loss": 3.6482, "theoretical_loss": 4.833296932290495, "tokens_seen": 91291648 }, { "epoch": 0.03, "learning_rate": 0.000491073219658977, "loss": 3.759, "theoretical_loss": 4.832809839776213, "tokens_seen": 91357184 }, { "epoch": 0.03, "learning_rate": 0.0004910631895687061, "loss": 3.8633, "theoretical_loss": 4.832323194315265, "tokens_seen": 91422720 }, { "epoch": 0.03, "learning_rate": 0.0004910531594784353, "loss": 4.0146, "theoretical_loss": 4.831836995177319, "tokens_seen": 91488256 }, { "epoch": 0.03, "learning_rate": 0.0004910431293881645, "loss": 3.7656, "theoretical_loss": 4.831351241633756, "tokens_seen": 91553792 }, { "epoch": 0.03, "learning_rate": 0.0004910330992978937, "loss": 3.7154, "theoretical_loss": 4.8308659329576695, "tokens_seen": 91619328 }, { "epoch": 0.03, "learning_rate": 0.0004910230692076229, "loss": 3.7462, "theoretical_loss": 4.830381068423856, "tokens_seen": 91684864 }, { "epoch": 0.03, "objective/train/docs_used": 177483, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5936269760131836, "objective/train/theoretical_loss": 4.8298966473088125, "objective/train/tokens_used": 112210400, "theoretical_loss": 4.8298966473088125, "tokens_seen": 91750400 }, { "epoch": 0.03, "learning_rate": 0.000491013039117352, "loss": 3.8713, "theoretical_loss": 4.8298966473088125, "tokens_seen": 91750400 }, { "epoch": 0.03, "learning_rate": 0.0004910030090270812, "loss": 3.6462, "theoretical_loss": 4.829412668890729, "tokens_seen": 91815936 }, { "epoch": 0.03, "learning_rate": 0.0004909929789368104, "loss": 4.0673, "theoretical_loss": 4.8289291324494865, "tokens_seen": 91881472 }, { "epoch": 0.03, "learning_rate": 0.0004909829488465397, "loss": 3.6328, "theoretical_loss": 4.828446037266647, "tokens_seen": 91947008 }, { "epoch": 0.03, "learning_rate": 0.0004909729187562688, "loss": 3.7804, "theoretical_loss": 4.827963382625454, "tokens_seen": 92012544 }, { "epoch": 0.03, "learning_rate": 0.0004909628886659981, "loss": 3.8435, "theoretical_loss": 4.827481167810825, "tokens_seen": 92078080 }, { "epoch": 0.03, "learning_rate": 0.0004909528585757272, "loss": 3.612, "theoretical_loss": 4.826999392109344, "tokens_seen": 92143616 }, { "epoch": 0.03, "learning_rate": 0.0004909428284854564, "loss": 3.9394, "theoretical_loss": 4.826518054809259, "tokens_seen": 92209152 }, { "epoch": 0.03, "learning_rate": 0.0004909327983951856, "loss": 3.8127, "theoretical_loss": 4.826037155200478, "tokens_seen": 92274688 }, { "epoch": 0.03, "learning_rate": 0.0004909227683049148, "loss": 3.4751, "theoretical_loss": 4.825556692574562, "tokens_seen": 92340224 }, { "epoch": 0.03, "learning_rate": 0.000490912738214644, "loss": 3.5561, "theoretical_loss": 4.825076666224717, "tokens_seen": 92405760 }, { "epoch": 0.03, "learning_rate": 0.0004909027081243731, "loss": 3.6715, "theoretical_loss": 4.824597075445799, "tokens_seen": 92471296 }, { "epoch": 0.03, "learning_rate": 0.0004908926780341023, "loss": 3.6564, "theoretical_loss": 4.824117919534297, "tokens_seen": 92536832 }, { "epoch": 0.03, "learning_rate": 0.0004908826479438315, "loss": 3.7949, "theoretical_loss": 4.823639197788334, "tokens_seen": 92602368 }, { "epoch": 0.03, "learning_rate": 0.0004908726178535607, "loss": 3.6944, "theoretical_loss": 4.823160909507665, "tokens_seen": 92667904 }, { "epoch": 0.03, "learning_rate": 0.0004908625877632899, "loss": 3.7235, "theoretical_loss": 4.822683053993664, "tokens_seen": 92733440 }, { "epoch": 0.03, "learning_rate": 0.000490852557673019, "loss": 3.8657, "theoretical_loss": 4.822205630549329, "tokens_seen": 92798976 }, { "epoch": 0.03, "learning_rate": 0.0004908425275827482, "loss": 3.677, "theoretical_loss": 4.821728638479267, "tokens_seen": 92864512 }, { "epoch": 0.03, "learning_rate": 0.0004908324974924774, "loss": 3.9142, "theoretical_loss": 4.821252077089696, "tokens_seen": 92930048 }, { "epoch": 0.03, "learning_rate": 0.0004908224674022067, "loss": 3.7216, "theoretical_loss": 4.820775945688437, "tokens_seen": 92995584 }, { "epoch": 0.03, "learning_rate": 0.0004908124373119358, "loss": 3.6402, "theoretical_loss": 4.820300243584913, "tokens_seen": 93061120 }, { "epoch": 0.03, "learning_rate": 0.0004908024072216651, "loss": 3.7207, "theoretical_loss": 4.819824970090138, "tokens_seen": 93126656 }, { "epoch": 0.03, "learning_rate": 0.0004907923771313941, "loss": 3.9008, "theoretical_loss": 4.819350124516717, "tokens_seen": 93192192 }, { "epoch": 0.03, "learning_rate": 0.0004907823470411234, "loss": 3.5414, "theoretical_loss": 4.818875706178841, "tokens_seen": 93257728 }, { "epoch": 0.03, "learning_rate": 0.0004907723169508526, "loss": 3.8364, "theoretical_loss": 4.818401714392279, "tokens_seen": 93323264 }, { "epoch": 0.03, "objective/train/docs_used": 180773, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.663616180419922, "objective/train/theoretical_loss": 4.817928148474378, "objective/train/tokens_used": 113848800, "theoretical_loss": 4.817928148474378, "tokens_seen": 93388800 }, { "epoch": 0.03, "learning_rate": 0.0004907622868605818, "loss": 3.7221, "theoretical_loss": 4.817928148474378, "tokens_seen": 93388800 }, { "epoch": 0.03, "learning_rate": 0.000490752256770311, "loss": 3.7328, "theoretical_loss": 4.817455007744052, "tokens_seen": 93454336 }, { "epoch": 0.03, "learning_rate": 0.0004907422266800401, "loss": 3.8401, "theoretical_loss": 4.816982291521785, "tokens_seen": 93519872 }, { "epoch": 0.03, "learning_rate": 0.0004907321965897693, "loss": 3.7226, "theoretical_loss": 4.816509999129618, "tokens_seen": 93585408 }, { "epoch": 0.03, "learning_rate": 0.0004907221664994985, "loss": 3.8853, "theoretical_loss": 4.816038129891151, "tokens_seen": 93650944 }, { "epoch": 0.03, "learning_rate": 0.0004907121364092277, "loss": 3.5473, "theoretical_loss": 4.815566683131536, "tokens_seen": 93716480 }, { "epoch": 0.03, "learning_rate": 0.0004907021063189569, "loss": 3.8009, "theoretical_loss": 4.815095658177472, "tokens_seen": 93782016 }, { "epoch": 0.03, "learning_rate": 0.000490692076228686, "loss": 3.7725, "theoretical_loss": 4.814625054357199, "tokens_seen": 93847552 }, { "epoch": 0.03, "learning_rate": 0.0004906820461384152, "loss": 3.633, "theoretical_loss": 4.814154871000497, "tokens_seen": 93913088 }, { "epoch": 0.03, "learning_rate": 0.0004906720160481444, "loss": 3.951, "theoretical_loss": 4.813685107438679, "tokens_seen": 93978624 }, { "epoch": 0.03, "learning_rate": 0.0004906619859578736, "loss": 3.6644, "theoretical_loss": 4.813215763004585, "tokens_seen": 94044160 }, { "epoch": 0.03, "learning_rate": 0.0004906519558676028, "loss": 4.0132, "theoretical_loss": 4.812746837032582, "tokens_seen": 94109696 }, { "epoch": 0.03, "learning_rate": 0.0004906419257773321, "loss": 3.7624, "theoretical_loss": 4.812278328858554, "tokens_seen": 94175232 }, { "epoch": 0.03, "learning_rate": 0.0004906318956870611, "loss": 3.8803, "theoretical_loss": 4.811810237819904, "tokens_seen": 94240768 }, { "epoch": 0.03, "learning_rate": 0.0004906218655967904, "loss": 3.6891, "theoretical_loss": 4.81134256325554, "tokens_seen": 94306304 }, { "epoch": 0.03, "learning_rate": 0.0004906118355065195, "loss": 3.7854, "theoretical_loss": 4.810875304505881, "tokens_seen": 94371840 }, { "epoch": 0.03, "learning_rate": 0.0004906018054162488, "loss": 3.7933, "theoretical_loss": 4.810408460912846, "tokens_seen": 94437376 }, { "epoch": 0.03, "learning_rate": 0.000490591775325978, "loss": 3.7832, "theoretical_loss": 4.809942031819853, "tokens_seen": 94502912 }, { "epoch": 0.03, "learning_rate": 0.0004905817452357072, "loss": 3.7385, "theoretical_loss": 4.809476016571809, "tokens_seen": 94568448 }, { "epoch": 0.03, "learning_rate": 0.0004905717151454363, "loss": 3.7876, "theoretical_loss": 4.809010414515113, "tokens_seen": 94633984 }, { "epoch": 0.03, "learning_rate": 0.0004905616850551655, "loss": 3.751, "theoretical_loss": 4.808545224997644, "tokens_seen": 94699520 }, { "epoch": 0.03, "learning_rate": 0.0004905516549648947, "loss": 3.7538, "theoretical_loss": 4.808080447368766, "tokens_seen": 94765056 }, { "epoch": 0.03, "learning_rate": 0.0004905416248746239, "loss": 3.7115, "theoretical_loss": 4.807616080979315, "tokens_seen": 94830592 }, { "epoch": 0.03, "learning_rate": 0.0004905315947843531, "loss": 3.5892, "theoretical_loss": 4.807152125181597, "tokens_seen": 94896128 }, { "epoch": 0.03, "learning_rate": 0.0004905215646940822, "loss": 3.6909, "theoretical_loss": 4.806688579329387, "tokens_seen": 94961664 }, { "epoch": 0.03, "objective/train/docs_used": 182208, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.473114252090454, "objective/train/theoretical_loss": 4.8062254427779205, "objective/train/tokens_used": 115487200, "theoretical_loss": 4.8062254427779205, "tokens_seen": 95027200 }, { "epoch": 0.03, "learning_rate": 0.0004905115346038114, "loss": 3.7036, "theoretical_loss": 4.8062254427779205, "tokens_seen": 95027200 }, { "epoch": 0.03, "learning_rate": 0.0004905015045135406, "loss": 3.8284, "theoretical_loss": 4.80576271488389, "tokens_seen": 95092736 }, { "epoch": 0.03, "learning_rate": 0.0004904914744232698, "loss": 3.773, "theoretical_loss": 4.805300395005444, "tokens_seen": 95158272 }, { "epoch": 0.03, "learning_rate": 0.000490481444332999, "loss": 3.7999, "theoretical_loss": 4.804838482502181, "tokens_seen": 95223808 }, { "epoch": 0.03, "learning_rate": 0.0004904714142427281, "loss": 3.7321, "theoretical_loss": 4.8043769767351385, "tokens_seen": 95289344 }, { "epoch": 0.03, "learning_rate": 0.0004904613841524574, "loss": 3.7742, "theoretical_loss": 4.8039158770668005, "tokens_seen": 95354880 }, { "epoch": 0.03, "learning_rate": 0.0004904513540621865, "loss": 3.927, "theoretical_loss": 4.803455182861087, "tokens_seen": 95420416 }, { "epoch": 0.03, "learning_rate": 0.0004904413239719158, "loss": 3.6625, "theoretical_loss": 4.802994893483348, "tokens_seen": 95485952 }, { "epoch": 0.03, "learning_rate": 0.0004904312938816449, "loss": 3.8359, "theoretical_loss": 4.802535008300364, "tokens_seen": 95551488 }, { "epoch": 0.03, "learning_rate": 0.0004904212637913742, "loss": 3.8231, "theoretical_loss": 4.802075526680335, "tokens_seen": 95617024 }, { "epoch": 0.03, "learning_rate": 0.0004904112337011033, "loss": 3.8136, "theoretical_loss": 4.801616447992888, "tokens_seen": 95682560 }, { "epoch": 0.03, "learning_rate": 0.0004904012036108325, "loss": 3.7203, "theoretical_loss": 4.801157771609061, "tokens_seen": 95748096 }, { "epoch": 0.03, "learning_rate": 0.0004903911735205617, "loss": 3.6651, "theoretical_loss": 4.8006994969013, "tokens_seen": 95813632 }, { "epoch": 0.03, "learning_rate": 0.0004903811434302909, "loss": 3.7355, "theoretical_loss": 4.800241623243467, "tokens_seen": 95879168 }, { "epoch": 0.03, "learning_rate": 0.0004903711133400201, "loss": 3.8095, "theoretical_loss": 4.799784150010819, "tokens_seen": 95944704 }, { "epoch": 0.03, "learning_rate": 0.0004903610832497492, "loss": 3.6297, "theoretical_loss": 4.799327076580017, "tokens_seen": 96010240 }, { "epoch": 0.03, "learning_rate": 0.0004903510531594784, "loss": 3.8643, "theoretical_loss": 4.798870402329115, "tokens_seen": 96075776 }, { "epoch": 0.03, "learning_rate": 0.0004903410230692076, "loss": 3.767, "theoretical_loss": 4.798414126637558, "tokens_seen": 96141312 }, { "epoch": 0.03, "learning_rate": 0.0004903309929789368, "loss": 3.9448, "theoretical_loss": 4.797958248886179, "tokens_seen": 96206848 }, { "epoch": 0.03, "learning_rate": 0.000490320962888666, "loss": 3.7056, "theoretical_loss": 4.797502768457193, "tokens_seen": 96272384 }, { "epoch": 0.03, "learning_rate": 0.0004903109327983952, "loss": 3.7626, "theoretical_loss": 4.797047684734192, "tokens_seen": 96337920 }, { "epoch": 0.03, "learning_rate": 0.0004903009027081243, "loss": 3.9715, "theoretical_loss": 4.796592997102147, "tokens_seen": 96403456 }, { "epoch": 0.03, "learning_rate": 0.0004902908726178535, "loss": 3.8285, "theoretical_loss": 4.796138704947397, "tokens_seen": 96468992 }, { "epoch": 0.03, "learning_rate": 0.0004902808425275828, "loss": 3.6196, "theoretical_loss": 4.795684807657649, "tokens_seen": 96534528 }, { "epoch": 0.03, "learning_rate": 0.0004902708124373119, "loss": 3.8131, "theoretical_loss": 4.795231304621968, "tokens_seen": 96600064 }, { "epoch": 0.03, "objective/train/docs_used": 184806, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2286107540130615, "objective/train/theoretical_loss": 4.794778195230787, "objective/train/tokens_used": 117125600, "theoretical_loss": 4.794778195230787, "tokens_seen": 96665600 }, { "epoch": 0.03, "learning_rate": 0.0004902607823470412, "loss": 3.6947, "theoretical_loss": 4.794778195230787, "tokens_seen": 96665600 }, { "epoch": 0.03, "learning_rate": 0.0004902507522567703, "loss": 3.5902, "theoretical_loss": 4.794325478875885, "tokens_seen": 96731136 }, { "epoch": 0.03, "learning_rate": 0.0004902407221664995, "loss": 3.7462, "theoretical_loss": 4.793873154950399, "tokens_seen": 96796672 }, { "epoch": 0.03, "learning_rate": 0.0004902306920762287, "loss": 3.7638, "theoretical_loss": 4.793421222848808, "tokens_seen": 96862208 }, { "epoch": 0.03, "learning_rate": 0.0004902206619859579, "loss": 3.8195, "theoretical_loss": 4.7929696819669365, "tokens_seen": 96927744 }, { "epoch": 0.03, "learning_rate": 0.0004902106318956871, "loss": 3.6739, "theoretical_loss": 4.792518531701948, "tokens_seen": 96993280 }, { "epoch": 0.03, "learning_rate": 0.0004902006018054163, "loss": 3.463, "theoretical_loss": 4.792067771452341, "tokens_seen": 97058816 }, { "epoch": 0.03, "learning_rate": 0.0004901905717151454, "loss": 3.83, "theoretical_loss": 4.791617400617948, "tokens_seen": 97124352 }, { "epoch": 0.03, "learning_rate": 0.0004901805416248746, "loss": 3.7611, "theoretical_loss": 4.791167418599925, "tokens_seen": 97189888 }, { "epoch": 0.03, "learning_rate": 0.0004901705115346038, "loss": 3.4349, "theoretical_loss": 4.790717824800755, "tokens_seen": 97255424 }, { "epoch": 0.03, "learning_rate": 0.000490160481444333, "loss": 3.638, "theoretical_loss": 4.790268618624239, "tokens_seen": 97320960 }, { "epoch": 0.03, "learning_rate": 0.0004901504513540623, "loss": 3.7789, "theoretical_loss": 4.789819799475499, "tokens_seen": 97386496 }, { "epoch": 0.03, "learning_rate": 0.0004901404212637913, "loss": 3.676, "theoretical_loss": 4.789371366760961, "tokens_seen": 97452032 }, { "epoch": 0.03, "learning_rate": 0.0004901303911735206, "loss": 3.7225, "theoretical_loss": 4.788923319888369, "tokens_seen": 97517568 }, { "epoch": 0.03, "learning_rate": 0.0004901203610832497, "loss": 3.6228, "theoretical_loss": 4.788475658266766, "tokens_seen": 97583104 }, { "epoch": 0.03, "learning_rate": 0.000490110330992979, "loss": 3.6363, "theoretical_loss": 4.788028381306497, "tokens_seen": 97648640 }, { "epoch": 0.03, "learning_rate": 0.0004901003009027082, "loss": 3.7938, "theoretical_loss": 4.787581488419207, "tokens_seen": 97714176 }, { "epoch": 0.03, "learning_rate": 0.0004900902708124374, "loss": 3.8034, "theoretical_loss": 4.787134979017832, "tokens_seen": 97779712 }, { "epoch": 0.03, "learning_rate": 0.0004900802407221665, "loss": 3.9646, "theoretical_loss": 4.786688852516599, "tokens_seen": 97845248 }, { "epoch": 0.03, "learning_rate": 0.0004900702106318957, "loss": 3.7032, "theoretical_loss": 4.786243108331024, "tokens_seen": 97910784 }, { "epoch": 0.03, "learning_rate": 0.0004900601805416249, "loss": 3.5955, "theoretical_loss": 4.7857977458779, "tokens_seen": 97976320 }, { "epoch": 0.03, "learning_rate": 0.0004900501504513541, "loss": 3.7743, "theoretical_loss": 4.785352764575304, "tokens_seen": 98041856 }, { "epoch": 0.03, "learning_rate": 0.0004900401203610833, "loss": 3.689, "theoretical_loss": 4.784908163842585, "tokens_seen": 98107392 }, { "epoch": 0.03, "learning_rate": 0.0004900300902708124, "loss": 3.5962, "theoretical_loss": 4.784463943100367, "tokens_seen": 98172928 }, { "epoch": 0.03, "learning_rate": 0.0004900200601805416, "loss": 3.587, "theoretical_loss": 4.7840201017705395, "tokens_seen": 98238464 }, { "epoch": 0.03, "objective/train/docs_used": 187560, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.8499672412872314, "objective/train/theoretical_loss": 4.783576639276257, "objective/train/tokens_used": 118764000, "theoretical_loss": 4.783576639276257, "tokens_seen": 98304000 }, { "epoch": 0.03, "learning_rate": 0.0004900100300902708, "loss": 3.8187, "theoretical_loss": 4.783576639276257, "tokens_seen": 98304000 }, { "epoch": 0.03, "learning_rate": 0.00049, "loss": 3.6056, "theoretical_loss": 4.783133555041934, "tokens_seen": 98369536 }, { "epoch": 0.03, "learning_rate": 0.0004899899699097292, "loss": 3.766, "theoretical_loss": 4.782690848493245, "tokens_seen": 98435072 }, { "epoch": 0.03, "learning_rate": 0.0004899799398194583, "loss": 3.7992, "theoretical_loss": 4.7822485190571165, "tokens_seen": 98500608 }, { "epoch": 0.03, "learning_rate": 0.0004899699097291876, "loss": 3.7486, "theoretical_loss": 4.781806566161723, "tokens_seen": 98566144 }, { "epoch": 0.03, "learning_rate": 0.0004899598796389167, "loss": 3.6369, "theoretical_loss": 4.781364989236488, "tokens_seen": 98631680 }, { "epoch": 0.03, "learning_rate": 0.000489949849548646, "loss": 3.7901, "theoretical_loss": 4.78092378771208, "tokens_seen": 98697216 }, { "epoch": 0.03, "learning_rate": 0.0004899398194583751, "loss": 3.7577, "theoretical_loss": 4.780482961020402, "tokens_seen": 98762752 }, { "epoch": 0.03, "learning_rate": 0.0004899297893681044, "loss": 3.5609, "theoretical_loss": 4.780042508594596, "tokens_seen": 98828288 }, { "epoch": 0.03, "learning_rate": 0.0004899197592778335, "loss": 3.7707, "theoretical_loss": 4.779602429869035, "tokens_seen": 98893824 }, { "epoch": 0.03, "learning_rate": 0.0004899097291875627, "loss": 3.9065, "theoretical_loss": 4.779162724279324, "tokens_seen": 98959360 }, { "epoch": 0.03, "learning_rate": 0.0004898996990972919, "loss": 3.7514, "theoretical_loss": 4.7787233912622895, "tokens_seen": 99024896 }, { "epoch": 0.03, "learning_rate": 0.0004898896690070211, "loss": 3.6845, "theoretical_loss": 4.778284430255981, "tokens_seen": 99090432 }, { "epoch": 0.03, "learning_rate": 0.0004898796389167503, "loss": 3.7866, "theoretical_loss": 4.77784584069967, "tokens_seen": 99155968 }, { "epoch": 0.03, "learning_rate": 0.0004898696088264794, "loss": 3.696, "theoretical_loss": 4.777407622033838, "tokens_seen": 99221504 }, { "epoch": 0.03, "learning_rate": 0.0004898595787362086, "loss": 3.6766, "theoretical_loss": 4.776969773700181, "tokens_seen": 99287040 }, { "epoch": 0.03, "learning_rate": 0.0004898495486459378, "loss": 3.7536, "theoretical_loss": 4.776532295141601, "tokens_seen": 99352576 }, { "epoch": 0.03, "learning_rate": 0.000489839518555667, "loss": 3.6263, "theoretical_loss": 4.776095185802211, "tokens_seen": 99418112 }, { "epoch": 0.03, "learning_rate": 0.0004898294884653962, "loss": 3.7065, "theoretical_loss": 4.775658445127318, "tokens_seen": 99483648 }, { "epoch": 0.03, "learning_rate": 0.0004898194583751254, "loss": 3.4468, "theoretical_loss": 4.775222072563429, "tokens_seen": 99549184 }, { "epoch": 0.03, "learning_rate": 0.0004898094282848545, "loss": 3.7618, "theoretical_loss": 4.7747860675582485, "tokens_seen": 99614720 }, { "epoch": 0.03, "learning_rate": 0.0004897993981945837, "loss": 3.6917, "theoretical_loss": 4.77435042956067, "tokens_seen": 99680256 }, { "epoch": 0.03, "learning_rate": 0.000489789368104313, "loss": 3.8064, "theoretical_loss": 4.773915158020776, "tokens_seen": 99745792 }, { "epoch": 0.03, "learning_rate": 0.0004897793380140421, "loss": 3.8477, "theoretical_loss": 4.773480252389831, "tokens_seen": 99811328 }, { "epoch": 0.03, "learning_rate": 0.0004897693079237714, "loss": 3.6201, "theoretical_loss": 4.773045712120284, "tokens_seen": 99876864 }, { "epoch": 0.03, "objective/train/docs_used": 190327, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.8209116458892822, "objective/train/theoretical_loss": 4.77261153666576, "objective/train/tokens_used": 120402400, "theoretical_loss": 4.77261153666576, "tokens_seen": 99942400 }, { "epoch": 0.03, "learning_rate": 0.0004897592778335005, "loss": 3.808, "theoretical_loss": 4.77261153666576, "tokens_seen": 99942400 }, { "epoch": 0.03, "learning_rate": 0.0004897492477432297, "loss": 3.7831, "theoretical_loss": 4.772177725481062, "tokens_seen": 100007936 }, { "epoch": 0.03, "learning_rate": 0.0004897392176529589, "loss": 3.6166, "theoretical_loss": 4.77174427802216, "tokens_seen": 100073472 }, { "epoch": 0.03, "learning_rate": 0.0004897291875626881, "loss": 3.7938, "theoretical_loss": 4.771311193746191, "tokens_seen": 100139008 }, { "epoch": 0.03, "learning_rate": 0.0004897191574724173, "loss": 3.7496, "theoretical_loss": 4.770878472111465, "tokens_seen": 100204544 }, { "epoch": 0.03, "learning_rate": 0.0004897091273821465, "loss": 3.7514, "theoretical_loss": 4.770446112577445, "tokens_seen": 100270080 }, { "epoch": 0.03, "learning_rate": 0.0004896990972918756, "loss": 3.6621, "theoretical_loss": 4.770014114604756, "tokens_seen": 100335616 }, { "epoch": 0.03, "learning_rate": 0.0004896890672016048, "loss": 3.6932, "theoretical_loss": 4.769582477655177, "tokens_seen": 100401152 }, { "epoch": 0.03, "learning_rate": 0.000489679037111334, "loss": 3.646, "theoretical_loss": 4.769151201191641, "tokens_seen": 100466688 }, { "epoch": 0.03, "learning_rate": 0.0004896690070210632, "loss": 3.7592, "theoretical_loss": 4.768720284678228, "tokens_seen": 100532224 }, { "epoch": 0.03, "learning_rate": 0.0004896589769307924, "loss": 3.7866, "theoretical_loss": 4.768289727580161, "tokens_seen": 100597760 }, { "epoch": 0.03, "learning_rate": 0.0004896489468405215, "loss": 3.6201, "theoretical_loss": 4.767859529363809, "tokens_seen": 100663296 }, { "epoch": 0.03, "learning_rate": 0.0004896389167502507, "loss": 3.6428, "theoretical_loss": 4.767429689496682, "tokens_seen": 100728832 }, { "epoch": 0.03, "learning_rate": 0.0004896288866599799, "loss": 3.7344, "theoretical_loss": 4.767000207447417, "tokens_seen": 100794368 }, { "epoch": 0.03, "learning_rate": 0.0004896188565697091, "loss": 3.7596, "theoretical_loss": 4.766571082685794, "tokens_seen": 100859904 }, { "epoch": 0.03, "learning_rate": 0.0004896088264794384, "loss": 3.8807, "theoretical_loss": 4.766142314682716, "tokens_seen": 100925440 }, { "epoch": 0.03, "learning_rate": 0.0004895987963891674, "loss": 3.7832, "theoretical_loss": 4.765713902910214, "tokens_seen": 100990976 }, { "epoch": 0.03, "learning_rate": 0.0004895887662988967, "loss": 3.8406, "theoretical_loss": 4.765285846841444, "tokens_seen": 101056512 }, { "epoch": 0.03, "learning_rate": 0.0004895787362086259, "loss": 3.7048, "theoretical_loss": 4.76485814595068, "tokens_seen": 101122048 }, { "epoch": 0.03, "learning_rate": 0.0004895687061183551, "loss": 3.8846, "theoretical_loss": 4.764430799713314, "tokens_seen": 101187584 }, { "epoch": 0.03, "learning_rate": 0.0004895586760280843, "loss": 3.6069, "theoretical_loss": 4.764003807605853, "tokens_seen": 101253120 }, { "epoch": 0.03, "learning_rate": 0.0004895486459378135, "loss": 3.6876, "theoretical_loss": 4.763577169105912, "tokens_seen": 101318656 }, { "epoch": 0.03, "learning_rate": 0.0004895386158475426, "loss": 3.6549, "theoretical_loss": 4.763150883692218, "tokens_seen": 101384192 }, { "epoch": 0.03, "learning_rate": 0.0004895285857572718, "loss": 3.6719, "theoretical_loss": 4.762724950844598, "tokens_seen": 101449728 }, { "epoch": 0.03, "learning_rate": 0.000489518555667001, "loss": 3.6664, "theoretical_loss": 4.762299370043984, "tokens_seen": 101515264 }, { "epoch": 0.03, "objective/train/docs_used": 193223, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.8491604328155518, "objective/train/theoretical_loss": 4.761874140772408, "objective/train/tokens_used": 122040800, "theoretical_loss": 4.761874140772408, "tokens_seen": 101580800 }, { "epoch": 0.03, "learning_rate": 0.0004895085255767302, "loss": 3.6685, "theoretical_loss": 4.761874140772408, "tokens_seen": 101580800 }, { "epoch": 0.03, "learning_rate": 0.0004894984954864594, "loss": 3.5151, "theoretical_loss": 4.761449262512993, "tokens_seen": 101646336 }, { "epoch": 0.03, "learning_rate": 0.0004894884653961885, "loss": 3.7341, "theoretical_loss": 4.761024734749958, "tokens_seen": 101711872 }, { "epoch": 0.03, "learning_rate": 0.0004894784353059178, "loss": 3.7577, "theoretical_loss": 4.76060055696861, "tokens_seen": 101777408 }, { "epoch": 0.03, "learning_rate": 0.0004894684052156469, "loss": 3.5263, "theoretical_loss": 4.760176728655345, "tokens_seen": 101842944 }, { "epoch": 0.03, "learning_rate": 0.0004894583751253762, "loss": 3.6827, "theoretical_loss": 4.75975324929764, "tokens_seen": 101908480 }, { "epoch": 0.03, "learning_rate": 0.0004894483450351053, "loss": 3.5838, "theoretical_loss": 4.759330118384053, "tokens_seen": 101974016 }, { "epoch": 0.03, "learning_rate": 0.0004894383149448346, "loss": 3.4147, "theoretical_loss": 4.758907335404221, "tokens_seen": 102039552 }, { "epoch": 0.03, "learning_rate": 0.0004894282848545637, "loss": 3.6316, "theoretical_loss": 4.758484899848854, "tokens_seen": 102105088 }, { "epoch": 0.03, "learning_rate": 0.0004894182547642929, "loss": 3.7185, "theoretical_loss": 4.7580628112097365, "tokens_seen": 102170624 }, { "epoch": 0.03, "learning_rate": 0.0004894082246740221, "loss": 3.921, "theoretical_loss": 4.7576410689797175, "tokens_seen": 102236160 }, { "epoch": 0.03, "learning_rate": 0.0004893981945837513, "loss": 3.6135, "theoretical_loss": 4.757219672652717, "tokens_seen": 102301696 }, { "epoch": 0.03, "learning_rate": 0.0004893881644934805, "loss": 3.678, "theoretical_loss": 4.756798621723712, "tokens_seen": 102367232 }, { "epoch": 0.03, "learning_rate": 0.0004893781344032096, "loss": 3.7806, "theoretical_loss": 4.756377915688748, "tokens_seen": 102432768 }, { "epoch": 0.03, "learning_rate": 0.0004893681043129388, "loss": 3.5997, "theoretical_loss": 4.755957554044917, "tokens_seen": 102498304 }, { "epoch": 0.03, "learning_rate": 0.000489358074222668, "loss": 3.5603, "theoretical_loss": 4.755537536290373, "tokens_seen": 102563840 }, { "epoch": 0.03, "learning_rate": 0.0004893480441323972, "loss": 3.6917, "theoretical_loss": 4.755117861924321, "tokens_seen": 102629376 }, { "epoch": 0.03, "learning_rate": 0.0004893380140421264, "loss": 3.6926, "theoretical_loss": 4.754698530447009, "tokens_seen": 102694912 }, { "epoch": 0.03, "learning_rate": 0.0004893279839518556, "loss": 3.5328, "theoretical_loss": 4.754279541359738, "tokens_seen": 102760448 }, { "epoch": 0.03, "learning_rate": 0.0004893179538615847, "loss": 3.6989, "theoretical_loss": 4.753860894164845, "tokens_seen": 102825984 }, { "epoch": 0.03, "learning_rate": 0.0004893079237713139, "loss": 3.6835, "theoretical_loss": 4.75344258836571, "tokens_seen": 102891520 }, { "epoch": 0.03, "learning_rate": 0.0004892978936810432, "loss": 3.7437, "theoretical_loss": 4.753024623466752, "tokens_seen": 102957056 }, { "epoch": 0.03, "learning_rate": 0.0004892878635907723, "loss": 3.7374, "theoretical_loss": 4.752606998973421, "tokens_seen": 103022592 }, { "epoch": 0.03, "learning_rate": 0.0004892778335005016, "loss": 3.8671, "theoretical_loss": 4.752189714392202, "tokens_seen": 103088128 }, { "epoch": 0.03, "learning_rate": 0.0004892678034102307, "loss": 3.6205, "theoretical_loss": 4.7517727692306035, "tokens_seen": 103153664 }, { "epoch": 0.03, "objective/train/docs_used": 194897, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.8129758834838867, "objective/train/theoretical_loss": 4.751356162997164, "objective/train/tokens_used": 123679200, "theoretical_loss": 4.751356162997164, "tokens_seen": 103219200 }, { "epoch": 0.03, "learning_rate": 0.0004892577733199599, "loss": 3.8056, "theoretical_loss": 4.751356162997164, "tokens_seen": 103219200 }, { "epoch": 0.03, "learning_rate": 0.0004892477432296891, "loss": 3.5845, "theoretical_loss": 4.750939895201443, "tokens_seen": 103284736 }, { "epoch": 0.03, "learning_rate": 0.0004892377131394183, "loss": 3.5825, "theoretical_loss": 4.750523965354024, "tokens_seen": 103350272 }, { "epoch": 0.03, "learning_rate": 0.0004892276830491475, "loss": 3.6629, "theoretical_loss": 4.750108372966501, "tokens_seen": 103415808 }, { "epoch": 0.03, "learning_rate": 0.0004892176529588767, "loss": 3.7654, "theoretical_loss": 4.749693117551491, "tokens_seen": 103481344 }, { "epoch": 0.03, "learning_rate": 0.0004892076228686058, "loss": 3.6769, "theoretical_loss": 4.749278198622617, "tokens_seen": 103546880 }, { "epoch": 0.03, "learning_rate": 0.000489197592778335, "loss": 3.6635, "theoretical_loss": 4.748863615694514, "tokens_seen": 103612416 }, { "epoch": 0.03, "learning_rate": 0.0004891875626880642, "loss": 3.8492, "theoretical_loss": 4.748449368282822, "tokens_seen": 103677952 }, { "epoch": 0.03, "learning_rate": 0.0004891775325977934, "loss": 3.6718, "theoretical_loss": 4.748035455904185, "tokens_seen": 103743488 }, { "epoch": 0.03, "learning_rate": 0.0004891675025075226, "loss": 3.7957, "theoretical_loss": 4.747621878076252, "tokens_seen": 103809024 }, { "epoch": 0.03, "learning_rate": 0.0004891574724172517, "loss": 3.8361, "theoretical_loss": 4.747208634317664, "tokens_seen": 103874560 }, { "epoch": 0.03, "learning_rate": 0.0004891474423269809, "loss": 3.7287, "theoretical_loss": 4.746795724148061, "tokens_seen": 103940096 }, { "epoch": 0.03, "learning_rate": 0.0004891374122367101, "loss": 3.5614, "theoretical_loss": 4.746383147088078, "tokens_seen": 104005632 }, { "epoch": 0.03, "learning_rate": 0.0004891273821464393, "loss": 3.7828, "theoretical_loss": 4.745970902659338, "tokens_seen": 104071168 }, { "epoch": 0.03, "learning_rate": 0.0004891173520561686, "loss": 3.6487, "theoretical_loss": 4.745558990384451, "tokens_seen": 104136704 }, { "epoch": 0.03, "learning_rate": 0.0004891073219658976, "loss": 3.9319, "theoretical_loss": 4.7451474097870125, "tokens_seen": 104202240 }, { "epoch": 0.03, "learning_rate": 0.0004890972918756269, "loss": 3.7078, "theoretical_loss": 4.744736160391602, "tokens_seen": 104267776 }, { "epoch": 0.03, "learning_rate": 0.0004890872617853561, "loss": 3.6757, "theoretical_loss": 4.744325241723777, "tokens_seen": 104333312 }, { "epoch": 0.03, "learning_rate": 0.0004890772316950853, "loss": 3.6144, "theoretical_loss": 4.743914653310073, "tokens_seen": 104398848 }, { "epoch": 0.03, "learning_rate": 0.0004890672016048145, "loss": 3.7475, "theoretical_loss": 4.743504394678, "tokens_seen": 104464384 }, { "epoch": 0.03, "learning_rate": 0.0004890571715145437, "loss": 3.3277, "theoretical_loss": 4.743094465356039, "tokens_seen": 104529920 }, { "epoch": 0.03, "learning_rate": 0.0004890471414242728, "loss": 3.7092, "theoretical_loss": 4.742684864873641, "tokens_seen": 104595456 }, { "epoch": 0.03, "learning_rate": 0.000489037111334002, "loss": 3.6497, "theoretical_loss": 4.742275592761223, "tokens_seen": 104660992 }, { "epoch": 0.03, "learning_rate": 0.0004890270812437312, "loss": 3.7419, "theoretical_loss": 4.741866648550168, "tokens_seen": 104726528 }, { "epoch": 0.03, "learning_rate": 0.0004890170511534604, "loss": 3.8306, "theoretical_loss": 4.741458031772817, "tokens_seen": 104792064 }, { "epoch": 0.03, "objective/train/docs_used": 197518, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7364706993103027, "objective/train/theoretical_loss": 4.741049741962473, "objective/train/tokens_used": 125317600, "theoretical_loss": 4.741049741962473, "tokens_seen": 104857600 }, { "epoch": 0.03, "learning_rate": 0.0004890070210631896, "loss": 3.7107, "theoretical_loss": 4.741049741962473, "tokens_seen": 104857600 }, { "epoch": 0.03, "learning_rate": 0.0004889969909729187, "loss": 3.6636, "theoretical_loss": 4.740641778653395, "tokens_seen": 104923136 }, { "epoch": 0.03, "learning_rate": 0.0004889869608826479, "loss": 3.7717, "theoretical_loss": 4.740234141380794, "tokens_seen": 104988672 }, { "epoch": 0.03, "learning_rate": 0.0004889769307923771, "loss": 3.6215, "theoretical_loss": 4.739826829680833, "tokens_seen": 105054208 }, { "epoch": 0.03, "learning_rate": 0.0004889669007021063, "loss": 3.6056, "theoretical_loss": 4.739419843090626, "tokens_seen": 105119744 }, { "epoch": 0.03, "learning_rate": 0.0004889568706118355, "loss": 3.713, "theoretical_loss": 4.739013181148229, "tokens_seen": 105185280 }, { "epoch": 0.03, "learning_rate": 0.0004889468405215647, "loss": 3.7278, "theoretical_loss": 4.738606843392644, "tokens_seen": 105250816 }, { "epoch": 0.03, "learning_rate": 0.0004889368104312939, "loss": 3.4764, "theoretical_loss": 4.738200829363815, "tokens_seen": 105316352 }, { "epoch": 0.03, "learning_rate": 0.000488926780341023, "loss": 3.8085, "theoretical_loss": 4.737795138602624, "tokens_seen": 105381888 }, { "epoch": 0.03, "learning_rate": 0.0004889167502507523, "loss": 3.5483, "theoretical_loss": 4.737389770650887, "tokens_seen": 105447424 }, { "epoch": 0.03, "learning_rate": 0.0004889067201604815, "loss": 3.7326, "theoretical_loss": 4.736984725051357, "tokens_seen": 105512960 }, { "epoch": 0.03, "learning_rate": 0.0004888966900702107, "loss": 3.6618, "theoretical_loss": 4.736580001347717, "tokens_seen": 105578496 }, { "epoch": 0.03, "learning_rate": 0.0004888866599799398, "loss": 3.5957, "theoretical_loss": 4.736175599084576, "tokens_seen": 105644032 }, { "epoch": 0.03, "learning_rate": 0.000488876629889669, "loss": 3.6386, "theoretical_loss": 4.735771517807473, "tokens_seen": 105709568 }, { "epoch": 0.03, "learning_rate": 0.0004888665997993982, "loss": 3.7061, "theoretical_loss": 4.735367757062869, "tokens_seen": 105775104 }, { "epoch": 0.03, "learning_rate": 0.0004888565697091274, "loss": 3.5672, "theoretical_loss": 4.734964316398148, "tokens_seen": 105840640 }, { "epoch": 0.03, "learning_rate": 0.0004888465396188566, "loss": 3.611, "theoretical_loss": 4.734561195361609, "tokens_seen": 105906176 }, { "epoch": 0.03, "learning_rate": 0.0004888365095285858, "loss": 3.7687, "theoretical_loss": 4.734158393502471, "tokens_seen": 105971712 }, { "epoch": 0.03, "learning_rate": 0.0004888264794383149, "loss": 3.6085, "theoretical_loss": 4.733755910370867, "tokens_seen": 106037248 }, { "epoch": 0.03, "learning_rate": 0.0004888164493480441, "loss": 3.7007, "theoretical_loss": 4.73335374551784, "tokens_seen": 106102784 }, { "epoch": 0.03, "learning_rate": 0.0004888064192577733, "loss": 3.66, "theoretical_loss": 4.732951898495341, "tokens_seen": 106168320 }, { "epoch": 0.03, "learning_rate": 0.0004887963891675025, "loss": 3.24, "theoretical_loss": 4.7325503688562325, "tokens_seen": 106233856 }, { "epoch": 0.03, "learning_rate": 0.0004887863590772317, "loss": 3.7322, "theoretical_loss": 4.732149156154276, "tokens_seen": 106299392 }, { "epoch": 0.03, "learning_rate": 0.000488776328986961, "loss": 3.6656, "theoretical_loss": 4.731748259944139, "tokens_seen": 106364928 }, { "epoch": 0.03, "learning_rate": 0.00048876629889669, "loss": 3.7687, "theoretical_loss": 4.731347679781386, "tokens_seen": 106430464 }, { "epoch": 0.03, "objective/train/docs_used": 200313, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.267194986343384, "objective/train/theoretical_loss": 4.730947415222481, "objective/train/tokens_used": 126956000, "theoretical_loss": 4.730947415222481, "tokens_seen": 106496000 }, { "epoch": 0.03, "learning_rate": 0.0004887562688064193, "loss": 3.5399, "theoretical_loss": 4.730947415222481, "tokens_seen": 106496000 }, { "epoch": 0.03, "learning_rate": 0.0004887462387161484, "loss": 3.8275, "theoretical_loss": 4.730547465824781, "tokens_seen": 106561536 }, { "epoch": 0.03, "learning_rate": 0.0004887362086258777, "loss": 3.6262, "theoretical_loss": 4.730147831146537, "tokens_seen": 106627072 }, { "epoch": 0.03, "learning_rate": 0.0004887261785356069, "loss": 3.648, "theoretical_loss": 4.72974851074689, "tokens_seen": 106692608 }, { "epoch": 0.03, "learning_rate": 0.000488716148445336, "loss": 3.6835, "theoretical_loss": 4.729349504185867, "tokens_seen": 106758144 }, { "epoch": 0.03, "learning_rate": 0.0004887061183550652, "loss": 3.5786, "theoretical_loss": 4.728950811024383, "tokens_seen": 106823680 }, { "epoch": 0.03, "learning_rate": 0.0004886960882647944, "loss": 3.7741, "theoretical_loss": 4.7285524308242355, "tokens_seen": 106889216 }, { "epoch": 0.03, "learning_rate": 0.0004886860581745236, "loss": 3.6262, "theoretical_loss": 4.728154363148102, "tokens_seen": 106954752 }, { "epoch": 0.03, "learning_rate": 0.0004886760280842528, "loss": 3.8291, "theoretical_loss": 4.72775660755954, "tokens_seen": 107020288 }, { "epoch": 0.03, "learning_rate": 0.0004886659979939819, "loss": 3.5973, "theoretical_loss": 4.72735916362298, "tokens_seen": 107085824 }, { "epoch": 0.03, "learning_rate": 0.0004886559679037111, "loss": 3.7821, "theoretical_loss": 4.7269620309037315, "tokens_seen": 107151360 }, { "epoch": 0.03, "learning_rate": 0.0004886459378134403, "loss": 3.6485, "theoretical_loss": 4.726565208967973, "tokens_seen": 107216896 }, { "epoch": 0.03, "learning_rate": 0.0004886359077231695, "loss": 3.4702, "theoretical_loss": 4.726168697382751, "tokens_seen": 107282432 }, { "epoch": 0.03, "learning_rate": 0.0004886258776328988, "loss": 3.5645, "theoretical_loss": 4.725772495715983, "tokens_seen": 107347968 }, { "epoch": 0.03, "learning_rate": 0.0004886158475426278, "loss": 3.5625, "theoretical_loss": 4.725376603536446, "tokens_seen": 107413504 }, { "epoch": 0.03, "learning_rate": 0.0004886058174523571, "loss": 3.6855, "theoretical_loss": 4.724981020413787, "tokens_seen": 107479040 }, { "epoch": 0.03, "learning_rate": 0.0004885957873620863, "loss": 3.743, "theoretical_loss": 4.724585745918505, "tokens_seen": 107544576 }, { "epoch": 0.03, "learning_rate": 0.0004885857572718155, "loss": 3.6769, "theoretical_loss": 4.7241907796219635, "tokens_seen": 107610112 }, { "epoch": 0.03, "learning_rate": 0.0004885757271815447, "loss": 3.6149, "theoretical_loss": 4.723796121096381, "tokens_seen": 107675648 }, { "epoch": 0.03, "learning_rate": 0.0004885656970912739, "loss": 3.4559, "theoretical_loss": 4.723401769914824, "tokens_seen": 107741184 }, { "epoch": 0.03, "learning_rate": 0.000488555667001003, "loss": 3.5958, "theoretical_loss": 4.723007725651219, "tokens_seen": 107806720 }, { "epoch": 0.03, "learning_rate": 0.0004885456369107322, "loss": 3.713, "theoretical_loss": 4.722613987880335, "tokens_seen": 107872256 }, { "epoch": 0.03, "learning_rate": 0.0004885356068204614, "loss": 3.4987, "theoretical_loss": 4.722220556177792, "tokens_seen": 107937792 }, { "epoch": 0.03, "learning_rate": 0.0004885255767301906, "loss": 3.6946, "theoretical_loss": 4.721827430120053, "tokens_seen": 108003328 }, { "epoch": 0.03, "learning_rate": 0.0004885155466399198, "loss": 3.6752, "theoretical_loss": 4.721434609284424, "tokens_seen": 108068864 }, { "epoch": 0.03, "objective/train/docs_used": 203124, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5385873317718506, "objective/train/theoretical_loss": 4.721042093249051, "objective/train/tokens_used": 128594400, "theoretical_loss": 4.721042093249051, "tokens_seen": 108134400 }, { "epoch": 0.03, "learning_rate": 0.000488505516549649, "loss": 3.6886, "theoretical_loss": 4.721042093249051, "tokens_seen": 108134400 }, { "epoch": 0.03, "learning_rate": 0.0004884954864593781, "loss": 3.6082, "theoretical_loss": 4.720649881592919, "tokens_seen": 108199936 }, { "epoch": 0.03, "learning_rate": 0.0004884854563691073, "loss": 3.5941, "theoretical_loss": 4.7202579738958494, "tokens_seen": 108265472 }, { "epoch": 0.03, "learning_rate": 0.0004884754262788365, "loss": 3.6131, "theoretical_loss": 4.7198663697384955, "tokens_seen": 108331008 }, { "epoch": 0.03, "learning_rate": 0.0004884653961885657, "loss": 3.6118, "theoretical_loss": 4.719475068702346, "tokens_seen": 108396544 }, { "epoch": 0.03, "learning_rate": 0.0004884553660982949, "loss": 3.5467, "theoretical_loss": 4.719084070369714, "tokens_seen": 108462080 }, { "epoch": 0.03, "learning_rate": 0.0004884453360080241, "loss": 3.7319, "theoretical_loss": 4.718693374323747, "tokens_seen": 108527616 }, { "epoch": 0.03, "learning_rate": 0.0004884353059177532, "loss": 3.5742, "theoretical_loss": 4.718302980148412, "tokens_seen": 108593152 }, { "epoch": 0.03, "learning_rate": 0.0004884252758274825, "loss": 3.6418, "theoretical_loss": 4.717912887428501, "tokens_seen": 108658688 }, { "epoch": 0.03, "learning_rate": 0.0004884152457372117, "loss": 3.6894, "theoretical_loss": 4.717523095749626, "tokens_seen": 108724224 }, { "epoch": 0.03, "learning_rate": 0.0004884052156469409, "loss": 3.6784, "theoretical_loss": 4.717133604698222, "tokens_seen": 108789760 }, { "epoch": 0.03, "learning_rate": 0.00048839518555667, "loss": 3.6138, "theoretical_loss": 4.7167444138615355, "tokens_seen": 108855296 }, { "epoch": 0.03, "learning_rate": 0.0004883851554663992, "loss": 3.6859, "theoretical_loss": 4.716355522827633, "tokens_seen": 108920832 }, { "epoch": 0.03, "learning_rate": 0.0004883751253761284, "loss": 3.5154, "theoretical_loss": 4.715966931185388, "tokens_seen": 108986368 }, { "epoch": 0.03, "learning_rate": 0.0004883650952858576, "loss": 3.731, "theoretical_loss": 4.715578638524491, "tokens_seen": 109051904 }, { "epoch": 0.03, "learning_rate": 0.0004883550651955868, "loss": 3.6547, "theoretical_loss": 4.715190644435435, "tokens_seen": 109117440 }, { "epoch": 0.03, "learning_rate": 0.000488345035105316, "loss": 3.6713, "theoretical_loss": 4.714802948509522, "tokens_seen": 109182976 }, { "epoch": 0.03, "learning_rate": 0.0004883350050150451, "loss": 3.7671, "theoretical_loss": 4.71441555033886, "tokens_seen": 109248512 }, { "epoch": 0.03, "learning_rate": 0.0004883249749247743, "loss": 3.7949, "theoretical_loss": 4.714028449516356, "tokens_seen": 109314048 }, { "epoch": 0.03, "learning_rate": 0.0004883149448345035, "loss": 3.7142, "theoretical_loss": 4.713641645635718, "tokens_seen": 109379584 }, { "epoch": 0.03, "learning_rate": 0.0004883049147442327, "loss": 3.6105, "theoretical_loss": 4.713255138291454, "tokens_seen": 109445120 }, { "epoch": 0.03, "learning_rate": 0.0004882948846539619, "loss": 3.7381, "theoretical_loss": 4.712868927078868, "tokens_seen": 109510656 }, { "epoch": 0.03, "learning_rate": 0.0004882848545636911, "loss": 3.6663, "theoretical_loss": 4.712483011594056, "tokens_seen": 109576192 }, { "epoch": 0.03, "learning_rate": 0.0004882748244734203, "loss": 3.5321, "theoretical_loss": 4.7120973914339075, "tokens_seen": 109641728 }, { "epoch": 0.03, "learning_rate": 0.00048826479438314946, "loss": 3.647, "theoretical_loss": 4.7117120661961005, "tokens_seen": 109707264 }, { "epoch": 0.03, "objective/train/docs_used": 205882, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5943007469177246, "objective/train/theoretical_loss": 4.711327035479103, "objective/train/tokens_used": 130232800, "theoretical_loss": 4.711327035479103, "tokens_seen": 109772800 }, { "epoch": 0.03, "learning_rate": 0.00048825476429287864, "loss": 3.6294, "theoretical_loss": 4.711327035479103, "tokens_seen": 109772800 }, { "epoch": 0.03, "learning_rate": 0.0004882447342026078, "loss": 3.5685, "theoretical_loss": 4.710942298882169, "tokens_seen": 109838336 }, { "epoch": 0.03, "learning_rate": 0.000488234704112337, "loss": 3.6673, "theoretical_loss": 4.710557856005335, "tokens_seen": 109903872 }, { "epoch": 0.03, "learning_rate": 0.00048822467402206624, "loss": 3.6369, "theoretical_loss": 4.710173706449419, "tokens_seen": 109969408 }, { "epoch": 0.03, "learning_rate": 0.00048821464393179536, "loss": 3.5978, "theoretical_loss": 4.709789849816021, "tokens_seen": 110034944 }, { "epoch": 0.03, "learning_rate": 0.0004882046138415246, "loss": 3.6491, "theoretical_loss": 4.7094062857075185, "tokens_seen": 110100480 }, { "epoch": 0.03, "learning_rate": 0.0004881945837512537, "loss": 3.7124, "theoretical_loss": 4.709023013727063, "tokens_seen": 110166016 }, { "epoch": 0.03, "learning_rate": 0.00048818455366098296, "loss": 3.7461, "theoretical_loss": 4.708640033478584, "tokens_seen": 110231552 }, { "epoch": 0.03, "learning_rate": 0.00048817452357071214, "loss": 3.5964, "theoretical_loss": 4.708257344566778, "tokens_seen": 110297088 }, { "epoch": 0.03, "learning_rate": 0.0004881644934804413, "loss": 3.654, "theoretical_loss": 4.7078749465971175, "tokens_seen": 110362624 }, { "epoch": 0.03, "learning_rate": 0.0004881544633901705, "loss": 3.5805, "theoretical_loss": 4.707492839175837, "tokens_seen": 110428160 }, { "epoch": 0.03, "learning_rate": 0.00048814443329989974, "loss": 3.6071, "theoretical_loss": 4.707111021909941, "tokens_seen": 110493696 }, { "epoch": 0.03, "learning_rate": 0.00048813440320962887, "loss": 3.4389, "theoretical_loss": 4.706729494407197, "tokens_seen": 110559232 }, { "epoch": 0.03, "learning_rate": 0.0004881243731193581, "loss": 3.6907, "theoretical_loss": 4.706348256276138, "tokens_seen": 110624768 }, { "epoch": 0.03, "learning_rate": 0.00048811434302908723, "loss": 3.6221, "theoretical_loss": 4.705967307126051, "tokens_seen": 110690304 }, { "epoch": 0.03, "learning_rate": 0.00048810431293881646, "loss": 3.7253, "theoretical_loss": 4.705586646566987, "tokens_seen": 110755840 }, { "epoch": 0.03, "learning_rate": 0.00048809428284854564, "loss": 3.7269, "theoretical_loss": 4.705206274209751, "tokens_seen": 110821376 }, { "epoch": 0.03, "learning_rate": 0.0004880842527582748, "loss": 3.6898, "theoretical_loss": 4.704826189665905, "tokens_seen": 110886912 }, { "epoch": 0.03, "learning_rate": 0.000488074222668004, "loss": 3.6357, "theoretical_loss": 4.704446392547759, "tokens_seen": 110952448 }, { "epoch": 0.03, "learning_rate": 0.0004880641925777332, "loss": 3.5915, "theoretical_loss": 4.7040668824683785, "tokens_seen": 111017984 }, { "epoch": 0.03, "learning_rate": 0.00048805416248746237, "loss": 3.7866, "theoretical_loss": 4.7036876590415755, "tokens_seen": 111083520 }, { "epoch": 0.03, "learning_rate": 0.0004880441323971916, "loss": 3.5941, "theoretical_loss": 4.7033087218819105, "tokens_seen": 111149056 }, { "epoch": 0.03, "learning_rate": 0.00048803410230692073, "loss": 3.5593, "theoretical_loss": 4.7029300706046895, "tokens_seen": 111214592 }, { "epoch": 0.03, "learning_rate": 0.00048802407221664997, "loss": 3.6039, "theoretical_loss": 4.702551704825957, "tokens_seen": 111280128 }, { "epoch": 0.03, "learning_rate": 0.00048801404212637915, "loss": 3.5394, "theoretical_loss": 4.702173624162507, "tokens_seen": 111345664 }, { "epoch": 0.03, "objective/train/docs_used": 208312, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7646846771240234, "objective/train/theoretical_loss": 4.701795828231866, "objective/train/tokens_used": 131871200, "theoretical_loss": 4.701795828231866, "tokens_seen": 111411200 }, { "epoch": 0.03, "learning_rate": 0.00048800401203610833, "loss": 3.6299, "theoretical_loss": 4.701795828231866, "tokens_seen": 111411200 }, { "epoch": 0.03, "learning_rate": 0.0004879939819458375, "loss": 3.6882, "theoretical_loss": 4.701418316652299, "tokens_seen": 111476736 }, { "epoch": 0.03, "learning_rate": 0.0004879839518555667, "loss": 3.7519, "theoretical_loss": 4.701041089042813, "tokens_seen": 111542272 }, { "epoch": 0.03, "learning_rate": 0.0004879739217652959, "loss": 3.7532, "theoretical_loss": 4.700664145023142, "tokens_seen": 111607808 }, { "epoch": 0.03, "learning_rate": 0.0004879638916750251, "loss": 3.7032, "theoretical_loss": 4.700287484213753, "tokens_seen": 111673344 }, { "epoch": 0.03, "learning_rate": 0.0004879538615847543, "loss": 3.759, "theoretical_loss": 4.699911106235849, "tokens_seen": 111738880 }, { "epoch": 0.03, "learning_rate": 0.00048794383149448347, "loss": 3.6025, "theoretical_loss": 4.6995350107113545, "tokens_seen": 111804416 }, { "epoch": 0.03, "learning_rate": 0.00048793380140421265, "loss": 3.6804, "theoretical_loss": 4.699159197262922, "tokens_seen": 111869952 }, { "epoch": 0.03, "learning_rate": 0.00048792377131394183, "loss": 3.6993, "theoretical_loss": 4.698783665513934, "tokens_seen": 111935488 }, { "epoch": 0.03, "learning_rate": 0.00048791374122367107, "loss": 3.4967, "theoretical_loss": 4.698408415088491, "tokens_seen": 112001024 }, { "epoch": 0.03, "learning_rate": 0.0004879037111334002, "loss": 3.6463, "theoretical_loss": 4.698033445611415, "tokens_seen": 112066560 }, { "epoch": 0.03, "learning_rate": 0.00048789368104312943, "loss": 3.8927, "theoretical_loss": 4.6976587567082495, "tokens_seen": 112132096 }, { "epoch": 0.03, "learning_rate": 0.00048788365095285856, "loss": 3.6471, "theoretical_loss": 4.697284348005253, "tokens_seen": 112197632 }, { "epoch": 0.03, "learning_rate": 0.0004878736208625878, "loss": 3.4436, "theoretical_loss": 4.696910219129402, "tokens_seen": 112263168 }, { "epoch": 0.03, "learning_rate": 0.00048786359077231697, "loss": 3.761, "theoretical_loss": 4.696536369708386, "tokens_seen": 112328704 }, { "epoch": 0.03, "learning_rate": 0.00048785356068204615, "loss": 3.5448, "theoretical_loss": 4.696162799370606, "tokens_seen": 112394240 }, { "epoch": 0.03, "learning_rate": 0.00048784353059177533, "loss": 3.6518, "theoretical_loss": 4.695789507745176, "tokens_seen": 112459776 }, { "epoch": 0.03, "learning_rate": 0.00048783350050150457, "loss": 3.5411, "theoretical_loss": 4.695416494461917, "tokens_seen": 112525312 }, { "epoch": 0.03, "learning_rate": 0.0004878234704112337, "loss": 3.7453, "theoretical_loss": 4.695043759151353, "tokens_seen": 112590848 }, { "epoch": 0.03, "learning_rate": 0.00048781344032096293, "loss": 3.593, "theoretical_loss": 4.694671301444722, "tokens_seen": 112656384 }, { "epoch": 0.03, "learning_rate": 0.00048780341023069206, "loss": 3.5234, "theoretical_loss": 4.694299120973957, "tokens_seen": 112721920 }, { "epoch": 0.03, "learning_rate": 0.0004877933801404213, "loss": 3.5445, "theoretical_loss": 4.693927217371698, "tokens_seen": 112787456 }, { "epoch": 0.03, "learning_rate": 0.0004877833500501505, "loss": 3.5058, "theoretical_loss": 4.693555590271282, "tokens_seen": 112852992 }, { "epoch": 0.03, "learning_rate": 0.00048777331995987966, "loss": 3.7503, "theoretical_loss": 4.693184239306744, "tokens_seen": 112918528 }, { "epoch": 0.03, "learning_rate": 0.00048776328986960884, "loss": 3.5331, "theoretical_loss": 4.692813164112819, "tokens_seen": 112984064 }, { "epoch": 0.03, "objective/train/docs_used": 209733, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6223511695861816, "objective/train/theoretical_loss": 4.692442364324931, "objective/train/tokens_used": 133509600, "theoretical_loss": 4.692442364324931, "tokens_seen": 113049600 }, { "epoch": 0.03, "learning_rate": 0.000487753259779338, "loss": 3.8004, "theoretical_loss": 4.692442364324931, "tokens_seen": 113049600 }, { "epoch": 0.03, "learning_rate": 0.0004877432296890672, "loss": 3.7384, "theoretical_loss": 4.692071839579201, "tokens_seen": 113115136 }, { "epoch": 0.03, "learning_rate": 0.00048773319959879644, "loss": 3.8432, "theoretical_loss": 4.6917015895124425, "tokens_seen": 113180672 }, { "epoch": 0.03, "learning_rate": 0.00048772316950852556, "loss": 3.6426, "theoretical_loss": 4.691331613762153, "tokens_seen": 113246208 }, { "epoch": 0.03, "learning_rate": 0.0004877131394182548, "loss": 3.5893, "theoretical_loss": 4.690961911966523, "tokens_seen": 113311744 }, { "epoch": 0.03, "learning_rate": 0.0004877031093279839, "loss": 3.5231, "theoretical_loss": 4.690592483764427, "tokens_seen": 113377280 }, { "epoch": 0.03, "learning_rate": 0.00048769307923771316, "loss": 3.8004, "theoretical_loss": 4.690223328795424, "tokens_seen": 113442816 }, { "epoch": 0.03, "learning_rate": 0.00048768304914744234, "loss": 3.7025, "theoretical_loss": 4.689854446699757, "tokens_seen": 113508352 }, { "epoch": 0.03, "learning_rate": 0.0004876730190571715, "loss": 3.7021, "theoretical_loss": 4.689485837118347, "tokens_seen": 113573888 }, { "epoch": 0.03, "learning_rate": 0.0004876629889669007, "loss": 3.7127, "theoretical_loss": 4.689117499692798, "tokens_seen": 113639424 }, { "epoch": 0.03, "learning_rate": 0.00048765295887662994, "loss": 3.5209, "theoretical_loss": 4.688749434065389, "tokens_seen": 113704960 }, { "epoch": 0.03, "learning_rate": 0.00048764292878635907, "loss": 3.5222, "theoretical_loss": 4.688381639879076, "tokens_seen": 113770496 }, { "epoch": 0.03, "learning_rate": 0.0004876328986960883, "loss": 3.6631, "theoretical_loss": 4.68801411677749, "tokens_seen": 113836032 }, { "epoch": 0.03, "learning_rate": 0.00048762286860581743, "loss": 3.4776, "theoretical_loss": 4.687646864404934, "tokens_seen": 113901568 }, { "epoch": 0.03, "learning_rate": 0.00048761283851554666, "loss": 3.6454, "theoretical_loss": 4.687279882406381, "tokens_seen": 113967104 }, { "epoch": 0.03, "learning_rate": 0.00048760280842527584, "loss": 3.631, "theoretical_loss": 4.686913170427477, "tokens_seen": 114032640 }, { "epoch": 0.03, "learning_rate": 0.000487592778335005, "loss": 3.8286, "theoretical_loss": 4.68654672811453, "tokens_seen": 114098176 }, { "epoch": 0.03, "learning_rate": 0.0004875827482447342, "loss": 3.549, "theoretical_loss": 4.68618055511452, "tokens_seen": 114163712 }, { "epoch": 0.03, "learning_rate": 0.0004875727181544634, "loss": 3.611, "theoretical_loss": 4.685814651075088, "tokens_seen": 114229248 }, { "epoch": 0.03, "learning_rate": 0.00048756268806419257, "loss": 3.6694, "theoretical_loss": 4.685449015644537, "tokens_seen": 114294784 }, { "epoch": 0.03, "learning_rate": 0.0004875526579739218, "loss": 3.552, "theoretical_loss": 4.685083648471835, "tokens_seen": 114360320 }, { "epoch": 0.03, "learning_rate": 0.00048754262788365093, "loss": 3.4919, "theoretical_loss": 4.684718549206607, "tokens_seen": 114425856 }, { "epoch": 0.03, "learning_rate": 0.00048753259779338017, "loss": 3.549, "theoretical_loss": 4.6843537174991345, "tokens_seen": 114491392 }, { "epoch": 0.03, "learning_rate": 0.00048752256770310935, "loss": 3.5419, "theoretical_loss": 4.6839891530003595, "tokens_seen": 114556928 }, { "epoch": 0.03, "learning_rate": 0.00048751253761283853, "loss": 3.7236, "theoretical_loss": 4.683624855361876, "tokens_seen": 114622464 }, { "epoch": 0.03, "objective/train/docs_used": 212300, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7778286933898926, "objective/train/theoretical_loss": 4.68326082423593, "objective/train/tokens_used": 135148000, "theoretical_loss": 4.68326082423593, "tokens_seen": 114688000 }, { "epoch": 0.03, "learning_rate": 0.0004875025075225677, "loss": 3.6086, "theoretical_loss": 4.68326082423593, "tokens_seen": 114688000 }, { "epoch": 0.03, "learning_rate": 0.0004874924774322969, "loss": 3.6847, "theoretical_loss": 4.682897059275422, "tokens_seen": 114753536 }, { "epoch": 0.03, "learning_rate": 0.00048748244734202607, "loss": 3.6613, "theoretical_loss": 4.682533560133901, "tokens_seen": 114819072 }, { "epoch": 0.03, "learning_rate": 0.0004874724172517553, "loss": 3.6984, "theoretical_loss": 4.682170326465565, "tokens_seen": 114884608 }, { "epoch": 0.03, "learning_rate": 0.00048746238716148443, "loss": 3.5294, "theoretical_loss": 4.681807357925257, "tokens_seen": 114950144 }, { "epoch": 0.03, "learning_rate": 0.00048745235707121367, "loss": 3.6764, "theoretical_loss": 4.681444654168468, "tokens_seen": 115015680 }, { "epoch": 0.03, "learning_rate": 0.0004874423269809428, "loss": 3.5603, "theoretical_loss": 4.68108221485133, "tokens_seen": 115081216 }, { "epoch": 0.03, "learning_rate": 0.00048743229689067203, "loss": 3.4418, "theoretical_loss": 4.680720039630617, "tokens_seen": 115146752 }, { "epoch": 0.03, "learning_rate": 0.0004874222668004012, "loss": 3.6714, "theoretical_loss": 4.680358128163747, "tokens_seen": 115212288 }, { "epoch": 0.03, "learning_rate": 0.0004874122367101304, "loss": 3.7728, "theoretical_loss": 4.679996480108773, "tokens_seen": 115277824 }, { "epoch": 0.03, "learning_rate": 0.0004874022066198596, "loss": 3.4824, "theoretical_loss": 4.6796350951243895, "tokens_seen": 115343360 }, { "epoch": 0.03, "learning_rate": 0.00048739217652958876, "loss": 3.8099, "theoretical_loss": 4.679273972869922, "tokens_seen": 115408896 }, { "epoch": 0.03, "learning_rate": 0.00048738214643931794, "loss": 3.6727, "theoretical_loss": 4.678913113005333, "tokens_seen": 115474432 }, { "epoch": 0.04, "learning_rate": 0.00048737211634904717, "loss": 3.4843, "theoretical_loss": 4.6785525151912175, "tokens_seen": 115539968 }, { "epoch": 0.04, "learning_rate": 0.0004873620862587763, "loss": 3.8763, "theoretical_loss": 4.678192179088802, "tokens_seen": 115605504 }, { "epoch": 0.04, "learning_rate": 0.00048735205616850553, "loss": 3.666, "theoretical_loss": 4.6778321043599425, "tokens_seen": 115671040 }, { "epoch": 0.04, "learning_rate": 0.0004873420260782347, "loss": 3.7276, "theoretical_loss": 4.677472290667122, "tokens_seen": 115736576 }, { "epoch": 0.04, "learning_rate": 0.0004873319959879639, "loss": 3.6673, "theoretical_loss": 4.677112737673453, "tokens_seen": 115802112 }, { "epoch": 0.04, "learning_rate": 0.0004873219658976931, "loss": 3.5775, "theoretical_loss": 4.676753445042669, "tokens_seen": 115867648 }, { "epoch": 0.04, "learning_rate": 0.00048731193580742226, "loss": 3.7468, "theoretical_loss": 4.676394412439132, "tokens_seen": 115933184 }, { "epoch": 0.04, "learning_rate": 0.00048730190571715144, "loss": 3.7257, "theoretical_loss": 4.6760356395278215, "tokens_seen": 115998720 }, { "epoch": 0.04, "learning_rate": 0.0004872918756268807, "loss": 3.614, "theoretical_loss": 4.675677125974339, "tokens_seen": 116064256 }, { "epoch": 0.04, "learning_rate": 0.0004872818455366098, "loss": 3.699, "theoretical_loss": 4.675318871444908, "tokens_seen": 116129792 }, { "epoch": 0.04, "learning_rate": 0.00048727181544633904, "loss": 3.67, "theoretical_loss": 4.674960875606366, "tokens_seen": 116195328 }, { "epoch": 0.04, "learning_rate": 0.00048726178535606816, "loss": 3.7152, "theoretical_loss": 4.674603138126168, "tokens_seen": 116260864 }, { "epoch": 0.04, "objective/train/docs_used": 215181, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.492262363433838, "objective/train/theoretical_loss": 4.674245658672382, "objective/train/tokens_used": 136786400, "theoretical_loss": 4.674245658672382, "tokens_seen": 116326400 }, { "epoch": 0.04, "learning_rate": 0.0004872517552657974, "loss": 3.634, "theoretical_loss": 4.674245658672382, "tokens_seen": 116326400 }, { "epoch": 0.04, "learning_rate": 0.0004872417251755266, "loss": 3.5239, "theoretical_loss": 4.673888436913694, "tokens_seen": 116391936 }, { "epoch": 0.04, "learning_rate": 0.00048723169508525576, "loss": 3.6449, "theoretical_loss": 4.673531472519397, "tokens_seen": 116457472 }, { "epoch": 0.04, "learning_rate": 0.000487221664994985, "loss": 3.55, "theoretical_loss": 4.673174765159393, "tokens_seen": 116523008 }, { "epoch": 0.04, "learning_rate": 0.0004872116349047141, "loss": 3.4948, "theoretical_loss": 4.672818314504198, "tokens_seen": 116588544 }, { "epoch": 0.04, "learning_rate": 0.00048720160481444336, "loss": 3.6017, "theoretical_loss": 4.6724621202249335, "tokens_seen": 116654080 }, { "epoch": 0.04, "learning_rate": 0.00048719157472417254, "loss": 3.7208, "theoretical_loss": 4.672106181993324, "tokens_seen": 116719616 }, { "epoch": 0.04, "learning_rate": 0.0004871815446339017, "loss": 3.6289, "theoretical_loss": 4.6717504994817, "tokens_seen": 116785152 }, { "epoch": 0.04, "learning_rate": 0.0004871715145436309, "loss": 3.6821, "theoretical_loss": 4.671395072362996, "tokens_seen": 116850688 }, { "epoch": 0.04, "learning_rate": 0.00048716148445336014, "loss": 3.4851, "theoretical_loss": 4.671039900310747, "tokens_seen": 116916224 }, { "epoch": 0.04, "learning_rate": 0.00048715145436308927, "loss": 3.5472, "theoretical_loss": 4.670684982999088, "tokens_seen": 116981760 }, { "epoch": 0.04, "learning_rate": 0.0004871414242728185, "loss": 3.669, "theoretical_loss": 4.670330320102753, "tokens_seen": 117047296 }, { "epoch": 0.04, "learning_rate": 0.00048713139418254763, "loss": 3.451, "theoretical_loss": 4.669975911297072, "tokens_seen": 117112832 }, { "epoch": 0.04, "learning_rate": 0.00048712136409227686, "loss": 3.3952, "theoretical_loss": 4.669621756257971, "tokens_seen": 117178368 }, { "epoch": 0.04, "learning_rate": 0.00048711133400200604, "loss": 3.6349, "theoretical_loss": 4.669267854661973, "tokens_seen": 117243904 }, { "epoch": 0.04, "learning_rate": 0.0004871013039117352, "loss": 3.6134, "theoretical_loss": 4.668914206186189, "tokens_seen": 117309440 }, { "epoch": 0.04, "learning_rate": 0.0004870912738214644, "loss": 3.6608, "theoretical_loss": 4.6685608105083265, "tokens_seen": 117374976 }, { "epoch": 0.04, "learning_rate": 0.0004870812437311936, "loss": 3.5169, "theoretical_loss": 4.66820766730668, "tokens_seen": 117440512 }, { "epoch": 0.04, "learning_rate": 0.00048707121364092277, "loss": 3.8038, "theoretical_loss": 4.667854776260132, "tokens_seen": 117506048 }, { "epoch": 0.04, "learning_rate": 0.000487061183550652, "loss": 3.6941, "theoretical_loss": 4.667502137048155, "tokens_seen": 117571584 }, { "epoch": 0.04, "learning_rate": 0.00048705115346038113, "loss": 3.6261, "theoretical_loss": 4.667149749350805, "tokens_seen": 117637120 }, { "epoch": 0.04, "learning_rate": 0.00048704112337011037, "loss": 3.5565, "theoretical_loss": 4.666797612848723, "tokens_seen": 117702656 }, { "epoch": 0.04, "learning_rate": 0.00048703109327983955, "loss": 3.7011, "theoretical_loss": 4.666445727223134, "tokens_seen": 117768192 }, { "epoch": 0.04, "learning_rate": 0.00048702106318956873, "loss": 3.5257, "theoretical_loss": 4.666094092155843, "tokens_seen": 117833728 }, { "epoch": 0.04, "learning_rate": 0.0004870110330992979, "loss": 3.7037, "theoretical_loss": 4.665742707329238, "tokens_seen": 117899264 }, { "epoch": 0.04, "objective/train/docs_used": 218087, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.9211204051971436, "objective/train/theoretical_loss": 4.665391572426282, "objective/train/tokens_used": 138424800, "theoretical_loss": 4.665391572426282, "tokens_seen": 117964800 }, { "epoch": 0.04, "learning_rate": 0.0004870010030090271, "loss": 3.5114, "theoretical_loss": 4.665391572426282, "tokens_seen": 117964800 }, { "epoch": 0.04, "learning_rate": 0.00048699097291875627, "loss": 3.4907, "theoretical_loss": 4.665040687130518, "tokens_seen": 118030336 }, { "epoch": 0.04, "learning_rate": 0.0004869809428284855, "loss": 3.423, "theoretical_loss": 4.664690051126065, "tokens_seen": 118095872 }, { "epoch": 0.04, "learning_rate": 0.00048697091273821463, "loss": 3.74, "theoretical_loss": 4.664339664097617, "tokens_seen": 118161408 }, { "epoch": 0.04, "learning_rate": 0.00048696088264794387, "loss": 3.7841, "theoretical_loss": 4.66398952573044, "tokens_seen": 118226944 }, { "epoch": 0.04, "learning_rate": 0.000486950852557673, "loss": 3.6146, "theoretical_loss": 4.663639635710373, "tokens_seen": 118292480 }, { "epoch": 0.04, "learning_rate": 0.00048694082246740223, "loss": 3.4494, "theoretical_loss": 4.663289993723826, "tokens_seen": 118358016 }, { "epoch": 0.04, "learning_rate": 0.0004869307923771314, "loss": 3.4861, "theoretical_loss": 4.662940599457777, "tokens_seen": 118423552 }, { "epoch": 0.04, "learning_rate": 0.0004869207622868606, "loss": 3.8032, "theoretical_loss": 4.662591452599774, "tokens_seen": 118489088 }, { "epoch": 0.04, "learning_rate": 0.0004869107321965898, "loss": 3.5821, "theoretical_loss": 4.662242552837929, "tokens_seen": 118554624 }, { "epoch": 0.04, "learning_rate": 0.00048690070210631896, "loss": 3.7023, "theoretical_loss": 4.661893899860923, "tokens_seen": 118620160 }, { "epoch": 0.04, "learning_rate": 0.00048689067201604814, "loss": 3.6291, "theoretical_loss": 4.6615454933579965, "tokens_seen": 118685696 }, { "epoch": 0.04, "learning_rate": 0.00048688064192577737, "loss": 3.5807, "theoretical_loss": 4.661197333018957, "tokens_seen": 118751232 }, { "epoch": 0.04, "learning_rate": 0.0004868706118355065, "loss": 3.7368, "theoretical_loss": 4.66084941853417, "tokens_seen": 118816768 }, { "epoch": 0.04, "learning_rate": 0.00048686058174523573, "loss": 3.6181, "theoretical_loss": 4.6605017495945615, "tokens_seen": 118882304 }, { "epoch": 0.04, "learning_rate": 0.0004868505516549649, "loss": 3.7314, "theoretical_loss": 4.660154325891618, "tokens_seen": 118947840 }, { "epoch": 0.04, "learning_rate": 0.0004868405215646941, "loss": 3.4913, "theoretical_loss": 4.659807147117382, "tokens_seen": 119013376 }, { "epoch": 0.04, "learning_rate": 0.0004868304914744233, "loss": 3.5727, "theoretical_loss": 4.6594602129644525, "tokens_seen": 119078912 }, { "epoch": 0.04, "learning_rate": 0.00048682046138415246, "loss": 3.4882, "theoretical_loss": 4.659113523125981, "tokens_seen": 119144448 }, { "epoch": 0.04, "learning_rate": 0.00048681043129388164, "loss": 3.6714, "theoretical_loss": 4.6587670772956775, "tokens_seen": 119209984 }, { "epoch": 0.04, "learning_rate": 0.0004868004012036109, "loss": 3.551, "theoretical_loss": 4.658420875167799, "tokens_seen": 119275520 }, { "epoch": 0.04, "learning_rate": 0.00048679037111334, "loss": 3.4185, "theoretical_loss": 4.658074916437155, "tokens_seen": 119341056 }, { "epoch": 0.04, "learning_rate": 0.00048678034102306924, "loss": 3.8074, "theoretical_loss": 4.657729200799105, "tokens_seen": 119406592 }, { "epoch": 0.04, "learning_rate": 0.00048677031093279836, "loss": 3.6782, "theoretical_loss": 4.657383727949558, "tokens_seen": 119472128 }, { "epoch": 0.04, "learning_rate": 0.0004867602808425276, "loss": 3.6155, "theoretical_loss": 4.657038497584967, "tokens_seen": 119537664 }, { "epoch": 0.04, "objective/train/docs_used": 220671, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7515311241149902, "objective/train/theoretical_loss": 4.656693509402331, "objective/train/tokens_used": 140063200, "theoretical_loss": 4.656693509402331, "tokens_seen": 119603200 }, { "epoch": 0.04, "learning_rate": 0.0004867502507522568, "loss": 3.7864, "theoretical_loss": 4.656693509402331, "tokens_seen": 119603200 }, { "epoch": 0.04, "learning_rate": 0.00048674022066198596, "loss": 3.5852, "theoretical_loss": 4.6563487630991975, "tokens_seen": 119668736 }, { "epoch": 0.04, "learning_rate": 0.00048673019057171514, "loss": 3.5044, "theoretical_loss": 4.656004258373651, "tokens_seen": 119734272 }, { "epoch": 0.04, "learning_rate": 0.0004867201604814443, "loss": 3.6466, "theoretical_loss": 4.655659994924323, "tokens_seen": 119799808 }, { "epoch": 0.04, "learning_rate": 0.0004867101303911735, "loss": 3.4824, "theoretical_loss": 4.655315972450383, "tokens_seen": 119865344 }, { "epoch": 0.04, "learning_rate": 0.00048670010030090274, "loss": 3.4399, "theoretical_loss": 4.65497219065154, "tokens_seen": 119930880 }, { "epoch": 0.04, "learning_rate": 0.00048669007021063187, "loss": 3.4929, "theoretical_loss": 4.654628649228041, "tokens_seen": 119996416 }, { "epoch": 0.04, "learning_rate": 0.0004866800401203611, "loss": 3.6167, "theoretical_loss": 4.654285347880672, "tokens_seen": 120061952 }, { "epoch": 0.04, "learning_rate": 0.0004866700100300903, "loss": 3.6075, "theoretical_loss": 4.653942286310749, "tokens_seen": 120127488 }, { "epoch": 0.04, "learning_rate": 0.00048665997993981947, "loss": 3.6987, "theoretical_loss": 4.653599464220129, "tokens_seen": 120193024 }, { "epoch": 0.04, "learning_rate": 0.00048664994984954865, "loss": 3.7139, "theoretical_loss": 4.653256881311198, "tokens_seen": 120258560 }, { "epoch": 0.04, "learning_rate": 0.00048663991975927783, "loss": 3.7446, "theoretical_loss": 4.6529145372868745, "tokens_seen": 120324096 }, { "epoch": 0.04, "learning_rate": 0.000486629889669007, "loss": 3.6858, "theoretical_loss": 4.652572431850608, "tokens_seen": 120389632 }, { "epoch": 0.04, "learning_rate": 0.00048661985957873624, "loss": 3.5636, "theoretical_loss": 4.652230564706377, "tokens_seen": 120455168 }, { "epoch": 0.04, "learning_rate": 0.00048660982948846537, "loss": 3.5917, "theoretical_loss": 4.651888935558688, "tokens_seen": 120520704 }, { "epoch": 0.04, "learning_rate": 0.0004865997993981946, "loss": 3.7002, "theoretical_loss": 4.651547544112575, "tokens_seen": 120586240 }, { "epoch": 0.04, "learning_rate": 0.00048658976930792373, "loss": 3.3296, "theoretical_loss": 4.651206390073597, "tokens_seen": 120651776 }, { "epoch": 0.04, "learning_rate": 0.00048657973921765297, "loss": 3.7888, "theoretical_loss": 4.650865473147837, "tokens_seen": 120717312 }, { "epoch": 0.04, "learning_rate": 0.00048656970912738215, "loss": 3.5227, "theoretical_loss": 4.650524793041903, "tokens_seen": 120782848 }, { "epoch": 0.04, "learning_rate": 0.00048655967903711133, "loss": 3.7916, "theoretical_loss": 4.650184349462922, "tokens_seen": 120848384 }, { "epoch": 0.04, "learning_rate": 0.0004865496489468405, "loss": 3.7223, "theoretical_loss": 4.649844142118544, "tokens_seen": 120913920 }, { "epoch": 0.04, "learning_rate": 0.00048653961885656975, "loss": 3.5667, "theoretical_loss": 4.6495041707169396, "tokens_seen": 120979456 }, { "epoch": 0.04, "learning_rate": 0.0004865295887662989, "loss": 3.7564, "theoretical_loss": 4.649164434966794, "tokens_seen": 121044992 }, { "epoch": 0.04, "learning_rate": 0.0004865195586760281, "loss": 3.6653, "theoretical_loss": 4.648824934577313, "tokens_seen": 121110528 }, { "epoch": 0.04, "learning_rate": 0.00048650952858575724, "loss": 3.547, "theoretical_loss": 4.648485669258216, "tokens_seen": 121176064 }, { "epoch": 0.04, "objective/train/docs_used": 223562, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.404118537902832, "objective/train/theoretical_loss": 4.648146638719739, "objective/train/tokens_used": 141701600, "theoretical_loss": 4.648146638719739, "tokens_seen": 121241600 }, { "epoch": 0.04, "learning_rate": 0.00048649949849548647, "loss": 3.477, "theoretical_loss": 4.648146638719739, "tokens_seen": 121241600 }, { "epoch": 0.04, "learning_rate": 0.00048648946840521565, "loss": 3.6442, "theoretical_loss": 4.647807842672631, "tokens_seen": 121307136 }, { "epoch": 0.04, "learning_rate": 0.00048647943831494483, "loss": 3.399, "theoretical_loss": 4.647469280828153, "tokens_seen": 121372672 }, { "epoch": 0.04, "learning_rate": 0.00048646940822467407, "loss": 3.5529, "theoretical_loss": 4.647130952898077, "tokens_seen": 121438208 }, { "epoch": 0.04, "learning_rate": 0.0004864593781344032, "loss": 3.3999, "theoretical_loss": 4.646792858594686, "tokens_seen": 121503744 }, { "epoch": 0.04, "learning_rate": 0.00048644934804413243, "loss": 3.7765, "theoretical_loss": 4.64645499763077, "tokens_seen": 121569280 }, { "epoch": 0.04, "learning_rate": 0.0004864393179538616, "loss": 3.9141, "theoretical_loss": 4.646117369719629, "tokens_seen": 121634816 }, { "epoch": 0.04, "learning_rate": 0.0004864292878635908, "loss": 3.5722, "theoretical_loss": 4.645779974575069, "tokens_seen": 121700352 }, { "epoch": 0.04, "learning_rate": 0.00048641925777332, "loss": 3.6885, "theoretical_loss": 4.6454428119113995, "tokens_seen": 121765888 }, { "epoch": 0.04, "learning_rate": 0.00048640922768304916, "loss": 3.7057, "theoretical_loss": 4.6451058814434365, "tokens_seen": 121831424 }, { "epoch": 0.04, "learning_rate": 0.00048639919759277834, "loss": 3.5169, "theoretical_loss": 4.644769182886495, "tokens_seen": 121896960 }, { "epoch": 0.04, "learning_rate": 0.00048638916750250757, "loss": 3.5991, "theoretical_loss": 4.644432715956399, "tokens_seen": 121962496 }, { "epoch": 0.04, "learning_rate": 0.0004863791374122367, "loss": 3.6623, "theoretical_loss": 4.644096480369466, "tokens_seen": 122028032 }, { "epoch": 0.04, "learning_rate": 0.00048636910732196593, "loss": 3.6837, "theoretical_loss": 4.643760475842518, "tokens_seen": 122093568 }, { "epoch": 0.04, "learning_rate": 0.0004863590772316951, "loss": 3.6267, "theoretical_loss": 4.6434247020928705, "tokens_seen": 122159104 }, { "epoch": 0.04, "learning_rate": 0.0004863490471414243, "loss": 3.6542, "theoretical_loss": 4.643089158838341, "tokens_seen": 122224640 }, { "epoch": 0.04, "learning_rate": 0.0004863390170511535, "loss": 3.6438, "theoretical_loss": 4.642753845797243, "tokens_seen": 122290176 }, { "epoch": 0.04, "learning_rate": 0.00048632898696088266, "loss": 3.5258, "theoretical_loss": 4.642418762688379, "tokens_seen": 122355712 }, { "epoch": 0.04, "learning_rate": 0.00048631895687061184, "loss": 3.6071, "theoretical_loss": 4.642083909231053, "tokens_seen": 122421248 }, { "epoch": 0.04, "learning_rate": 0.0004863089267803411, "loss": 3.6373, "theoretical_loss": 4.641749285145057, "tokens_seen": 122486784 }, { "epoch": 0.04, "learning_rate": 0.0004862988966900702, "loss": 3.5639, "theoretical_loss": 4.641414890150675, "tokens_seen": 122552320 }, { "epoch": 0.04, "learning_rate": 0.00048628886659979944, "loss": 3.5941, "theoretical_loss": 4.641080723968684, "tokens_seen": 122617856 }, { "epoch": 0.04, "learning_rate": 0.00048627883650952857, "loss": 3.6494, "theoretical_loss": 4.6407467863203475, "tokens_seen": 122683392 }, { "epoch": 0.04, "learning_rate": 0.0004862688064192578, "loss": 3.625, "theoretical_loss": 4.640413076927418, "tokens_seen": 122748928 }, { "epoch": 0.04, "learning_rate": 0.000486258776328987, "loss": 3.7251, "theoretical_loss": 4.6400795955121374, "tokens_seen": 122814464 }, { "epoch": 0.04, "objective/train/docs_used": 226297, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.526075601577759, "objective/train/theoretical_loss": 4.639746341797229, "objective/train/tokens_used": 143340000, "theoretical_loss": 4.639746341797229, "tokens_seen": 122880000 }, { "epoch": 0.04, "learning_rate": 0.00048624874623871616, "loss": 3.3561, "theoretical_loss": 4.639746341797229, "tokens_seen": 122880000 }, { "epoch": 0.04, "learning_rate": 0.00048623871614844534, "loss": 3.4239, "theoretical_loss": 4.639413315505905, "tokens_seen": 122945536 }, { "epoch": 0.04, "learning_rate": 0.0004862286860581745, "loss": 3.6265, "theoretical_loss": 4.639080516361861, "tokens_seen": 123011072 }, { "epoch": 0.04, "learning_rate": 0.0004862186559679037, "loss": 3.5569, "theoretical_loss": 4.638747944089273, "tokens_seen": 123076608 }, { "epoch": 0.04, "learning_rate": 0.00048620862587763294, "loss": 3.7956, "theoretical_loss": 4.638415598412799, "tokens_seen": 123142144 }, { "epoch": 0.04, "learning_rate": 0.00048619859578736207, "loss": 3.7779, "theoretical_loss": 4.638083479057579, "tokens_seen": 123207680 }, { "epoch": 0.04, "learning_rate": 0.0004861885656970913, "loss": 3.8056, "theoretical_loss": 4.637751585749234, "tokens_seen": 123273216 }, { "epoch": 0.04, "learning_rate": 0.0004861785356068205, "loss": 3.712, "theoretical_loss": 4.6374199182138565, "tokens_seen": 123338752 }, { "epoch": 0.04, "learning_rate": 0.00048616850551654967, "loss": 3.5023, "theoretical_loss": 4.637088476178025, "tokens_seen": 123404288 }, { "epoch": 0.04, "learning_rate": 0.00048615847542627885, "loss": 3.7364, "theoretical_loss": 4.636757259368787, "tokens_seen": 123469824 }, { "epoch": 0.04, "learning_rate": 0.00048614844533600803, "loss": 3.5289, "theoretical_loss": 4.636426267513668, "tokens_seen": 123535360 }, { "epoch": 0.04, "learning_rate": 0.0004861384152457372, "loss": 3.8272, "theoretical_loss": 4.636095500340669, "tokens_seen": 123600896 }, { "epoch": 0.04, "learning_rate": 0.00048612838515546644, "loss": 3.6238, "theoretical_loss": 4.635764957578261, "tokens_seen": 123666432 }, { "epoch": 0.04, "learning_rate": 0.00048611835506519557, "loss": 3.6063, "theoretical_loss": 4.635434638955388, "tokens_seen": 123731968 }, { "epoch": 0.04, "learning_rate": 0.0004861083249749248, "loss": 3.6831, "theoretical_loss": 4.635104544201465, "tokens_seen": 123797504 }, { "epoch": 0.04, "learning_rate": 0.00048609829488465393, "loss": 3.7495, "theoretical_loss": 4.634774673046376, "tokens_seen": 123863040 }, { "epoch": 0.04, "learning_rate": 0.00048608826479438317, "loss": 3.6716, "theoretical_loss": 4.634445025220475, "tokens_seen": 123928576 }, { "epoch": 0.04, "learning_rate": 0.00048607823470411235, "loss": 3.5948, "theoretical_loss": 4.634115600454582, "tokens_seen": 123994112 }, { "epoch": 0.04, "learning_rate": 0.00048606820461384153, "loss": 3.5728, "theoretical_loss": 4.633786398479983, "tokens_seen": 124059648 }, { "epoch": 0.04, "learning_rate": 0.0004860581745235707, "loss": 3.6127, "theoretical_loss": 4.6334574190284314, "tokens_seen": 124125184 }, { "epoch": 0.04, "learning_rate": 0.00048604814443329995, "loss": 3.7216, "theoretical_loss": 4.633128661832145, "tokens_seen": 124190720 }, { "epoch": 0.04, "learning_rate": 0.0004860381143430291, "loss": 3.6773, "theoretical_loss": 4.632800126623803, "tokens_seen": 124256256 }, { "epoch": 0.04, "learning_rate": 0.0004860280842527583, "loss": 3.497, "theoretical_loss": 4.632471813136547, "tokens_seen": 124321792 }, { "epoch": 0.04, "learning_rate": 0.00048601805416248744, "loss": 3.6151, "theoretical_loss": 4.632143721103983, "tokens_seen": 124387328 }, { "epoch": 0.04, "learning_rate": 0.00048600802407221667, "loss": 3.5141, "theoretical_loss": 4.631815850260173, "tokens_seen": 124452864 }, { "epoch": 0.04, "objective/train/docs_used": 229052, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.752659320831299, "objective/train/theoretical_loss": 4.631488200339643, "objective/train/tokens_used": 144978400, "theoretical_loss": 4.631488200339643, "tokens_seen": 124518400 }, { "epoch": 0.04, "learning_rate": 0.00048599799398194585, "loss": 3.7009, "theoretical_loss": 4.631488200339643, "tokens_seen": 124518400 }, { "epoch": 0.04, "learning_rate": 0.00048598796389167503, "loss": 3.6085, "theoretical_loss": 4.63116077107737, "tokens_seen": 124583936 }, { "epoch": 0.04, "learning_rate": 0.0004859779338014042, "loss": 3.6043, "theoretical_loss": 4.630833562208797, "tokens_seen": 124649472 }, { "epoch": 0.04, "learning_rate": 0.0004859679037111334, "loss": 3.6545, "theoretical_loss": 4.630506573469815, "tokens_seen": 124715008 }, { "epoch": 0.04, "learning_rate": 0.0004859578736208626, "loss": 3.6577, "theoretical_loss": 4.630179804596775, "tokens_seen": 124780544 }, { "epoch": 0.04, "learning_rate": 0.0004859478435305918, "loss": 3.5495, "theoretical_loss": 4.629853255326481, "tokens_seen": 124846080 }, { "epoch": 0.04, "learning_rate": 0.00048593781344032094, "loss": 3.824, "theoretical_loss": 4.629526925396189, "tokens_seen": 124911616 }, { "epoch": 0.04, "learning_rate": 0.0004859277833500502, "loss": 3.6638, "theoretical_loss": 4.6292008145436085, "tokens_seen": 124977152 }, { "epoch": 0.04, "learning_rate": 0.0004859177532597793, "loss": 3.6086, "theoretical_loss": 4.628874922506897, "tokens_seen": 125042688 }, { "epoch": 0.04, "learning_rate": 0.00048590772316950854, "loss": 3.6279, "theoretical_loss": 4.628549249024666, "tokens_seen": 125108224 }, { "epoch": 0.04, "learning_rate": 0.0004858976930792377, "loss": 3.7239, "theoretical_loss": 4.628223793835975, "tokens_seen": 125173760 }, { "epoch": 0.04, "learning_rate": 0.0004858876629889669, "loss": 3.4833, "theoretical_loss": 4.627898556680327, "tokens_seen": 125239296 }, { "epoch": 0.04, "learning_rate": 0.0004858776328986961, "loss": 3.7573, "theoretical_loss": 4.627573537297678, "tokens_seen": 125304832 }, { "epoch": 0.04, "learning_rate": 0.0004858676028084253, "loss": 3.5416, "theoretical_loss": 4.627248735428427, "tokens_seen": 125370368 }, { "epoch": 0.04, "learning_rate": 0.00048585757271815444, "loss": 3.5974, "theoretical_loss": 4.6269241508134185, "tokens_seen": 125435904 }, { "epoch": 0.04, "learning_rate": 0.0004858475426278837, "loss": 3.7367, "theoretical_loss": 4.6265997831939405, "tokens_seen": 125501440 }, { "epoch": 0.04, "learning_rate": 0.0004858375125376128, "loss": 3.8225, "theoretical_loss": 4.6262756323117245, "tokens_seen": 125566976 }, { "epoch": 0.04, "learning_rate": 0.00048582748244734204, "loss": 3.6754, "theoretical_loss": 4.625951697908944, "tokens_seen": 125632512 }, { "epoch": 0.04, "learning_rate": 0.0004858174523570712, "loss": 3.5835, "theoretical_loss": 4.625627979728212, "tokens_seen": 125698048 }, { "epoch": 0.04, "learning_rate": 0.0004858074222668004, "loss": 3.6256, "theoretical_loss": 4.625304477512584, "tokens_seen": 125763584 }, { "epoch": 0.04, "learning_rate": 0.0004857973921765296, "loss": 3.4816, "theoretical_loss": 4.624981191005554, "tokens_seen": 125829120 }, { "epoch": 0.04, "learning_rate": 0.00048578736208625877, "loss": 3.4264, "theoretical_loss": 4.624658119951052, "tokens_seen": 125894656 }, { "epoch": 0.04, "learning_rate": 0.00048577733199598795, "loss": 3.5838, "theoretical_loss": 4.624335264093447, "tokens_seen": 125960192 }, { "epoch": 0.04, "learning_rate": 0.0004857673019057172, "loss": 3.4387, "theoretical_loss": 4.624012623177544, "tokens_seen": 126025728 }, { "epoch": 0.04, "learning_rate": 0.0004857572718154463, "loss": 3.6018, "theoretical_loss": 4.623690196948582, "tokens_seen": 126091264 }, { "epoch": 0.04, "objective/train/docs_used": 230495, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4762017726898193, "objective/train/theoretical_loss": 4.623367985152234, "objective/train/tokens_used": 146616800, "theoretical_loss": 4.623367985152234, "tokens_seen": 126156800 }, { "epoch": 0.04, "learning_rate": 0.00048574724172517554, "loss": 3.6307, "theoretical_loss": 4.623367985152234, "tokens_seen": 126156800 }, { "epoch": 0.04, "learning_rate": 0.00048573721163490467, "loss": 3.6309, "theoretical_loss": 4.623045987534609, "tokens_seen": 126222336 }, { "epoch": 0.04, "learning_rate": 0.0004857271815446339, "loss": 3.5099, "theoretical_loss": 4.622724203842246, "tokens_seen": 126287872 }, { "epoch": 0.04, "learning_rate": 0.00048571715145436314, "loss": 3.5249, "theoretical_loss": 4.622402633822114, "tokens_seen": 126353408 }, { "epoch": 0.04, "learning_rate": 0.00048570712136409227, "loss": 3.5069, "theoretical_loss": 4.622081277221616, "tokens_seen": 126418944 }, { "epoch": 0.04, "learning_rate": 0.0004856970912738215, "loss": 3.6259, "theoretical_loss": 4.62176013378858, "tokens_seen": 126484480 }, { "epoch": 0.04, "learning_rate": 0.0004856870611835507, "loss": 3.6217, "theoretical_loss": 4.621439203271267, "tokens_seen": 126550016 }, { "epoch": 0.04, "learning_rate": 0.00048567703109327987, "loss": 3.4785, "theoretical_loss": 4.621118485418362, "tokens_seen": 126615552 }, { "epoch": 0.04, "learning_rate": 0.00048566700100300905, "loss": 3.5592, "theoretical_loss": 4.620797979978978, "tokens_seen": 126681088 }, { "epoch": 0.04, "learning_rate": 0.00048565697091273823, "loss": 3.4764, "theoretical_loss": 4.620477686702651, "tokens_seen": 126746624 }, { "epoch": 0.04, "learning_rate": 0.0004856469408224674, "loss": 3.6324, "theoretical_loss": 4.620157605339347, "tokens_seen": 126812160 }, { "epoch": 0.04, "learning_rate": 0.00048563691073219664, "loss": 3.6155, "theoretical_loss": 4.619837735639452, "tokens_seen": 126877696 }, { "epoch": 0.04, "learning_rate": 0.00048562688064192577, "loss": 3.7319, "theoretical_loss": 4.619518077353776, "tokens_seen": 126943232 }, { "epoch": 0.04, "learning_rate": 0.000485616850551655, "loss": 3.6162, "theoretical_loss": 4.619198630233547, "tokens_seen": 127008768 }, { "epoch": 0.04, "learning_rate": 0.00048560682046138413, "loss": 3.4532, "theoretical_loss": 4.6188793940304205, "tokens_seen": 127074304 }, { "epoch": 0.04, "learning_rate": 0.00048559679037111337, "loss": 3.6952, "theoretical_loss": 4.618560368496466, "tokens_seen": 127139840 }, { "epoch": 0.04, "learning_rate": 0.00048558676028084255, "loss": 3.6854, "theoretical_loss": 4.618241553384175, "tokens_seen": 127205376 }, { "epoch": 0.04, "learning_rate": 0.00048557673019057173, "loss": 3.6855, "theoretical_loss": 4.617922948446459, "tokens_seen": 127270912 }, { "epoch": 0.04, "learning_rate": 0.0004855667001003009, "loss": 3.5326, "theoretical_loss": 4.617604553436642, "tokens_seen": 127336448 }, { "epoch": 0.04, "learning_rate": 0.00048555667001003015, "loss": 3.4579, "theoretical_loss": 4.617286368108466, "tokens_seen": 127401984 }, { "epoch": 0.04, "learning_rate": 0.0004855466399197593, "loss": 3.5475, "theoretical_loss": 4.6169683922160925, "tokens_seen": 127467520 }, { "epoch": 0.04, "learning_rate": 0.0004855366098294885, "loss": 3.6926, "theoretical_loss": 4.616650625514091, "tokens_seen": 127533056 }, { "epoch": 0.04, "learning_rate": 0.00048552657973921764, "loss": 3.694, "theoretical_loss": 4.616333067757449, "tokens_seen": 127598592 }, { "epoch": 0.04, "learning_rate": 0.00048551654964894687, "loss": 3.5438, "theoretical_loss": 4.616015718701563, "tokens_seen": 127664128 }, { "epoch": 0.04, "learning_rate": 0.00048550651955867605, "loss": 3.3761, "theoretical_loss": 4.615698578102245, "tokens_seen": 127729664 }, { "epoch": 0.04, "objective/train/docs_used": 233438, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7482805252075195, "objective/train/theoretical_loss": 4.615381645715717, "objective/train/tokens_used": 148255200, "theoretical_loss": 4.615381645715717, "tokens_seen": 127795200 }, { "epoch": 0.04, "learning_rate": 0.00048549648946840523, "loss": 3.6499, "theoretical_loss": 4.615381645715717, "tokens_seen": 127795200 }, { "epoch": 0.04, "learning_rate": 0.0004854864593781344, "loss": 3.4341, "theoretical_loss": 4.615064921298608, "tokens_seen": 127860736 }, { "epoch": 0.04, "learning_rate": 0.0004854764292878636, "loss": 3.5659, "theoretical_loss": 4.61474840460796, "tokens_seen": 127926272 }, { "epoch": 0.04, "learning_rate": 0.0004854663991975928, "loss": 3.438, "theoretical_loss": 4.614432095401219, "tokens_seen": 127991808 }, { "epoch": 0.04, "learning_rate": 0.000485456369107322, "loss": 3.4932, "theoretical_loss": 4.614115993436242, "tokens_seen": 128057344 }, { "epoch": 0.04, "learning_rate": 0.00048544633901705114, "loss": 3.5606, "theoretical_loss": 4.613800098471291, "tokens_seen": 128122880 }, { "epoch": 0.04, "learning_rate": 0.0004854363089267804, "loss": 3.4407, "theoretical_loss": 4.613484410265032, "tokens_seen": 128188416 }, { "epoch": 0.04, "learning_rate": 0.0004854262788365095, "loss": 3.56, "theoretical_loss": 4.613168928576538, "tokens_seen": 128253952 }, { "epoch": 0.04, "learning_rate": 0.00048541624874623874, "loss": 3.5589, "theoretical_loss": 4.612853653165283, "tokens_seen": 128319488 }, { "epoch": 0.04, "learning_rate": 0.0004854062186559679, "loss": 3.518, "theoretical_loss": 4.612538583791146, "tokens_seen": 128385024 }, { "epoch": 0.04, "learning_rate": 0.0004853961885656971, "loss": 3.7216, "theoretical_loss": 4.612223720214407, "tokens_seen": 128450560 }, { "epoch": 0.04, "learning_rate": 0.0004853861584754263, "loss": 3.6757, "theoretical_loss": 4.611909062195749, "tokens_seen": 128516096 }, { "epoch": 0.04, "learning_rate": 0.0004853761283851555, "loss": 3.7147, "theoretical_loss": 4.61159460949625, "tokens_seen": 128581632 }, { "epoch": 0.04, "learning_rate": 0.00048536609829488464, "loss": 3.4663, "theoretical_loss": 4.611280361877393, "tokens_seen": 128647168 }, { "epoch": 0.04, "learning_rate": 0.0004853560682046139, "loss": 3.7565, "theoretical_loss": 4.610966319101056, "tokens_seen": 128712704 }, { "epoch": 0.04, "learning_rate": 0.000485346038114343, "loss": 3.4374, "theoretical_loss": 4.610652480929515, "tokens_seen": 128778240 }, { "epoch": 0.04, "learning_rate": 0.00048533600802407224, "loss": 3.659, "theoretical_loss": 4.610338847125445, "tokens_seen": 128843776 }, { "epoch": 0.04, "learning_rate": 0.0004853259779338014, "loss": 3.588, "theoretical_loss": 4.610025417451913, "tokens_seen": 128909312 }, { "epoch": 0.04, "learning_rate": 0.0004853159478435306, "loss": 3.7241, "theoretical_loss": 4.6097121916723856, "tokens_seen": 128974848 }, { "epoch": 0.04, "learning_rate": 0.0004853059177532598, "loss": 3.7008, "theoretical_loss": 4.609399169550718, "tokens_seen": 129040384 }, { "epoch": 0.04, "learning_rate": 0.00048529588766298897, "loss": 3.5876, "theoretical_loss": 4.609086350851165, "tokens_seen": 129105920 }, { "epoch": 0.04, "learning_rate": 0.00048528585757271815, "loss": 3.5207, "theoretical_loss": 4.6087737353383655, "tokens_seen": 129171456 }, { "epoch": 0.04, "learning_rate": 0.0004852758274824474, "loss": 3.5304, "theoretical_loss": 4.6084613227773605, "tokens_seen": 129236992 }, { "epoch": 0.04, "learning_rate": 0.0004852657973921765, "loss": 3.5929, "theoretical_loss": 4.608149112933571, "tokens_seen": 129302528 }, { "epoch": 0.04, "learning_rate": 0.00048525576730190574, "loss": 3.513, "theoretical_loss": 4.607837105572816, "tokens_seen": 129368064 }, { "epoch": 0.04, "objective/train/docs_used": 235763, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2372939586639404, "objective/train/theoretical_loss": 4.607525300461299, "objective/train/tokens_used": 149893600, "theoretical_loss": 4.607525300461299, "tokens_seen": 129433600 }, { "epoch": 0.04, "learning_rate": 0.00048524573721163487, "loss": 3.5517, "theoretical_loss": 4.607525300461299, "tokens_seen": 129433600 }, { "epoch": 0.04, "learning_rate": 0.0004852357071213641, "loss": 3.4671, "theoretical_loss": 4.607213697365613, "tokens_seen": 129499136 }, { "epoch": 0.04, "learning_rate": 0.0004852256770310933, "loss": 3.5538, "theoretical_loss": 4.606902296052739, "tokens_seen": 129564672 }, { "epoch": 0.04, "learning_rate": 0.00048521564694082247, "loss": 3.5203, "theoretical_loss": 4.6065910962900425, "tokens_seen": 129630208 }, { "epoch": 0.04, "learning_rate": 0.00048520561685055165, "loss": 3.5423, "theoretical_loss": 4.606280097845277, "tokens_seen": 129695744 }, { "epoch": 0.04, "learning_rate": 0.0004851955867602809, "loss": 3.5919, "theoretical_loss": 4.60596930048658, "tokens_seen": 129761280 }, { "epoch": 0.04, "learning_rate": 0.00048518555667001, "loss": 3.6533, "theoretical_loss": 4.605658703982471, "tokens_seen": 129826816 }, { "epoch": 0.04, "learning_rate": 0.00048517552657973925, "loss": 3.5534, "theoretical_loss": 4.6053483081018545, "tokens_seen": 129892352 }, { "epoch": 0.04, "learning_rate": 0.0004851654964894684, "loss": 3.5125, "theoretical_loss": 4.605038112614018, "tokens_seen": 129957888 }, { "epoch": 0.04, "learning_rate": 0.0004851554663991976, "loss": 3.5594, "theoretical_loss": 4.604728117288631, "tokens_seen": 130023424 }, { "epoch": 0.04, "learning_rate": 0.0004851454363089268, "loss": 3.6311, "theoretical_loss": 4.604418321895739, "tokens_seen": 130088960 }, { "epoch": 0.04, "learning_rate": 0.00048513540621865597, "loss": 3.5425, "theoretical_loss": 4.604108726205774, "tokens_seen": 130154496 }, { "epoch": 0.04, "learning_rate": 0.00048512537612838515, "loss": 3.5202, "theoretical_loss": 4.60379932998954, "tokens_seen": 130220032 }, { "epoch": 0.04, "learning_rate": 0.00048511534603811433, "loss": 3.774, "theoretical_loss": 4.6034901330182265, "tokens_seen": 130285568 }, { "epoch": 0.04, "learning_rate": 0.0004851053159478435, "loss": 3.7324, "theoretical_loss": 4.603181135063394, "tokens_seen": 130351104 }, { "epoch": 0.04, "learning_rate": 0.00048509528585757275, "loss": 3.5637, "theoretical_loss": 4.6028723358969845, "tokens_seen": 130416640 }, { "epoch": 0.04, "learning_rate": 0.0004850852557673019, "loss": 3.4938, "theoretical_loss": 4.602563735291312, "tokens_seen": 130482176 }, { "epoch": 0.04, "learning_rate": 0.0004850752256770311, "loss": 3.6637, "theoretical_loss": 4.602255333019068, "tokens_seen": 130547712 }, { "epoch": 0.04, "learning_rate": 0.00048506519558676024, "loss": 3.4088, "theoretical_loss": 4.6019471288533165, "tokens_seen": 130613248 }, { "epoch": 0.04, "learning_rate": 0.0004850551654964895, "loss": 3.6385, "theoretical_loss": 4.601639122567497, "tokens_seen": 130678784 }, { "epoch": 0.04, "learning_rate": 0.00048504513540621866, "loss": 3.6205, "theoretical_loss": 4.601331313935418, "tokens_seen": 130744320 }, { "epoch": 0.04, "learning_rate": 0.00048503510531594784, "loss": 3.4816, "theoretical_loss": 4.601023702731264, "tokens_seen": 130809856 }, { "epoch": 0.04, "learning_rate": 0.000485025075225677, "loss": 3.7126, "theoretical_loss": 4.600716288729587, "tokens_seen": 130875392 }, { "epoch": 0.04, "learning_rate": 0.00048501504513540625, "loss": 3.7579, "theoretical_loss": 4.600409071705312, "tokens_seen": 130940928 }, { "epoch": 0.04, "learning_rate": 0.0004850050150451354, "loss": 3.4944, "theoretical_loss": 4.60010205143373, "tokens_seen": 131006464 }, { "epoch": 0.04, "objective/train/docs_used": 238624, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.702207088470459, "objective/train/theoretical_loss": 4.599795227690505, "objective/train/tokens_used": 151532000, "theoretical_loss": 4.599795227690505, "tokens_seen": 131072000 }, { "epoch": 0.04, "learning_rate": 0.0004849949849548646, "loss": 3.5658, "theoretical_loss": 4.599795227690505, "tokens_seen": 131072000 }, { "epoch": 0.04, "learning_rate": 0.0004849849548645938, "loss": 3.5025, "theoretical_loss": 4.5994886002516635, "tokens_seen": 131137536 }, { "epoch": 0.04, "learning_rate": 0.000484974924774323, "loss": 3.7124, "theoretical_loss": 4.599182168893604, "tokens_seen": 131203072 }, { "epoch": 0.04, "learning_rate": 0.0004849648946840522, "loss": 3.7309, "theoretical_loss": 4.598875933393089, "tokens_seen": 131268608 }, { "epoch": 0.04, "learning_rate": 0.00048495486459378134, "loss": 3.4793, "theoretical_loss": 4.5985698935272445, "tokens_seen": 131334144 }, { "epoch": 0.04, "learning_rate": 0.0004849448345035106, "loss": 3.5696, "theoretical_loss": 4.598264049073565, "tokens_seen": 131399680 }, { "epoch": 0.04, "learning_rate": 0.0004849348044132397, "loss": 3.4882, "theoretical_loss": 4.597958399809908, "tokens_seen": 131465216 }, { "epoch": 0.04, "learning_rate": 0.00048492477432296894, "loss": 3.4212, "theoretical_loss": 4.59765294551449, "tokens_seen": 131530752 }, { "epoch": 0.04, "learning_rate": 0.0004849147442326981, "loss": 3.4403, "theoretical_loss": 4.597347685965897, "tokens_seen": 131596288 }, { "epoch": 0.04, "learning_rate": 0.0004849047141424273, "loss": 3.5257, "theoretical_loss": 4.597042620943069, "tokens_seen": 131661824 }, { "epoch": 0.04, "learning_rate": 0.0004848946840521565, "loss": 3.6052, "theoretical_loss": 4.596737750225311, "tokens_seen": 131727360 }, { "epoch": 0.04, "learning_rate": 0.0004848846539618857, "loss": 3.4987, "theoretical_loss": 4.596433073592289, "tokens_seen": 131792896 }, { "epoch": 0.04, "learning_rate": 0.00048487462387161484, "loss": 3.4929, "theoretical_loss": 4.596128590824026, "tokens_seen": 131858432 }, { "epoch": 0.04, "learning_rate": 0.0004848645937813441, "loss": 3.6371, "theoretical_loss": 4.595824301700904, "tokens_seen": 131923968 }, { "epoch": 0.04, "learning_rate": 0.0004848545636910732, "loss": 3.6243, "theoretical_loss": 4.595520206003663, "tokens_seen": 131989504 }, { "epoch": 0.04, "learning_rate": 0.00048484453360080244, "loss": 3.5233, "theoretical_loss": 4.595216303513399, "tokens_seen": 132055040 }, { "epoch": 0.04, "learning_rate": 0.0004848345035105316, "loss": 3.4362, "theoretical_loss": 4.594912594011566, "tokens_seen": 132120576 }, { "epoch": 0.04, "learning_rate": 0.0004848244734202608, "loss": 3.4556, "theoretical_loss": 4.594609077279973, "tokens_seen": 132186112 }, { "epoch": 0.04, "learning_rate": 0.00048481444332999, "loss": 3.7096, "theoretical_loss": 4.594305753100782, "tokens_seen": 132251648 }, { "epoch": 0.04, "learning_rate": 0.00048480441323971917, "loss": 3.5165, "theoretical_loss": 4.594002621256511, "tokens_seen": 132317184 }, { "epoch": 0.04, "learning_rate": 0.00048479438314944835, "loss": 3.3414, "theoretical_loss": 4.59369968153003, "tokens_seen": 132382720 }, { "epoch": 0.04, "learning_rate": 0.0004847843530591776, "loss": 3.5603, "theoretical_loss": 4.593396933704562, "tokens_seen": 132448256 }, { "epoch": 0.04, "learning_rate": 0.0004847743229689067, "loss": 3.647, "theoretical_loss": 4.593094377563681, "tokens_seen": 132513792 }, { "epoch": 0.04, "learning_rate": 0.00048476429287863594, "loss": 3.594, "theoretical_loss": 4.592792012891314, "tokens_seen": 132579328 }, { "epoch": 0.04, "learning_rate": 0.00048475426278836507, "loss": 3.561, "theoretical_loss": 4.592489839471735, "tokens_seen": 132644864 }, { "epoch": 0.04, "objective/train/docs_used": 241377, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.649860382080078, "objective/train/theoretical_loss": 4.592187857089571, "objective/train/tokens_used": 153170400, "theoretical_loss": 4.592187857089571, "tokens_seen": 132710400 }, { "epoch": 0.04, "learning_rate": 0.0004847442326980943, "loss": 3.5422, "theoretical_loss": 4.592187857089571, "tokens_seen": 132710400 }, { "epoch": 0.04, "learning_rate": 0.0004847342026078235, "loss": 3.7167, "theoretical_loss": 4.591886065529795, "tokens_seen": 132775936 }, { "epoch": 0.04, "learning_rate": 0.00048472417251755267, "loss": 3.6444, "theoretical_loss": 4.591584464577728, "tokens_seen": 132841472 }, { "epoch": 0.04, "learning_rate": 0.00048471414242728185, "loss": 3.5954, "theoretical_loss": 4.591283054019041, "tokens_seen": 132907008 }, { "epoch": 0.04, "learning_rate": 0.0004847041123370111, "loss": 3.6201, "theoretical_loss": 4.5909818336397485, "tokens_seen": 132972544 }, { "epoch": 0.04, "learning_rate": 0.0004846940822467402, "loss": 3.6329, "theoretical_loss": 4.590680803226213, "tokens_seen": 133038080 }, { "epoch": 0.04, "learning_rate": 0.00048468405215646945, "loss": 3.4109, "theoretical_loss": 4.590379962565141, "tokens_seen": 133103616 }, { "epoch": 0.04, "learning_rate": 0.0004846740220661986, "loss": 3.5289, "theoretical_loss": 4.590079311443583, "tokens_seen": 133169152 }, { "epoch": 0.04, "learning_rate": 0.0004846639919759278, "loss": 3.6759, "theoretical_loss": 4.589778849648934, "tokens_seen": 133234688 }, { "epoch": 0.04, "learning_rate": 0.000484653961885657, "loss": 3.5951, "theoretical_loss": 4.589478576968932, "tokens_seen": 133300224 }, { "epoch": 0.04, "learning_rate": 0.00048464393179538617, "loss": 3.6574, "theoretical_loss": 4.589178493191655, "tokens_seen": 133365760 }, { "epoch": 0.04, "learning_rate": 0.00048463390170511535, "loss": 3.7804, "theoretical_loss": 4.588878598105527, "tokens_seen": 133431296 }, { "epoch": 0.04, "learning_rate": 0.00048462387161484453, "loss": 3.4497, "theoretical_loss": 4.588578891499308, "tokens_seen": 133496832 }, { "epoch": 0.04, "learning_rate": 0.0004846138415245737, "loss": 3.6114, "theoretical_loss": 4.588279373162101, "tokens_seen": 133562368 }, { "epoch": 0.04, "learning_rate": 0.00048460381143430295, "loss": 3.6677, "theoretical_loss": 4.587980042883347, "tokens_seen": 133627904 }, { "epoch": 0.04, "learning_rate": 0.0004845937813440321, "loss": 3.5913, "theoretical_loss": 4.587680900452824, "tokens_seen": 133693440 }, { "epoch": 0.04, "learning_rate": 0.0004845837512537613, "loss": 3.5159, "theoretical_loss": 4.587381945660653, "tokens_seen": 133758976 }, { "epoch": 0.04, "learning_rate": 0.00048457372116349044, "loss": 3.5661, "theoretical_loss": 4.587083178297288, "tokens_seen": 133824512 }, { "epoch": 0.04, "learning_rate": 0.0004845636910732197, "loss": 3.5724, "theoretical_loss": 4.5867845981535185, "tokens_seen": 133890048 }, { "epoch": 0.04, "learning_rate": 0.00048455366098294886, "loss": 3.5006, "theoretical_loss": 4.586486205020474, "tokens_seen": 133955584 }, { "epoch": 0.04, "learning_rate": 0.00048454363089267804, "loss": 3.5603, "theoretical_loss": 4.586187998689616, "tokens_seen": 134021120 }, { "epoch": 0.04, "learning_rate": 0.0004845336008024072, "loss": 3.5667, "theoretical_loss": 4.585889978952741, "tokens_seen": 134086656 }, { "epoch": 0.04, "learning_rate": 0.00048452357071213645, "loss": 3.672, "theoretical_loss": 4.58559214560198, "tokens_seen": 134152192 }, { "epoch": 0.04, "learning_rate": 0.0004845135406218656, "loss": 3.6974, "theoretical_loss": 4.585294498429796, "tokens_seen": 134217728 }, { "epoch": 0.04, "learning_rate": 0.0004845035105315948, "loss": 3.7061, "theoretical_loss": 4.584997037228986, "tokens_seen": 134283264 }, { "epoch": 0.04, "objective/train/docs_used": 244333, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7445924282073975, "objective/train/theoretical_loss": 4.584699761792674, "objective/train/tokens_used": 154808800, "theoretical_loss": 4.584699761792674, "tokens_seen": 134348800 }, { "epoch": 0.04, "learning_rate": 0.00048449348044132394, "loss": 3.7804, "theoretical_loss": 4.584699761792674, "tokens_seen": 134348800 }, { "epoch": 0.04, "learning_rate": 0.0004844834503510532, "loss": 3.5979, "theoretical_loss": 4.5844026719143205, "tokens_seen": 134414336 }, { "epoch": 0.04, "learning_rate": 0.00048447342026078236, "loss": 3.5811, "theoretical_loss": 4.5841057673877135, "tokens_seen": 134479872 }, { "epoch": 0.04, "learning_rate": 0.00048446339017051154, "loss": 3.5271, "theoretical_loss": 4.5838090480069695, "tokens_seen": 134545408 }, { "epoch": 0.04, "learning_rate": 0.0004844533600802407, "loss": 3.55, "theoretical_loss": 4.5835125135665375, "tokens_seen": 134610944 }, { "epoch": 0.04, "learning_rate": 0.0004844433299899699, "loss": 3.5203, "theoretical_loss": 4.583216163861191, "tokens_seen": 134676480 }, { "epoch": 0.04, "learning_rate": 0.0004844332998996991, "loss": 3.4122, "theoretical_loss": 4.58291999868603, "tokens_seen": 134742016 }, { "epoch": 0.04, "learning_rate": 0.0004844232698094283, "loss": 3.2912, "theoretical_loss": 4.582624017836489, "tokens_seen": 134807552 }, { "epoch": 0.04, "learning_rate": 0.00048441323971915745, "loss": 3.6541, "theoretical_loss": 4.582328221108318, "tokens_seen": 134873088 }, { "epoch": 0.04, "learning_rate": 0.0004844032096288867, "loss": 3.5847, "theoretical_loss": 4.5820326082976, "tokens_seen": 134938624 }, { "epoch": 0.04, "learning_rate": 0.00048439317953861586, "loss": 3.6278, "theoretical_loss": 4.581737179200739, "tokens_seen": 135004160 }, { "epoch": 0.04, "learning_rate": 0.00048438314944834504, "loss": 3.5601, "theoretical_loss": 4.581441933614466, "tokens_seen": 135069696 }, { "epoch": 0.04, "learning_rate": 0.0004843731193580742, "loss": 3.4999, "theoretical_loss": 4.581146871335832, "tokens_seen": 135135232 }, { "epoch": 0.04, "learning_rate": 0.0004843630892678034, "loss": 3.5877, "theoretical_loss": 4.580851992162214, "tokens_seen": 135200768 }, { "epoch": 0.04, "learning_rate": 0.0004843530591775326, "loss": 3.6998, "theoretical_loss": 4.5805572958913086, "tokens_seen": 135266304 }, { "epoch": 0.04, "learning_rate": 0.0004843430290872618, "loss": 3.5848, "theoretical_loss": 4.580262782321135, "tokens_seen": 135331840 }, { "epoch": 0.04, "learning_rate": 0.00048433299899699095, "loss": 3.6249, "theoretical_loss": 4.579968451250032, "tokens_seen": 135397376 }, { "epoch": 0.04, "learning_rate": 0.0004843229689067202, "loss": 3.6525, "theoretical_loss": 4.579674302476661, "tokens_seen": 135462912 }, { "epoch": 0.04, "learning_rate": 0.0004843129388164493, "loss": 3.586, "theoretical_loss": 4.579380335800001, "tokens_seen": 135528448 }, { "epoch": 0.04, "learning_rate": 0.00048430290872617855, "loss": 3.6201, "theoretical_loss": 4.579086551019348, "tokens_seen": 135593984 }, { "epoch": 0.04, "learning_rate": 0.00048429287863590773, "loss": 3.4843, "theoretical_loss": 4.5787929479343195, "tokens_seen": 135659520 }, { "epoch": 0.04, "learning_rate": 0.0004842828485456369, "loss": 3.5944, "theoretical_loss": 4.578499526344848, "tokens_seen": 135725056 }, { "epoch": 0.04, "learning_rate": 0.0004842728184553661, "loss": 3.4993, "theoretical_loss": 4.578206286051184, "tokens_seen": 135790592 }, { "epoch": 0.04, "learning_rate": 0.00048426278836509527, "loss": 3.5829, "theoretical_loss": 4.5779132268538945, "tokens_seen": 135856128 }, { "epoch": 0.04, "learning_rate": 0.00048425275827482445, "loss": 3.5099, "theoretical_loss": 4.577620348553859, "tokens_seen": 135921664 }, { "epoch": 0.04, "objective/train/docs_used": 247335, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.489936351776123, "objective/train/theoretical_loss": 4.577327650952276, "objective/train/tokens_used": 156447200, "theoretical_loss": 4.577327650952276, "tokens_seen": 135987200 }, { "epoch": 0.04, "learning_rate": 0.0004842427281845537, "loss": 3.5891, "theoretical_loss": 4.577327650952276, "tokens_seen": 135987200 }, { "epoch": 0.04, "learning_rate": 0.00048423269809428287, "loss": 3.5763, "theoretical_loss": 4.5770351338506545, "tokens_seen": 136052736 }, { "epoch": 0.04, "learning_rate": 0.00048422266800401205, "loss": 3.383, "theoretical_loss": 4.57674279705082, "tokens_seen": 136118272 }, { "epoch": 0.04, "learning_rate": 0.0004842126379137413, "loss": 3.7113, "theoretical_loss": 4.57645064035491, "tokens_seen": 136183808 }, { "epoch": 0.04, "learning_rate": 0.0004842026078234704, "loss": 3.4664, "theoretical_loss": 4.576158663565371, "tokens_seen": 136249344 }, { "epoch": 0.04, "learning_rate": 0.00048419257773319965, "loss": 3.4926, "theoretical_loss": 4.575866866484967, "tokens_seen": 136314880 }, { "epoch": 0.04, "learning_rate": 0.0004841825476429288, "loss": 3.5476, "theoretical_loss": 4.575575248916767, "tokens_seen": 136380416 }, { "epoch": 0.04, "learning_rate": 0.000484172517552658, "loss": 3.505, "theoretical_loss": 4.575283810664155, "tokens_seen": 136445952 }, { "epoch": 0.04, "learning_rate": 0.0004841624874623872, "loss": 3.526, "theoretical_loss": 4.574992551530822, "tokens_seen": 136511488 }, { "epoch": 0.04, "learning_rate": 0.00048415245737211637, "loss": 3.4687, "theoretical_loss": 4.574701471320768, "tokens_seen": 136577024 }, { "epoch": 0.04, "learning_rate": 0.00048414242728184555, "loss": 3.5346, "theoretical_loss": 4.574410569838304, "tokens_seen": 136642560 }, { "epoch": 0.04, "learning_rate": 0.00048413239719157473, "loss": 3.7621, "theoretical_loss": 4.574119846888045, "tokens_seen": 136708096 }, { "epoch": 0.04, "learning_rate": 0.0004841223671013039, "loss": 3.551, "theoretical_loss": 4.573829302274915, "tokens_seen": 136773632 }, { "epoch": 0.04, "learning_rate": 0.00048411233701103315, "loss": 3.523, "theoretical_loss": 4.573538935804146, "tokens_seen": 136839168 }, { "epoch": 0.04, "learning_rate": 0.0004841023069207623, "loss": 3.5407, "theoretical_loss": 4.573248747281273, "tokens_seen": 136904704 }, { "epoch": 0.04, "learning_rate": 0.0004840922768304915, "loss": 3.4714, "theoretical_loss": 4.5729587365121365, "tokens_seen": 136970240 }, { "epoch": 0.04, "learning_rate": 0.00048408224674022064, "loss": 3.6406, "theoretical_loss": 4.572668903302886, "tokens_seen": 137035776 }, { "epoch": 0.04, "learning_rate": 0.0004840722166499499, "loss": 3.3561, "theoretical_loss": 4.572379247459969, "tokens_seen": 137101312 }, { "epoch": 0.04, "learning_rate": 0.00048406218655967906, "loss": 3.6194, "theoretical_loss": 4.57208976879014, "tokens_seen": 137166848 }, { "epoch": 0.04, "learning_rate": 0.00048405215646940824, "loss": 3.6152, "theoretical_loss": 4.571800467100456, "tokens_seen": 137232384 }, { "epoch": 0.04, "learning_rate": 0.0004840421263791374, "loss": 3.6048, "theoretical_loss": 4.5715113421982725, "tokens_seen": 137297920 }, { "epoch": 0.04, "learning_rate": 0.00048403209628886665, "loss": 3.6566, "theoretical_loss": 4.571222393891253, "tokens_seen": 137363456 }, { "epoch": 0.04, "learning_rate": 0.0004840220661985958, "loss": 3.3516, "theoretical_loss": 4.570933621987356, "tokens_seen": 137428992 }, { "epoch": 0.04, "learning_rate": 0.000484012036108325, "loss": 3.5162, "theoretical_loss": 4.570645026294844, "tokens_seen": 137494528 }, { "epoch": 0.04, "learning_rate": 0.00048400200601805414, "loss": 3.5923, "theoretical_loss": 4.570356606622278, "tokens_seen": 137560064 }, { "epoch": 0.04, "objective/train/docs_used": 249317, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.451021909713745, "objective/train/theoretical_loss": 4.570068362778516, "objective/train/tokens_used": 158085600, "theoretical_loss": 4.570068362778516, "tokens_seen": 137625600 }, { "epoch": 0.04, "learning_rate": 0.0004839919759277834, "loss": 3.4821, "theoretical_loss": 4.570068362778516, "tokens_seen": 137625600 }, { "epoch": 0.04, "learning_rate": 0.00048398194583751256, "loss": 3.6324, "theoretical_loss": 4.569780294572718, "tokens_seen": 137691136 }, { "epoch": 0.04, "learning_rate": 0.00048397191574724174, "loss": 3.5473, "theoretical_loss": 4.569492401814339, "tokens_seen": 137756672 }, { "epoch": 0.04, "learning_rate": 0.0004839618856569709, "loss": 3.5585, "theoretical_loss": 4.569204684313133, "tokens_seen": 137822208 }, { "epoch": 0.04, "learning_rate": 0.0004839518555667001, "loss": 3.4574, "theoretical_loss": 4.568917141879149, "tokens_seen": 137887744 }, { "epoch": 0.04, "learning_rate": 0.0004839418254764293, "loss": 3.3014, "theoretical_loss": 4.568629774322736, "tokens_seen": 137953280 }, { "epoch": 0.04, "learning_rate": 0.0004839317953861585, "loss": 3.5883, "theoretical_loss": 4.568342581454532, "tokens_seen": 138018816 }, { "epoch": 0.04, "learning_rate": 0.00048392176529588765, "loss": 3.5122, "theoretical_loss": 4.568055563085476, "tokens_seen": 138084352 }, { "epoch": 0.04, "learning_rate": 0.0004839117352056169, "loss": 3.699, "theoretical_loss": 4.567768719026797, "tokens_seen": 138149888 }, { "epoch": 0.04, "learning_rate": 0.00048390170511534606, "loss": 3.4775, "theoretical_loss": 4.567482049090019, "tokens_seen": 138215424 }, { "epoch": 0.04, "learning_rate": 0.00048389167502507524, "loss": 3.4237, "theoretical_loss": 4.567195553086961, "tokens_seen": 138280960 }, { "epoch": 0.04, "learning_rate": 0.0004838816449348044, "loss": 3.4686, "theoretical_loss": 4.566909230829729, "tokens_seen": 138346496 }, { "epoch": 0.04, "learning_rate": 0.0004838716148445336, "loss": 3.4138, "theoretical_loss": 4.566623082130729, "tokens_seen": 138412032 }, { "epoch": 0.04, "learning_rate": 0.0004838615847542628, "loss": 3.5461, "theoretical_loss": 4.566337106802651, "tokens_seen": 138477568 }, { "epoch": 0.04, "learning_rate": 0.000483851554663992, "loss": 3.6355, "theoretical_loss": 4.56605130465848, "tokens_seen": 138543104 }, { "epoch": 0.04, "learning_rate": 0.00048384152457372115, "loss": 3.7033, "theoretical_loss": 4.565765675511487, "tokens_seen": 138608640 }, { "epoch": 0.04, "learning_rate": 0.0004838314944834504, "loss": 3.647, "theoretical_loss": 4.565480219175237, "tokens_seen": 138674176 }, { "epoch": 0.04, "learning_rate": 0.0004838214643931795, "loss": 3.6162, "theoretical_loss": 4.56519493546358, "tokens_seen": 138739712 }, { "epoch": 0.04, "learning_rate": 0.00048381143430290875, "loss": 3.6186, "theoretical_loss": 4.56490982419066, "tokens_seen": 138805248 }, { "epoch": 0.04, "learning_rate": 0.00048380140421263793, "loss": 3.5797, "theoretical_loss": 4.564624885170902, "tokens_seen": 138870784 }, { "epoch": 0.04, "learning_rate": 0.0004837913741223671, "loss": 3.4108, "theoretical_loss": 4.564340118219022, "tokens_seen": 138936320 }, { "epoch": 0.04, "learning_rate": 0.0004837813440320963, "loss": 3.4539, "theoretical_loss": 4.56405552315002, "tokens_seen": 139001856 }, { "epoch": 0.04, "learning_rate": 0.00048377131394182547, "loss": 3.4717, "theoretical_loss": 4.563771099779187, "tokens_seen": 139067392 }, { "epoch": 0.04, "learning_rate": 0.00048376128385155465, "loss": 3.5063, "theoretical_loss": 4.563486847922093, "tokens_seen": 139132928 }, { "epoch": 0.04, "learning_rate": 0.0004837512537612839, "loss": 3.4314, "theoretical_loss": 4.563202767394597, "tokens_seen": 139198464 }, { "epoch": 0.04, "objective/train/docs_used": 252191, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4365386962890625, "objective/train/theoretical_loss": 4.562918858012843, "objective/train/tokens_used": 159724000, "theoretical_loss": 4.562918858012843, "tokens_seen": 139264000 }, { "epoch": 0.04, "learning_rate": 0.000483741223671013, "loss": 3.491, "theoretical_loss": 4.562918858012843, "tokens_seen": 139264000 }, { "epoch": 0.04, "learning_rate": 0.00048373119358074225, "loss": 3.4835, "theoretical_loss": 4.562635119593255, "tokens_seen": 139329536 }, { "epoch": 0.04, "learning_rate": 0.00048372116349047143, "loss": 3.6067, "theoretical_loss": 4.562351551952542, "tokens_seen": 139395072 }, { "epoch": 0.04, "learning_rate": 0.0004837111334002006, "loss": 3.5454, "theoretical_loss": 4.5620681549076965, "tokens_seen": 139460608 }, { "epoch": 0.04, "learning_rate": 0.0004837011033099298, "loss": 3.457, "theoretical_loss": 4.561784928275992, "tokens_seen": 139526144 }, { "epoch": 0.04, "learning_rate": 0.000483691073219659, "loss": 3.4872, "theoretical_loss": 4.561501871874984, "tokens_seen": 139591680 }, { "epoch": 0.04, "learning_rate": 0.00048368104312938816, "loss": 3.4405, "theoretical_loss": 4.561218985522507, "tokens_seen": 139657216 }, { "epoch": 0.04, "learning_rate": 0.0004836710130391174, "loss": 3.5879, "theoretical_loss": 4.560936269036679, "tokens_seen": 139722752 }, { "epoch": 0.04, "learning_rate": 0.0004836609829488465, "loss": 3.5717, "theoretical_loss": 4.560653722235895, "tokens_seen": 139788288 }, { "epoch": 0.04, "learning_rate": 0.00048365095285857575, "loss": 3.6366, "theoretical_loss": 4.560371344938831, "tokens_seen": 139853824 }, { "epoch": 0.04, "learning_rate": 0.0004836409227683049, "loss": 3.5944, "theoretical_loss": 4.560089136964439, "tokens_seen": 139919360 }, { "epoch": 0.04, "learning_rate": 0.0004836308926780341, "loss": 3.3183, "theoretical_loss": 4.559807098131953, "tokens_seen": 139984896 }, { "epoch": 0.04, "learning_rate": 0.0004836208625877633, "loss": 3.3801, "theoretical_loss": 4.559525228260882, "tokens_seen": 140050432 }, { "epoch": 0.04, "learning_rate": 0.0004836108324974925, "loss": 3.3804, "theoretical_loss": 4.559243527171011, "tokens_seen": 140115968 }, { "epoch": 0.04, "learning_rate": 0.00048360080240722166, "loss": 3.5342, "theoretical_loss": 4.558961994682403, "tokens_seen": 140181504 }, { "epoch": 0.04, "learning_rate": 0.00048359077231695084, "loss": 3.4742, "theoretical_loss": 4.558680630615397, "tokens_seen": 140247040 }, { "epoch": 0.04, "learning_rate": 0.00048358074222668, "loss": 3.5603, "theoretical_loss": 4.558399434790607, "tokens_seen": 140312576 }, { "epoch": 0.04, "learning_rate": 0.00048357071213640926, "loss": 3.5842, "theoretical_loss": 4.558118407028921, "tokens_seen": 140378112 }, { "epoch": 0.04, "learning_rate": 0.0004835606820461384, "loss": 3.2524, "theoretical_loss": 4.557837547151502, "tokens_seen": 140443648 }, { "epoch": 0.04, "learning_rate": 0.0004835506519558676, "loss": 3.3036, "theoretical_loss": 4.557556854979786, "tokens_seen": 140509184 }, { "epoch": 0.04, "learning_rate": 0.0004835406218655968, "loss": 3.3987, "theoretical_loss": 4.5572763303354815, "tokens_seen": 140574720 }, { "epoch": 0.04, "learning_rate": 0.000483530591775326, "loss": 3.5147, "theoretical_loss": 4.556995973040574, "tokens_seen": 140640256 }, { "epoch": 0.04, "learning_rate": 0.00048352056168505516, "loss": 3.4556, "theoretical_loss": 4.556715782917314, "tokens_seen": 140705792 }, { "epoch": 0.04, "learning_rate": 0.00048351053159478434, "loss": 3.5449, "theoretical_loss": 4.556435759788229, "tokens_seen": 140771328 }, { "epoch": 0.04, "learning_rate": 0.0004835005015045135, "loss": 3.6471, "theoretical_loss": 4.556155903476114, "tokens_seen": 140836864 }, { "epoch": 0.04, "objective/train/docs_used": 255032, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.713993549346924, "objective/train/theoretical_loss": 4.555876213804037, "objective/train/tokens_used": 161362400, "theoretical_loss": 4.555876213804037, "tokens_seen": 140902400 }, { "epoch": 0.04, "learning_rate": 0.00048349047141424276, "loss": 3.6805, "theoretical_loss": 4.555876213804037, "tokens_seen": 140902400 }, { "epoch": 0.04, "learning_rate": 0.00048348044132397194, "loss": 3.4832, "theoretical_loss": 4.555596690595333, "tokens_seen": 140967936 }, { "epoch": 0.04, "learning_rate": 0.0004834704112337011, "loss": 3.4922, "theoretical_loss": 4.555317333673611, "tokens_seen": 141033472 }, { "epoch": 0.04, "learning_rate": 0.0004834603811434303, "loss": 3.6061, "theoretical_loss": 4.555038142862742, "tokens_seen": 141099008 }, { "epoch": 0.04, "learning_rate": 0.0004834503510531595, "loss": 3.6062, "theoretical_loss": 4.5547591179868725, "tokens_seen": 141164544 }, { "epoch": 0.04, "learning_rate": 0.0004834403209628887, "loss": 3.5637, "theoretical_loss": 4.554480258870409, "tokens_seen": 141230080 }, { "epoch": 0.04, "learning_rate": 0.00048343029087261785, "loss": 3.4913, "theoretical_loss": 4.554201565338033, "tokens_seen": 141295616 }, { "epoch": 0.04, "learning_rate": 0.0004834202607823471, "loss": 3.7421, "theoretical_loss": 4.5539230372146875, "tokens_seen": 141361152 }, { "epoch": 0.04, "learning_rate": 0.00048341023069207626, "loss": 3.3526, "theoretical_loss": 4.553644674325584, "tokens_seen": 141426688 }, { "epoch": 0.04, "learning_rate": 0.00048340020060180544, "loss": 3.6671, "theoretical_loss": 4.553366476496198, "tokens_seen": 141492224 }, { "epoch": 0.04, "learning_rate": 0.0004833901705115346, "loss": 3.5561, "theoretical_loss": 4.553088443552269, "tokens_seen": 141557760 }, { "epoch": 0.04, "learning_rate": 0.0004833801404212638, "loss": 3.623, "theoretical_loss": 4.552810575319806, "tokens_seen": 141623296 }, { "epoch": 0.04, "learning_rate": 0.000483370110330993, "loss": 3.3913, "theoretical_loss": 4.552532871625077, "tokens_seen": 141688832 }, { "epoch": 0.04, "learning_rate": 0.0004833600802407222, "loss": 3.354, "theoretical_loss": 4.5522553322946155, "tokens_seen": 141754368 }, { "epoch": 0.04, "learning_rate": 0.00048335005015045135, "loss": 3.4717, "theoretical_loss": 4.551977957155217, "tokens_seen": 141819904 }, { "epoch": 0.04, "learning_rate": 0.0004833400200601806, "loss": 3.5283, "theoretical_loss": 4.5517007460339425, "tokens_seen": 141885440 }, { "epoch": 0.04, "learning_rate": 0.0004833299899699097, "loss": 3.5349, "theoretical_loss": 4.551423698758111, "tokens_seen": 141950976 }, { "epoch": 0.04, "learning_rate": 0.00048331995987963895, "loss": 3.427, "theoretical_loss": 4.551146815155304, "tokens_seen": 142016512 }, { "epoch": 0.04, "learning_rate": 0.00048330992978936813, "loss": 3.5229, "theoretical_loss": 4.550870095053366, "tokens_seen": 142082048 }, { "epoch": 0.04, "learning_rate": 0.0004832998996990973, "loss": 3.4194, "theoretical_loss": 4.550593538280398, "tokens_seen": 142147584 }, { "epoch": 0.04, "learning_rate": 0.0004832898696088265, "loss": 3.5581, "theoretical_loss": 4.550317144664766, "tokens_seen": 142213120 }, { "epoch": 0.04, "learning_rate": 0.00048327983951855567, "loss": 3.5286, "theoretical_loss": 4.55004091403509, "tokens_seen": 142278656 }, { "epoch": 0.04, "learning_rate": 0.00048326980942828485, "loss": 3.4976, "theoretical_loss": 4.5497648462202545, "tokens_seen": 142344192 }, { "epoch": 0.04, "learning_rate": 0.0004832597793380141, "loss": 3.3311, "theoretical_loss": 4.549488941049397, "tokens_seen": 142409728 }, { "epoch": 0.04, "learning_rate": 0.0004832497492477432, "loss": 3.5573, "theoretical_loss": 4.549213198351914, "tokens_seen": 142475264 }, { "epoch": 0.04, "objective/train/docs_used": 257777, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.8072094917297363, "objective/train/theoretical_loss": 4.548937617957463, "objective/train/tokens_used": 163000800, "theoretical_loss": 4.548937617957463, "tokens_seen": 142540800 }, { "epoch": 0.04, "learning_rate": 0.00048323971915747245, "loss": 3.4805, "theoretical_loss": 4.548937617957463, "tokens_seen": 142540800 }, { "epoch": 0.04, "learning_rate": 0.00048322968906720163, "loss": 3.6851, "theoretical_loss": 4.548662199695954, "tokens_seen": 142606336 }, { "epoch": 0.04, "learning_rate": 0.0004832196589769308, "loss": 3.589, "theoretical_loss": 4.548386943397556, "tokens_seen": 142671872 }, { "epoch": 0.04, "learning_rate": 0.00048320962888666, "loss": 3.1768, "theoretical_loss": 4.548111848892693, "tokens_seen": 142737408 }, { "epoch": 0.04, "learning_rate": 0.0004831995987963892, "loss": 3.4447, "theoretical_loss": 4.547836916012042, "tokens_seen": 142802944 }, { "epoch": 0.04, "learning_rate": 0.00048318956870611836, "loss": 3.5021, "theoretical_loss": 4.547562144586539, "tokens_seen": 142868480 }, { "epoch": 0.04, "learning_rate": 0.0004831795386158476, "loss": 3.5031, "theoretical_loss": 4.547287534447372, "tokens_seen": 142934016 }, { "epoch": 0.04, "learning_rate": 0.0004831695085255767, "loss": 3.4233, "theoretical_loss": 4.5470130854259825, "tokens_seen": 142999552 }, { "epoch": 0.04, "learning_rate": 0.00048315947843530595, "loss": 3.5398, "theoretical_loss": 4.546738797354065, "tokens_seen": 143065088 }, { "epoch": 0.04, "learning_rate": 0.0004831494483450351, "loss": 3.5082, "theoretical_loss": 4.546464670063569, "tokens_seen": 143130624 }, { "epoch": 0.04, "learning_rate": 0.0004831394182547643, "loss": 3.5977, "theoretical_loss": 4.546190703386695, "tokens_seen": 143196160 }, { "epoch": 0.04, "learning_rate": 0.0004831293881644935, "loss": 3.4213, "theoretical_loss": 4.545916897155894, "tokens_seen": 143261696 }, { "epoch": 0.04, "learning_rate": 0.0004831193580742227, "loss": 3.5149, "theoretical_loss": 4.54564325120387, "tokens_seen": 143327232 }, { "epoch": 0.04, "learning_rate": 0.00048310932798395186, "loss": 3.6573, "theoretical_loss": 4.545369765363578, "tokens_seen": 143392768 }, { "epoch": 0.04, "learning_rate": 0.00048309929789368104, "loss": 3.4663, "theoretical_loss": 4.545096439468223, "tokens_seen": 143458304 }, { "epoch": 0.04, "learning_rate": 0.0004830892678034102, "loss": 3.5112, "theoretical_loss": 4.544823273351257, "tokens_seen": 143523840 }, { "epoch": 0.04, "learning_rate": 0.00048307923771313946, "loss": 3.4416, "theoretical_loss": 4.544550266846388, "tokens_seen": 143589376 }, { "epoch": 0.04, "learning_rate": 0.0004830692076228686, "loss": 3.4746, "theoretical_loss": 4.544277419787566, "tokens_seen": 143654912 }, { "epoch": 0.04, "learning_rate": 0.0004830591775325978, "loss": 3.5954, "theoretical_loss": 4.544004732008993, "tokens_seen": 143720448 }, { "epoch": 0.04, "learning_rate": 0.000483049147442327, "loss": 3.5367, "theoretical_loss": 4.543732203345119, "tokens_seen": 143785984 }, { "epoch": 0.04, "learning_rate": 0.0004830391173520562, "loss": 3.6476, "theoretical_loss": 4.543459833630639, "tokens_seen": 143851520 }, { "epoch": 0.04, "learning_rate": 0.00048302908726178536, "loss": 3.3349, "theoretical_loss": 4.543187622700497, "tokens_seen": 143917056 }, { "epoch": 0.04, "learning_rate": 0.00048301905717151454, "loss": 3.6606, "theoretical_loss": 4.542915570389884, "tokens_seen": 143982592 }, { "epoch": 0.04, "learning_rate": 0.0004830090270812437, "loss": 3.4543, "theoretical_loss": 4.542643676534234, "tokens_seen": 144048128 }, { "epoch": 0.04, "learning_rate": 0.00048299899699097296, "loss": 3.5477, "theoretical_loss": 4.542371940969231, "tokens_seen": 144113664 }, { "epoch": 0.04, "objective/train/docs_used": 259161, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1481668949127197, "objective/train/theoretical_loss": 4.542100363530799, "objective/train/tokens_used": 164639200, "theoretical_loss": 4.542100363530799, "tokens_seen": 144179200 }, { "epoch": 0.04, "learning_rate": 0.0004829889669007021, "loss": 3.4614, "theoretical_loss": 4.542100363530799, "tokens_seen": 144179200 }, { "epoch": 0.04, "learning_rate": 0.0004829789368104313, "loss": 3.5839, "theoretical_loss": 4.54182894405511, "tokens_seen": 144244736 }, { "epoch": 0.04, "learning_rate": 0.00048296890672016045, "loss": 3.4031, "theoretical_loss": 4.5415576823785795, "tokens_seen": 144310272 }, { "epoch": 0.04, "learning_rate": 0.0004829588766298897, "loss": 3.5012, "theoretical_loss": 4.541286578337866, "tokens_seen": 144375808 }, { "epoch": 0.04, "learning_rate": 0.00048294884653961887, "loss": 3.5403, "theoretical_loss": 4.541015631769872, "tokens_seen": 144441344 }, { "epoch": 0.04, "learning_rate": 0.00048293881644934805, "loss": 3.6976, "theoretical_loss": 4.5407448425117405, "tokens_seen": 144506880 }, { "epoch": 0.04, "learning_rate": 0.00048292878635907723, "loss": 3.5247, "theoretical_loss": 4.540474210400859, "tokens_seen": 144572416 }, { "epoch": 0.04, "learning_rate": 0.00048291875626880646, "loss": 3.6311, "theoretical_loss": 4.540203735274855, "tokens_seen": 144637952 }, { "epoch": 0.04, "learning_rate": 0.0004829087261785356, "loss": 3.5692, "theoretical_loss": 4.5399334169716, "tokens_seen": 144703488 }, { "epoch": 0.04, "learning_rate": 0.0004828986960882648, "loss": 3.6652, "theoretical_loss": 4.539663255329202, "tokens_seen": 144769024 }, { "epoch": 0.04, "learning_rate": 0.00048288866599799395, "loss": 3.4826, "theoretical_loss": 4.539393250186015, "tokens_seen": 144834560 }, { "epoch": 0.04, "learning_rate": 0.0004828786359077232, "loss": 3.373, "theoretical_loss": 4.539123401380625, "tokens_seen": 144900096 }, { "epoch": 0.04, "learning_rate": 0.00048286860581745237, "loss": 3.6064, "theoretical_loss": 4.538853708751866, "tokens_seen": 144965632 }, { "epoch": 0.04, "learning_rate": 0.00048285857572718155, "loss": 3.4889, "theoretical_loss": 4.538584172138804, "tokens_seen": 145031168 }, { "epoch": 0.04, "learning_rate": 0.00048284854563691073, "loss": 3.5176, "theoretical_loss": 4.538314791380748, "tokens_seen": 145096704 }, { "epoch": 0.04, "learning_rate": 0.0004828385155466399, "loss": 3.342, "theoretical_loss": 4.538045566317242, "tokens_seen": 145162240 }, { "epoch": 0.04, "learning_rate": 0.0004828284854563691, "loss": 3.4041, "theoretical_loss": 4.537776496788071, "tokens_seen": 145227776 }, { "epoch": 0.04, "learning_rate": 0.00048281845536609833, "loss": 3.656, "theoretical_loss": 4.537507582633253, "tokens_seen": 145293312 }, { "epoch": 0.04, "learning_rate": 0.00048280842527582746, "loss": 3.6063, "theoretical_loss": 4.537238823693045, "tokens_seen": 145358848 }, { "epoch": 0.04, "learning_rate": 0.0004827983951855567, "loss": 3.4952, "theoretical_loss": 4.536970219807939, "tokens_seen": 145424384 }, { "epoch": 0.04, "learning_rate": 0.0004827883650952858, "loss": 3.4596, "theoretical_loss": 4.536701770818665, "tokens_seen": 145489920 }, { "epoch": 0.04, "learning_rate": 0.00048277833500501505, "loss": 3.6225, "theoretical_loss": 4.536433476566185, "tokens_seen": 145555456 }, { "epoch": 0.04, "learning_rate": 0.00048276830491474423, "loss": 3.681, "theoretical_loss": 4.536165336891699, "tokens_seen": 145620992 }, { "epoch": 0.04, "learning_rate": 0.0004827582748244734, "loss": 3.427, "theoretical_loss": 4.535897351636638, "tokens_seen": 145686528 }, { "epoch": 0.04, "learning_rate": 0.0004827482447342026, "loss": 3.3131, "theoretical_loss": 4.53562952064267, "tokens_seen": 145752064 }, { "epoch": 0.04, "objective/train/docs_used": 261926, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7494444847106934, "objective/train/theoretical_loss": 4.535361843751696, "objective/train/tokens_used": 166277600, "theoretical_loss": 4.535361843751696, "tokens_seen": 145817600 }, { "epoch": 0.04, "learning_rate": 0.00048273821464393183, "loss": 3.5548, "theoretical_loss": 4.535361843751696, "tokens_seen": 145817600 }, { "epoch": 0.04, "learning_rate": 0.000482728184553661, "loss": 3.6512, "theoretical_loss": 4.535094320805847, "tokens_seen": 145883136 }, { "epoch": 0.04, "learning_rate": 0.0004827181544633902, "loss": 3.6332, "theoretical_loss": 4.534826951647489, "tokens_seen": 145948672 }, { "epoch": 0.04, "learning_rate": 0.0004827081243731194, "loss": 3.4451, "theoretical_loss": 4.5345597361192205, "tokens_seen": 146014208 }, { "epoch": 0.04, "learning_rate": 0.00048269809428284856, "loss": 3.4831, "theoretical_loss": 4.53429267406387, "tokens_seen": 146079744 }, { "epoch": 0.04, "learning_rate": 0.0004826880641925778, "loss": 3.6239, "theoretical_loss": 4.5340257653244995, "tokens_seen": 146145280 }, { "epoch": 0.04, "learning_rate": 0.0004826780341023069, "loss": 3.4369, "theoretical_loss": 4.5337590097444, "tokens_seen": 146210816 }, { "epoch": 0.04, "learning_rate": 0.00048266800401203615, "loss": 3.4717, "theoretical_loss": 4.533492407167093, "tokens_seen": 146276352 }, { "epoch": 0.04, "learning_rate": 0.0004826579739217653, "loss": 3.4534, "theoretical_loss": 4.53322595743633, "tokens_seen": 146341888 }, { "epoch": 0.04, "learning_rate": 0.0004826479438314945, "loss": 3.3769, "theoretical_loss": 4.5329596603960916, "tokens_seen": 146407424 }, { "epoch": 0.04, "learning_rate": 0.0004826379137412237, "loss": 3.6591, "theoretical_loss": 4.53269351589059, "tokens_seen": 146472960 }, { "epoch": 0.04, "learning_rate": 0.0004826278836509529, "loss": 3.2819, "theoretical_loss": 4.532427523764261, "tokens_seen": 146538496 }, { "epoch": 0.04, "learning_rate": 0.00048261785356068206, "loss": 3.6957, "theoretical_loss": 4.532161683861773, "tokens_seen": 146604032 }, { "epoch": 0.04, "learning_rate": 0.00048260782347041124, "loss": 3.6494, "theoretical_loss": 4.5318959960280205, "tokens_seen": 146669568 }, { "epoch": 0.04, "learning_rate": 0.0004825977933801404, "loss": 3.5571, "theoretical_loss": 4.531630460108125, "tokens_seen": 146735104 }, { "epoch": 0.04, "learning_rate": 0.00048258776328986966, "loss": 3.5653, "theoretical_loss": 4.531365075947434, "tokens_seen": 146800640 }, { "epoch": 0.04, "learning_rate": 0.0004825777331995988, "loss": 3.4759, "theoretical_loss": 4.531099843391524, "tokens_seen": 146866176 }, { "epoch": 0.04, "learning_rate": 0.000482567703109328, "loss": 3.5165, "theoretical_loss": 4.5308347622861955, "tokens_seen": 146931712 }, { "epoch": 0.04, "learning_rate": 0.0004825576730190572, "loss": 3.5252, "theoretical_loss": 4.5305698324774735, "tokens_seen": 146997248 }, { "epoch": 0.04, "learning_rate": 0.0004825476429287864, "loss": 3.4658, "theoretical_loss": 4.530305053811611, "tokens_seen": 147062784 }, { "epoch": 0.04, "learning_rate": 0.00048253761283851556, "loss": 3.4348, "theoretical_loss": 4.530040426135084, "tokens_seen": 147128320 }, { "epoch": 0.04, "learning_rate": 0.00048252758274824474, "loss": 3.6148, "theoretical_loss": 4.529775949294593, "tokens_seen": 147193856 }, { "epoch": 0.04, "learning_rate": 0.0004825175526579739, "loss": 3.4774, "theoretical_loss": 4.529511623137061, "tokens_seen": 147259392 }, { "epoch": 0.04, "learning_rate": 0.00048250752256770316, "loss": 3.6315, "theoretical_loss": 4.529247447509637, "tokens_seen": 147324928 }, { "epoch": 0.04, "learning_rate": 0.0004824974924774323, "loss": 3.4554, "theoretical_loss": 4.528983422259691, "tokens_seen": 147390464 }, { "epoch": 0.04, "objective/train/docs_used": 264882, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4082305431365967, "objective/train/theoretical_loss": 4.528719547234816, "objective/train/tokens_used": 167916000, "theoretical_loss": 4.528719547234816, "tokens_seen": 147456000 }, { "epoch": 0.04, "learning_rate": 0.0004824874623871615, "loss": 3.3936, "theoretical_loss": 4.528719547234816, "tokens_seen": 147456000 }, { "epoch": 0.04, "learning_rate": 0.00048247743229689065, "loss": 3.4728, "theoretical_loss": 4.528455822282828, "tokens_seen": 147521536 }, { "epoch": 0.04, "learning_rate": 0.0004824674022066199, "loss": 3.5183, "theoretical_loss": 4.528192247251763, "tokens_seen": 147587072 }, { "epoch": 0.04, "learning_rate": 0.00048245737211634907, "loss": 3.5562, "theoretical_loss": 4.52792882198988, "tokens_seen": 147652608 }, { "epoch": 0.04, "learning_rate": 0.00048244734202607825, "loss": 3.5825, "theoretical_loss": 4.527665546345656, "tokens_seen": 147718144 }, { "epoch": 0.04, "learning_rate": 0.00048243731193580743, "loss": 3.6563, "theoretical_loss": 4.5274024201677925, "tokens_seen": 147783680 }, { "epoch": 0.04, "learning_rate": 0.00048242728184553666, "loss": 3.5426, "theoretical_loss": 4.527139443305209, "tokens_seen": 147849216 }, { "epoch": 0.04, "learning_rate": 0.0004824172517552658, "loss": 3.5514, "theoretical_loss": 4.526876615607042, "tokens_seen": 147914752 }, { "epoch": 0.04, "learning_rate": 0.000482407221664995, "loss": 3.396, "theoretical_loss": 4.526613936922654, "tokens_seen": 147980288 }, { "epoch": 0.04, "learning_rate": 0.00048239719157472415, "loss": 3.4025, "theoretical_loss": 4.526351407101618, "tokens_seen": 148045824 }, { "epoch": 0.04, "learning_rate": 0.0004823871614844534, "loss": 3.5241, "theoretical_loss": 4.526089025993732, "tokens_seen": 148111360 }, { "epoch": 0.04, "learning_rate": 0.00048237713139418257, "loss": 3.5164, "theoretical_loss": 4.525826793449008, "tokens_seen": 148176896 }, { "epoch": 0.04, "learning_rate": 0.00048236710130391175, "loss": 3.5274, "theoretical_loss": 4.525564709317678, "tokens_seen": 148242432 }, { "epoch": 0.04, "learning_rate": 0.00048235707121364093, "loss": 3.5387, "theoretical_loss": 4.525302773450187, "tokens_seen": 148307968 }, { "epoch": 0.04, "learning_rate": 0.0004823470411233701, "loss": 3.4173, "theoretical_loss": 4.525040985697203, "tokens_seen": 148373504 }, { "epoch": 0.04, "learning_rate": 0.0004823370110330993, "loss": 3.4491, "theoretical_loss": 4.524779345909604, "tokens_seen": 148439040 }, { "epoch": 0.05, "learning_rate": 0.00048232698094282853, "loss": 3.1996, "theoretical_loss": 4.524517853938489, "tokens_seen": 148504576 }, { "epoch": 0.05, "learning_rate": 0.00048231695085255766, "loss": 3.4526, "theoretical_loss": 4.524256509635169, "tokens_seen": 148570112 }, { "epoch": 0.05, "learning_rate": 0.0004823069207622869, "loss": 3.5323, "theoretical_loss": 4.523995312851174, "tokens_seen": 148635648 }, { "epoch": 0.05, "learning_rate": 0.000482296890672016, "loss": 3.4914, "theoretical_loss": 4.523734263438241, "tokens_seen": 148701184 }, { "epoch": 0.05, "learning_rate": 0.00048228686058174525, "loss": 3.656, "theoretical_loss": 4.52347336124833, "tokens_seen": 148766720 }, { "epoch": 0.05, "learning_rate": 0.00048227683049147443, "loss": 3.6255, "theoretical_loss": 4.52321260613361, "tokens_seen": 148832256 }, { "epoch": 0.05, "learning_rate": 0.0004822668004012036, "loss": 3.4395, "theoretical_loss": 4.522951997946466, "tokens_seen": 148897792 }, { "epoch": 0.05, "learning_rate": 0.0004822567703109328, "loss": 3.5195, "theoretical_loss": 4.522691536539492, "tokens_seen": 148963328 }, { "epoch": 0.05, "learning_rate": 0.00048224674022066203, "loss": 3.5602, "theoretical_loss": 4.522431221765498, "tokens_seen": 149028864 }, { "epoch": 0.05, "objective/train/docs_used": 267653, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.507631778717041, "objective/train/theoretical_loss": 4.522171053477507, "objective/train/tokens_used": 169554400, "theoretical_loss": 4.522171053477507, "tokens_seen": 149094400 }, { "epoch": 0.05, "learning_rate": 0.00048223671013039116, "loss": 3.4595, "theoretical_loss": 4.522171053477507, "tokens_seen": 149094400 }, { "epoch": 0.05, "learning_rate": 0.0004822266800401204, "loss": 3.4166, "theoretical_loss": 4.5219110315287505, "tokens_seen": 149159936 }, { "epoch": 0.05, "learning_rate": 0.0004822166499498495, "loss": 3.5747, "theoretical_loss": 4.521651155772675, "tokens_seen": 149225472 }, { "epoch": 0.05, "learning_rate": 0.00048220661985957876, "loss": 3.4243, "theoretical_loss": 4.521391426062934, "tokens_seen": 149291008 }, { "epoch": 0.05, "learning_rate": 0.00048219658976930794, "loss": 3.4486, "theoretical_loss": 4.521131842253396, "tokens_seen": 149356544 }, { "epoch": 0.05, "learning_rate": 0.0004821865596790371, "loss": 3.5287, "theoretical_loss": 4.520872404198139, "tokens_seen": 149422080 }, { "epoch": 0.05, "learning_rate": 0.0004821765295887663, "loss": 3.5761, "theoretical_loss": 4.520613111751445, "tokens_seen": 149487616 }, { "epoch": 0.05, "learning_rate": 0.0004821664994984955, "loss": 3.4929, "theoretical_loss": 4.520353964767814, "tokens_seen": 149553152 }, { "epoch": 0.05, "learning_rate": 0.00048215646940822466, "loss": 3.4981, "theoretical_loss": 4.5200949631019505, "tokens_seen": 149618688 }, { "epoch": 0.05, "learning_rate": 0.0004821464393179539, "loss": 3.3487, "theoretical_loss": 4.519836106608768, "tokens_seen": 149684224 }, { "epoch": 0.05, "learning_rate": 0.000482136409227683, "loss": 3.5654, "theoretical_loss": 4.519577395143388, "tokens_seen": 149749760 }, { "epoch": 0.05, "learning_rate": 0.00048212637913741226, "loss": 3.5136, "theoretical_loss": 4.519318828561142, "tokens_seen": 149815296 }, { "epoch": 0.05, "learning_rate": 0.0004821163490471414, "loss": 3.4044, "theoretical_loss": 4.519060406717565, "tokens_seen": 149880832 }, { "epoch": 0.05, "learning_rate": 0.0004821063189568706, "loss": 3.4915, "theoretical_loss": 4.518802129468405, "tokens_seen": 149946368 }, { "epoch": 0.05, "learning_rate": 0.0004820962888665998, "loss": 3.5795, "theoretical_loss": 4.51854399666961, "tokens_seen": 150011904 }, { "epoch": 0.05, "learning_rate": 0.000482086258776329, "loss": 3.6472, "theoretical_loss": 4.518286008177341, "tokens_seen": 150077440 }, { "epoch": 0.05, "learning_rate": 0.00048207622868605816, "loss": 3.5436, "theoretical_loss": 4.51802816384796, "tokens_seen": 150142976 }, { "epoch": 0.05, "learning_rate": 0.0004820661985957874, "loss": 3.3985, "theoretical_loss": 4.517770463538038, "tokens_seen": 150208512 }, { "epoch": 0.05, "learning_rate": 0.00048205616850551653, "loss": 3.5173, "theoretical_loss": 4.517512907104347, "tokens_seen": 150274048 }, { "epoch": 0.05, "learning_rate": 0.00048204613841524576, "loss": 3.5958, "theoretical_loss": 4.517255494403868, "tokens_seen": 150339584 }, { "epoch": 0.05, "learning_rate": 0.0004820361083249749, "loss": 3.5705, "theoretical_loss": 4.516998225293785, "tokens_seen": 150405120 }, { "epoch": 0.05, "learning_rate": 0.0004820260782347041, "loss": 3.3924, "theoretical_loss": 4.516741099631485, "tokens_seen": 150470656 }, { "epoch": 0.05, "learning_rate": 0.0004820160481444333, "loss": 3.4862, "theoretical_loss": 4.51648411727456, "tokens_seen": 150536192 }, { "epoch": 0.05, "learning_rate": 0.0004820060180541625, "loss": 3.5485, "theoretical_loss": 4.5162272780808035, "tokens_seen": 150601728 }, { "epoch": 0.05, "learning_rate": 0.00048199598796389167, "loss": 3.4926, "theoretical_loss": 4.515970581908216, "tokens_seen": 150667264 }, { "epoch": 0.05, "objective/train/docs_used": 270469, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6817948818206787, "objective/train/theoretical_loss": 4.515714028614996, "objective/train/tokens_used": 171192800, "theoretical_loss": 4.515714028614996, "tokens_seen": 150732800 }, { "epoch": 0.05, "learning_rate": 0.00048198595787362085, "loss": 3.525, "theoretical_loss": 4.515714028614996, "tokens_seen": 150732800 }, { "epoch": 0.05, "learning_rate": 0.0004819759277833501, "loss": 3.5014, "theoretical_loss": 4.515457618059546, "tokens_seen": 150798336 }, { "epoch": 0.05, "learning_rate": 0.00048196589769307927, "loss": 3.5235, "theoretical_loss": 4.515201350100471, "tokens_seen": 150863872 }, { "epoch": 0.05, "learning_rate": 0.00048195586760280845, "loss": 3.219, "theoretical_loss": 4.514945224596577, "tokens_seen": 150929408 }, { "epoch": 0.05, "learning_rate": 0.00048194583751253763, "loss": 3.3252, "theoretical_loss": 4.5146892414068684, "tokens_seen": 150994944 }, { "epoch": 0.05, "learning_rate": 0.00048193580742226686, "loss": 3.3152, "theoretical_loss": 4.514433400390554, "tokens_seen": 151060480 }, { "epoch": 0.05, "learning_rate": 0.000481925777331996, "loss": 3.5686, "theoretical_loss": 4.514177701407042, "tokens_seen": 151126016 }, { "epoch": 0.05, "learning_rate": 0.0004819157472417252, "loss": 3.5379, "theoretical_loss": 4.51392214431594, "tokens_seen": 151191552 }, { "epoch": 0.05, "learning_rate": 0.00048190571715145435, "loss": 3.6698, "theoretical_loss": 4.513666728977054, "tokens_seen": 151257088 }, { "epoch": 0.05, "learning_rate": 0.0004818956870611836, "loss": 3.4031, "theoretical_loss": 4.51341145525039, "tokens_seen": 151322624 }, { "epoch": 0.05, "learning_rate": 0.00048188565697091277, "loss": 3.2406, "theoretical_loss": 4.513156322996155, "tokens_seen": 151388160 }, { "epoch": 0.05, "learning_rate": 0.00048187562688064195, "loss": 3.5046, "theoretical_loss": 4.512901332074751, "tokens_seen": 151453696 }, { "epoch": 0.05, "learning_rate": 0.00048186559679037113, "loss": 3.4518, "theoretical_loss": 4.5126464823467805, "tokens_seen": 151519232 }, { "epoch": 0.05, "learning_rate": 0.0004818555667001003, "loss": 3.451, "theoretical_loss": 4.512391773673042, "tokens_seen": 151584768 }, { "epoch": 0.05, "learning_rate": 0.0004818455366098295, "loss": 3.5618, "theoretical_loss": 4.5121372059145335, "tokens_seen": 151650304 }, { "epoch": 0.05, "learning_rate": 0.00048183550651955873, "loss": 3.5633, "theoretical_loss": 4.511882778932447, "tokens_seen": 151715840 }, { "epoch": 0.05, "learning_rate": 0.00048182547642928786, "loss": 3.5978, "theoretical_loss": 4.511628492588174, "tokens_seen": 151781376 }, { "epoch": 0.05, "learning_rate": 0.0004818154463390171, "loss": 3.5553, "theoretical_loss": 4.5113743467433, "tokens_seen": 151846912 }, { "epoch": 0.05, "learning_rate": 0.0004818054162487462, "loss": 3.603, "theoretical_loss": 4.511120341259608, "tokens_seen": 151912448 }, { "epoch": 0.05, "learning_rate": 0.00048179538615847545, "loss": 3.595, "theoretical_loss": 4.510866475999077, "tokens_seen": 151977984 }, { "epoch": 0.05, "learning_rate": 0.00048178535606820463, "loss": 3.4817, "theoretical_loss": 4.510612750823878, "tokens_seen": 152043520 }, { "epoch": 0.05, "learning_rate": 0.0004817753259779338, "loss": 3.6731, "theoretical_loss": 4.5103591655963795, "tokens_seen": 152109056 }, { "epoch": 0.05, "learning_rate": 0.000481765295887663, "loss": 3.4322, "theoretical_loss": 4.510105720179144, "tokens_seen": 152174592 }, { "epoch": 0.05, "learning_rate": 0.00048175526579739223, "loss": 3.6885, "theoretical_loss": 4.5098524144349295, "tokens_seen": 152240128 }, { "epoch": 0.05, "learning_rate": 0.00048174523570712136, "loss": 3.5445, "theoretical_loss": 4.509599248226683, "tokens_seen": 152305664 }, { "epoch": 0.05, "objective/train/docs_used": 273356, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.537525177001953, "objective/train/theoretical_loss": 4.509346221417552, "objective/train/tokens_used": 172831200, "theoretical_loss": 4.509346221417552, "tokens_seen": 152371200 }, { "epoch": 0.05, "learning_rate": 0.0004817352056168506, "loss": 3.4736, "theoretical_loss": 4.509346221417552, "tokens_seen": 152371200 }, { "epoch": 0.05, "learning_rate": 0.0004817251755265797, "loss": 3.6076, "theoretical_loss": 4.509093333870869, "tokens_seen": 152436736 }, { "epoch": 0.05, "learning_rate": 0.00048171514543630896, "loss": 3.5971, "theoretical_loss": 4.508840585450166, "tokens_seen": 152502272 }, { "epoch": 0.05, "learning_rate": 0.00048170511534603814, "loss": 3.3995, "theoretical_loss": 4.508587976019164, "tokens_seen": 152567808 }, { "epoch": 0.05, "learning_rate": 0.0004816950852557673, "loss": 3.4475, "theoretical_loss": 4.508335505441774, "tokens_seen": 152633344 }, { "epoch": 0.05, "learning_rate": 0.0004816850551654965, "loss": 3.4884, "theoretical_loss": 4.508083173582105, "tokens_seen": 152698880 }, { "epoch": 0.05, "learning_rate": 0.0004816750250752257, "loss": 3.5685, "theoretical_loss": 4.507830980304451, "tokens_seen": 152764416 }, { "epoch": 0.05, "learning_rate": 0.00048166499498495486, "loss": 3.4457, "theoretical_loss": 4.5075789254733, "tokens_seen": 152829952 }, { "epoch": 0.05, "learning_rate": 0.0004816549648946841, "loss": 3.6273, "theoretical_loss": 4.507327008953329, "tokens_seen": 152895488 }, { "epoch": 0.05, "learning_rate": 0.0004816449348044132, "loss": 3.351, "theoretical_loss": 4.507075230609407, "tokens_seen": 152961024 }, { "epoch": 0.05, "learning_rate": 0.00048163490471414246, "loss": 3.4969, "theoretical_loss": 4.506823590306591, "tokens_seen": 153026560 }, { "epoch": 0.05, "learning_rate": 0.0004816248746238716, "loss": 3.5263, "theoretical_loss": 4.506572087910127, "tokens_seen": 153092096 }, { "epoch": 0.05, "learning_rate": 0.0004816148445336008, "loss": 3.5666, "theoretical_loss": 4.506320723285455, "tokens_seen": 153157632 }, { "epoch": 0.05, "learning_rate": 0.00048160481444333, "loss": 3.5577, "theoretical_loss": 4.506069496298198, "tokens_seen": 153223168 }, { "epoch": 0.05, "learning_rate": 0.0004815947843530592, "loss": 3.6173, "theoretical_loss": 4.5058184068141705, "tokens_seen": 153288704 }, { "epoch": 0.05, "learning_rate": 0.00048158475426278837, "loss": 3.5079, "theoretical_loss": 4.505567454699373, "tokens_seen": 153354240 }, { "epoch": 0.05, "learning_rate": 0.0004815747241725176, "loss": 3.581, "theoretical_loss": 4.505316639819997, "tokens_seen": 153419776 }, { "epoch": 0.05, "learning_rate": 0.00048156469408224673, "loss": 3.552, "theoretical_loss": 4.505065962042418, "tokens_seen": 153485312 }, { "epoch": 0.05, "learning_rate": 0.00048155466399197596, "loss": 3.5518, "theoretical_loss": 4.504815421233202, "tokens_seen": 153550848 }, { "epoch": 0.05, "learning_rate": 0.0004815446339017051, "loss": 3.6896, "theoretical_loss": 4.504565017259097, "tokens_seen": 153616384 }, { "epoch": 0.05, "learning_rate": 0.0004815346038114343, "loss": 3.5187, "theoretical_loss": 4.504314749987044, "tokens_seen": 153681920 }, { "epoch": 0.05, "learning_rate": 0.0004815245737211635, "loss": 3.6987, "theoretical_loss": 4.504064619284163, "tokens_seen": 153747456 }, { "epoch": 0.05, "learning_rate": 0.0004815145436308927, "loss": 3.524, "theoretical_loss": 4.503814625017766, "tokens_seen": 153812992 }, { "epoch": 0.05, "learning_rate": 0.00048150451354062187, "loss": 3.5269, "theoretical_loss": 4.5035647670553445, "tokens_seen": 153878528 }, { "epoch": 0.05, "learning_rate": 0.00048149448345035105, "loss": 3.5689, "theoretical_loss": 4.503315045264581, "tokens_seen": 153944064 }, { "epoch": 0.05, "objective/train/docs_used": 275667, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.527536630630493, "objective/train/theoretical_loss": 4.503065459513339, "objective/train/tokens_used": 174469600, "theoretical_loss": 4.503065459513339, "tokens_seen": 154009600 }, { "epoch": 0.05, "learning_rate": 0.00048148445336008023, "loss": 3.5969, "theoretical_loss": 4.503065459513339, "tokens_seen": 154009600 }, { "epoch": 0.05, "learning_rate": 0.00048147442326980947, "loss": 3.4633, "theoretical_loss": 4.502816009669665, "tokens_seen": 154075136 }, { "epoch": 0.05, "learning_rate": 0.0004814643931795386, "loss": 3.6145, "theoretical_loss": 4.502566695601795, "tokens_seen": 154140672 }, { "epoch": 0.05, "learning_rate": 0.00048145436308926783, "loss": 3.5786, "theoretical_loss": 4.502317517178142, "tokens_seen": 154206208 }, { "epoch": 0.05, "learning_rate": 0.00048144433299899696, "loss": 3.4865, "theoretical_loss": 4.502068474267309, "tokens_seen": 154271744 }, { "epoch": 0.05, "learning_rate": 0.0004814343029087262, "loss": 3.3871, "theoretical_loss": 4.501819566738076, "tokens_seen": 154337280 }, { "epoch": 0.05, "learning_rate": 0.00048142427281845537, "loss": 3.4128, "theoretical_loss": 4.501570794459411, "tokens_seen": 154402816 }, { "epoch": 0.05, "learning_rate": 0.00048141424272818455, "loss": 3.5619, "theoretical_loss": 4.501322157300461, "tokens_seen": 154468352 }, { "epoch": 0.05, "learning_rate": 0.00048140421263791373, "loss": 3.6098, "theoretical_loss": 4.501073655130554, "tokens_seen": 154533888 }, { "epoch": 0.05, "learning_rate": 0.00048139418254764297, "loss": 3.4412, "theoretical_loss": 4.500825287819205, "tokens_seen": 154599424 }, { "epoch": 0.05, "learning_rate": 0.0004813841524573721, "loss": 3.5811, "theoretical_loss": 4.500577055236104, "tokens_seen": 154664960 }, { "epoch": 0.05, "learning_rate": 0.00048137412236710133, "loss": 3.5355, "theoretical_loss": 4.500328957251128, "tokens_seen": 154730496 }, { "epoch": 0.05, "learning_rate": 0.00048136409227683046, "loss": 3.5202, "theoretical_loss": 4.500080993734329, "tokens_seen": 154796032 }, { "epoch": 0.05, "learning_rate": 0.0004813540621865597, "loss": 3.5863, "theoretical_loss": 4.499833164555944, "tokens_seen": 154861568 }, { "epoch": 0.05, "learning_rate": 0.0004813440320962889, "loss": 3.5478, "theoretical_loss": 4.499585469586387, "tokens_seen": 154927104 }, { "epoch": 0.05, "learning_rate": 0.00048133400200601806, "loss": 3.3553, "theoretical_loss": 4.499337908696255, "tokens_seen": 154992640 }, { "epoch": 0.05, "learning_rate": 0.00048132397191574724, "loss": 3.5104, "theoretical_loss": 4.499090481756321, "tokens_seen": 155058176 }, { "epoch": 0.05, "learning_rate": 0.0004813139418254764, "loss": 3.5378, "theoretical_loss": 4.498843188637538, "tokens_seen": 155123712 }, { "epoch": 0.05, "learning_rate": 0.0004813039117352056, "loss": 3.5246, "theoretical_loss": 4.498596029211041, "tokens_seen": 155189248 }, { "epoch": 0.05, "learning_rate": 0.00048129388164493483, "loss": 3.472, "theoretical_loss": 4.498349003348137, "tokens_seen": 155254784 }, { "epoch": 0.05, "learning_rate": 0.00048128385155466396, "loss": 3.4848, "theoretical_loss": 4.4981021109203185, "tokens_seen": 155320320 }, { "epoch": 0.05, "learning_rate": 0.0004812738214643932, "loss": 3.5309, "theoretical_loss": 4.49785535179925, "tokens_seen": 155385856 }, { "epoch": 0.05, "learning_rate": 0.0004812637913741223, "loss": 3.3388, "theoretical_loss": 4.497608725856776, "tokens_seen": 155451392 }, { "epoch": 0.05, "learning_rate": 0.00048125376128385156, "loss": 3.3656, "theoretical_loss": 4.497362232964919, "tokens_seen": 155516928 }, { "epoch": 0.05, "learning_rate": 0.00048124373119358074, "loss": 3.3279, "theoretical_loss": 4.497115872995876, "tokens_seen": 155582464 }, { "epoch": 0.05, "objective/train/docs_used": 278643, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.639112710952759, "objective/train/theoretical_loss": 4.496869645822022, "objective/train/tokens_used": 176108000, "theoretical_loss": 4.496869645822022, "tokens_seen": 155648000 }, { "epoch": 0.05, "learning_rate": 0.0004812337011033099, "loss": 3.4811, "theoretical_loss": 4.496869645822022, "tokens_seen": 155648000 }, { "epoch": 0.05, "learning_rate": 0.00048122367101303916, "loss": 3.5257, "theoretical_loss": 4.496623551315908, "tokens_seen": 155713536 }, { "epoch": 0.05, "learning_rate": 0.00048121364092276834, "loss": 3.5039, "theoretical_loss": 4.496377589350261, "tokens_seen": 155779072 }, { "epoch": 0.05, "learning_rate": 0.0004812036108324975, "loss": 3.3332, "theoretical_loss": 4.496131759797984, "tokens_seen": 155844608 }, { "epoch": 0.05, "learning_rate": 0.0004811935807422267, "loss": 3.4615, "theoretical_loss": 4.495886062532153, "tokens_seen": 155910144 }, { "epoch": 0.05, "learning_rate": 0.0004811835506519559, "loss": 3.3883, "theoretical_loss": 4.495640497426023, "tokens_seen": 155975680 }, { "epoch": 0.05, "learning_rate": 0.00048117352056168506, "loss": 3.4164, "theoretical_loss": 4.495395064353019, "tokens_seen": 156041216 }, { "epoch": 0.05, "learning_rate": 0.0004811634904714143, "loss": 3.4754, "theoretical_loss": 4.4951497631867445, "tokens_seen": 156106752 }, { "epoch": 0.05, "learning_rate": 0.0004811534603811434, "loss": 3.5938, "theoretical_loss": 4.494904593800973, "tokens_seen": 156172288 }, { "epoch": 0.05, "learning_rate": 0.00048114343029087266, "loss": 3.5742, "theoretical_loss": 4.4946595560696565, "tokens_seen": 156237824 }, { "epoch": 0.05, "learning_rate": 0.0004811334002006018, "loss": 3.3737, "theoretical_loss": 4.494414649866915, "tokens_seen": 156303360 }, { "epoch": 0.05, "learning_rate": 0.000481123370110331, "loss": 3.446, "theoretical_loss": 4.494169875067046, "tokens_seen": 156368896 }, { "epoch": 0.05, "learning_rate": 0.0004811133400200602, "loss": 3.609, "theoretical_loss": 4.493925231544516, "tokens_seen": 156434432 }, { "epoch": 0.05, "learning_rate": 0.0004811033099297894, "loss": 3.4418, "theoretical_loss": 4.493680719173968, "tokens_seen": 156499968 }, { "epoch": 0.05, "learning_rate": 0.00048109327983951857, "loss": 3.6007, "theoretical_loss": 4.4934363378302145, "tokens_seen": 156565504 }, { "epoch": 0.05, "learning_rate": 0.0004810832497492478, "loss": 3.5822, "theoretical_loss": 4.493192087388239, "tokens_seen": 156631040 }, { "epoch": 0.05, "learning_rate": 0.00048107321965897693, "loss": 3.3224, "theoretical_loss": 4.4929479677232, "tokens_seen": 156696576 }, { "epoch": 0.05, "learning_rate": 0.00048106318956870616, "loss": 3.5168, "theoretical_loss": 4.4927039787104235, "tokens_seen": 156762112 }, { "epoch": 0.05, "learning_rate": 0.0004810531594784353, "loss": 3.3339, "theoretical_loss": 4.4924601202254095, "tokens_seen": 156827648 }, { "epoch": 0.05, "learning_rate": 0.0004810431293881645, "loss": 3.4399, "theoretical_loss": 4.492216392143826, "tokens_seen": 156893184 }, { "epoch": 0.05, "learning_rate": 0.0004810330992978937, "loss": 3.5788, "theoretical_loss": 4.491972794341514, "tokens_seen": 156958720 }, { "epoch": 0.05, "learning_rate": 0.0004810230692076229, "loss": 3.6369, "theoretical_loss": 4.49172932669448, "tokens_seen": 157024256 }, { "epoch": 0.05, "learning_rate": 0.00048101303911735207, "loss": 3.4469, "theoretical_loss": 4.491485989078906, "tokens_seen": 157089792 }, { "epoch": 0.05, "learning_rate": 0.00048100300902708125, "loss": 3.5772, "theoretical_loss": 4.491242781371138, "tokens_seen": 157155328 }, { "epoch": 0.05, "learning_rate": 0.00048099297893681043, "loss": 3.534, "theoretical_loss": 4.490999703447697, "tokens_seen": 157220864 }, { "epoch": 0.05, "objective/train/docs_used": 281475, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.716078519821167, "objective/train/theoretical_loss": 4.4907567551852665, "objective/train/tokens_used": 177746400, "theoretical_loss": 4.4907567551852665, "tokens_seen": 157286400 }, { "epoch": 0.05, "learning_rate": 0.00048098294884653967, "loss": 3.5845, "theoretical_loss": 4.4907567551852665, "tokens_seen": 157286400 }, { "epoch": 0.05, "learning_rate": 0.0004809729187562688, "loss": 3.4622, "theoretical_loss": 4.490513936460702, "tokens_seen": 157351936 }, { "epoch": 0.05, "learning_rate": 0.00048096288866599803, "loss": 3.5452, "theoretical_loss": 4.490271247151027, "tokens_seen": 157417472 }, { "epoch": 0.05, "learning_rate": 0.00048095285857572716, "loss": 3.5515, "theoretical_loss": 4.490028687133432, "tokens_seen": 157483008 }, { "epoch": 0.05, "learning_rate": 0.0004809428284854564, "loss": 3.5545, "theoretical_loss": 4.489786256285276, "tokens_seen": 157548544 }, { "epoch": 0.05, "learning_rate": 0.00048093279839518557, "loss": 3.5207, "theoretical_loss": 4.489543954484084, "tokens_seen": 157614080 }, { "epoch": 0.05, "learning_rate": 0.00048092276830491475, "loss": 3.5213, "theoretical_loss": 4.489301781607551, "tokens_seen": 157679616 }, { "epoch": 0.05, "learning_rate": 0.00048091273821464393, "loss": 3.7733, "theoretical_loss": 4.489059737533534, "tokens_seen": 157745152 }, { "epoch": 0.05, "learning_rate": 0.00048090270812437317, "loss": 3.3295, "theoretical_loss": 4.48881782214006, "tokens_seen": 157810688 }, { "epoch": 0.05, "learning_rate": 0.0004808926780341023, "loss": 3.5712, "theoretical_loss": 4.48857603530532, "tokens_seen": 157876224 }, { "epoch": 0.05, "learning_rate": 0.00048088264794383153, "loss": 3.5069, "theoretical_loss": 4.488334376907673, "tokens_seen": 157941760 }, { "epoch": 0.05, "learning_rate": 0.00048087261785356066, "loss": 3.4878, "theoretical_loss": 4.4880928468256425, "tokens_seen": 158007296 }, { "epoch": 0.05, "learning_rate": 0.0004808625877632899, "loss": 3.3649, "theoretical_loss": 4.487851444937916, "tokens_seen": 158072832 }, { "epoch": 0.05, "learning_rate": 0.0004808525576730191, "loss": 3.6218, "theoretical_loss": 4.487610171123347, "tokens_seen": 158138368 }, { "epoch": 0.05, "learning_rate": 0.00048084252758274826, "loss": 3.4751, "theoretical_loss": 4.487369025260954, "tokens_seen": 158203904 }, { "epoch": 0.05, "learning_rate": 0.00048083249749247744, "loss": 3.5188, "theoretical_loss": 4.48712800722992, "tokens_seen": 158269440 }, { "epoch": 0.05, "learning_rate": 0.0004808224674022066, "loss": 3.4831, "theoretical_loss": 4.48688711690959, "tokens_seen": 158334976 }, { "epoch": 0.05, "learning_rate": 0.0004808124373119358, "loss": 3.5493, "theoretical_loss": 4.486646354179475, "tokens_seen": 158400512 }, { "epoch": 0.05, "learning_rate": 0.00048080240722166503, "loss": 3.4127, "theoretical_loss": 4.48640571891925, "tokens_seen": 158466048 }, { "epoch": 0.05, "learning_rate": 0.00048079237713139416, "loss": 3.5111, "theoretical_loss": 4.48616521100875, "tokens_seen": 158531584 }, { "epoch": 0.05, "learning_rate": 0.0004807823470411234, "loss": 3.5255, "theoretical_loss": 4.485924830327974, "tokens_seen": 158597120 }, { "epoch": 0.05, "learning_rate": 0.0004807723169508525, "loss": 3.7461, "theoretical_loss": 4.485684576757087, "tokens_seen": 158662656 }, { "epoch": 0.05, "learning_rate": 0.00048076228686058176, "loss": 3.3743, "theoretical_loss": 4.485444450176413, "tokens_seen": 158728192 }, { "epoch": 0.05, "learning_rate": 0.00048075225677031094, "loss": 3.584, "theoretical_loss": 4.485204450466437, "tokens_seen": 158793728 }, { "epoch": 0.05, "learning_rate": 0.0004807422266800401, "loss": 3.5131, "theoretical_loss": 4.484964577507808, "tokens_seen": 158859264 }, { "epoch": 0.05, "objective/train/docs_used": 282978, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.556464672088623, "objective/train/theoretical_loss": 4.484724831181337, "objective/train/tokens_used": 179384800, "theoretical_loss": 4.484724831181337, "tokens_seen": 158924800 }, { "epoch": 0.05, "learning_rate": 0.0004807321965897693, "loss": 3.6099, "theoretical_loss": 4.484724831181337, "tokens_seen": 158924800 }, { "epoch": 0.05, "learning_rate": 0.00048072216649949854, "loss": 3.3585, "theoretical_loss": 4.4844852113679945, "tokens_seen": 158990336 }, { "epoch": 0.05, "learning_rate": 0.00048071213640922766, "loss": 3.4415, "theoretical_loss": 4.484245717948913, "tokens_seen": 159055872 }, { "epoch": 0.05, "learning_rate": 0.0004807021063189569, "loss": 3.5175, "theoretical_loss": 4.484006350805385, "tokens_seen": 159121408 }, { "epoch": 0.05, "learning_rate": 0.000480692076228686, "loss": 3.5754, "theoretical_loss": 4.483767109818862, "tokens_seen": 159186944 }, { "epoch": 0.05, "learning_rate": 0.00048068204613841526, "loss": 3.4372, "theoretical_loss": 4.483527994870958, "tokens_seen": 159252480 }, { "epoch": 0.05, "learning_rate": 0.00048067201604814444, "loss": 3.4834, "theoretical_loss": 4.483289005843445, "tokens_seen": 159318016 }, { "epoch": 0.05, "learning_rate": 0.0004806619859578736, "loss": 3.6163, "theoretical_loss": 4.483050142618255, "tokens_seen": 159383552 }, { "epoch": 0.05, "learning_rate": 0.0004806519558676028, "loss": 3.3963, "theoretical_loss": 4.482811405077482, "tokens_seen": 159449088 }, { "epoch": 0.05, "learning_rate": 0.000480641925777332, "loss": 3.3799, "theoretical_loss": 4.482572793103373, "tokens_seen": 159514624 }, { "epoch": 0.05, "learning_rate": 0.00048063189568706117, "loss": 3.4791, "theoretical_loss": 4.482334306578339, "tokens_seen": 159580160 }, { "epoch": 0.05, "learning_rate": 0.0004806218655967904, "loss": 3.4591, "theoretical_loss": 4.482095945384946, "tokens_seen": 159645696 }, { "epoch": 0.05, "learning_rate": 0.00048061183550651953, "loss": 3.6484, "theoretical_loss": 4.481857709405919, "tokens_seen": 159711232 }, { "epoch": 0.05, "learning_rate": 0.00048060180541624877, "loss": 3.6014, "theoretical_loss": 4.4816195985241425, "tokens_seen": 159776768 }, { "epoch": 0.05, "learning_rate": 0.00048059177532597795, "loss": 3.4958, "theoretical_loss": 4.481381612622657, "tokens_seen": 159842304 }, { "epoch": 0.05, "learning_rate": 0.00048058174523570713, "loss": 3.4882, "theoretical_loss": 4.481143751584659, "tokens_seen": 159907840 }, { "epoch": 0.05, "learning_rate": 0.0004805717151454363, "loss": 3.5333, "theoretical_loss": 4.480906015293505, "tokens_seen": 159973376 }, { "epoch": 0.05, "learning_rate": 0.0004805616850551655, "loss": 3.3813, "theoretical_loss": 4.480668403632706, "tokens_seen": 160038912 }, { "epoch": 0.05, "learning_rate": 0.00048055165496489467, "loss": 3.5336, "theoretical_loss": 4.480430916485929, "tokens_seen": 160104448 }, { "epoch": 0.05, "learning_rate": 0.0004805416248746239, "loss": 3.4503, "theoretical_loss": 4.480193553736999, "tokens_seen": 160169984 }, { "epoch": 0.05, "learning_rate": 0.00048053159478435303, "loss": 3.343, "theoretical_loss": 4.479956315269897, "tokens_seen": 160235520 }, { "epoch": 0.05, "learning_rate": 0.00048052156469408227, "loss": 3.6896, "theoretical_loss": 4.479719200968757, "tokens_seen": 160301056 }, { "epoch": 0.05, "learning_rate": 0.0004805115346038114, "loss": 3.4818, "theoretical_loss": 4.479482210717871, "tokens_seen": 160366592 }, { "epoch": 0.05, "learning_rate": 0.00048050150451354063, "loss": 3.4458, "theoretical_loss": 4.479245344401685, "tokens_seen": 160432128 }, { "epoch": 0.05, "learning_rate": 0.0004804914744232698, "loss": 3.5544, "theoretical_loss": 4.479008601904798, "tokens_seen": 160497664 }, { "epoch": 0.05, "objective/train/docs_used": 286119, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5713136196136475, "objective/train/theoretical_loss": 4.478771983111967, "objective/train/tokens_used": 181023200, "theoretical_loss": 4.478771983111967, "tokens_seen": 160563200 }, { "epoch": 0.05, "learning_rate": 0.000480481444332999, "loss": 3.6139, "theoretical_loss": 4.478771983111967, "tokens_seen": 160563200 }, { "epoch": 0.05, "learning_rate": 0.00048047141424272823, "loss": 3.4249, "theoretical_loss": 4.478535487908101, "tokens_seen": 160628736 }, { "epoch": 0.05, "learning_rate": 0.00048046138415245736, "loss": 3.4692, "theoretical_loss": 4.478299116178265, "tokens_seen": 160694272 }, { "epoch": 0.05, "learning_rate": 0.0004804513540621866, "loss": 3.5089, "theoretical_loss": 4.478062867807674, "tokens_seen": 160759808 }, { "epoch": 0.05, "learning_rate": 0.00048044132397191577, "loss": 3.4447, "theoretical_loss": 4.4778267426817, "tokens_seen": 160825344 }, { "epoch": 0.05, "learning_rate": 0.00048043129388164495, "loss": 3.4133, "theoretical_loss": 4.477590740685867, "tokens_seen": 160890880 }, { "epoch": 0.05, "learning_rate": 0.00048042126379137413, "loss": 3.5118, "theoretical_loss": 4.47735486170585, "tokens_seen": 160956416 }, { "epoch": 0.05, "learning_rate": 0.00048041123370110337, "loss": 3.5597, "theoretical_loss": 4.47711910562748, "tokens_seen": 161021952 }, { "epoch": 0.05, "learning_rate": 0.0004804012036108325, "loss": 3.4674, "theoretical_loss": 4.4768834723367394, "tokens_seen": 161087488 }, { "epoch": 0.05, "learning_rate": 0.00048039117352056173, "loss": 3.5716, "theoretical_loss": 4.4766479617197605, "tokens_seen": 161153024 }, { "epoch": 0.05, "learning_rate": 0.00048038114343029086, "loss": 3.647, "theoretical_loss": 4.476412573662829, "tokens_seen": 161218560 }, { "epoch": 0.05, "learning_rate": 0.0004803711133400201, "loss": 3.5136, "theoretical_loss": 4.4761773080523835, "tokens_seen": 161284096 }, { "epoch": 0.05, "learning_rate": 0.0004803610832497493, "loss": 3.4, "theoretical_loss": 4.475942164775013, "tokens_seen": 161349632 }, { "epoch": 0.05, "learning_rate": 0.00048035105315947846, "loss": 3.3939, "theoretical_loss": 4.475707143717455, "tokens_seen": 161415168 }, { "epoch": 0.05, "learning_rate": 0.00048034102306920764, "loss": 3.4068, "theoretical_loss": 4.475472244766601, "tokens_seen": 161480704 }, { "epoch": 0.05, "learning_rate": 0.0004803309929789368, "loss": 3.5167, "theoretical_loss": 4.475237467809492, "tokens_seen": 161546240 }, { "epoch": 0.05, "learning_rate": 0.000480320962888666, "loss": 3.4987, "theoretical_loss": 4.47500281273332, "tokens_seen": 161611776 }, { "epoch": 0.05, "learning_rate": 0.00048031093279839523, "loss": 3.3375, "theoretical_loss": 4.474768279425424, "tokens_seen": 161677312 }, { "epoch": 0.05, "learning_rate": 0.00048030090270812436, "loss": 3.3941, "theoretical_loss": 4.474533867773299, "tokens_seen": 161742848 }, { "epoch": 0.05, "learning_rate": 0.0004802908726178536, "loss": 3.6084, "theoretical_loss": 4.474299577664581, "tokens_seen": 161808384 }, { "epoch": 0.05, "learning_rate": 0.0004802808425275827, "loss": 3.4215, "theoretical_loss": 4.474065408987063, "tokens_seen": 161873920 }, { "epoch": 0.05, "learning_rate": 0.00048027081243731196, "loss": 3.4874, "theoretical_loss": 4.473831361628682, "tokens_seen": 161939456 }, { "epoch": 0.05, "learning_rate": 0.00048026078234704114, "loss": 3.4402, "theoretical_loss": 4.473597435477526, "tokens_seen": 162004992 }, { "epoch": 0.05, "learning_rate": 0.0004802507522567703, "loss": 3.5673, "theoretical_loss": 4.473363630421831, "tokens_seen": 162070528 }, { "epoch": 0.05, "learning_rate": 0.0004802407221664995, "loss": 3.5023, "theoretical_loss": 4.473129946349982, "tokens_seen": 162136064 }, { "epoch": 0.05, "objective/train/docs_used": 288885, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4012951850891113, "objective/train/theoretical_loss": 4.472896383150508, "objective/train/tokens_used": 182661600, "theoretical_loss": 4.472896383150508, "tokens_seen": 162201600 }, { "epoch": 0.05, "learning_rate": 0.00048023069207622874, "loss": 3.4786, "theoretical_loss": 4.472896383150508, "tokens_seen": 162201600 }, { "epoch": 0.05, "learning_rate": 0.00048022066198595786, "loss": 3.4887, "theoretical_loss": 4.472662940712091, "tokens_seen": 162267136 }, { "epoch": 0.05, "learning_rate": 0.0004802106318956871, "loss": 3.443, "theoretical_loss": 4.472429618923558, "tokens_seen": 162332672 }, { "epoch": 0.05, "learning_rate": 0.00048020060180541623, "loss": 3.2582, "theoretical_loss": 4.472196417673883, "tokens_seen": 162398208 }, { "epoch": 0.05, "learning_rate": 0.00048019057171514546, "loss": 3.4676, "theoretical_loss": 4.471963336852187, "tokens_seen": 162463744 }, { "epoch": 0.05, "learning_rate": 0.00048018054162487464, "loss": 3.5424, "theoretical_loss": 4.471730376347738, "tokens_seen": 162529280 }, { "epoch": 0.05, "learning_rate": 0.0004801705115346038, "loss": 3.4035, "theoretical_loss": 4.4714975360499505, "tokens_seen": 162594816 }, { "epoch": 0.05, "learning_rate": 0.000480160481444333, "loss": 3.6257, "theoretical_loss": 4.471264815848384, "tokens_seen": 162660352 }, { "epoch": 0.05, "learning_rate": 0.0004801504513540622, "loss": 3.4879, "theoretical_loss": 4.471032215632746, "tokens_seen": 162725888 }, { "epoch": 0.05, "learning_rate": 0.00048014042126379137, "loss": 3.5214, "theoretical_loss": 4.470799735292889, "tokens_seen": 162791424 }, { "epoch": 0.05, "learning_rate": 0.0004801303911735206, "loss": 3.6083, "theoretical_loss": 4.470567374718808, "tokens_seen": 162856960 }, { "epoch": 0.05, "learning_rate": 0.00048012036108324973, "loss": 3.4556, "theoretical_loss": 4.470335133800649, "tokens_seen": 162922496 }, { "epoch": 0.05, "learning_rate": 0.00048011033099297897, "loss": 3.5857, "theoretical_loss": 4.470103012428696, "tokens_seen": 162988032 }, { "epoch": 0.05, "learning_rate": 0.00048010030090270815, "loss": 3.5754, "theoretical_loss": 4.469871010493383, "tokens_seen": 163053568 }, { "epoch": 0.05, "learning_rate": 0.00048009027081243733, "loss": 3.344, "theoretical_loss": 4.469639127885287, "tokens_seen": 163119104 }, { "epoch": 0.05, "learning_rate": 0.0004800802407221665, "loss": 3.5455, "theoretical_loss": 4.4694073644951295, "tokens_seen": 163184640 }, { "epoch": 0.05, "learning_rate": 0.0004800702106318957, "loss": 3.4763, "theoretical_loss": 4.469175720213771, "tokens_seen": 163250176 }, { "epoch": 0.05, "learning_rate": 0.00048006018054162487, "loss": 3.5228, "theoretical_loss": 4.468944194932225, "tokens_seen": 163315712 }, { "epoch": 0.05, "learning_rate": 0.0004800501504513541, "loss": 3.4963, "theoretical_loss": 4.468712788541639, "tokens_seen": 163381248 }, { "epoch": 0.05, "learning_rate": 0.00048004012036108323, "loss": 3.3741, "theoretical_loss": 4.46848150093331, "tokens_seen": 163446784 }, { "epoch": 0.05, "learning_rate": 0.00048003009027081247, "loss": 3.5376, "theoretical_loss": 4.468250331998676, "tokens_seen": 163512320 }, { "epoch": 0.05, "learning_rate": 0.0004800200601805416, "loss": 3.4213, "theoretical_loss": 4.468019281629316, "tokens_seen": 163577856 }, { "epoch": 0.05, "learning_rate": 0.00048001003009027083, "loss": 3.3944, "theoretical_loss": 4.467788349716955, "tokens_seen": 163643392 }, { "epoch": 0.05, "learning_rate": 0.00048, "loss": 3.5617, "theoretical_loss": 4.467557536153457, "tokens_seen": 163708928 }, { "epoch": 0.05, "learning_rate": 0.0004799899699097292, "loss": 3.4445, "theoretical_loss": 4.467326840830829, "tokens_seen": 163774464 }, { "debugging/Self-BLEU-5": 0.6166572850713502, "debugging/distinct-1-grams": 0.7807065688056567, "debugging/distinct-2-grams": 0.9605986951783266, "debugging/entropy-1-grams": 6.462556435978332, "debugging/entropy-2-grams": 7.641264963293784, "debugging/length": 524.741935483871, "debugging/num_segments": 31, "epoch": 0.05, "objective/train/docs_used": 291020, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2950470447540283, "objective/train/theoretical_loss": 4.467096263641219, "objective/train/tokens_used": 184300000, "theoretical_loss": 4.467096263641219, "tokens_seen": 163840000 }, { "epoch": 0.05, "learning_rate": 0.0004799799398194584, "loss": 3.141, "theoretical_loss": 4.467096263641219, "tokens_seen": 163840000 }, { "epoch": 0.05, "learning_rate": 0.00047996990972918756, "loss": 3.5036, "theoretical_loss": 4.466865804476919, "tokens_seen": 163905536 }, { "epoch": 0.05, "learning_rate": 0.00047995987963891674, "loss": 3.5242, "theoretical_loss": 4.466635463230359, "tokens_seen": 163971072 }, { "epoch": 0.05, "learning_rate": 0.00047994984954864597, "loss": 3.4027, "theoretical_loss": 4.466405239794113, "tokens_seen": 164036608 }, { "epoch": 0.05, "learning_rate": 0.0004799398194583751, "loss": 3.5311, "theoretical_loss": 4.466175134060894, "tokens_seen": 164102144 }, { "epoch": 0.05, "learning_rate": 0.00047992978936810433, "loss": 3.5162, "theoretical_loss": 4.465945145923554, "tokens_seen": 164167680 }, { "epoch": 0.05, "learning_rate": 0.0004799197592778335, "loss": 3.2934, "theoretical_loss": 4.4657152752750875, "tokens_seen": 164233216 }, { "epoch": 0.05, "learning_rate": 0.0004799097291875627, "loss": 3.2895, "theoretical_loss": 4.465485522008629, "tokens_seen": 164298752 }, { "epoch": 0.05, "learning_rate": 0.0004798996990972919, "loss": 3.4597, "theoretical_loss": 4.465255886017452, "tokens_seen": 164364288 }, { "epoch": 0.05, "learning_rate": 0.00047988966900702106, "loss": 3.6717, "theoretical_loss": 4.465026367194971, "tokens_seen": 164429824 }, { "epoch": 0.05, "learning_rate": 0.00047987963891675024, "loss": 3.5381, "theoretical_loss": 4.464796965434738, "tokens_seen": 164495360 }, { "epoch": 0.05, "learning_rate": 0.0004798696088264795, "loss": 3.5381, "theoretical_loss": 4.464567680630443, "tokens_seen": 164560896 }, { "epoch": 0.05, "learning_rate": 0.0004798595787362086, "loss": 3.5591, "theoretical_loss": 4.464338512675919, "tokens_seen": 164626432 }, { "epoch": 0.05, "learning_rate": 0.00047984954864593784, "loss": 3.424, "theoretical_loss": 4.464109461465133, "tokens_seen": 164691968 }, { "epoch": 0.05, "learning_rate": 0.00047983951855566696, "loss": 3.5228, "theoretical_loss": 4.4638805268921935, "tokens_seen": 164757504 }, { "epoch": 0.05, "learning_rate": 0.0004798294884653962, "loss": 3.5595, "theoretical_loss": 4.463651708851346, "tokens_seen": 164823040 }, { "epoch": 0.05, "learning_rate": 0.0004798194583751254, "loss": 3.5127, "theoretical_loss": 4.463423007236974, "tokens_seen": 164888576 }, { "epoch": 0.05, "learning_rate": 0.00047980942828485456, "loss": 3.5878, "theoretical_loss": 4.4631944219436, "tokens_seen": 164954112 }, { "epoch": 0.05, "learning_rate": 0.00047979939819458374, "loss": 3.516, "theoretical_loss": 4.462965952865879, "tokens_seen": 165019648 }, { "epoch": 0.05, "learning_rate": 0.0004797893681043129, "loss": 3.4038, "theoretical_loss": 4.46273759989861, "tokens_seen": 165085184 }, { "epoch": 0.05, "learning_rate": 0.0004797793380140421, "loss": 3.4682, "theoretical_loss": 4.462509362936723, "tokens_seen": 165150720 }, { "epoch": 0.05, "learning_rate": 0.00047976930792377134, "loss": 3.4582, "theoretical_loss": 4.46228124187529, "tokens_seen": 165216256 }, { "epoch": 0.05, "learning_rate": 0.00047975927783350047, "loss": 3.4865, "theoretical_loss": 4.462053236609516, "tokens_seen": 165281792 }, { "epoch": 0.05, "learning_rate": 0.0004797492477432297, "loss": 3.4924, "theoretical_loss": 4.461825347034742, "tokens_seen": 165347328 }, { "epoch": 0.05, "learning_rate": 0.0004797392176529589, "loss": 3.5146, "theoretical_loss": 4.461597573046449, "tokens_seen": 165412864 }, { "epoch": 0.05, "objective/train/docs_used": 293925, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.468921422958374, "objective/train/theoretical_loss": 4.461369914540247, "objective/train/tokens_used": 185938400, "theoretical_loss": 4.461369914540247, "tokens_seen": 165478400 }, { "epoch": 0.05, "learning_rate": 0.00047972918756268807, "loss": 3.596, "theoretical_loss": 4.461369914540247, "tokens_seen": 165478400 }, { "epoch": 0.05, "learning_rate": 0.0004797191574724173, "loss": 3.4403, "theoretical_loss": 4.4611423714118885, "tokens_seen": 165543936 }, { "epoch": 0.05, "learning_rate": 0.00047970912738214643, "loss": 3.5107, "theoretical_loss": 4.460914943557256, "tokens_seen": 165609472 }, { "epoch": 0.05, "learning_rate": 0.00047969909729187566, "loss": 3.4693, "theoretical_loss": 4.460687630872371, "tokens_seen": 165675008 }, { "epoch": 0.05, "learning_rate": 0.00047968906720160484, "loss": 3.5294, "theoretical_loss": 4.46046043325339, "tokens_seen": 165740544 }, { "epoch": 0.05, "learning_rate": 0.000479679037111334, "loss": 3.536, "theoretical_loss": 4.460233350596599, "tokens_seen": 165806080 }, { "epoch": 0.05, "learning_rate": 0.0004796690070210632, "loss": 3.3684, "theoretical_loss": 4.460006382798425, "tokens_seen": 165871616 }, { "epoch": 0.05, "learning_rate": 0.0004796589769307924, "loss": 3.4247, "theoretical_loss": 4.459779529755423, "tokens_seen": 165937152 }, { "epoch": 0.05, "learning_rate": 0.00047964894684052157, "loss": 3.4478, "theoretical_loss": 4.459552791364288, "tokens_seen": 166002688 }, { "epoch": 0.05, "learning_rate": 0.0004796389167502508, "loss": 3.6883, "theoretical_loss": 4.459326167521844, "tokens_seen": 166068224 }, { "epoch": 0.05, "learning_rate": 0.00047962888665997993, "loss": 3.3391, "theoretical_loss": 4.4590996581250515, "tokens_seen": 166133760 }, { "epoch": 0.05, "learning_rate": 0.00047961885656970917, "loss": 3.587, "theoretical_loss": 4.458873263071002, "tokens_seen": 166199296 }, { "epoch": 0.05, "learning_rate": 0.00047960882647943835, "loss": 3.5647, "theoretical_loss": 4.458646982256921, "tokens_seen": 166264832 }, { "epoch": 0.05, "learning_rate": 0.00047959879638916753, "loss": 3.5652, "theoretical_loss": 4.458420815580169, "tokens_seen": 166330368 }, { "epoch": 0.05, "learning_rate": 0.0004795887662988967, "loss": 3.4148, "theoretical_loss": 4.458194762938234, "tokens_seen": 166395904 }, { "epoch": 0.05, "learning_rate": 0.0004795787362086259, "loss": 3.4387, "theoretical_loss": 4.457968824228743, "tokens_seen": 166461440 }, { "epoch": 0.05, "learning_rate": 0.00047956870611835507, "loss": 3.5228, "theoretical_loss": 4.457742999349449, "tokens_seen": 166526976 }, { "epoch": 0.05, "learning_rate": 0.0004795586760280843, "loss": 3.5523, "theoretical_loss": 4.4575172881982414, "tokens_seen": 166592512 }, { "epoch": 0.05, "learning_rate": 0.00047954864593781343, "loss": 3.3092, "theoretical_loss": 4.457291690673139, "tokens_seen": 166658048 }, { "epoch": 0.05, "learning_rate": 0.00047953861584754267, "loss": 3.5879, "theoretical_loss": 4.457066206672291, "tokens_seen": 166723584 }, { "epoch": 0.05, "learning_rate": 0.0004795285857572718, "loss": 3.5197, "theoretical_loss": 4.456840836093983, "tokens_seen": 166789120 }, { "epoch": 0.05, "learning_rate": 0.00047951855566700103, "loss": 3.5638, "theoretical_loss": 4.456615578836625, "tokens_seen": 166854656 }, { "epoch": 0.05, "learning_rate": 0.0004795085255767302, "loss": 3.4903, "theoretical_loss": 4.456390434798762, "tokens_seen": 166920192 }, { "epoch": 0.05, "learning_rate": 0.0004794984954864594, "loss": 3.5508, "theoretical_loss": 4.45616540387907, "tokens_seen": 166985728 }, { "epoch": 0.05, "learning_rate": 0.0004794884653961886, "loss": 3.3855, "theoretical_loss": 4.4559404859763525, "tokens_seen": 167051264 }, { "epoch": 0.05, "objective/train/docs_used": 296587, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5752499103546143, "objective/train/theoretical_loss": 4.455715680989545, "objective/train/tokens_used": 187576800, "theoretical_loss": 4.455715680989545, "tokens_seen": 167116800 }, { "epoch": 0.05, "learning_rate": 0.00047947843530591776, "loss": 3.4208, "theoretical_loss": 4.455715680989545, "tokens_seen": 167116800 }, { "epoch": 0.05, "learning_rate": 0.00047946840521564694, "loss": 3.4917, "theoretical_loss": 4.455490988817713, "tokens_seen": 167182336 }, { "epoch": 0.05, "learning_rate": 0.00047945837512537617, "loss": 3.5511, "theoretical_loss": 4.4552664093600525, "tokens_seen": 167247872 }, { "epoch": 0.05, "learning_rate": 0.0004794483450351053, "loss": 3.5321, "theoretical_loss": 4.455041942515887, "tokens_seen": 167313408 }, { "epoch": 0.05, "learning_rate": 0.00047943831494483453, "loss": 3.3446, "theoretical_loss": 4.454817588184669, "tokens_seen": 167378944 }, { "epoch": 0.05, "learning_rate": 0.0004794282848545637, "loss": 3.6442, "theoretical_loss": 4.454593346265984, "tokens_seen": 167444480 }, { "epoch": 0.05, "learning_rate": 0.0004794182547642929, "loss": 3.6149, "theoretical_loss": 4.454369216659542, "tokens_seen": 167510016 }, { "epoch": 0.05, "learning_rate": 0.0004794082246740221, "loss": 3.4652, "theoretical_loss": 4.454145199265183, "tokens_seen": 167575552 }, { "epoch": 0.05, "learning_rate": 0.00047939819458375126, "loss": 3.4917, "theoretical_loss": 4.453921293982877, "tokens_seen": 167641088 }, { "epoch": 0.05, "learning_rate": 0.00047938816449348044, "loss": 3.3547, "theoretical_loss": 4.453697500712722, "tokens_seen": 167706624 }, { "epoch": 0.05, "learning_rate": 0.0004793781344032097, "loss": 3.5033, "theoretical_loss": 4.453473819354942, "tokens_seen": 167772160 }, { "epoch": 0.05, "learning_rate": 0.0004793681043129388, "loss": 3.5428, "theoretical_loss": 4.453250249809889, "tokens_seen": 167837696 }, { "epoch": 0.05, "learning_rate": 0.00047935807422266804, "loss": 3.6955, "theoretical_loss": 4.453026791978045, "tokens_seen": 167903232 }, { "epoch": 0.05, "learning_rate": 0.00047934804413239716, "loss": 3.4914, "theoretical_loss": 4.4528034457600185, "tokens_seen": 167968768 }, { "epoch": 0.05, "learning_rate": 0.0004793380140421264, "loss": 3.4161, "theoretical_loss": 4.452580211056542, "tokens_seen": 168034304 }, { "epoch": 0.05, "learning_rate": 0.0004793279839518556, "loss": 3.587, "theoretical_loss": 4.452357087768481, "tokens_seen": 168099840 }, { "epoch": 0.05, "learning_rate": 0.00047931795386158476, "loss": 3.6165, "theoretical_loss": 4.45213407579682, "tokens_seen": 168165376 }, { "epoch": 0.05, "learning_rate": 0.00047930792377131394, "loss": 3.406, "theoretical_loss": 4.451911175042679, "tokens_seen": 168230912 }, { "epoch": 0.05, "learning_rate": 0.0004792978936810431, "loss": 3.5402, "theoretical_loss": 4.451688385407296, "tokens_seen": 168296448 }, { "epoch": 0.05, "learning_rate": 0.0004792878635907723, "loss": 3.5501, "theoretical_loss": 4.451465706792041, "tokens_seen": 168361984 }, { "epoch": 0.05, "learning_rate": 0.00047927783350050154, "loss": 3.4771, "theoretical_loss": 4.4512431390984055, "tokens_seen": 168427520 }, { "epoch": 0.05, "learning_rate": 0.00047926780341023067, "loss": 3.5284, "theoretical_loss": 4.451020682228011, "tokens_seen": 168493056 }, { "epoch": 0.05, "learning_rate": 0.0004792577733199599, "loss": 3.5752, "theoretical_loss": 4.450798336082601, "tokens_seen": 168558592 }, { "epoch": 0.05, "learning_rate": 0.0004792477432296891, "loss": 3.5683, "theoretical_loss": 4.450576100564046, "tokens_seen": 168624128 }, { "epoch": 0.05, "learning_rate": 0.00047923771313941827, "loss": 3.3967, "theoretical_loss": 4.450353975574341, "tokens_seen": 168689664 }, { "epoch": 0.05, "objective/train/docs_used": 299406, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5390167236328125, "objective/train/theoretical_loss": 4.450131961015606, "objective/train/tokens_used": 189215200, "theoretical_loss": 4.450131961015606, "tokens_seen": 168755200 }, { "epoch": 0.05, "learning_rate": 0.00047922768304914745, "loss": 3.5708, "theoretical_loss": 4.450131961015606, "tokens_seen": 168755200 }, { "epoch": 0.05, "learning_rate": 0.00047921765295887663, "loss": 3.5653, "theoretical_loss": 4.449910056790086, "tokens_seen": 168820736 }, { "epoch": 0.05, "learning_rate": 0.0004792076228686058, "loss": 3.4991, "theoretical_loss": 4.44968826280015, "tokens_seen": 168886272 }, { "epoch": 0.05, "learning_rate": 0.00047919759277833504, "loss": 3.6578, "theoretical_loss": 4.4494665789482895, "tokens_seen": 168951808 }, { "epoch": 0.05, "learning_rate": 0.00047918756268806417, "loss": 3.4498, "theoretical_loss": 4.449245005137125, "tokens_seen": 169017344 }, { "epoch": 0.05, "learning_rate": 0.0004791775325977934, "loss": 3.3428, "theoretical_loss": 4.449023541269395, "tokens_seen": 169082880 }, { "epoch": 0.05, "learning_rate": 0.00047916750250752253, "loss": 3.626, "theoretical_loss": 4.448802187247966, "tokens_seen": 169148416 }, { "epoch": 0.05, "learning_rate": 0.00047915747241725177, "loss": 3.5652, "theoretical_loss": 4.448580942975825, "tokens_seen": 169213952 }, { "epoch": 0.05, "learning_rate": 0.00047914744232698095, "loss": 3.544, "theoretical_loss": 4.448359808356084, "tokens_seen": 169279488 }, { "epoch": 0.05, "learning_rate": 0.00047913741223671013, "loss": 3.5013, "theoretical_loss": 4.448138783291979, "tokens_seen": 169345024 }, { "epoch": 0.05, "learning_rate": 0.0004791273821464393, "loss": 3.6148, "theoretical_loss": 4.447917867686863, "tokens_seen": 169410560 }, { "epoch": 0.05, "learning_rate": 0.00047911735205616855, "loss": 3.6161, "theoretical_loss": 4.44769706144422, "tokens_seen": 169476096 }, { "epoch": 0.05, "learning_rate": 0.0004791073219658977, "loss": 3.5421, "theoretical_loss": 4.44747636446765, "tokens_seen": 169541632 }, { "epoch": 0.05, "learning_rate": 0.0004790972918756269, "loss": 3.5565, "theoretical_loss": 4.447255776660878, "tokens_seen": 169607168 }, { "epoch": 0.05, "learning_rate": 0.00047908726178535604, "loss": 3.5758, "theoretical_loss": 4.44703529792775, "tokens_seen": 169672704 }, { "epoch": 0.05, "learning_rate": 0.00047907723169508527, "loss": 3.4412, "theoretical_loss": 4.446814928172234, "tokens_seen": 169738240 }, { "epoch": 0.05, "learning_rate": 0.00047906720160481445, "loss": 3.7087, "theoretical_loss": 4.446594667298421, "tokens_seen": 169803776 }, { "epoch": 0.05, "learning_rate": 0.00047905717151454363, "loss": 3.4537, "theoretical_loss": 4.446374515210521, "tokens_seen": 169869312 }, { "epoch": 0.05, "learning_rate": 0.0004790471414242728, "loss": 3.6721, "theoretical_loss": 4.446154471812866, "tokens_seen": 169934848 }, { "epoch": 0.05, "learning_rate": 0.000479037111334002, "loss": 3.587, "theoretical_loss": 4.445934537009911, "tokens_seen": 170000384 }, { "epoch": 0.05, "learning_rate": 0.0004790270812437312, "loss": 3.6467, "theoretical_loss": 4.445714710706228, "tokens_seen": 170065920 }, { "epoch": 0.05, "learning_rate": 0.0004790170511534604, "loss": 3.4983, "theoretical_loss": 4.445494992806513, "tokens_seen": 170131456 }, { "epoch": 0.05, "learning_rate": 0.00047900702106318954, "loss": 3.4989, "theoretical_loss": 4.44527538321558, "tokens_seen": 170196992 }, { "epoch": 0.05, "learning_rate": 0.0004789969909729188, "loss": 3.6873, "theoretical_loss": 4.445055881838365, "tokens_seen": 170262528 }, { "epoch": 0.05, "learning_rate": 0.0004789869608826479, "loss": 3.5946, "theoretical_loss": 4.444836488579924, "tokens_seen": 170328064 }, { "epoch": 0.05, "objective/train/docs_used": 302236, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7417047023773193, "objective/train/theoretical_loss": 4.44461720334543, "objective/train/tokens_used": 190853600, "theoretical_loss": 4.44461720334543, "tokens_seen": 170393600 }, { "epoch": 0.05, "learning_rate": 0.00047897693079237714, "loss": 3.5777, "theoretical_loss": 4.44461720334543, "tokens_seen": 170393600 }, { "epoch": 0.05, "learning_rate": 0.00047896690070210637, "loss": 3.6929, "theoretical_loss": 4.444398026040179, "tokens_seen": 170459136 }, { "epoch": 0.05, "learning_rate": 0.0004789568706118355, "loss": 3.573, "theoretical_loss": 4.444178956569585, "tokens_seen": 170524672 }, { "epoch": 0.05, "learning_rate": 0.00047894684052156473, "loss": 3.4602, "theoretical_loss": 4.443959994839181, "tokens_seen": 170590208 }, { "epoch": 0.05, "learning_rate": 0.0004789368104312939, "loss": 3.6511, "theoretical_loss": 4.44374114075462, "tokens_seen": 170655744 }, { "epoch": 0.05, "learning_rate": 0.0004789267803410231, "loss": 3.5249, "theoretical_loss": 4.443522394221671, "tokens_seen": 170721280 }, { "epoch": 0.05, "learning_rate": 0.0004789167502507523, "loss": 3.4477, "theoretical_loss": 4.443303755146225, "tokens_seen": 170786816 }, { "epoch": 0.05, "learning_rate": 0.00047890672016048146, "loss": 3.4037, "theoretical_loss": 4.443085223434291, "tokens_seen": 170852352 }, { "epoch": 0.05, "learning_rate": 0.00047889669007021064, "loss": 3.3888, "theoretical_loss": 4.442866798991993, "tokens_seen": 170917888 }, { "epoch": 0.05, "learning_rate": 0.0004788866599799399, "loss": 3.5984, "theoretical_loss": 4.442648481725577, "tokens_seen": 170983424 }, { "epoch": 0.05, "learning_rate": 0.000478876629889669, "loss": 3.4863, "theoretical_loss": 4.442430271541404, "tokens_seen": 171048960 }, { "epoch": 0.05, "learning_rate": 0.00047886659979939824, "loss": 3.3684, "theoretical_loss": 4.442212168345956, "tokens_seen": 171114496 }, { "epoch": 0.05, "learning_rate": 0.00047885656970912736, "loss": 3.4039, "theoretical_loss": 4.4419941720458285, "tokens_seen": 171180032 }, { "epoch": 0.05, "learning_rate": 0.0004788465396188566, "loss": 3.506, "theoretical_loss": 4.441776282547736, "tokens_seen": 171245568 }, { "epoch": 0.05, "learning_rate": 0.0004788365095285858, "loss": 3.5294, "theoretical_loss": 4.441558499758511, "tokens_seen": 171311104 }, { "epoch": 0.05, "learning_rate": 0.00047882647943831496, "loss": 3.5767, "theoretical_loss": 4.441340823585101, "tokens_seen": 171376640 }, { "epoch": 0.05, "learning_rate": 0.00047881644934804414, "loss": 3.5449, "theoretical_loss": 4.441123253934572, "tokens_seen": 171442176 }, { "epoch": 0.05, "learning_rate": 0.0004788064192577733, "loss": 3.5683, "theoretical_loss": 4.440905790714105, "tokens_seen": 171507712 }, { "epoch": 0.05, "learning_rate": 0.0004787963891675025, "loss": 3.4973, "theoretical_loss": 4.440688433830999, "tokens_seen": 171573248 }, { "epoch": 0.05, "learning_rate": 0.00047878635907723174, "loss": 3.3215, "theoretical_loss": 4.440471183192667, "tokens_seen": 171638784 }, { "epoch": 0.05, "learning_rate": 0.00047877632898696087, "loss": 3.5681, "theoretical_loss": 4.440254038706639, "tokens_seen": 171704320 }, { "epoch": 0.05, "learning_rate": 0.0004787662988966901, "loss": 3.4664, "theoretical_loss": 4.440037000280561, "tokens_seen": 171769856 }, { "epoch": 0.05, "learning_rate": 0.0004787562688064193, "loss": 3.4892, "theoretical_loss": 4.439820067822195, "tokens_seen": 171835392 }, { "epoch": 0.05, "learning_rate": 0.00047874623871614847, "loss": 3.4314, "theoretical_loss": 4.439603241239416, "tokens_seen": 171900928 }, { "epoch": 0.05, "learning_rate": 0.00047873620862587765, "loss": 3.4433, "theoretical_loss": 4.439386520440218, "tokens_seen": 171966464 }, { "epoch": 0.05, "objective/train/docs_used": 304754, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6857874393463135, "objective/train/theoretical_loss": 4.439169905332706, "objective/train/tokens_used": 192492000, "theoretical_loss": 4.439169905332706, "tokens_seen": 172032000 }, { "epoch": 0.05, "learning_rate": 0.00047872617853560683, "loss": 3.5816, "theoretical_loss": 4.439169905332706, "tokens_seen": 172032000 }, { "epoch": 0.05, "learning_rate": 0.000478716148445336, "loss": 3.5198, "theoretical_loss": 4.438953395825102, "tokens_seen": 172097536 }, { "epoch": 0.05, "learning_rate": 0.00047870611835506524, "loss": 3.3825, "theoretical_loss": 4.438736991825744, "tokens_seen": 172163072 }, { "epoch": 0.05, "learning_rate": 0.00047869608826479437, "loss": 3.4973, "theoretical_loss": 4.438520693243079, "tokens_seen": 172228608 }, { "epoch": 0.05, "learning_rate": 0.0004786860581745236, "loss": 3.6539, "theoretical_loss": 4.4383044999856756, "tokens_seen": 172294144 }, { "epoch": 0.05, "learning_rate": 0.00047867602808425273, "loss": 3.6168, "theoretical_loss": 4.438088411962211, "tokens_seen": 172359680 }, { "epoch": 0.05, "learning_rate": 0.00047866599799398197, "loss": 3.3971, "theoretical_loss": 4.437872429081477, "tokens_seen": 172425216 }, { "epoch": 0.05, "learning_rate": 0.00047865596790371115, "loss": 3.5248, "theoretical_loss": 4.437656551252381, "tokens_seen": 172490752 }, { "epoch": 0.05, "learning_rate": 0.00047864593781344033, "loss": 3.4377, "theoretical_loss": 4.4374407783839445, "tokens_seen": 172556288 }, { "epoch": 0.05, "learning_rate": 0.0004786359077231695, "loss": 3.4251, "theoretical_loss": 4.437225110385297, "tokens_seen": 172621824 }, { "epoch": 0.05, "learning_rate": 0.00047862587763289875, "loss": 3.5513, "theoretical_loss": 4.4370095471656885, "tokens_seen": 172687360 }, { "epoch": 0.05, "learning_rate": 0.0004786158475426279, "loss": 3.4964, "theoretical_loss": 4.436794088634477, "tokens_seen": 172752896 }, { "epoch": 0.05, "learning_rate": 0.0004786058174523571, "loss": 3.2791, "theoretical_loss": 4.4365787347011345, "tokens_seen": 172818432 }, { "epoch": 0.05, "learning_rate": 0.00047859578736208624, "loss": 3.5172, "theoretical_loss": 4.436363485275246, "tokens_seen": 172883968 }, { "epoch": 0.05, "learning_rate": 0.00047858575727181547, "loss": 3.5968, "theoretical_loss": 4.436148340266508, "tokens_seen": 172949504 }, { "epoch": 0.05, "learning_rate": 0.00047857572718154465, "loss": 3.5778, "theoretical_loss": 4.435933299584729, "tokens_seen": 173015040 }, { "epoch": 0.05, "learning_rate": 0.00047856569709127383, "loss": 3.6211, "theoretical_loss": 4.4357183631398325, "tokens_seen": 173080576 }, { "epoch": 0.05, "learning_rate": 0.000478555667001003, "loss": 3.4173, "theoretical_loss": 4.435503530841849, "tokens_seen": 173146112 }, { "epoch": 0.05, "learning_rate": 0.0004785456369107322, "loss": 3.5009, "theoretical_loss": 4.435288802600926, "tokens_seen": 173211648 }, { "epoch": 0.05, "learning_rate": 0.0004785356068204614, "loss": 3.4044, "theoretical_loss": 4.4350741783273175, "tokens_seen": 173277184 }, { "epoch": 0.05, "learning_rate": 0.0004785255767301906, "loss": 3.4302, "theoretical_loss": 4.434859657931392, "tokens_seen": 173342720 }, { "epoch": 0.05, "learning_rate": 0.00047851554663991974, "loss": 3.591, "theoretical_loss": 4.434645241323629, "tokens_seen": 173408256 }, { "epoch": 0.05, "learning_rate": 0.000478505516549649, "loss": 3.3573, "theoretical_loss": 4.434430928414617, "tokens_seen": 173473792 }, { "epoch": 0.05, "learning_rate": 0.0004784954864593781, "loss": 3.6336, "theoretical_loss": 4.434216719115057, "tokens_seen": 173539328 }, { "epoch": 0.05, "learning_rate": 0.00047848545636910734, "loss": 3.5888, "theoretical_loss": 4.43400261333576, "tokens_seen": 173604864 }, { "epoch": 0.05, "objective/train/docs_used": 307553, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.355226993560791, "objective/train/theoretical_loss": 4.433788610987646, "objective/train/tokens_used": 194130400, "theoretical_loss": 4.433788610987646, "tokens_seen": 173670400 }, { "epoch": 0.05, "learning_rate": 0.0004784754262788365, "loss": 3.4879, "theoretical_loss": 4.433788610987646, "tokens_seen": 173670400 }, { "epoch": 0.05, "learning_rate": 0.0004784653961885657, "loss": 3.5222, "theoretical_loss": 4.433574711981749, "tokens_seen": 173735936 }, { "epoch": 0.05, "learning_rate": 0.0004784553660982949, "loss": 3.4322, "theoretical_loss": 4.433360916229209, "tokens_seen": 173801472 }, { "epoch": 0.05, "learning_rate": 0.0004784453360080241, "loss": 3.3733, "theoretical_loss": 4.433147223641278, "tokens_seen": 173867008 }, { "epoch": 0.05, "learning_rate": 0.00047843530591775324, "loss": 3.5238, "theoretical_loss": 4.432933634129318, "tokens_seen": 173932544 }, { "epoch": 0.05, "learning_rate": 0.0004784252758274825, "loss": 3.6365, "theoretical_loss": 4.4327201476047975, "tokens_seen": 173998080 }, { "epoch": 0.05, "learning_rate": 0.0004784152457372116, "loss": 3.4856, "theoretical_loss": 4.432506763979299, "tokens_seen": 174063616 }, { "epoch": 0.05, "learning_rate": 0.00047840521564694084, "loss": 3.4319, "theoretical_loss": 4.432293483164512, "tokens_seen": 174129152 }, { "epoch": 0.05, "learning_rate": 0.00047839518555667, "loss": 3.4063, "theoretical_loss": 4.432080305072233, "tokens_seen": 174194688 }, { "epoch": 0.05, "learning_rate": 0.0004783851554663992, "loss": 3.3679, "theoretical_loss": 4.43186722961437, "tokens_seen": 174260224 }, { "epoch": 0.05, "learning_rate": 0.0004783751253761284, "loss": 3.4075, "theoretical_loss": 4.431654256702938, "tokens_seen": 174325760 }, { "epoch": 0.05, "learning_rate": 0.00047836509528585756, "loss": 3.4335, "theoretical_loss": 4.431441386250063, "tokens_seen": 174391296 }, { "epoch": 0.05, "learning_rate": 0.00047835506519558675, "loss": 3.3891, "theoretical_loss": 4.4312286181679745, "tokens_seen": 174456832 }, { "epoch": 0.05, "learning_rate": 0.000478345035105316, "loss": 3.5069, "theoretical_loss": 4.431015952369016, "tokens_seen": 174522368 }, { "epoch": 0.05, "learning_rate": 0.0004783350050150451, "loss": 3.2068, "theoretical_loss": 4.430803388765636, "tokens_seen": 174587904 }, { "epoch": 0.05, "learning_rate": 0.00047832497492477434, "loss": 3.421, "theoretical_loss": 4.430590927270388, "tokens_seen": 174653440 }, { "epoch": 0.05, "learning_rate": 0.00047831494483450347, "loss": 3.5374, "theoretical_loss": 4.430378567795938, "tokens_seen": 174718976 }, { "epoch": 0.05, "learning_rate": 0.0004783049147442327, "loss": 3.5975, "theoretical_loss": 4.430166310255057, "tokens_seen": 174784512 }, { "epoch": 0.05, "learning_rate": 0.0004782948846539619, "loss": 3.2509, "theoretical_loss": 4.429954154560624, "tokens_seen": 174850048 }, { "epoch": 0.05, "learning_rate": 0.00047828485456369107, "loss": 3.6145, "theoretical_loss": 4.429742100625624, "tokens_seen": 174915584 }, { "epoch": 0.05, "learning_rate": 0.00047827482447342025, "loss": 3.4669, "theoretical_loss": 4.429530148363151, "tokens_seen": 174981120 }, { "epoch": 0.05, "learning_rate": 0.0004782647943831495, "loss": 3.5169, "theoretical_loss": 4.429318297686402, "tokens_seen": 175046656 }, { "epoch": 0.05, "learning_rate": 0.0004782547642928786, "loss": 3.2779, "theoretical_loss": 4.429106548508685, "tokens_seen": 175112192 }, { "epoch": 0.05, "learning_rate": 0.00047824473420260785, "loss": 3.4859, "theoretical_loss": 4.428894900743411, "tokens_seen": 175177728 }, { "epoch": 0.05, "learning_rate": 0.000478234704112337, "loss": 3.526, "theoretical_loss": 4.428683354304098, "tokens_seen": 175243264 }, { "epoch": 0.05, "objective/train/docs_used": 308941, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.644310712814331, "objective/train/theoretical_loss": 4.428471909104372, "objective/train/tokens_used": 195768800, "theoretical_loss": 4.428471909104372, "tokens_seen": 175308800 }, { "epoch": 0.05, "learning_rate": 0.0004782246740220662, "loss": 3.4211, "theoretical_loss": 4.428471909104372, "tokens_seen": 175308800 }, { "epoch": 0.05, "learning_rate": 0.00047821464393179544, "loss": 3.3844, "theoretical_loss": 4.428260565057964, "tokens_seen": 175374336 }, { "epoch": 0.05, "learning_rate": 0.00047820461384152457, "loss": 3.5499, "theoretical_loss": 4.428049322078708, "tokens_seen": 175439872 }, { "epoch": 0.05, "learning_rate": 0.0004781945837512538, "loss": 3.2756, "theoretical_loss": 4.427838180080547, "tokens_seen": 175505408 }, { "epoch": 0.05, "learning_rate": 0.00047818455366098293, "loss": 3.3089, "theoretical_loss": 4.4276271389775275, "tokens_seen": 175570944 }, { "epoch": 0.05, "learning_rate": 0.00047817452357071217, "loss": 3.6094, "theoretical_loss": 4.427416198683803, "tokens_seen": 175636480 }, { "epoch": 0.05, "learning_rate": 0.00047816449348044135, "loss": 3.5686, "theoretical_loss": 4.427205359113629, "tokens_seen": 175702016 }, { "epoch": 0.05, "learning_rate": 0.00047815446339017053, "loss": 3.435, "theoretical_loss": 4.42699462018137, "tokens_seen": 175767552 }, { "epoch": 0.05, "learning_rate": 0.0004781444332998997, "loss": 3.3441, "theoretical_loss": 4.42678398180149, "tokens_seen": 175833088 }, { "epoch": 0.05, "learning_rate": 0.00047813440320962895, "loss": 3.3721, "theoretical_loss": 4.426573443888563, "tokens_seen": 175898624 }, { "epoch": 0.05, "learning_rate": 0.0004781243731193581, "loss": 3.3331, "theoretical_loss": 4.426363006357263, "tokens_seen": 175964160 }, { "epoch": 0.05, "learning_rate": 0.0004781143430290873, "loss": 3.6108, "theoretical_loss": 4.426152669122374, "tokens_seen": 176029696 }, { "epoch": 0.05, "learning_rate": 0.00047810431293881644, "loss": 3.5267, "theoretical_loss": 4.425942432098774, "tokens_seen": 176095232 }, { "epoch": 0.05, "learning_rate": 0.00047809428284854567, "loss": 3.1678, "theoretical_loss": 4.425732295201455, "tokens_seen": 176160768 }, { "epoch": 0.05, "learning_rate": 0.00047808425275827485, "loss": 3.5357, "theoretical_loss": 4.425522258345508, "tokens_seen": 176226304 }, { "epoch": 0.05, "learning_rate": 0.00047807422266800403, "loss": 3.4027, "theoretical_loss": 4.425312321446127, "tokens_seen": 176291840 }, { "epoch": 0.05, "learning_rate": 0.0004780641925777332, "loss": 3.464, "theoretical_loss": 4.425102484418613, "tokens_seen": 176357376 }, { "epoch": 0.05, "learning_rate": 0.0004780541624874624, "loss": 3.3002, "theoretical_loss": 4.424892747178365, "tokens_seen": 176422912 }, { "epoch": 0.05, "learning_rate": 0.0004780441323971916, "loss": 3.4841, "theoretical_loss": 4.42468310964089, "tokens_seen": 176488448 }, { "epoch": 0.05, "learning_rate": 0.0004780341023069208, "loss": 3.5805, "theoretical_loss": 4.424473571721794, "tokens_seen": 176553984 }, { "epoch": 0.05, "learning_rate": 0.00047802407221664994, "loss": 3.4713, "theoretical_loss": 4.42426413333679, "tokens_seen": 176619520 }, { "epoch": 0.05, "learning_rate": 0.0004780140421263792, "loss": 3.282, "theoretical_loss": 4.424054794401689, "tokens_seen": 176685056 }, { "epoch": 0.05, "learning_rate": 0.0004780040120361083, "loss": 3.5103, "theoretical_loss": 4.423845554832406, "tokens_seen": 176750592 }, { "epoch": 0.05, "learning_rate": 0.00047799398194583754, "loss": 3.4553, "theoretical_loss": 4.42363641454496, "tokens_seen": 176816128 }, { "epoch": 0.05, "learning_rate": 0.0004779839518555667, "loss": 3.3999, "theoretical_loss": 4.423427373455471, "tokens_seen": 176881664 }, { "epoch": 0.05, "objective/train/docs_used": 311765, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5463764667510986, "objective/train/theoretical_loss": 4.42321843148016, "objective/train/tokens_used": 197407200, "theoretical_loss": 4.42321843148016, "tokens_seen": 176947200 }, { "epoch": 0.05, "learning_rate": 0.0004779739217652959, "loss": 3.5138, "theoretical_loss": 4.42321843148016, "tokens_seen": 176947200 }, { "epoch": 0.05, "learning_rate": 0.0004779638916750251, "loss": 3.3527, "theoretical_loss": 4.423009588535351, "tokens_seen": 177012736 }, { "epoch": 0.05, "learning_rate": 0.0004779538615847543, "loss": 3.3753, "theoretical_loss": 4.422800844537466, "tokens_seen": 177078272 }, { "epoch": 0.05, "learning_rate": 0.00047794383149448344, "loss": 3.3885, "theoretical_loss": 4.422592199403036, "tokens_seen": 177143808 }, { "epoch": 0.05, "learning_rate": 0.0004779338014042127, "loss": 3.6073, "theoretical_loss": 4.422383653048685, "tokens_seen": 177209344 }, { "epoch": 0.05, "learning_rate": 0.0004779237713139418, "loss": 3.4067, "theoretical_loss": 4.422175205391145, "tokens_seen": 177274880 }, { "epoch": 0.05, "learning_rate": 0.00047791374122367104, "loss": 3.4665, "theoretical_loss": 4.421966856347243, "tokens_seen": 177340416 }, { "epoch": 0.05, "learning_rate": 0.0004779037111334002, "loss": 3.3743, "theoretical_loss": 4.421758605833912, "tokens_seen": 177405952 }, { "epoch": 0.05, "learning_rate": 0.0004778936810431294, "loss": 3.6342, "theoretical_loss": 4.421550453768181, "tokens_seen": 177471488 }, { "epoch": 0.05, "learning_rate": 0.0004778836509528586, "loss": 3.4869, "theoretical_loss": 4.421342400067183, "tokens_seen": 177537024 }, { "epoch": 0.05, "learning_rate": 0.00047787362086258776, "loss": 3.3341, "theoretical_loss": 4.42113444464815, "tokens_seen": 177602560 }, { "epoch": 0.05, "learning_rate": 0.00047786359077231695, "loss": 3.5124, "theoretical_loss": 4.420926587428411, "tokens_seen": 177668096 }, { "epoch": 0.05, "learning_rate": 0.0004778535606820462, "loss": 3.3416, "theoretical_loss": 4.420718828325403, "tokens_seen": 177733632 }, { "epoch": 0.05, "learning_rate": 0.0004778435305917753, "loss": 3.4264, "theoretical_loss": 4.420511167256656, "tokens_seen": 177799168 }, { "epoch": 0.05, "learning_rate": 0.00047783350050150454, "loss": 3.3182, "theoretical_loss": 4.4203036041398, "tokens_seen": 177864704 }, { "epoch": 0.05, "learning_rate": 0.00047782347041123367, "loss": 3.4804, "theoretical_loss": 4.420096138892568, "tokens_seen": 177930240 }, { "epoch": 0.05, "learning_rate": 0.0004778134403209629, "loss": 3.5218, "theoretical_loss": 4.419888771432789, "tokens_seen": 177995776 }, { "epoch": 0.05, "learning_rate": 0.0004778034102306921, "loss": 3.4943, "theoretical_loss": 4.419681501678395, "tokens_seen": 178061312 }, { "epoch": 0.05, "learning_rate": 0.00047779338014042127, "loss": 3.6212, "theoretical_loss": 4.419474329547413, "tokens_seen": 178126848 }, { "epoch": 0.05, "learning_rate": 0.00047778335005015045, "loss": 3.4504, "theoretical_loss": 4.419267254957971, "tokens_seen": 178192384 }, { "epoch": 0.05, "learning_rate": 0.0004777733199598797, "loss": 3.492, "theoretical_loss": 4.419060277828295, "tokens_seen": 178257920 }, { "epoch": 0.05, "learning_rate": 0.0004777632898696088, "loss": 3.5074, "theoretical_loss": 4.41885339807671, "tokens_seen": 178323456 }, { "epoch": 0.05, "learning_rate": 0.00047775325977933805, "loss": 3.4359, "theoretical_loss": 4.4186466156216415, "tokens_seen": 178388992 }, { "epoch": 0.05, "learning_rate": 0.0004777432296890672, "loss": 3.5823, "theoretical_loss": 4.418439930381609, "tokens_seen": 178454528 }, { "epoch": 0.05, "learning_rate": 0.0004777331995987964, "loss": 3.3413, "theoretical_loss": 4.418233342275233, "tokens_seen": 178520064 }, { "epoch": 0.05, "objective/train/docs_used": 314648, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6916651725769043, "objective/train/theoretical_loss": 4.418026851221231, "objective/train/tokens_used": 199045600, "theoretical_loss": 4.418026851221231, "tokens_seen": 178585600 }, { "epoch": 0.05, "learning_rate": 0.0004777231695085256, "loss": 3.5202, "theoretical_loss": 4.418026851221231, "tokens_seen": 178585600 }, { "epoch": 0.05, "learning_rate": 0.00047771313941825477, "loss": 3.3874, "theoretical_loss": 4.4178204571384185, "tokens_seen": 178651136 }, { "epoch": 0.05, "learning_rate": 0.00047770310932798395, "loss": 3.4658, "theoretical_loss": 4.41761415994571, "tokens_seen": 178716672 }, { "epoch": 0.05, "learning_rate": 0.00047769307923771313, "loss": 3.4383, "theoretical_loss": 4.417407959562116, "tokens_seen": 178782208 }, { "epoch": 0.05, "learning_rate": 0.0004776830491474423, "loss": 3.6053, "theoretical_loss": 4.417201855906742, "tokens_seen": 178847744 }, { "epoch": 0.05, "learning_rate": 0.00047767301905717155, "loss": 3.3997, "theoretical_loss": 4.416995848898797, "tokens_seen": 178913280 }, { "epoch": 0.05, "learning_rate": 0.0004776629889669007, "loss": 3.2264, "theoretical_loss": 4.4167899384575815, "tokens_seen": 178978816 }, { "epoch": 0.05, "learning_rate": 0.0004776529588766299, "loss": 3.5317, "theoretical_loss": 4.416584124502495, "tokens_seen": 179044352 }, { "epoch": 0.05, "learning_rate": 0.00047764292878635904, "loss": 3.4043, "theoretical_loss": 4.416378406953033, "tokens_seen": 179109888 }, { "epoch": 0.05, "learning_rate": 0.0004776328986960883, "loss": 3.4367, "theoretical_loss": 4.41617278572879, "tokens_seen": 179175424 }, { "epoch": 0.05, "learning_rate": 0.00047762286860581746, "loss": 3.3725, "theoretical_loss": 4.4159672607494524, "tokens_seen": 179240960 }, { "epoch": 0.05, "learning_rate": 0.00047761283851554664, "loss": 3.6123, "theoretical_loss": 4.415761831934808, "tokens_seen": 179306496 }, { "epoch": 0.05, "learning_rate": 0.0004776028084252758, "loss": 3.3352, "theoretical_loss": 4.415556499204737, "tokens_seen": 179372032 }, { "epoch": 0.05, "learning_rate": 0.00047759277833500505, "loss": 3.4415, "theoretical_loss": 4.415351262479216, "tokens_seen": 179437568 }, { "epoch": 0.05, "learning_rate": 0.0004775827482447342, "loss": 3.2507, "theoretical_loss": 4.415146121678321, "tokens_seen": 179503104 }, { "epoch": 0.05, "learning_rate": 0.0004775727181544634, "loss": 3.4488, "theoretical_loss": 4.414941076722219, "tokens_seen": 179568640 }, { "epoch": 0.05, "learning_rate": 0.00047756268806419254, "loss": 3.3088, "theoretical_loss": 4.4147361275311745, "tokens_seen": 179634176 }, { "epoch": 0.05, "learning_rate": 0.0004775526579739218, "loss": 3.5145, "theoretical_loss": 4.414531274025548, "tokens_seen": 179699712 }, { "epoch": 0.05, "learning_rate": 0.00047754262788365096, "loss": 3.5104, "theoretical_loss": 4.414326516125795, "tokens_seen": 179765248 }, { "epoch": 0.05, "learning_rate": 0.00047753259779338014, "loss": 3.3663, "theoretical_loss": 4.414121853752466, "tokens_seen": 179830784 }, { "epoch": 0.05, "learning_rate": 0.0004775225677031093, "loss": 3.5145, "theoretical_loss": 4.413917286826205, "tokens_seen": 179896320 }, { "epoch": 0.05, "learning_rate": 0.0004775125376128385, "loss": 3.4654, "theoretical_loss": 4.413712815267752, "tokens_seen": 179961856 }, { "epoch": 0.05, "learning_rate": 0.0004775025075225677, "loss": 3.4731, "theoretical_loss": 4.413508438997944, "tokens_seen": 180027392 }, { "epoch": 0.05, "learning_rate": 0.0004774924774322969, "loss": 3.5194, "theoretical_loss": 4.4133041579377075, "tokens_seen": 180092928 }, { "epoch": 0.05, "learning_rate": 0.00047748244734202605, "loss": 3.4778, "theoretical_loss": 4.413099972008068, "tokens_seen": 180158464 }, { "epoch": 0.05, "objective/train/docs_used": 317009, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.7935869693756104, "objective/train/theoretical_loss": 4.412895881130142, "objective/train/tokens_used": 200684000, "theoretical_loss": 4.412895881130142, "tokens_seen": 180224000 }, { "epoch": 0.05, "learning_rate": 0.0004774724172517553, "loss": 3.3358, "theoretical_loss": 4.412895881130142, "tokens_seen": 180224000 }, { "epoch": 0.05, "learning_rate": 0.0004774623871614845, "loss": 3.4082, "theoretical_loss": 4.412691885225141, "tokens_seen": 180289536 }, { "epoch": 0.05, "learning_rate": 0.00047745235707121364, "loss": 3.3752, "theoretical_loss": 4.412487984214373, "tokens_seen": 180355072 }, { "epoch": 0.05, "learning_rate": 0.0004774423269809429, "loss": 3.4927, "theoretical_loss": 4.412284178019235, "tokens_seen": 180420608 }, { "epoch": 0.05, "learning_rate": 0.000477432296890672, "loss": 3.307, "theoretical_loss": 4.412080466561221, "tokens_seen": 180486144 }, { "epoch": 0.05, "learning_rate": 0.00047742226680040124, "loss": 3.5782, "theoretical_loss": 4.411876849761917, "tokens_seen": 180551680 }, { "epoch": 0.05, "learning_rate": 0.0004774122367101304, "loss": 3.629, "theoretical_loss": 4.411673327543005, "tokens_seen": 180617216 }, { "epoch": 0.05, "learning_rate": 0.0004774022066198596, "loss": 3.5315, "theoretical_loss": 4.4114698998262565, "tokens_seen": 180682752 }, { "epoch": 0.05, "learning_rate": 0.0004773921765295888, "loss": 3.5319, "theoretical_loss": 4.411266566533539, "tokens_seen": 180748288 }, { "epoch": 0.05, "learning_rate": 0.00047738214643931797, "loss": 3.4153, "theoretical_loss": 4.41106332758681, "tokens_seen": 180813824 }, { "epoch": 0.05, "learning_rate": 0.00047737211634904715, "loss": 3.5485, "theoretical_loss": 4.41086018290812, "tokens_seen": 180879360 }, { "epoch": 0.05, "learning_rate": 0.0004773620862587764, "loss": 3.5068, "theoretical_loss": 4.410657132419617, "tokens_seen": 180944896 }, { "epoch": 0.05, "learning_rate": 0.0004773520561685055, "loss": 3.5254, "theoretical_loss": 4.410454176043537, "tokens_seen": 181010432 }, { "epoch": 0.05, "learning_rate": 0.00047734202607823474, "loss": 3.3199, "theoretical_loss": 4.410251313702208, "tokens_seen": 181075968 }, { "epoch": 0.05, "learning_rate": 0.00047733199598796387, "loss": 3.4552, "theoretical_loss": 4.410048545318052, "tokens_seen": 181141504 }, { "epoch": 0.05, "learning_rate": 0.0004773219658976931, "loss": 3.4009, "theoretical_loss": 4.409845870813582, "tokens_seen": 181207040 }, { "epoch": 0.05, "learning_rate": 0.0004773119358074223, "loss": 3.3785, "theoretical_loss": 4.409643290111404, "tokens_seen": 181272576 }, { "epoch": 0.05, "learning_rate": 0.00047730190571715147, "loss": 3.5304, "theoretical_loss": 4.409440803134215, "tokens_seen": 181338112 }, { "epoch": 0.05, "learning_rate": 0.00047729187562688065, "loss": 3.4502, "theoretical_loss": 4.409238409804804, "tokens_seen": 181403648 }, { "epoch": 0.05, "learning_rate": 0.0004772818455366099, "loss": 3.3637, "theoretical_loss": 4.409036110046051, "tokens_seen": 181469184 }, { "epoch": 0.06, "learning_rate": 0.000477271815446339, "loss": 3.4601, "theoretical_loss": 4.408833903780926, "tokens_seen": 181534720 }, { "epoch": 0.06, "learning_rate": 0.00047726178535606825, "loss": 3.5468, "theoretical_loss": 4.408631790932494, "tokens_seen": 181600256 }, { "epoch": 0.06, "learning_rate": 0.0004772517552657974, "loss": 3.332, "theoretical_loss": 4.408429771423909, "tokens_seen": 181665792 }, { "epoch": 0.06, "learning_rate": 0.0004772417251755266, "loss": 3.3655, "theoretical_loss": 4.408227845178414, "tokens_seen": 181731328 }, { "epoch": 0.06, "learning_rate": 0.0004772316950852558, "loss": 3.4816, "theoretical_loss": 4.408026012119344, "tokens_seen": 181796864 }, { "epoch": 0.06, "objective/train/docs_used": 319880, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4623184204101562, "objective/train/theoretical_loss": 4.407824272170128, "objective/train/tokens_used": 202322400, "theoretical_loss": 4.407824272170128, "tokens_seen": 181862400 }, { "epoch": 0.06, "learning_rate": 0.00047722166499498497, "loss": 3.4163, "theoretical_loss": 4.407824272170128, "tokens_seen": 181862400 }, { "epoch": 0.06, "learning_rate": 0.00047721163490471415, "loss": 3.2812, "theoretical_loss": 4.407622625254279, "tokens_seen": 181927936 }, { "epoch": 0.06, "learning_rate": 0.00047720160481444333, "loss": 3.6037, "theoretical_loss": 4.407421071295406, "tokens_seen": 181993472 }, { "epoch": 0.06, "learning_rate": 0.0004771915747241725, "loss": 3.5599, "theoretical_loss": 4.407219610217206, "tokens_seen": 182059008 }, { "epoch": 0.06, "learning_rate": 0.00047718154463390175, "loss": 3.1789, "theoretical_loss": 4.407018241943467, "tokens_seen": 182124544 }, { "epoch": 0.06, "learning_rate": 0.0004771715145436309, "loss": 3.4999, "theoretical_loss": 4.406816966398064, "tokens_seen": 182190080 }, { "epoch": 0.06, "learning_rate": 0.0004771614844533601, "loss": 3.3004, "theoretical_loss": 4.406615783504965, "tokens_seen": 182255616 }, { "epoch": 0.06, "learning_rate": 0.00047715145436308924, "loss": 3.614, "theoretical_loss": 4.4064146931882275, "tokens_seen": 182321152 }, { "epoch": 0.06, "learning_rate": 0.0004771414242728185, "loss": 3.4806, "theoretical_loss": 4.406213695371996, "tokens_seen": 182386688 }, { "epoch": 0.06, "learning_rate": 0.00047713139418254766, "loss": 3.4724, "theoretical_loss": 4.406012789980506, "tokens_seen": 182452224 }, { "epoch": 0.06, "learning_rate": 0.00047712136409227684, "loss": 3.3861, "theoretical_loss": 4.405811976938084, "tokens_seen": 182517760 }, { "epoch": 0.06, "learning_rate": 0.000477111334002006, "loss": 3.5195, "theoretical_loss": 4.405611256169143, "tokens_seen": 182583296 }, { "epoch": 0.06, "learning_rate": 0.00047710130391173525, "loss": 3.3626, "theoretical_loss": 4.405410627598185, "tokens_seen": 182648832 }, { "epoch": 0.06, "learning_rate": 0.0004770912738214644, "loss": 3.4743, "theoretical_loss": 4.405210091149802, "tokens_seen": 182714368 }, { "epoch": 0.06, "learning_rate": 0.0004770812437311936, "loss": 3.5629, "theoretical_loss": 4.405009646748674, "tokens_seen": 182779904 }, { "epoch": 0.06, "learning_rate": 0.00047707121364092274, "loss": 3.506, "theoretical_loss": 4.404809294319572, "tokens_seen": 182845440 }, { "epoch": 0.06, "learning_rate": 0.000477061183550652, "loss": 3.3131, "theoretical_loss": 4.40460903378735, "tokens_seen": 182910976 }, { "epoch": 0.06, "learning_rate": 0.00047705115346038116, "loss": 3.6122, "theoretical_loss": 4.404408865076955, "tokens_seen": 182976512 }, { "epoch": 0.06, "learning_rate": 0.00047704112337011034, "loss": 3.4927, "theoretical_loss": 4.404208788113422, "tokens_seen": 183042048 }, { "epoch": 0.06, "learning_rate": 0.0004770310932798395, "loss": 3.5265, "theoretical_loss": 4.404008802821871, "tokens_seen": 183107584 }, { "epoch": 0.06, "learning_rate": 0.0004770210631895687, "loss": 3.4492, "theoretical_loss": 4.4038089091275125, "tokens_seen": 183173120 }, { "epoch": 0.06, "learning_rate": 0.0004770110330992979, "loss": 3.4915, "theoretical_loss": 4.403609106955645, "tokens_seen": 183238656 }, { "epoch": 0.06, "learning_rate": 0.0004770010030090271, "loss": 3.5957, "theoretical_loss": 4.403409396231651, "tokens_seen": 183304192 }, { "epoch": 0.06, "learning_rate": 0.00047699097291875625, "loss": 3.3896, "theoretical_loss": 4.403209776881004, "tokens_seen": 183369728 }, { "epoch": 0.06, "learning_rate": 0.0004769809428284855, "loss": 3.4793, "theoretical_loss": 4.403010248829265, "tokens_seen": 183435264 }, { "epoch": 0.06, "objective/train/docs_used": 322732, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3233094215393066, "objective/train/theoretical_loss": 4.4028108120020795, "objective/train/tokens_used": 203960800, "theoretical_loss": 4.4028108120020795, "tokens_seen": 183500800 }, { "epoch": 0.06, "learning_rate": 0.00047697091273821466, "loss": 3.413, "theoretical_loss": 4.4028108120020795, "tokens_seen": 183500800 }, { "epoch": 0.06, "learning_rate": 0.00047696088264794384, "loss": 3.3942, "theoretical_loss": 4.402611466325182, "tokens_seen": 183566336 }, { "epoch": 0.06, "learning_rate": 0.000476950852557673, "loss": 3.504, "theoretical_loss": 4.4024122117243945, "tokens_seen": 183631872 }, { "epoch": 0.06, "learning_rate": 0.0004769408224674022, "loss": 3.2974, "theoretical_loss": 4.402213048125624, "tokens_seen": 183697408 }, { "epoch": 0.06, "learning_rate": 0.0004769307923771314, "loss": 3.4921, "theoretical_loss": 4.4020139754548655, "tokens_seen": 183762944 }, { "epoch": 0.06, "learning_rate": 0.0004769207622868606, "loss": 3.3395, "theoretical_loss": 4.401814993638199, "tokens_seen": 183828480 }, { "epoch": 0.06, "learning_rate": 0.00047691073219658975, "loss": 3.3293, "theoretical_loss": 4.4016161026017935, "tokens_seen": 183894016 }, { "epoch": 0.06, "learning_rate": 0.000476900702106319, "loss": 3.329, "theoretical_loss": 4.401417302271902, "tokens_seen": 183959552 }, { "epoch": 0.06, "learning_rate": 0.0004768906720160481, "loss": 3.3762, "theoretical_loss": 4.401218592574865, "tokens_seen": 184025088 }, { "epoch": 0.06, "learning_rate": 0.00047688064192577735, "loss": 3.3754, "theoretical_loss": 4.401019973437108, "tokens_seen": 184090624 }, { "epoch": 0.06, "learning_rate": 0.00047687061183550653, "loss": 3.3238, "theoretical_loss": 4.400821444785143, "tokens_seen": 184156160 }, { "epoch": 0.06, "learning_rate": 0.0004768605817452357, "loss": 3.2853, "theoretical_loss": 4.400623006545567, "tokens_seen": 184221696 }, { "epoch": 0.06, "learning_rate": 0.0004768505516549649, "loss": 3.4366, "theoretical_loss": 4.400424658645065, "tokens_seen": 184287232 }, { "epoch": 0.06, "learning_rate": 0.00047684052156469407, "loss": 3.3863, "theoretical_loss": 4.400226401010404, "tokens_seen": 184352768 }, { "epoch": 0.06, "learning_rate": 0.00047683049147442325, "loss": 3.5199, "theoretical_loss": 4.40002823356844, "tokens_seen": 184418304 }, { "epoch": 0.06, "learning_rate": 0.0004768204613841525, "loss": 3.4097, "theoretical_loss": 4.39983015624611, "tokens_seen": 184483840 }, { "epoch": 0.06, "learning_rate": 0.0004768104312938816, "loss": 3.2923, "theoretical_loss": 4.39963216897044, "tokens_seen": 184549376 }, { "epoch": 0.06, "learning_rate": 0.00047680040120361085, "loss": 3.3514, "theoretical_loss": 4.3994342716685395, "tokens_seen": 184614912 }, { "epoch": 0.06, "learning_rate": 0.00047679037111334003, "loss": 3.4307, "theoretical_loss": 4.399236464267602, "tokens_seen": 184680448 }, { "epoch": 0.06, "learning_rate": 0.0004767803410230692, "loss": 3.6546, "theoretical_loss": 4.399038746694908, "tokens_seen": 184745984 }, { "epoch": 0.06, "learning_rate": 0.0004767703109327984, "loss": 3.4967, "theoretical_loss": 4.398841118877819, "tokens_seen": 184811520 }, { "epoch": 0.06, "learning_rate": 0.0004767602808425276, "loss": 3.3676, "theoretical_loss": 4.398643580743785, "tokens_seen": 184877056 }, { "epoch": 0.06, "learning_rate": 0.00047675025075225676, "loss": 3.2635, "theoretical_loss": 4.398446132220338, "tokens_seen": 184942592 }, { "epoch": 0.06, "learning_rate": 0.000476740220661986, "loss": 3.2288, "theoretical_loss": 4.3982487732350934, "tokens_seen": 185008128 }, { "epoch": 0.06, "learning_rate": 0.0004767301905717151, "loss": 3.366, "theoretical_loss": 4.398051503715753, "tokens_seen": 185073664 }, { "epoch": 0.06, "objective/train/docs_used": 325427, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.109199285507202, "objective/train/theoretical_loss": 4.397854323590102, "objective/train/tokens_used": 205599200, "theoretical_loss": 4.397854323590102, "tokens_seen": 185139200 }, { "epoch": 0.06, "learning_rate": 0.00047672016048144435, "loss": 3.3711, "theoretical_loss": 4.397854323590102, "tokens_seen": 185139200 }, { "epoch": 0.06, "learning_rate": 0.00047671013039117353, "loss": 3.4974, "theoretical_loss": 4.397657232786008, "tokens_seen": 185204736 }, { "epoch": 0.06, "learning_rate": 0.0004767001003009027, "loss": 3.2722, "theoretical_loss": 4.397460231231424, "tokens_seen": 185270272 }, { "epoch": 0.06, "learning_rate": 0.00047669007021063195, "loss": 3.4263, "theoretical_loss": 4.397263318854384, "tokens_seen": 185335808 }, { "epoch": 0.06, "learning_rate": 0.0004766800401203611, "loss": 3.3948, "theoretical_loss": 4.39706649558301, "tokens_seen": 185401344 }, { "epoch": 0.06, "learning_rate": 0.0004766700100300903, "loss": 3.2296, "theoretical_loss": 4.396869761345503, "tokens_seen": 185466880 }, { "epoch": 0.06, "learning_rate": 0.00047665997993981944, "loss": 3.4077, "theoretical_loss": 4.396673116070147, "tokens_seen": 185532416 }, { "epoch": 0.06, "learning_rate": 0.0004766499498495487, "loss": 3.3954, "theoretical_loss": 4.396476559685315, "tokens_seen": 185597952 }, { "epoch": 0.06, "learning_rate": 0.00047663991975927786, "loss": 3.3999, "theoretical_loss": 4.396280092119455, "tokens_seen": 185663488 }, { "epoch": 0.06, "learning_rate": 0.00047662988966900704, "loss": 3.4481, "theoretical_loss": 4.3960837133011035, "tokens_seen": 185729024 }, { "epoch": 0.06, "learning_rate": 0.0004766198595787362, "loss": 3.5451, "theoretical_loss": 4.395887423158877, "tokens_seen": 185794560 }, { "epoch": 0.06, "learning_rate": 0.00047660982948846545, "loss": 3.4626, "theoretical_loss": 4.395691221621476, "tokens_seen": 185860096 }, { "epoch": 0.06, "learning_rate": 0.0004765997993981946, "loss": 3.5543, "theoretical_loss": 4.395495108617682, "tokens_seen": 185925632 }, { "epoch": 0.06, "learning_rate": 0.0004765897693079238, "loss": 3.3749, "theoretical_loss": 4.39529908407636, "tokens_seen": 185991168 }, { "epoch": 0.06, "learning_rate": 0.00047657973921765294, "loss": 3.4035, "theoretical_loss": 4.3951031479264575, "tokens_seen": 186056704 }, { "epoch": 0.06, "learning_rate": 0.0004765697091273822, "loss": 3.3592, "theoretical_loss": 4.394907300097002, "tokens_seen": 186122240 }, { "epoch": 0.06, "learning_rate": 0.00047655967903711136, "loss": 3.297, "theoretical_loss": 4.394711540517106, "tokens_seen": 186187776 }, { "epoch": 0.06, "learning_rate": 0.00047654964894684054, "loss": 3.3343, "theoretical_loss": 4.39451586911596, "tokens_seen": 186253312 }, { "epoch": 0.06, "learning_rate": 0.0004765396188565697, "loss": 3.437, "theoretical_loss": 4.39432028582284, "tokens_seen": 186318848 }, { "epoch": 0.06, "learning_rate": 0.0004765295887662989, "loss": 3.5463, "theoretical_loss": 4.394124790567101, "tokens_seen": 186384384 }, { "epoch": 0.06, "learning_rate": 0.0004765195586760281, "loss": 3.5518, "theoretical_loss": 4.3939293832781825, "tokens_seen": 186449920 }, { "epoch": 0.06, "learning_rate": 0.0004765095285857573, "loss": 3.5526, "theoretical_loss": 4.393734063885599, "tokens_seen": 186515456 }, { "epoch": 0.06, "learning_rate": 0.00047649949849548645, "loss": 3.5388, "theoretical_loss": 4.3935388323189555, "tokens_seen": 186580992 }, { "epoch": 0.06, "learning_rate": 0.0004764894684052157, "loss": 3.4658, "theoretical_loss": 4.39334368850793, "tokens_seen": 186646528 }, { "epoch": 0.06, "learning_rate": 0.00047647943831494486, "loss": 3.3955, "theoretical_loss": 4.3931486323822835, "tokens_seen": 186712064 }, { "epoch": 0.06, "objective/train/docs_used": 328230, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.675567388534546, "objective/train/theoretical_loss": 4.392953663871862, "objective/train/tokens_used": 207237600, "theoretical_loss": 4.392953663871862, "tokens_seen": 186777600 }, { "epoch": 0.06, "learning_rate": 0.00047646940822467404, "loss": 3.5666, "theoretical_loss": 4.392953663871862, "tokens_seen": 186777600 }, { "epoch": 0.06, "learning_rate": 0.0004764593781344032, "loss": 3.2495, "theoretical_loss": 4.392758782906586, "tokens_seen": 186843136 }, { "epoch": 0.06, "learning_rate": 0.0004764493480441324, "loss": 3.459, "theoretical_loss": 4.392563989416462, "tokens_seen": 186908672 }, { "epoch": 0.06, "learning_rate": 0.0004764393179538616, "loss": 3.3613, "theoretical_loss": 4.392369283331574, "tokens_seen": 186974208 }, { "epoch": 0.06, "learning_rate": 0.0004764292878635908, "loss": 3.4318, "theoretical_loss": 4.392174664582085, "tokens_seen": 187039744 }, { "epoch": 0.06, "learning_rate": 0.00047641925777331995, "loss": 3.1486, "theoretical_loss": 4.391980133098244, "tokens_seen": 187105280 }, { "epoch": 0.06, "learning_rate": 0.0004764092276830492, "loss": 3.5893, "theoretical_loss": 4.391785688810373, "tokens_seen": 187170816 }, { "epoch": 0.06, "learning_rate": 0.0004763991975927783, "loss": 3.4881, "theoretical_loss": 4.391591331648879, "tokens_seen": 187236352 }, { "epoch": 0.06, "learning_rate": 0.00047638916750250755, "loss": 3.4528, "theoretical_loss": 4.391397061544247, "tokens_seen": 187301888 }, { "epoch": 0.06, "learning_rate": 0.00047637913741223673, "loss": 3.3992, "theoretical_loss": 4.391202878427042, "tokens_seen": 187367424 }, { "epoch": 0.06, "learning_rate": 0.0004763691073219659, "loss": 3.4703, "theoretical_loss": 4.3910087822279085, "tokens_seen": 187432960 }, { "epoch": 0.06, "learning_rate": 0.0004763590772316951, "loss": 3.5077, "theoretical_loss": 4.390814772877571, "tokens_seen": 187498496 }, { "epoch": 0.06, "learning_rate": 0.00047634904714142427, "loss": 3.2267, "theoretical_loss": 4.390620850306832, "tokens_seen": 187564032 }, { "epoch": 0.06, "learning_rate": 0.00047633901705115345, "loss": 3.6392, "theoretical_loss": 4.390427014446575, "tokens_seen": 187629568 }, { "epoch": 0.06, "learning_rate": 0.0004763289869608827, "loss": 3.3717, "theoretical_loss": 4.390233265227764, "tokens_seen": 187695104 }, { "epoch": 0.06, "learning_rate": 0.0004763189568706118, "loss": 3.496, "theoretical_loss": 4.390039602581437, "tokens_seen": 187760640 }, { "epoch": 0.06, "learning_rate": 0.00047630892678034105, "loss": 3.5921, "theoretical_loss": 4.389846026438715, "tokens_seen": 187826176 }, { "epoch": 0.06, "learning_rate": 0.00047629889669007023, "loss": 3.3335, "theoretical_loss": 4.3896525367307975, "tokens_seen": 187891712 }, { "epoch": 0.06, "learning_rate": 0.0004762888665997994, "loss": 3.3289, "theoretical_loss": 4.389459133388962, "tokens_seen": 187957248 }, { "epoch": 0.06, "learning_rate": 0.0004762788365095286, "loss": 3.6506, "theoretical_loss": 4.3892658163445635, "tokens_seen": 188022784 }, { "epoch": 0.06, "learning_rate": 0.0004762688064192578, "loss": 3.5205, "theoretical_loss": 4.389072585529037, "tokens_seen": 188088320 }, { "epoch": 0.06, "learning_rate": 0.00047625877632898696, "loss": 3.5344, "theoretical_loss": 4.388879440873897, "tokens_seen": 188153856 }, { "epoch": 0.06, "learning_rate": 0.0004762487462387162, "loss": 3.4977, "theoretical_loss": 4.388686382310732, "tokens_seen": 188219392 }, { "epoch": 0.06, "learning_rate": 0.0004762387161484453, "loss": 3.5253, "theoretical_loss": 4.388493409771213, "tokens_seen": 188284928 }, { "epoch": 0.06, "learning_rate": 0.00047622868605817455, "loss": 3.4185, "theoretical_loss": 4.388300523187087, "tokens_seen": 188350464 }, { "epoch": 0.06, "objective/train/docs_used": 330800, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9801299571990967, "objective/train/theoretical_loss": 4.3881077224901786, "objective/train/tokens_used": 208876000, "theoretical_loss": 4.3881077224901786, "tokens_seen": 188416000 }, { "epoch": 0.06, "learning_rate": 0.0004762186559679037, "loss": 3.4347, "theoretical_loss": 4.3881077224901786, "tokens_seen": 188416000 }, { "epoch": 0.06, "learning_rate": 0.0004762086258776329, "loss": 3.4261, "theoretical_loss": 4.38791500761239, "tokens_seen": 188481536 }, { "epoch": 0.06, "learning_rate": 0.0004761985957873621, "loss": 3.3204, "theoretical_loss": 4.387722378485703, "tokens_seen": 188547072 }, { "epoch": 0.06, "learning_rate": 0.0004761885656970913, "loss": 3.4121, "theoretical_loss": 4.3875298350421765, "tokens_seen": 188612608 }, { "epoch": 0.06, "learning_rate": 0.00047617853560682046, "loss": 3.5114, "theoretical_loss": 4.387337377213943, "tokens_seen": 188678144 }, { "epoch": 0.06, "learning_rate": 0.00047616850551654964, "loss": 3.7438, "theoretical_loss": 4.387145004933218, "tokens_seen": 188743680 }, { "epoch": 0.06, "learning_rate": 0.0004761584754262788, "loss": 3.4074, "theoretical_loss": 4.38695271813229, "tokens_seen": 188809216 }, { "epoch": 0.06, "learning_rate": 0.00047614844533600806, "loss": 3.4627, "theoretical_loss": 4.386760516743526, "tokens_seen": 188874752 }, { "epoch": 0.06, "learning_rate": 0.0004761384152457372, "loss": 3.3464, "theoretical_loss": 4.38656840069937, "tokens_seen": 188940288 }, { "epoch": 0.06, "learning_rate": 0.0004761283851554664, "loss": 3.3674, "theoretical_loss": 4.386376369932344, "tokens_seen": 189005824 }, { "epoch": 0.06, "learning_rate": 0.0004761183550651956, "loss": 3.3807, "theoretical_loss": 4.386184424375044, "tokens_seen": 189071360 }, { "epoch": 0.06, "learning_rate": 0.0004761083249749248, "loss": 3.3437, "theoretical_loss": 4.385992563960145, "tokens_seen": 189136896 }, { "epoch": 0.06, "learning_rate": 0.00047609829488465396, "loss": 3.359, "theoretical_loss": 4.385800788620397, "tokens_seen": 189202432 }, { "epoch": 0.06, "learning_rate": 0.00047608826479438314, "loss": 3.5368, "theoretical_loss": 4.385609098288628, "tokens_seen": 189267968 }, { "epoch": 0.06, "learning_rate": 0.0004760782347041123, "loss": 3.5303, "theoretical_loss": 4.385417492897741, "tokens_seen": 189333504 }, { "epoch": 0.06, "learning_rate": 0.00047606820461384156, "loss": 3.534, "theoretical_loss": 4.385225972380715, "tokens_seen": 189399040 }, { "epoch": 0.06, "learning_rate": 0.0004760581745235707, "loss": 3.3872, "theoretical_loss": 4.385034536670606, "tokens_seen": 189464576 }, { "epoch": 0.06, "learning_rate": 0.0004760481444332999, "loss": 3.4469, "theoretical_loss": 4.384843185700544, "tokens_seen": 189530112 }, { "epoch": 0.06, "learning_rate": 0.00047603811434302905, "loss": 3.2936, "theoretical_loss": 4.384651919403739, "tokens_seen": 189595648 }, { "epoch": 0.06, "learning_rate": 0.0004760280842527583, "loss": 3.3289, "theoretical_loss": 4.384460737713471, "tokens_seen": 189661184 }, { "epoch": 0.06, "learning_rate": 0.00047601805416248746, "loss": 3.4676, "theoretical_loss": 4.384269640563101, "tokens_seen": 189726720 }, { "epoch": 0.06, "learning_rate": 0.00047600802407221665, "loss": 3.5975, "theoretical_loss": 4.384078627886062, "tokens_seen": 189792256 }, { "epoch": 0.06, "learning_rate": 0.0004759979939819458, "loss": 3.3608, "theoretical_loss": 4.383887699615863, "tokens_seen": 189857792 }, { "epoch": 0.06, "learning_rate": 0.00047598796389167506, "loss": 3.3944, "theoretical_loss": 4.38369685568609, "tokens_seen": 189923328 }, { "epoch": 0.06, "learning_rate": 0.0004759779338014042, "loss": 3.4298, "theoretical_loss": 4.383506096030401, "tokens_seen": 189988864 }, { "epoch": 0.06, "objective/train/docs_used": 333650, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3905723094940186, "objective/train/theoretical_loss": 4.383315420582533, "objective/train/tokens_used": 210514400, "theoretical_loss": 4.383315420582533, "tokens_seen": 190054400 }, { "epoch": 0.06, "learning_rate": 0.0004759679037111334, "loss": 3.4421, "theoretical_loss": 4.383315420582533, "tokens_seen": 190054400 }, { "epoch": 0.06, "learning_rate": 0.0004759578736208626, "loss": 3.2576, "theoretical_loss": 4.383124829276294, "tokens_seen": 190119936 }, { "epoch": 0.06, "learning_rate": 0.0004759478435305918, "loss": 3.4529, "theoretical_loss": 4.38293432204557, "tokens_seen": 190185472 }, { "epoch": 0.06, "learning_rate": 0.000475937813440321, "loss": 3.3987, "theoretical_loss": 4.382743898824321, "tokens_seen": 190251008 }, { "epoch": 0.06, "learning_rate": 0.00047592778335005015, "loss": 3.3349, "theoretical_loss": 4.3825535595465785, "tokens_seen": 190316544 }, { "epoch": 0.06, "learning_rate": 0.0004759177532597794, "loss": 3.4103, "theoretical_loss": 4.382363304146453, "tokens_seen": 190382080 }, { "epoch": 0.06, "learning_rate": 0.0004759077231695085, "loss": 3.3401, "theoretical_loss": 4.382173132558126, "tokens_seen": 190447616 }, { "epoch": 0.06, "learning_rate": 0.00047589769307923775, "loss": 3.5074, "theoretical_loss": 4.381983044715856, "tokens_seen": 190513152 }, { "epoch": 0.06, "learning_rate": 0.00047588766298896693, "loss": 3.4211, "theoretical_loss": 4.381793040553973, "tokens_seen": 190578688 }, { "epoch": 0.06, "learning_rate": 0.0004758776328986961, "loss": 3.553, "theoretical_loss": 4.381603120006883, "tokens_seen": 190644224 }, { "epoch": 0.06, "learning_rate": 0.0004758676028084253, "loss": 3.3657, "theoretical_loss": 4.381413283009065, "tokens_seen": 190709760 }, { "epoch": 0.06, "learning_rate": 0.00047585757271815447, "loss": 3.5879, "theoretical_loss": 4.381223529495073, "tokens_seen": 190775296 }, { "epoch": 0.06, "learning_rate": 0.00047584754262788365, "loss": 3.4274, "theoretical_loss": 4.381033859399532, "tokens_seen": 190840832 }, { "epoch": 0.06, "learning_rate": 0.0004758375125376129, "loss": 3.2747, "theoretical_loss": 4.380844272657145, "tokens_seen": 190906368 }, { "epoch": 0.06, "learning_rate": 0.000475827482447342, "loss": 3.4379, "theoretical_loss": 4.380654769202683, "tokens_seen": 190971904 }, { "epoch": 0.06, "learning_rate": 0.00047581745235707125, "loss": 3.4268, "theoretical_loss": 4.380465348970995, "tokens_seen": 191037440 }, { "epoch": 0.06, "learning_rate": 0.00047580742226680043, "loss": 3.429, "theoretical_loss": 4.380276011897003, "tokens_seen": 191102976 }, { "epoch": 0.06, "learning_rate": 0.0004757973921765296, "loss": 3.4451, "theoretical_loss": 4.380086757915698, "tokens_seen": 191168512 }, { "epoch": 0.06, "learning_rate": 0.0004757873620862588, "loss": 3.4912, "theoretical_loss": 4.379897586962148, "tokens_seen": 191234048 }, { "epoch": 0.06, "learning_rate": 0.000475777331995988, "loss": 3.5713, "theoretical_loss": 4.379708498971494, "tokens_seen": 191299584 }, { "epoch": 0.06, "learning_rate": 0.00047576730190571716, "loss": 3.3917, "theoretical_loss": 4.379519493878948, "tokens_seen": 191365120 }, { "epoch": 0.06, "learning_rate": 0.0004757572718154464, "loss": 3.1626, "theoretical_loss": 4.379330571619795, "tokens_seen": 191430656 }, { "epoch": 0.06, "learning_rate": 0.0004757472417251755, "loss": 3.2857, "theoretical_loss": 4.379141732129394, "tokens_seen": 191496192 }, { "epoch": 0.06, "learning_rate": 0.00047573721163490475, "loss": 3.5271, "theoretical_loss": 4.378952975343175, "tokens_seen": 191561728 }, { "epoch": 0.06, "learning_rate": 0.0004757271815446339, "loss": 3.337, "theoretical_loss": 4.378764301196642, "tokens_seen": 191627264 }, { "epoch": 0.06, "objective/train/docs_used": 334982, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.534613847732544, "objective/train/theoretical_loss": 4.37857570962537, "objective/train/tokens_used": 212152800, "theoretical_loss": 4.37857570962537, "tokens_seen": 191692800 }, { "epoch": 0.06, "learning_rate": 0.0004757171514543631, "loss": 3.4024, "theoretical_loss": 4.37857570962537, "tokens_seen": 191692800 }, { "epoch": 0.06, "learning_rate": 0.0004757071213640923, "loss": 3.4732, "theoretical_loss": 4.378387200565006, "tokens_seen": 191758336 }, { "epoch": 0.06, "learning_rate": 0.0004756970912738215, "loss": 3.2767, "theoretical_loss": 4.378198773951272, "tokens_seen": 191823872 }, { "epoch": 0.06, "learning_rate": 0.00047568706118355066, "loss": 3.3449, "theoretical_loss": 4.378010429719957, "tokens_seen": 191889408 }, { "epoch": 0.06, "learning_rate": 0.00047567703109327984, "loss": 3.523, "theoretical_loss": 4.377822167806928, "tokens_seen": 191954944 }, { "epoch": 0.06, "learning_rate": 0.000475667001003009, "loss": 3.3455, "theoretical_loss": 4.377633988148117, "tokens_seen": 192020480 }, { "epoch": 0.06, "learning_rate": 0.00047565697091273826, "loss": 3.3683, "theoretical_loss": 4.377445890679534, "tokens_seen": 192086016 }, { "epoch": 0.06, "learning_rate": 0.0004756469408224674, "loss": 3.5404, "theoretical_loss": 4.377257875337257, "tokens_seen": 192151552 }, { "epoch": 0.06, "learning_rate": 0.0004756369107321966, "loss": 3.476, "theoretical_loss": 4.377069942057436, "tokens_seen": 192217088 }, { "epoch": 0.06, "learning_rate": 0.0004756268806419258, "loss": 3.4115, "theoretical_loss": 4.376882090776293, "tokens_seen": 192282624 }, { "epoch": 0.06, "learning_rate": 0.000475616850551655, "loss": 3.5594, "theoretical_loss": 4.376694321430121, "tokens_seen": 192348160 }, { "epoch": 0.06, "learning_rate": 0.00047560682046138416, "loss": 3.4661, "theoretical_loss": 4.376506633955286, "tokens_seen": 192413696 }, { "epoch": 0.06, "learning_rate": 0.00047559679037111334, "loss": 3.3045, "theoretical_loss": 4.376319028288219, "tokens_seen": 192479232 }, { "epoch": 0.06, "learning_rate": 0.0004755867602808425, "loss": 3.5353, "theoretical_loss": 4.37613150436543, "tokens_seen": 192544768 }, { "epoch": 0.06, "learning_rate": 0.00047557673019057176, "loss": 3.6184, "theoretical_loss": 4.375944062123496, "tokens_seen": 192610304 }, { "epoch": 0.06, "learning_rate": 0.0004755667001003009, "loss": 3.5661, "theoretical_loss": 4.375756701499063, "tokens_seen": 192675840 }, { "epoch": 0.06, "learning_rate": 0.0004755566700100301, "loss": 3.447, "theoretical_loss": 4.3755694224288515, "tokens_seen": 192741376 }, { "epoch": 0.06, "learning_rate": 0.00047554663991975925, "loss": 3.5665, "theoretical_loss": 4.375382224849648, "tokens_seen": 192806912 }, { "epoch": 0.06, "learning_rate": 0.0004755366098294885, "loss": 3.3279, "theoretical_loss": 4.375195108698316, "tokens_seen": 192872448 }, { "epoch": 0.06, "learning_rate": 0.00047552657973921766, "loss": 3.368, "theoretical_loss": 4.375008073911781, "tokens_seen": 192937984 }, { "epoch": 0.06, "learning_rate": 0.00047551654964894685, "loss": 3.4481, "theoretical_loss": 4.374821120427047, "tokens_seen": 193003520 }, { "epoch": 0.06, "learning_rate": 0.00047550651955867603, "loss": 3.4416, "theoretical_loss": 4.374634248181182, "tokens_seen": 193069056 }, { "epoch": 0.06, "learning_rate": 0.00047549648946840526, "loss": 3.3657, "theoretical_loss": 4.3744474571113265, "tokens_seen": 193134592 }, { "epoch": 0.06, "learning_rate": 0.0004754864593781344, "loss": 3.4649, "theoretical_loss": 4.374260747154692, "tokens_seen": 193200128 }, { "epoch": 0.06, "learning_rate": 0.0004754764292878636, "loss": 3.426, "theoretical_loss": 4.374074118248559, "tokens_seen": 193265664 }, { "epoch": 0.06, "objective/train/docs_used": 337587, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.365250825881958, "objective/train/theoretical_loss": 4.373887570330275, "objective/train/tokens_used": 213791200, "theoretical_loss": 4.373887570330275, "tokens_seen": 193331200 }, { "epoch": 0.06, "learning_rate": 0.00047546639919759275, "loss": 3.4069, "theoretical_loss": 4.373887570330275, "tokens_seen": 193331200 }, { "epoch": 0.06, "learning_rate": 0.000475456369107322, "loss": 3.5389, "theoretical_loss": 4.373701103337263, "tokens_seen": 193396736 }, { "epoch": 0.06, "learning_rate": 0.00047544633901705117, "loss": 3.4074, "theoretical_loss": 4.373514717207009, "tokens_seen": 193462272 }, { "epoch": 0.06, "learning_rate": 0.00047543630892678035, "loss": 3.4023, "theoretical_loss": 4.373328411877073, "tokens_seen": 193527808 }, { "epoch": 0.06, "learning_rate": 0.00047542627883650953, "loss": 3.3, "theoretical_loss": 4.373142187285083, "tokens_seen": 193593344 }, { "epoch": 0.06, "learning_rate": 0.0004754162487462387, "loss": 3.251, "theoretical_loss": 4.372956043368736, "tokens_seen": 193658880 }, { "epoch": 0.06, "learning_rate": 0.0004754062186559679, "loss": 3.5406, "theoretical_loss": 4.372769980065797, "tokens_seen": 193724416 }, { "epoch": 0.06, "learning_rate": 0.00047539618856569713, "loss": 3.4364, "theoretical_loss": 4.372583997314104, "tokens_seen": 193789952 }, { "epoch": 0.06, "learning_rate": 0.00047538615847542625, "loss": 3.4157, "theoretical_loss": 4.372398095051559, "tokens_seen": 193855488 }, { "epoch": 0.06, "learning_rate": 0.0004753761283851555, "loss": 3.3112, "theoretical_loss": 4.372212273216136, "tokens_seen": 193921024 }, { "epoch": 0.06, "learning_rate": 0.0004753660982948846, "loss": 3.4596, "theoretical_loss": 4.372026531745877, "tokens_seen": 193986560 }, { "epoch": 0.06, "learning_rate": 0.00047535606820461385, "loss": 3.1901, "theoretical_loss": 4.371840870578891, "tokens_seen": 194052096 }, { "epoch": 0.06, "learning_rate": 0.00047534603811434303, "loss": 3.3717, "theoretical_loss": 4.37165528965336, "tokens_seen": 194117632 }, { "epoch": 0.06, "learning_rate": 0.0004753360080240722, "loss": 3.3591, "theoretical_loss": 4.371469788907529, "tokens_seen": 194183168 }, { "epoch": 0.06, "learning_rate": 0.0004753259779338014, "loss": 3.3507, "theoretical_loss": 4.371284368279714, "tokens_seen": 194248704 }, { "epoch": 0.06, "learning_rate": 0.00047531594784353063, "loss": 3.4693, "theoretical_loss": 4.3710990277083, "tokens_seen": 194314240 }, { "epoch": 0.06, "learning_rate": 0.00047530591775325976, "loss": 3.37, "theoretical_loss": 4.3709137671317375, "tokens_seen": 194379776 }, { "epoch": 0.06, "learning_rate": 0.000475295887662989, "loss": 3.4069, "theoretical_loss": 4.37072858648855, "tokens_seen": 194445312 }, { "epoch": 0.06, "learning_rate": 0.0004752858575727181, "loss": 3.3965, "theoretical_loss": 4.370543485717322, "tokens_seen": 194510848 }, { "epoch": 0.06, "learning_rate": 0.00047527582748244736, "loss": 3.5525, "theoretical_loss": 4.370358464756713, "tokens_seen": 194576384 }, { "epoch": 0.06, "learning_rate": 0.00047526579739217654, "loss": 3.4976, "theoretical_loss": 4.370173523545443, "tokens_seen": 194641920 }, { "epoch": 0.06, "learning_rate": 0.0004752557673019057, "loss": 3.3773, "theoretical_loss": 4.3699886620223065, "tokens_seen": 194707456 }, { "epoch": 0.06, "learning_rate": 0.0004752457372116349, "loss": 3.2843, "theoretical_loss": 4.369803880126162, "tokens_seen": 194772992 }, { "epoch": 0.06, "learning_rate": 0.0004752357071213641, "loss": 3.4958, "theoretical_loss": 4.3696191777959354, "tokens_seen": 194838528 }, { "epoch": 0.06, "learning_rate": 0.00047522567703109326, "loss": 3.2915, "theoretical_loss": 4.369434554970621, "tokens_seen": 194904064 }, { "epoch": 0.06, "objective/train/docs_used": 340579, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.084949254989624, "objective/train/theoretical_loss": 4.369250011589279, "objective/train/tokens_used": 215429600, "theoretical_loss": 4.369250011589279, "tokens_seen": 194969600 }, { "epoch": 0.06, "learning_rate": 0.0004752156469408225, "loss": 3.4379, "theoretical_loss": 4.369250011589279, "tokens_seen": 194969600 }, { "epoch": 0.06, "learning_rate": 0.0004752056168505517, "loss": 3.3558, "theoretical_loss": 4.369065547591038, "tokens_seen": 195035136 }, { "epoch": 0.06, "learning_rate": 0.00047519558676028086, "loss": 3.2065, "theoretical_loss": 4.368881162915095, "tokens_seen": 195100672 }, { "epoch": 0.06, "learning_rate": 0.00047518555667001004, "loss": 3.3748, "theoretical_loss": 4.36869685750071, "tokens_seen": 195166208 }, { "epoch": 0.06, "learning_rate": 0.0004751755265797392, "loss": 3.3971, "theoretical_loss": 4.3685126312872145, "tokens_seen": 195231744 }, { "epoch": 0.06, "learning_rate": 0.00047516549648946846, "loss": 3.3746, "theoretical_loss": 4.368328484214002, "tokens_seen": 195297280 }, { "epoch": 0.06, "learning_rate": 0.0004751554663991976, "loss": 3.4419, "theoretical_loss": 4.368144416220538, "tokens_seen": 195362816 }, { "epoch": 0.06, "learning_rate": 0.0004751454363089268, "loss": 3.4109, "theoretical_loss": 4.3679604272463495, "tokens_seen": 195428352 }, { "epoch": 0.06, "learning_rate": 0.000475135406218656, "loss": 3.358, "theoretical_loss": 4.367776517231033, "tokens_seen": 195493888 }, { "epoch": 0.06, "learning_rate": 0.0004751253761283852, "loss": 3.3704, "theoretical_loss": 4.367592686114252, "tokens_seen": 195559424 }, { "epoch": 0.06, "learning_rate": 0.00047511534603811436, "loss": 3.4582, "theoretical_loss": 4.367408933835733, "tokens_seen": 195624960 }, { "epoch": 0.06, "learning_rate": 0.00047510531594784354, "loss": 3.4547, "theoretical_loss": 4.367225260335272, "tokens_seen": 195690496 }, { "epoch": 0.06, "learning_rate": 0.0004750952858575727, "loss": 3.3865, "theoretical_loss": 4.36704166555273, "tokens_seen": 195756032 }, { "epoch": 0.06, "learning_rate": 0.00047508525576730196, "loss": 3.3687, "theoretical_loss": 4.366858149428032, "tokens_seen": 195821568 }, { "epoch": 0.06, "learning_rate": 0.0004750752256770311, "loss": 3.3282, "theoretical_loss": 4.366674711901173, "tokens_seen": 195887104 }, { "epoch": 0.06, "learning_rate": 0.0004750651955867603, "loss": 3.3629, "theoretical_loss": 4.366491352912211, "tokens_seen": 195952640 }, { "epoch": 0.06, "learning_rate": 0.00047505516549648945, "loss": 3.4919, "theoretical_loss": 4.366308072401271, "tokens_seen": 196018176 }, { "epoch": 0.06, "learning_rate": 0.0004750451354062187, "loss": 3.3628, "theoretical_loss": 4.366124870308541, "tokens_seen": 196083712 }, { "epoch": 0.06, "learning_rate": 0.00047503510531594787, "loss": 3.4995, "theoretical_loss": 4.365941746574278, "tokens_seen": 196149248 }, { "epoch": 0.06, "learning_rate": 0.00047502507522567705, "loss": 3.6404, "theoretical_loss": 4.3657587011388035, "tokens_seen": 196214784 }, { "epoch": 0.06, "learning_rate": 0.00047501504513540623, "loss": 3.4557, "theoretical_loss": 4.365575733942503, "tokens_seen": 196280320 }, { "epoch": 0.06, "learning_rate": 0.00047500501504513546, "loss": 3.2961, "theoretical_loss": 4.365392844925829, "tokens_seen": 196345856 }, { "epoch": 0.06, "learning_rate": 0.0004749949849548646, "loss": 3.3312, "theoretical_loss": 4.365210034029298, "tokens_seen": 196411392 }, { "epoch": 0.06, "learning_rate": 0.0004749849548645938, "loss": 3.2653, "theoretical_loss": 4.365027301193491, "tokens_seen": 196476928 }, { "epoch": 0.06, "learning_rate": 0.00047497492477432295, "loss": 3.3798, "theoretical_loss": 4.364844646359056, "tokens_seen": 196542464 }, { "epoch": 0.06, "objective/train/docs_used": 343377, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2384772300720215, "objective/train/theoretical_loss": 4.364662069466704, "objective/train/tokens_used": 217068000, "theoretical_loss": 4.364662069466704, "tokens_seen": 196608000 }, { "epoch": 0.06, "learning_rate": 0.0004749648946840522, "loss": 3.2355, "theoretical_loss": 4.364662069466704, "tokens_seen": 196608000 }, { "epoch": 0.06, "learning_rate": 0.00047495486459378137, "loss": 3.4884, "theoretical_loss": 4.364479570457213, "tokens_seen": 196673536 }, { "epoch": 0.06, "learning_rate": 0.00047494483450351055, "loss": 3.471, "theoretical_loss": 4.364297149271423, "tokens_seen": 196739072 }, { "epoch": 0.06, "learning_rate": 0.00047493480441323973, "loss": 3.5063, "theoretical_loss": 4.3641148058502415, "tokens_seen": 196804608 }, { "epoch": 0.06, "learning_rate": 0.0004749247743229689, "loss": 3.3303, "theoretical_loss": 4.363932540134638, "tokens_seen": 196870144 }, { "epoch": 0.06, "learning_rate": 0.0004749147442326981, "loss": 3.4744, "theoretical_loss": 4.363750352065647, "tokens_seen": 196935680 }, { "epoch": 0.06, "learning_rate": 0.00047490471414242733, "loss": 3.5005, "theoretical_loss": 4.363568241584368, "tokens_seen": 197001216 }, { "epoch": 0.06, "learning_rate": 0.00047489468405215646, "loss": 3.2747, "theoretical_loss": 4.363386208631966, "tokens_seen": 197066752 }, { "epoch": 0.06, "learning_rate": 0.0004748846539618857, "loss": 3.5472, "theoretical_loss": 4.363204253149667, "tokens_seen": 197132288 }, { "epoch": 0.06, "learning_rate": 0.0004748746238716148, "loss": 3.3243, "theoretical_loss": 4.3630223750787644, "tokens_seen": 197197824 }, { "epoch": 0.06, "learning_rate": 0.00047486459378134405, "loss": 3.5055, "theoretical_loss": 4.362840574360612, "tokens_seen": 197263360 }, { "epoch": 0.06, "learning_rate": 0.00047485456369107323, "loss": 3.4225, "theoretical_loss": 4.362658850936631, "tokens_seen": 197328896 }, { "epoch": 0.06, "learning_rate": 0.0004748445336008024, "loss": 3.3399, "theoretical_loss": 4.362477204748305, "tokens_seen": 197394432 }, { "epoch": 0.06, "learning_rate": 0.0004748345035105316, "loss": 3.3257, "theoretical_loss": 4.362295635737179, "tokens_seen": 197459968 }, { "epoch": 0.06, "learning_rate": 0.00047482447342026083, "loss": 3.3412, "theoretical_loss": 4.362114143844867, "tokens_seen": 197525504 }, { "epoch": 0.06, "learning_rate": 0.00047481444332998996, "loss": 3.403, "theoretical_loss": 4.3619327290130405, "tokens_seen": 197591040 }, { "epoch": 0.06, "learning_rate": 0.0004748044132397192, "loss": 3.4901, "theoretical_loss": 4.3617513911834385, "tokens_seen": 197656576 }, { "epoch": 0.06, "learning_rate": 0.0004747943831494483, "loss": 3.3831, "theoretical_loss": 4.361570130297863, "tokens_seen": 197722112 }, { "epoch": 0.06, "learning_rate": 0.00047478435305917756, "loss": 3.2849, "theoretical_loss": 4.3613889462981765, "tokens_seen": 197787648 }, { "epoch": 0.06, "learning_rate": 0.00047477432296890674, "loss": 3.406, "theoretical_loss": 4.361207839126308, "tokens_seen": 197853184 }, { "epoch": 0.06, "learning_rate": 0.0004747642928786359, "loss": 3.5375, "theoretical_loss": 4.361026808724247, "tokens_seen": 197918720 }, { "epoch": 0.06, "learning_rate": 0.0004747542627883651, "loss": 3.4531, "theoretical_loss": 4.360845855034049, "tokens_seen": 197984256 }, { "epoch": 0.06, "learning_rate": 0.0004747442326980943, "loss": 3.5207, "theoretical_loss": 4.360664977997828, "tokens_seen": 198049792 }, { "epoch": 0.06, "learning_rate": 0.00047473420260782346, "loss": 3.5428, "theoretical_loss": 4.360484177557766, "tokens_seen": 198115328 }, { "epoch": 0.06, "learning_rate": 0.0004747241725175527, "loss": 3.3178, "theoretical_loss": 4.360303453656103, "tokens_seen": 198180864 }, { "epoch": 0.06, "objective/train/docs_used": 346182, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6043739318847656, "objective/train/theoretical_loss": 4.360122806235145, "objective/train/tokens_used": 218706400, "theoretical_loss": 4.360122806235145, "tokens_seen": 198246400 }, { "epoch": 0.06, "learning_rate": 0.0004747141424272818, "loss": 3.4787, "theoretical_loss": 4.360122806235145, "tokens_seen": 198246400 }, { "epoch": 0.06, "learning_rate": 0.00047470411233701106, "loss": 3.4616, "theoretical_loss": 4.359942235237257, "tokens_seen": 198311936 }, { "epoch": 0.06, "learning_rate": 0.0004746940822467402, "loss": 3.4203, "theoretical_loss": 4.359761740604871, "tokens_seen": 198377472 }, { "epoch": 0.06, "learning_rate": 0.0004746840521564694, "loss": 3.2979, "theoretical_loss": 4.359581322280479, "tokens_seen": 198443008 }, { "epoch": 0.06, "learning_rate": 0.0004746740220661986, "loss": 3.4387, "theoretical_loss": 4.359400980206634, "tokens_seen": 198508544 }, { "epoch": 0.06, "learning_rate": 0.0004746639919759278, "loss": 3.5383, "theoretical_loss": 4.359220714325954, "tokens_seen": 198574080 }, { "epoch": 0.06, "learning_rate": 0.00047465396188565696, "loss": 3.3242, "theoretical_loss": 4.359040524581116, "tokens_seen": 198639616 }, { "epoch": 0.06, "learning_rate": 0.0004746439317953862, "loss": 3.3109, "theoretical_loss": 4.358860410914861, "tokens_seen": 198705152 }, { "epoch": 0.06, "learning_rate": 0.0004746339017051153, "loss": 3.2386, "theoretical_loss": 4.358680373269993, "tokens_seen": 198770688 }, { "epoch": 0.06, "learning_rate": 0.00047462387161484456, "loss": 3.3547, "theoretical_loss": 4.358500411589375, "tokens_seen": 198836224 }, { "epoch": 0.06, "learning_rate": 0.0004746138415245737, "loss": 3.4355, "theoretical_loss": 4.358320525815934, "tokens_seen": 198901760 }, { "epoch": 0.06, "learning_rate": 0.0004746038114343029, "loss": 3.4575, "theoretical_loss": 4.358140715892658, "tokens_seen": 198967296 }, { "epoch": 0.06, "learning_rate": 0.0004745937813440321, "loss": 3.2661, "theoretical_loss": 4.357960981762595, "tokens_seen": 199032832 }, { "epoch": 0.06, "learning_rate": 0.0004745837512537613, "loss": 3.4258, "theoretical_loss": 4.357781323368857, "tokens_seen": 199098368 }, { "epoch": 0.06, "learning_rate": 0.00047457372116349047, "loss": 3.2711, "theoretical_loss": 4.357601740654617, "tokens_seen": 199163904 }, { "epoch": 0.06, "learning_rate": 0.00047456369107321965, "loss": 3.3552, "theoretical_loss": 4.357422233563106, "tokens_seen": 199229440 }, { "epoch": 0.06, "learning_rate": 0.00047455366098294883, "loss": 3.3961, "theoretical_loss": 4.357242802037623, "tokens_seen": 199294976 }, { "epoch": 0.06, "learning_rate": 0.00047454363089267807, "loss": 3.3907, "theoretical_loss": 4.35706344602152, "tokens_seen": 199360512 }, { "epoch": 0.06, "learning_rate": 0.0004745336008024072, "loss": 3.262, "theoretical_loss": 4.356884165458217, "tokens_seen": 199426048 }, { "epoch": 0.06, "learning_rate": 0.00047452357071213643, "loss": 3.334, "theoretical_loss": 4.356704960291191, "tokens_seen": 199491584 }, { "epoch": 0.06, "learning_rate": 0.00047451354062186555, "loss": 3.3842, "theoretical_loss": 4.35652583046398, "tokens_seen": 199557120 }, { "epoch": 0.06, "learning_rate": 0.0004745035105315948, "loss": 3.5208, "theoretical_loss": 4.356346775920185, "tokens_seen": 199622656 }, { "epoch": 0.06, "learning_rate": 0.00047449348044132397, "loss": 3.3992, "theoretical_loss": 4.356167796603467, "tokens_seen": 199688192 }, { "epoch": 0.06, "learning_rate": 0.00047448345035105315, "loss": 3.4458, "theoretical_loss": 4.355988892457546, "tokens_seen": 199753728 }, { "epoch": 0.06, "learning_rate": 0.0004744734202607824, "loss": 3.5178, "theoretical_loss": 4.355810063426204, "tokens_seen": 199819264 }, { "epoch": 0.06, "objective/train/docs_used": 348879, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2729251384735107, "objective/train/theoretical_loss": 4.355631309453283, "objective/train/tokens_used": 220344800, "theoretical_loss": 4.355631309453283, "tokens_seen": 199884800 }, { "epoch": 0.06, "learning_rate": 0.00047446339017051157, "loss": 3.4035, "theoretical_loss": 4.355631309453283, "tokens_seen": 199884800 }, { "epoch": 0.06, "learning_rate": 0.00047445336008024075, "loss": 3.3777, "theoretical_loss": 4.355452630482685, "tokens_seen": 199950336 }, { "epoch": 0.06, "learning_rate": 0.00047444332998996993, "loss": 3.3681, "theoretical_loss": 4.355274026458375, "tokens_seen": 200015872 }, { "epoch": 0.06, "learning_rate": 0.0004744332998996991, "loss": 3.2506, "theoretical_loss": 4.355095497324373, "tokens_seen": 200081408 }, { "epoch": 0.06, "learning_rate": 0.0004744232698094283, "loss": 3.312, "theoretical_loss": 4.354917043024765, "tokens_seen": 200146944 }, { "epoch": 0.06, "learning_rate": 0.00047441323971915753, "loss": 3.4773, "theoretical_loss": 4.354738663503692, "tokens_seen": 200212480 }, { "epoch": 0.06, "learning_rate": 0.00047440320962888666, "loss": 3.3712, "theoretical_loss": 4.354560358705358, "tokens_seen": 200278016 }, { "epoch": 0.06, "learning_rate": 0.0004743931795386159, "loss": 3.2298, "theoretical_loss": 4.354382128574027, "tokens_seen": 200343552 }, { "epoch": 0.06, "learning_rate": 0.000474383149448345, "loss": 3.388, "theoretical_loss": 4.35420397305402, "tokens_seen": 200409088 }, { "epoch": 0.06, "learning_rate": 0.00047437311935807425, "loss": 3.5506, "theoretical_loss": 4.35402589208972, "tokens_seen": 200474624 }, { "epoch": 0.06, "learning_rate": 0.00047436308926780343, "loss": 3.3399, "theoretical_loss": 4.353847885625571, "tokens_seen": 200540160 }, { "epoch": 0.06, "learning_rate": 0.0004743530591775326, "loss": 3.4503, "theoretical_loss": 4.353669953606072, "tokens_seen": 200605696 }, { "epoch": 0.06, "learning_rate": 0.0004743430290872618, "loss": 3.5637, "theoretical_loss": 4.353492095975787, "tokens_seen": 200671232 }, { "epoch": 0.06, "learning_rate": 0.00047433299899699103, "loss": 3.4721, "theoretical_loss": 4.353314312679333, "tokens_seen": 200736768 }, { "epoch": 0.06, "learning_rate": 0.00047432296890672016, "loss": 3.3351, "theoretical_loss": 4.353136603661392, "tokens_seen": 200802304 }, { "epoch": 0.06, "learning_rate": 0.0004743129388164494, "loss": 3.1327, "theoretical_loss": 4.352958968866704, "tokens_seen": 200867840 }, { "epoch": 0.06, "learning_rate": 0.0004743029087261785, "loss": 3.2352, "theoretical_loss": 4.352781408240065, "tokens_seen": 200933376 }, { "epoch": 0.06, "learning_rate": 0.00047429287863590776, "loss": 3.3332, "theoretical_loss": 4.352603921726334, "tokens_seen": 200998912 }, { "epoch": 0.06, "learning_rate": 0.00047428284854563694, "loss": 3.331, "theoretical_loss": 4.352426509270425, "tokens_seen": 201064448 }, { "epoch": 0.06, "learning_rate": 0.0004742728184553661, "loss": 3.3779, "theoretical_loss": 4.352249170817315, "tokens_seen": 201129984 }, { "epoch": 0.06, "learning_rate": 0.0004742627883650953, "loss": 3.4228, "theoretical_loss": 4.352071906312037, "tokens_seen": 201195520 }, { "epoch": 0.06, "learning_rate": 0.0004742527582748245, "loss": 3.3727, "theoretical_loss": 4.351894715699684, "tokens_seen": 201261056 }, { "epoch": 0.06, "learning_rate": 0.00047424272818455366, "loss": 3.4211, "theoretical_loss": 4.351717598925406, "tokens_seen": 201326592 }, { "epoch": 0.06, "learning_rate": 0.0004742326980942829, "loss": 3.3283, "theoretical_loss": 4.351540555934414, "tokens_seen": 201392128 }, { "epoch": 0.06, "learning_rate": 0.000474222668004012, "loss": 3.4601, "theoretical_loss": 4.351363586671976, "tokens_seen": 201457664 }, { "epoch": 0.06, "objective/train/docs_used": 350389, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.709861993789673, "objective/train/theoretical_loss": 4.351186691083417, "objective/train/tokens_used": 221983200, "theoretical_loss": 4.351186691083417, "tokens_seen": 201523200 }, { "epoch": 0.06, "learning_rate": 0.00047421263791374126, "loss": 3.3399, "theoretical_loss": 4.351186691083417, "tokens_seen": 201523200 }, { "epoch": 0.06, "learning_rate": 0.0004742026078234704, "loss": 3.3409, "theoretical_loss": 4.351009869114124, "tokens_seen": 201588736 }, { "epoch": 0.06, "learning_rate": 0.0004741925777331996, "loss": 3.3503, "theoretical_loss": 4.350833120709539, "tokens_seen": 201654272 }, { "epoch": 0.06, "learning_rate": 0.0004741825476429288, "loss": 3.3645, "theoretical_loss": 4.350656445815164, "tokens_seen": 201719808 }, { "epoch": 0.06, "learning_rate": 0.000474172517552658, "loss": 3.5298, "theoretical_loss": 4.350479844376557, "tokens_seen": 201785344 }, { "epoch": 0.06, "learning_rate": 0.00047416248746238716, "loss": 3.5122, "theoretical_loss": 4.350303316339337, "tokens_seen": 201850880 }, { "epoch": 0.06, "learning_rate": 0.0004741524573721164, "loss": 3.3112, "theoretical_loss": 4.350126861649178, "tokens_seen": 201916416 }, { "epoch": 0.06, "learning_rate": 0.0004741424272818455, "loss": 3.3335, "theoretical_loss": 4.349950480251813, "tokens_seen": 201981952 }, { "epoch": 0.06, "learning_rate": 0.00047413239719157476, "loss": 3.3025, "theoretical_loss": 4.349774172093033, "tokens_seen": 202047488 }, { "epoch": 0.06, "learning_rate": 0.0004741223671013039, "loss": 3.4355, "theoretical_loss": 4.349597937118687, "tokens_seen": 202113024 }, { "epoch": 0.06, "learning_rate": 0.0004741123370110331, "loss": 3.4259, "theoretical_loss": 4.3494217752746795, "tokens_seen": 202178560 }, { "epoch": 0.06, "learning_rate": 0.0004741023069207623, "loss": 3.5165, "theoretical_loss": 4.349245686506976, "tokens_seen": 202244096 }, { "epoch": 0.06, "learning_rate": 0.0004740922768304915, "loss": 3.4515, "theoretical_loss": 4.349069670761597, "tokens_seen": 202309632 }, { "epoch": 0.06, "learning_rate": 0.00047408224674022067, "loss": 3.4797, "theoretical_loss": 4.348893727984619, "tokens_seen": 202375168 }, { "epoch": 0.06, "learning_rate": 0.00047407221664994985, "loss": 3.377, "theoretical_loss": 4.348717858122178, "tokens_seen": 202440704 }, { "epoch": 0.06, "learning_rate": 0.00047406218655967903, "loss": 3.3521, "theoretical_loss": 4.348542061120469, "tokens_seen": 202506240 }, { "epoch": 0.06, "learning_rate": 0.00047405215646940827, "loss": 3.6393, "theoretical_loss": 4.348366336925739, "tokens_seen": 202571776 }, { "epoch": 0.06, "learning_rate": 0.0004740421263791374, "loss": 3.3479, "theoretical_loss": 4.3481906854842975, "tokens_seen": 202637312 }, { "epoch": 0.06, "learning_rate": 0.00047403209628886663, "loss": 3.3331, "theoretical_loss": 4.348015106742507, "tokens_seen": 202702848 }, { "epoch": 0.06, "learning_rate": 0.00047402206619859575, "loss": 3.1932, "theoretical_loss": 4.347839600646786, "tokens_seen": 202768384 }, { "epoch": 0.06, "learning_rate": 0.000474012036108325, "loss": 3.5205, "theoretical_loss": 4.347664167143615, "tokens_seen": 202833920 }, { "epoch": 0.06, "learning_rate": 0.00047400200601805417, "loss": 3.2618, "theoretical_loss": 4.347488806179528, "tokens_seen": 202899456 }, { "epoch": 0.06, "learning_rate": 0.00047399197592778335, "loss": 3.4394, "theoretical_loss": 4.347313517701114, "tokens_seen": 202964992 }, { "epoch": 0.06, "learning_rate": 0.00047398194583751253, "loss": 3.5791, "theoretical_loss": 4.347138301655021, "tokens_seen": 203030528 }, { "epoch": 0.06, "learning_rate": 0.00047397191574724177, "loss": 3.4387, "theoretical_loss": 4.346963157987954, "tokens_seen": 203096064 }, { "epoch": 0.06, "objective/train/docs_used": 353190, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5552213191986084, "objective/train/theoretical_loss": 4.346788086646671, "objective/train/tokens_used": 223621600, "theoretical_loss": 4.346788086646671, "tokens_seen": 203161600 }, { "epoch": 0.06, "learning_rate": 0.0004739618856569709, "loss": 3.3936, "theoretical_loss": 4.346788086646671, "tokens_seen": 203161600 }, { "epoch": 0.06, "learning_rate": 0.00047395185556670013, "loss": 3.3138, "theoretical_loss": 4.346613087577991, "tokens_seen": 203227136 }, { "epoch": 0.06, "learning_rate": 0.00047394182547642926, "loss": 3.3129, "theoretical_loss": 4.346438160728785, "tokens_seen": 203292672 }, { "epoch": 0.06, "learning_rate": 0.0004739317953861585, "loss": 3.2784, "theoretical_loss": 4.346263306045983, "tokens_seen": 203358208 }, { "epoch": 0.06, "learning_rate": 0.0004739217652958877, "loss": 3.6088, "theoretical_loss": 4.346088523476569, "tokens_seen": 203423744 }, { "epoch": 0.06, "learning_rate": 0.00047391173520561686, "loss": 3.4508, "theoretical_loss": 4.345913812967584, "tokens_seen": 203489280 }, { "epoch": 0.06, "learning_rate": 0.00047390170511534604, "loss": 3.503, "theoretical_loss": 4.345739174466127, "tokens_seen": 203554816 }, { "epoch": 0.06, "learning_rate": 0.0004738916750250752, "loss": 3.4109, "theoretical_loss": 4.345564607919348, "tokens_seen": 203620352 }, { "epoch": 0.06, "learning_rate": 0.0004738816449348044, "loss": 3.3443, "theoretical_loss": 4.3453901132744575, "tokens_seen": 203685888 }, { "epoch": 0.06, "learning_rate": 0.00047387161484453363, "loss": 3.3638, "theoretical_loss": 4.345215690478719, "tokens_seen": 203751424 }, { "epoch": 0.06, "learning_rate": 0.00047386158475426276, "loss": 3.4944, "theoretical_loss": 4.345041339479453, "tokens_seen": 203816960 }, { "epoch": 0.06, "learning_rate": 0.000473851554663992, "loss": 3.5365, "theoretical_loss": 4.3448670602240345, "tokens_seen": 203882496 }, { "epoch": 0.06, "learning_rate": 0.0004738415245737211, "loss": 3.4562, "theoretical_loss": 4.344692852659895, "tokens_seen": 203948032 }, { "epoch": 0.06, "learning_rate": 0.00047383149448345036, "loss": 3.2597, "theoretical_loss": 4.34451871673452, "tokens_seen": 204013568 }, { "epoch": 0.06, "learning_rate": 0.00047382146439317954, "loss": 3.3557, "theoretical_loss": 4.344344652395451, "tokens_seen": 204079104 }, { "epoch": 0.06, "learning_rate": 0.0004738114343029087, "loss": 3.3327, "theoretical_loss": 4.3441706595902865, "tokens_seen": 204144640 }, { "epoch": 0.06, "learning_rate": 0.0004738014042126379, "loss": 3.3027, "theoretical_loss": 4.343996738266677, "tokens_seen": 204210176 }, { "epoch": 0.06, "learning_rate": 0.00047379137412236714, "loss": 3.3285, "theoretical_loss": 4.343822888372331, "tokens_seen": 204275712 }, { "epoch": 0.06, "learning_rate": 0.00047378134403209626, "loss": 3.2589, "theoretical_loss": 4.343649109855009, "tokens_seen": 204341248 }, { "epoch": 0.06, "learning_rate": 0.0004737713139418255, "loss": 3.4624, "theoretical_loss": 4.343475402662529, "tokens_seen": 204406784 }, { "epoch": 0.06, "learning_rate": 0.0004737612838515546, "loss": 3.3659, "theoretical_loss": 4.343301766742763, "tokens_seen": 204472320 }, { "epoch": 0.06, "learning_rate": 0.00047375125376128386, "loss": 3.513, "theoretical_loss": 4.343128202043638, "tokens_seen": 204537856 }, { "epoch": 0.06, "learning_rate": 0.00047374122367101304, "loss": 3.2756, "theoretical_loss": 4.342954708513136, "tokens_seen": 204603392 }, { "epoch": 0.06, "learning_rate": 0.0004737311935807422, "loss": 3.5225, "theoretical_loss": 4.342781286099291, "tokens_seen": 204668928 }, { "epoch": 0.06, "learning_rate": 0.00047372116349047146, "loss": 3.4782, "theoretical_loss": 4.3426079347501965, "tokens_seen": 204734464 }, { "epoch": 0.06, "objective/train/docs_used": 356001, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.50378155708313, "objective/train/theoretical_loss": 4.342434654413995, "objective/train/tokens_used": 225260000, "theoretical_loss": 4.342434654413995, "tokens_seen": 204800000 }, { "epoch": 0.06, "learning_rate": 0.0004737111334002006, "loss": 3.2307, "theoretical_loss": 4.342434654413995, "tokens_seen": 204800000 }, { "epoch": 0.06, "learning_rate": 0.0004737011033099298, "loss": 3.5245, "theoretical_loss": 4.342261445038888, "tokens_seen": 204865536 }, { "epoch": 0.06, "learning_rate": 0.000473691073219659, "loss": 3.4964, "theoretical_loss": 4.342088306573128, "tokens_seen": 204931072 }, { "epoch": 0.06, "learning_rate": 0.0004736810431293882, "loss": 3.355, "theoretical_loss": 4.341915238965026, "tokens_seen": 204996608 }, { "epoch": 0.06, "learning_rate": 0.00047367101303911736, "loss": 3.2963, "theoretical_loss": 4.34174224216294, "tokens_seen": 205062144 }, { "epoch": 0.06, "learning_rate": 0.0004736609829488466, "loss": 3.4381, "theoretical_loss": 4.34156931611529, "tokens_seen": 205127680 }, { "epoch": 0.06, "learning_rate": 0.0004736509528585757, "loss": 3.4113, "theoretical_loss": 4.341396460770547, "tokens_seen": 205193216 }, { "epoch": 0.06, "learning_rate": 0.00047364092276830496, "loss": 3.3941, "theoretical_loss": 4.341223676077232, "tokens_seen": 205258752 }, { "epoch": 0.06, "learning_rate": 0.0004736308926780341, "loss": 3.4282, "theoretical_loss": 4.341050961983926, "tokens_seen": 205324288 }, { "epoch": 0.06, "learning_rate": 0.0004736208625877633, "loss": 3.5454, "theoretical_loss": 4.340878318439261, "tokens_seen": 205389824 }, { "epoch": 0.06, "learning_rate": 0.0004736108324974925, "loss": 3.6049, "theoretical_loss": 4.340705745391922, "tokens_seen": 205455360 }, { "epoch": 0.06, "learning_rate": 0.0004736008024072217, "loss": 3.3693, "theoretical_loss": 4.3405332427906504, "tokens_seen": 205520896 }, { "epoch": 0.06, "learning_rate": 0.00047359077231695087, "loss": 3.4021, "theoretical_loss": 4.340360810584238, "tokens_seen": 205586432 }, { "epoch": 0.06, "learning_rate": 0.00047358074222668005, "loss": 3.3438, "theoretical_loss": 4.340188448721532, "tokens_seen": 205651968 }, { "epoch": 0.06, "learning_rate": 0.00047357071213640923, "loss": 3.4539, "theoretical_loss": 4.3400161571514335, "tokens_seen": 205717504 }, { "epoch": 0.06, "learning_rate": 0.00047356068204613847, "loss": 3.402, "theoretical_loss": 4.339843935822895, "tokens_seen": 205783040 }, { "epoch": 0.06, "learning_rate": 0.0004735506519558676, "loss": 3.3385, "theoretical_loss": 4.339671784684923, "tokens_seen": 205848576 }, { "epoch": 0.06, "learning_rate": 0.00047354062186559683, "loss": 3.4456, "theoretical_loss": 4.339499703686579, "tokens_seen": 205914112 }, { "epoch": 0.06, "learning_rate": 0.00047353059177532595, "loss": 3.257, "theoretical_loss": 4.339327692776977, "tokens_seen": 205979648 }, { "epoch": 0.06, "learning_rate": 0.0004735205616850552, "loss": 3.3043, "theoretical_loss": 4.339155751905282, "tokens_seen": 206045184 }, { "epoch": 0.06, "learning_rate": 0.00047351053159478437, "loss": 3.4501, "theoretical_loss": 4.338983881020713, "tokens_seen": 206110720 }, { "epoch": 0.06, "learning_rate": 0.00047350050150451355, "loss": 3.5054, "theoretical_loss": 4.338812080072545, "tokens_seen": 206176256 }, { "epoch": 0.06, "learning_rate": 0.00047349047141424273, "loss": 3.5371, "theoretical_loss": 4.338640349010101, "tokens_seen": 206241792 }, { "epoch": 0.06, "learning_rate": 0.00047348044132397197, "loss": 3.4002, "theoretical_loss": 4.3384686877827585, "tokens_seen": 206307328 }, { "epoch": 0.06, "learning_rate": 0.0004734704112337011, "loss": 3.3856, "theoretical_loss": 4.338297096339951, "tokens_seen": 206372864 }, { "epoch": 0.06, "objective/train/docs_used": 358602, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.6710610389709473, "objective/train/theoretical_loss": 4.33812557463116, "objective/train/tokens_used": 226898400, "theoretical_loss": 4.33812557463116, "tokens_seen": 206438400 }, { "epoch": 0.06, "learning_rate": 0.00047346038114343033, "loss": 3.5245, "theoretical_loss": 4.33812557463116, "tokens_seen": 206438400 }, { "epoch": 0.06, "learning_rate": 0.00047345035105315946, "loss": 3.3877, "theoretical_loss": 4.3379541226059235, "tokens_seen": 206503936 }, { "epoch": 0.06, "learning_rate": 0.0004734403209628887, "loss": 3.3836, "theoretical_loss": 4.337782740213827, "tokens_seen": 206569472 }, { "epoch": 0.06, "learning_rate": 0.0004734302908726179, "loss": 3.4075, "theoretical_loss": 4.337611427404514, "tokens_seen": 206635008 }, { "epoch": 0.06, "learning_rate": 0.00047342026078234706, "loss": 3.3387, "theoretical_loss": 4.337440184127679, "tokens_seen": 206700544 }, { "epoch": 0.06, "learning_rate": 0.00047341023069207624, "loss": 3.4968, "theoretical_loss": 4.337269010333065, "tokens_seen": 206766080 }, { "epoch": 0.06, "learning_rate": 0.0004734002006018054, "loss": 3.4979, "theoretical_loss": 4.337097905970471, "tokens_seen": 206831616 }, { "epoch": 0.06, "learning_rate": 0.0004733901705115346, "loss": 3.3039, "theoretical_loss": 4.336926870989748, "tokens_seen": 206897152 }, { "epoch": 0.06, "learning_rate": 0.00047338014042126383, "loss": 3.5214, "theoretical_loss": 4.336755905340797, "tokens_seen": 206962688 }, { "epoch": 0.06, "learning_rate": 0.00047337011033099296, "loss": 3.394, "theoretical_loss": 4.336585008973573, "tokens_seen": 207028224 }, { "epoch": 0.06, "learning_rate": 0.0004733600802407222, "loss": 3.3401, "theoretical_loss": 4.336414181838082, "tokens_seen": 207093760 }, { "epoch": 0.06, "learning_rate": 0.0004733500501504513, "loss": 3.2972, "theoretical_loss": 4.336243423884382, "tokens_seen": 207159296 }, { "epoch": 0.06, "learning_rate": 0.00047334002006018056, "loss": 3.4078, "theoretical_loss": 4.336072735062583, "tokens_seen": 207224832 }, { "epoch": 0.06, "learning_rate": 0.00047332998996990974, "loss": 3.5474, "theoretical_loss": 4.335902115322847, "tokens_seen": 207290368 }, { "epoch": 0.06, "learning_rate": 0.0004733199598796389, "loss": 3.4689, "theoretical_loss": 4.335731564615387, "tokens_seen": 207355904 }, { "epoch": 0.06, "learning_rate": 0.0004733099297893681, "loss": 3.4008, "theoretical_loss": 4.335561082890468, "tokens_seen": 207421440 }, { "epoch": 0.06, "learning_rate": 0.00047329989969909734, "loss": 3.385, "theoretical_loss": 4.335390670098407, "tokens_seen": 207486976 }, { "epoch": 0.06, "learning_rate": 0.00047328986960882646, "loss": 3.4314, "theoretical_loss": 4.335220326189571, "tokens_seen": 207552512 }, { "epoch": 0.06, "learning_rate": 0.0004732798395185557, "loss": 3.4463, "theoretical_loss": 4.335050051114379, "tokens_seen": 207618048 }, { "epoch": 0.06, "learning_rate": 0.0004732698094282848, "loss": 3.4, "theoretical_loss": 4.334879844823304, "tokens_seen": 207683584 }, { "epoch": 0.06, "learning_rate": 0.00047325977933801406, "loss": 3.2869, "theoretical_loss": 4.334709707266865, "tokens_seen": 207749120 }, { "epoch": 0.06, "learning_rate": 0.00047324974924774324, "loss": 3.5459, "theoretical_loss": 4.334539638395636, "tokens_seen": 207814656 }, { "epoch": 0.06, "learning_rate": 0.0004732397191574724, "loss": 3.4227, "theoretical_loss": 4.334369638160242, "tokens_seen": 207880192 }, { "epoch": 0.06, "learning_rate": 0.0004732296890672016, "loss": 3.3837, "theoretical_loss": 4.334199706511358, "tokens_seen": 207945728 }, { "epoch": 0.06, "learning_rate": 0.0004732196589769308, "loss": 3.3105, "theoretical_loss": 4.334029843399709, "tokens_seen": 208011264 }, { "epoch": 0.06, "objective/train/docs_used": 361329, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3980507850646973, "objective/train/theoretical_loss": 4.333860048776074, "objective/train/tokens_used": 228536800, "theoretical_loss": 4.333860048776074, "tokens_seen": 208076800 }, { "epoch": 0.06, "learning_rate": 0.00047320962888665997, "loss": 3.3882, "theoretical_loss": 4.333860048776074, "tokens_seen": 208076800 }, { "epoch": 0.06, "learning_rate": 0.0004731995987963892, "loss": 3.344, "theoretical_loss": 4.33369032259128, "tokens_seen": 208142336 }, { "epoch": 0.06, "learning_rate": 0.00047318956870611833, "loss": 3.3845, "theoretical_loss": 4.333520664796206, "tokens_seen": 208207872 }, { "epoch": 0.06, "learning_rate": 0.00047317953861584757, "loss": 3.4634, "theoretical_loss": 4.33335107534178, "tokens_seen": 208273408 }, { "epoch": 0.06, "learning_rate": 0.00047316950852557675, "loss": 3.2808, "theoretical_loss": 4.333181554178985, "tokens_seen": 208338944 }, { "epoch": 0.06, "learning_rate": 0.00047315947843530593, "loss": 3.4263, "theoretical_loss": 4.3330121012588485, "tokens_seen": 208404480 }, { "epoch": 0.06, "learning_rate": 0.0004731494483450351, "loss": 3.3816, "theoretical_loss": 4.332842716532454, "tokens_seen": 208470016 }, { "epoch": 0.06, "learning_rate": 0.0004731394182547643, "loss": 3.331, "theoretical_loss": 4.332673399950932, "tokens_seen": 208535552 }, { "epoch": 0.06, "learning_rate": 0.00047312938816449347, "loss": 3.3621, "theoretical_loss": 4.332504151465464, "tokens_seen": 208601088 }, { "epoch": 0.06, "learning_rate": 0.0004731193580742227, "loss": 3.3856, "theoretical_loss": 4.332334971027284, "tokens_seen": 208666624 }, { "epoch": 0.06, "learning_rate": 0.00047310932798395183, "loss": 3.4142, "theoretical_loss": 4.332165858587672, "tokens_seen": 208732160 }, { "epoch": 0.06, "learning_rate": 0.00047309929789368107, "loss": 3.2692, "theoretical_loss": 4.331996814097963, "tokens_seen": 208797696 }, { "epoch": 0.06, "learning_rate": 0.0004730892678034102, "loss": 3.3793, "theoretical_loss": 4.331827837509538, "tokens_seen": 208863232 }, { "epoch": 0.06, "learning_rate": 0.00047307923771313943, "loss": 3.4114, "theoretical_loss": 4.331658928773831, "tokens_seen": 208928768 }, { "epoch": 0.06, "learning_rate": 0.0004730692076228686, "loss": 3.4714, "theoretical_loss": 4.331490087842324, "tokens_seen": 208994304 }, { "epoch": 0.06, "learning_rate": 0.0004730591775325978, "loss": 3.1764, "theoretical_loss": 4.33132131466655, "tokens_seen": 209059840 }, { "epoch": 0.06, "learning_rate": 0.000473049147442327, "loss": 3.305, "theoretical_loss": 4.3311526091980905, "tokens_seen": 209125376 }, { "epoch": 0.06, "learning_rate": 0.00047303911735205615, "loss": 3.4726, "theoretical_loss": 4.330983971388578, "tokens_seen": 209190912 }, { "epoch": 0.06, "learning_rate": 0.00047302908726178534, "loss": 3.3491, "theoretical_loss": 4.330815401189695, "tokens_seen": 209256448 }, { "epoch": 0.06, "learning_rate": 0.00047301905717151457, "loss": 3.5199, "theoretical_loss": 4.330646898553173, "tokens_seen": 209321984 }, { "epoch": 0.06, "learning_rate": 0.0004730090270812437, "loss": 3.3113, "theoretical_loss": 4.330478463430792, "tokens_seen": 209387520 }, { "epoch": 0.06, "learning_rate": 0.00047299899699097293, "loss": 3.6016, "theoretical_loss": 4.330310095774383, "tokens_seen": 209453056 }, { "epoch": 0.06, "learning_rate": 0.0004729889669007021, "loss": 3.4474, "theoretical_loss": 4.330141795535828, "tokens_seen": 209518592 }, { "epoch": 0.06, "learning_rate": 0.0004729789368104313, "loss": 3.3037, "theoretical_loss": 4.329973562667053, "tokens_seen": 209584128 }, { "epoch": 0.06, "learning_rate": 0.00047296890672016053, "loss": 3.3967, "theoretical_loss": 4.3298053971200385, "tokens_seen": 209649664 }, { "epoch": 0.06, "objective/train/docs_used": 364064, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5781326293945312, "objective/train/theoretical_loss": 4.329637298846812, "objective/train/tokens_used": 230175200, "theoretical_loss": 4.329637298846812, "tokens_seen": 209715200 }, { "epoch": 0.06, "learning_rate": 0.00047295887662988966, "loss": 3.4824, "theoretical_loss": 4.329637298846812, "tokens_seen": 209715200 }, { "epoch": 0.06, "learning_rate": 0.0004729488465396189, "loss": 3.3608, "theoretical_loss": 4.329469267799451, "tokens_seen": 209780736 }, { "epoch": 0.06, "learning_rate": 0.0004729388164493481, "loss": 3.5526, "theoretical_loss": 4.32930130393008, "tokens_seen": 209846272 }, { "epoch": 0.06, "learning_rate": 0.00047292878635907726, "loss": 3.4636, "theoretical_loss": 4.329133407190876, "tokens_seen": 209911808 }, { "epoch": 0.06, "learning_rate": 0.00047291875626880644, "loss": 3.3267, "theoretical_loss": 4.3289655775340625, "tokens_seen": 209977344 }, { "epoch": 0.06, "learning_rate": 0.0004729087261785356, "loss": 3.3005, "theoretical_loss": 4.328797814911912, "tokens_seen": 210042880 }, { "epoch": 0.06, "learning_rate": 0.0004728986960882648, "loss": 3.3252, "theoretical_loss": 4.328630119276747, "tokens_seen": 210108416 }, { "epoch": 0.06, "learning_rate": 0.00047288866599799403, "loss": 3.5262, "theoretical_loss": 4.328462490580938, "tokens_seen": 210173952 }, { "epoch": 0.06, "learning_rate": 0.00047287863590772316, "loss": 3.4278, "theoretical_loss": 4.328294928776903, "tokens_seen": 210239488 }, { "epoch": 0.06, "learning_rate": 0.0004728686058174524, "loss": 3.3637, "theoretical_loss": 4.328127433817112, "tokens_seen": 210305024 }, { "epoch": 0.06, "learning_rate": 0.0004728585757271815, "loss": 3.3306, "theoretical_loss": 4.327960005654081, "tokens_seen": 210370560 }, { "epoch": 0.06, "learning_rate": 0.00047284854563691076, "loss": 3.1524, "theoretical_loss": 4.327792644240374, "tokens_seen": 210436096 }, { "epoch": 0.06, "learning_rate": 0.00047283851554663994, "loss": 3.1952, "theoretical_loss": 4.327625349528605, "tokens_seen": 210501632 }, { "epoch": 0.06, "learning_rate": 0.0004728284854563691, "loss": 3.4627, "theoretical_loss": 4.327458121471436, "tokens_seen": 210567168 }, { "epoch": 0.06, "learning_rate": 0.0004728184553660983, "loss": 3.3024, "theoretical_loss": 4.3272909600215765, "tokens_seen": 210632704 }, { "epoch": 0.06, "learning_rate": 0.00047280842527582754, "loss": 3.3528, "theoretical_loss": 4.327123865131786, "tokens_seen": 210698240 }, { "epoch": 0.06, "learning_rate": 0.00047279839518555666, "loss": 3.3965, "theoretical_loss": 4.326956836754871, "tokens_seen": 210763776 }, { "epoch": 0.06, "learning_rate": 0.0004727883650952859, "loss": 3.5304, "theoretical_loss": 4.326789874843685, "tokens_seen": 210829312 }, { "epoch": 0.06, "learning_rate": 0.000472778335005015, "loss": 3.4066, "theoretical_loss": 4.326622979351132, "tokens_seen": 210894848 }, { "epoch": 0.06, "learning_rate": 0.00047276830491474426, "loss": 3.3694, "theoretical_loss": 4.326456150230163, "tokens_seen": 210960384 }, { "epoch": 0.06, "learning_rate": 0.00047275827482447344, "loss": 3.4539, "theoretical_loss": 4.326289387433776, "tokens_seen": 211025920 }, { "epoch": 0.06, "learning_rate": 0.0004727482447342026, "loss": 3.4263, "theoretical_loss": 4.326122690915017, "tokens_seen": 211091456 }, { "epoch": 0.06, "learning_rate": 0.0004727382146439318, "loss": 3.4499, "theoretical_loss": 4.325956060626982, "tokens_seen": 211156992 }, { "epoch": 0.06, "learning_rate": 0.000472728184553661, "loss": 3.4524, "theoretical_loss": 4.325789496522812, "tokens_seen": 211222528 }, { "epoch": 0.06, "learning_rate": 0.00047271815446339017, "loss": 3.3068, "theoretical_loss": 4.325622998555697, "tokens_seen": 211288064 }, { "epoch": 0.06, "objective/train/docs_used": 367034, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1220436096191406, "objective/train/theoretical_loss": 4.3254565666788745, "objective/train/tokens_used": 231813600, "theoretical_loss": 4.3254565666788745, "tokens_seen": 211353600 }, { "epoch": 0.06, "learning_rate": 0.0004727081243731194, "loss": 3.334, "theoretical_loss": 4.3254565666788745, "tokens_seen": 211353600 }, { "epoch": 0.06, "learning_rate": 0.00047269809428284853, "loss": 3.055, "theoretical_loss": 4.325290200845629, "tokens_seen": 211419136 }, { "epoch": 0.06, "learning_rate": 0.00047268806419257777, "loss": 3.3824, "theoretical_loss": 4.3251239010092934, "tokens_seen": 211484672 }, { "epoch": 0.06, "learning_rate": 0.00047267803410230695, "loss": 3.4823, "theoretical_loss": 4.324957667123249, "tokens_seen": 211550208 }, { "epoch": 0.06, "learning_rate": 0.00047266800401203613, "loss": 3.2993, "theoretical_loss": 4.32479149914092, "tokens_seen": 211615744 }, { "epoch": 0.06, "learning_rate": 0.0004726579739217653, "loss": 3.2258, "theoretical_loss": 4.324625397015783, "tokens_seen": 211681280 }, { "epoch": 0.06, "learning_rate": 0.0004726479438314945, "loss": 3.3235, "theoretical_loss": 4.3244593607013595, "tokens_seen": 211746816 }, { "epoch": 0.06, "learning_rate": 0.00047263791374122367, "loss": 3.4842, "theoretical_loss": 4.324293390151218, "tokens_seen": 211812352 }, { "epoch": 0.06, "learning_rate": 0.0004726278836509529, "loss": 3.4303, "theoretical_loss": 4.324127485318975, "tokens_seen": 211877888 }, { "epoch": 0.06, "learning_rate": 0.00047261785356068203, "loss": 3.3173, "theoretical_loss": 4.323961646158294, "tokens_seen": 211943424 }, { "epoch": 0.06, "learning_rate": 0.00047260782347041127, "loss": 3.365, "theoretical_loss": 4.323795872622884, "tokens_seen": 212008960 }, { "epoch": 0.06, "learning_rate": 0.0004725977933801404, "loss": 3.237, "theoretical_loss": 4.323630164666502, "tokens_seen": 212074496 }, { "epoch": 0.06, "learning_rate": 0.00047258776328986963, "loss": 3.3657, "theoretical_loss": 4.323464522242954, "tokens_seen": 212140032 }, { "epoch": 0.06, "learning_rate": 0.0004725777331995988, "loss": 3.3546, "theoretical_loss": 4.323298945306089, "tokens_seen": 212205568 }, { "epoch": 0.06, "learning_rate": 0.000472567703109328, "loss": 3.2106, "theoretical_loss": 4.3231334338098035, "tokens_seen": 212271104 }, { "epoch": 0.06, "learning_rate": 0.0004725576730190572, "loss": 3.3985, "theoretical_loss": 4.322967987708043, "tokens_seen": 212336640 }, { "epoch": 0.06, "learning_rate": 0.00047254764292878636, "loss": 3.2259, "theoretical_loss": 4.322802606954798, "tokens_seen": 212402176 }, { "epoch": 0.06, "learning_rate": 0.00047253761283851554, "loss": 3.2795, "theoretical_loss": 4.322637291504106, "tokens_seen": 212467712 }, { "epoch": 0.06, "learning_rate": 0.00047252758274824477, "loss": 3.4519, "theoretical_loss": 4.32247204131005, "tokens_seen": 212533248 }, { "epoch": 0.06, "learning_rate": 0.0004725175526579739, "loss": 3.2544, "theoretical_loss": 4.322306856326761, "tokens_seen": 212598784 }, { "epoch": 0.06, "learning_rate": 0.00047250752256770313, "loss": 3.4026, "theoretical_loss": 4.322141736508415, "tokens_seen": 212664320 }, { "epoch": 0.06, "learning_rate": 0.0004724974924774323, "loss": 3.2488, "theoretical_loss": 4.321976681809236, "tokens_seen": 212729856 }, { "epoch": 0.06, "learning_rate": 0.0004724874623871615, "loss": 3.3714, "theoretical_loss": 4.321811692183491, "tokens_seen": 212795392 }, { "epoch": 0.06, "learning_rate": 0.0004724774322968907, "loss": 3.2587, "theoretical_loss": 4.321646767585497, "tokens_seen": 212860928 }, { "epoch": 0.06, "learning_rate": 0.00047246740220661986, "loss": 3.2841, "theoretical_loss": 4.3214819079696145, "tokens_seen": 212926464 }, { "epoch": 0.06, "objective/train/docs_used": 368444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.645022392272949, "objective/train/theoretical_loss": 4.321317113290252, "objective/train/tokens_used": 233452000, "theoretical_loss": 4.321317113290252, "tokens_seen": 212992000 }, { "epoch": 0.06, "learning_rate": 0.00047245737211634904, "loss": 3.3884, "theoretical_loss": 4.321317113290252, "tokens_seen": 212992000 }, { "epoch": 0.06, "learning_rate": 0.0004724473420260783, "loss": 3.2881, "theoretical_loss": 4.321152383501863, "tokens_seen": 213057536 }, { "epoch": 0.06, "learning_rate": 0.0004724373119358074, "loss": 3.3384, "theoretical_loss": 4.320987718558945, "tokens_seen": 213123072 }, { "epoch": 0.06, "learning_rate": 0.00047242728184553664, "loss": 3.3661, "theoretical_loss": 4.320823118416046, "tokens_seen": 213188608 }, { "epoch": 0.06, "learning_rate": 0.00047241725175526576, "loss": 3.2545, "theoretical_loss": 4.320658583027755, "tokens_seen": 213254144 }, { "epoch": 0.06, "learning_rate": 0.000472407221664995, "loss": 3.4435, "theoretical_loss": 4.32049411234871, "tokens_seen": 213319680 }, { "epoch": 0.06, "learning_rate": 0.0004723971915747242, "loss": 3.5344, "theoretical_loss": 4.3203297063335935, "tokens_seen": 213385216 }, { "epoch": 0.06, "learning_rate": 0.00047238716148445336, "loss": 3.1711, "theoretical_loss": 4.320165364937134, "tokens_seen": 213450752 }, { "epoch": 0.06, "learning_rate": 0.00047237713139418254, "loss": 3.5762, "theoretical_loss": 4.320001088114105, "tokens_seen": 213516288 }, { "epoch": 0.06, "learning_rate": 0.0004723671013039117, "loss": 3.5211, "theoretical_loss": 4.319836875819325, "tokens_seen": 213581824 }, { "epoch": 0.06, "learning_rate": 0.0004723570712136409, "loss": 3.2912, "theoretical_loss": 4.31967272800766, "tokens_seen": 213647360 }, { "epoch": 0.06, "learning_rate": 0.00047234704112337014, "loss": 3.3871, "theoretical_loss": 4.319508644634021, "tokens_seen": 213712896 }, { "epoch": 0.06, "learning_rate": 0.00047233701103309927, "loss": 3.494, "theoretical_loss": 4.319344625653361, "tokens_seen": 213778432 }, { "epoch": 0.06, "learning_rate": 0.0004723269809428285, "loss": 3.4014, "theoretical_loss": 4.319180671020684, "tokens_seen": 213843968 }, { "epoch": 0.06, "learning_rate": 0.0004723169508525577, "loss": 3.44, "theoretical_loss": 4.319016780691033, "tokens_seen": 213909504 }, { "epoch": 0.06, "learning_rate": 0.00047230692076228686, "loss": 3.4235, "theoretical_loss": 4.318852954619501, "tokens_seen": 213975040 }, { "epoch": 0.06, "learning_rate": 0.00047229689067201605, "loss": 3.3656, "theoretical_loss": 4.318689192761225, "tokens_seen": 214040576 }, { "epoch": 0.06, "learning_rate": 0.0004722868605817452, "loss": 3.4526, "theoretical_loss": 4.318525495071385, "tokens_seen": 214106112 }, { "epoch": 0.06, "learning_rate": 0.0004722768304914744, "loss": 3.3912, "theoretical_loss": 4.318361861505207, "tokens_seen": 214171648 }, { "epoch": 0.06, "learning_rate": 0.00047226680040120364, "loss": 3.3522, "theoretical_loss": 4.318198292017964, "tokens_seen": 214237184 }, { "epoch": 0.06, "learning_rate": 0.00047225677031093277, "loss": 3.4424, "theoretical_loss": 4.318034786564971, "tokens_seen": 214302720 }, { "epoch": 0.06, "learning_rate": 0.000472246740220662, "loss": 3.251, "theoretical_loss": 4.31787134510159, "tokens_seen": 214368256 }, { "epoch": 0.06, "learning_rate": 0.00047223671013039113, "loss": 3.1358, "theoretical_loss": 4.3177079675832255, "tokens_seen": 214433792 }, { "epoch": 0.06, "learning_rate": 0.00047222668004012037, "loss": 3.3946, "theoretical_loss": 4.317544653965329, "tokens_seen": 214499328 }, { "epoch": 0.07, "learning_rate": 0.0004722166499498496, "loss": 3.5822, "theoretical_loss": 4.3173814042033944, "tokens_seen": 214564864 }, { "epoch": 0.07, "objective/train/docs_used": 371361, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2276971340179443, "objective/train/theoretical_loss": 4.317218218252963, "objective/train/tokens_used": 235090400, "theoretical_loss": 4.317218218252963, "tokens_seen": 214630400 }, { "epoch": 0.07, "learning_rate": 0.00047220661985957873, "loss": 3.305, "theoretical_loss": 4.317218218252963, "tokens_seen": 214630400 }, { "epoch": 0.07, "learning_rate": 0.00047219658976930797, "loss": 3.2718, "theoretical_loss": 4.317055096069618, "tokens_seen": 214695936 }, { "epoch": 0.07, "learning_rate": 0.00047218655967903715, "loss": 3.3142, "theoretical_loss": 4.316892037608987, "tokens_seen": 214761472 }, { "epoch": 0.07, "learning_rate": 0.00047217652958876633, "loss": 3.2232, "theoretical_loss": 4.316729042826745, "tokens_seen": 214827008 }, { "epoch": 0.07, "learning_rate": 0.0004721664994984955, "loss": 3.3066, "theoretical_loss": 4.316566111678609, "tokens_seen": 214892544 }, { "epoch": 0.07, "learning_rate": 0.0004721564694082247, "loss": 3.3105, "theoretical_loss": 4.316403244120339, "tokens_seen": 214958080 }, { "epoch": 0.07, "learning_rate": 0.00047214643931795387, "loss": 3.3791, "theoretical_loss": 4.3162404401077445, "tokens_seen": 215023616 }, { "epoch": 0.07, "learning_rate": 0.0004721364092276831, "loss": 3.2819, "theoretical_loss": 4.316077699596671, "tokens_seen": 215089152 }, { "epoch": 0.07, "learning_rate": 0.00047212637913741223, "loss": 3.3973, "theoretical_loss": 4.315915022543016, "tokens_seen": 215154688 }, { "epoch": 0.07, "learning_rate": 0.00047211634904714147, "loss": 3.5675, "theoretical_loss": 4.315752408902716, "tokens_seen": 215220224 }, { "epoch": 0.07, "learning_rate": 0.0004721063189568706, "loss": 3.4498, "theoretical_loss": 4.315589858631755, "tokens_seen": 215285760 }, { "epoch": 0.07, "learning_rate": 0.00047209628886659983, "loss": 3.3734, "theoretical_loss": 4.315427371686157, "tokens_seen": 215351296 }, { "epoch": 0.07, "learning_rate": 0.000472086258776329, "loss": 3.2779, "theoretical_loss": 4.315264948021994, "tokens_seen": 215416832 }, { "epoch": 0.07, "learning_rate": 0.0004720762286860582, "loss": 3.307, "theoretical_loss": 4.315102587595379, "tokens_seen": 215482368 }, { "epoch": 0.07, "learning_rate": 0.0004720661985957874, "loss": 3.2385, "theoretical_loss": 4.31494029036247, "tokens_seen": 215547904 }, { "epoch": 0.07, "learning_rate": 0.00047205616850551656, "loss": 3.4308, "theoretical_loss": 4.314778056279468, "tokens_seen": 215613440 }, { "epoch": 0.07, "learning_rate": 0.00047204613841524574, "loss": 3.3682, "theoretical_loss": 4.314615885302619, "tokens_seen": 215678976 }, { "epoch": 0.07, "learning_rate": 0.00047203610832497497, "loss": 3.4469, "theoretical_loss": 4.314453777388209, "tokens_seen": 215744512 }, { "epoch": 0.07, "learning_rate": 0.0004720260782347041, "loss": 3.254, "theoretical_loss": 4.314291732492573, "tokens_seen": 215810048 }, { "epoch": 0.07, "learning_rate": 0.00047201604814443333, "loss": 3.385, "theoretical_loss": 4.314129750572087, "tokens_seen": 215875584 }, { "epoch": 0.07, "learning_rate": 0.0004720060180541625, "loss": 3.4436, "theoretical_loss": 4.3139678315831675, "tokens_seen": 215941120 }, { "epoch": 0.07, "learning_rate": 0.0004719959879638917, "loss": 3.321, "theoretical_loss": 4.313805975482278, "tokens_seen": 216006656 }, { "epoch": 0.07, "learning_rate": 0.0004719859578736209, "loss": 3.3788, "theoretical_loss": 4.313644182225926, "tokens_seen": 216072192 }, { "epoch": 0.07, "learning_rate": 0.00047197592778335006, "loss": 3.4589, "theoretical_loss": 4.313482451770659, "tokens_seen": 216137728 }, { "epoch": 0.07, "learning_rate": 0.00047196589769307924, "loss": 3.1937, "theoretical_loss": 4.313320784073069, "tokens_seen": 216203264 }, { "epoch": 0.07, "objective/train/docs_used": 373872, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.527864933013916, "objective/train/theoretical_loss": 4.3131591790897925, "objective/train/tokens_used": 236728800, "theoretical_loss": 4.3131591790897925, "tokens_seen": 216268800 }, { "epoch": 0.07, "learning_rate": 0.0004719558676028085, "loss": 3.4104, "theoretical_loss": 4.3131591790897925, "tokens_seen": 216268800 }, { "epoch": 0.07, "learning_rate": 0.0004719458375125376, "loss": 3.5124, "theoretical_loss": 4.3129976367775065, "tokens_seen": 216334336 }, { "epoch": 0.07, "learning_rate": 0.00047193580742226684, "loss": 3.3647, "theoretical_loss": 4.312836157092934, "tokens_seen": 216399872 }, { "epoch": 0.07, "learning_rate": 0.00047192577733199596, "loss": 3.3762, "theoretical_loss": 4.312674739992839, "tokens_seen": 216465408 }, { "epoch": 0.07, "learning_rate": 0.0004719157472417252, "loss": 3.5008, "theoretical_loss": 4.31251338543403, "tokens_seen": 216530944 }, { "epoch": 0.07, "learning_rate": 0.0004719057171514544, "loss": 3.3981, "theoretical_loss": 4.312352093373354, "tokens_seen": 216596480 }, { "epoch": 0.07, "learning_rate": 0.00047189568706118356, "loss": 3.4092, "theoretical_loss": 4.312190863767708, "tokens_seen": 216662016 }, { "epoch": 0.07, "learning_rate": 0.00047188565697091274, "loss": 3.4016, "theoretical_loss": 4.312029696574027, "tokens_seen": 216727552 }, { "epoch": 0.07, "learning_rate": 0.0004718756268806419, "loss": 3.3583, "theoretical_loss": 4.311868591749287, "tokens_seen": 216793088 }, { "epoch": 0.07, "learning_rate": 0.0004718655967903711, "loss": 3.3424, "theoretical_loss": 4.311707549250514, "tokens_seen": 216858624 }, { "epoch": 0.07, "learning_rate": 0.00047185556670010034, "loss": 3.2557, "theoretical_loss": 4.311546569034767, "tokens_seen": 216924160 }, { "epoch": 0.07, "learning_rate": 0.00047184553660982947, "loss": 3.2043, "theoretical_loss": 4.311385651059155, "tokens_seen": 216989696 }, { "epoch": 0.07, "learning_rate": 0.0004718355065195587, "loss": 3.2298, "theoretical_loss": 4.311224795280825, "tokens_seen": 217055232 }, { "epoch": 0.07, "learning_rate": 0.0004718254764292879, "loss": 3.4558, "theoretical_loss": 4.3110640016569715, "tokens_seen": 217120768 }, { "epoch": 0.07, "learning_rate": 0.00047181544633901706, "loss": 3.2275, "theoretical_loss": 4.310903270144825, "tokens_seen": 217186304 }, { "epoch": 0.07, "learning_rate": 0.00047180541624874625, "loss": 3.3978, "theoretical_loss": 4.310742600701664, "tokens_seen": 217251840 }, { "epoch": 0.07, "learning_rate": 0.0004717953861584754, "loss": 3.2859, "theoretical_loss": 4.310581993284805, "tokens_seen": 217317376 }, { "epoch": 0.07, "learning_rate": 0.0004717853560682046, "loss": 3.2828, "theoretical_loss": 4.310421447851609, "tokens_seen": 217382912 }, { "epoch": 0.07, "learning_rate": 0.00047177532597793384, "loss": 3.558, "theoretical_loss": 4.310260964359479, "tokens_seen": 217448448 }, { "epoch": 0.07, "learning_rate": 0.00047176529588766297, "loss": 3.3885, "theoretical_loss": 4.310100542765858, "tokens_seen": 217513984 }, { "epoch": 0.07, "learning_rate": 0.0004717552657973922, "loss": 3.2239, "theoretical_loss": 4.309940183028236, "tokens_seen": 217579520 }, { "epoch": 0.07, "learning_rate": 0.00047174523570712133, "loss": 3.323, "theoretical_loss": 4.309779885104139, "tokens_seen": 217645056 }, { "epoch": 0.07, "learning_rate": 0.00047173520561685057, "loss": 3.5429, "theoretical_loss": 4.309619648951139, "tokens_seen": 217710592 }, { "epoch": 0.07, "learning_rate": 0.00047172517552657975, "loss": 3.2623, "theoretical_loss": 4.3094594745268475, "tokens_seen": 217776128 }, { "epoch": 0.07, "learning_rate": 0.00047171514543630893, "loss": 3.416, "theoretical_loss": 4.30929936178892, "tokens_seen": 217841664 }, { "epoch": 0.07, "objective/train/docs_used": 376678, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1398980617523193, "objective/train/theoretical_loss": 4.309139310695053, "objective/train/tokens_used": 238367200, "theoretical_loss": 4.309139310695053, "tokens_seen": 217907200 }, { "epoch": 0.07, "learning_rate": 0.0004717051153460381, "loss": 3.332, "theoretical_loss": 4.309139310695053, "tokens_seen": 217907200 }, { "epoch": 0.07, "learning_rate": 0.00047169508525576735, "loss": 3.4118, "theoretical_loss": 4.308979321202983, "tokens_seen": 217972736 }, { "epoch": 0.07, "learning_rate": 0.0004716850551654965, "loss": 3.2532, "theoretical_loss": 4.308819393270491, "tokens_seen": 218038272 }, { "epoch": 0.07, "learning_rate": 0.0004716750250752257, "loss": 3.3134, "theoretical_loss": 4.308659526855396, "tokens_seen": 218103808 }, { "epoch": 0.07, "learning_rate": 0.00047166499498495484, "loss": 3.4797, "theoretical_loss": 4.308499721915563, "tokens_seen": 218169344 }, { "epoch": 0.07, "learning_rate": 0.00047165496489468407, "loss": 3.2982, "theoretical_loss": 4.308339978408897, "tokens_seen": 218234880 }, { "epoch": 0.07, "learning_rate": 0.00047164493480441325, "loss": 3.4844, "theoretical_loss": 4.308180296293341, "tokens_seen": 218300416 }, { "epoch": 0.07, "learning_rate": 0.00047163490471414243, "loss": 3.2677, "theoretical_loss": 4.308020675526883, "tokens_seen": 218365952 }, { "epoch": 0.07, "learning_rate": 0.0004716248746238716, "loss": 3.1504, "theoretical_loss": 4.307861116067554, "tokens_seen": 218431488 }, { "epoch": 0.07, "learning_rate": 0.0004716148445336008, "loss": 3.5423, "theoretical_loss": 4.30770161787342, "tokens_seen": 218497024 }, { "epoch": 0.07, "learning_rate": 0.00047160481444333, "loss": 3.3352, "theoretical_loss": 4.307542180902594, "tokens_seen": 218562560 }, { "epoch": 0.07, "learning_rate": 0.0004715947843530592, "loss": 3.4815, "theoretical_loss": 4.307382805113228, "tokens_seen": 218628096 }, { "epoch": 0.07, "learning_rate": 0.00047158475426278834, "loss": 3.3257, "theoretical_loss": 4.307223490463516, "tokens_seen": 218693632 }, { "epoch": 0.07, "learning_rate": 0.0004715747241725176, "loss": 3.4674, "theoretical_loss": 4.307064236911692, "tokens_seen": 218759168 }, { "epoch": 0.07, "learning_rate": 0.0004715646940822467, "loss": 3.4006, "theoretical_loss": 4.30690504441603, "tokens_seen": 218824704 }, { "epoch": 0.07, "learning_rate": 0.00047155466399197594, "loss": 3.2173, "theoretical_loss": 4.306745912934849, "tokens_seen": 218890240 }, { "epoch": 0.07, "learning_rate": 0.0004715446339017051, "loss": 3.3985, "theoretical_loss": 4.306586842426504, "tokens_seen": 218955776 }, { "epoch": 0.07, "learning_rate": 0.0004715346038114343, "loss": 3.2482, "theoretical_loss": 4.306427832849394, "tokens_seen": 219021312 }, { "epoch": 0.07, "learning_rate": 0.0004715245737211635, "loss": 3.4141, "theoretical_loss": 4.306268884161959, "tokens_seen": 219086848 }, { "epoch": 0.07, "learning_rate": 0.0004715145436308927, "loss": 3.3153, "theoretical_loss": 4.306109996322679, "tokens_seen": 219152384 }, { "epoch": 0.07, "learning_rate": 0.00047150451354062184, "loss": 3.4532, "theoretical_loss": 4.305951169290073, "tokens_seen": 219217920 }, { "epoch": 0.07, "learning_rate": 0.0004714944834503511, "loss": 3.2605, "theoretical_loss": 4.305792403022703, "tokens_seen": 219283456 }, { "epoch": 0.07, "learning_rate": 0.0004714844533600802, "loss": 3.4034, "theoretical_loss": 4.305633697479171, "tokens_seen": 219348992 }, { "epoch": 0.07, "learning_rate": 0.00047147442326980944, "loss": 3.3452, "theoretical_loss": 4.305475052618119, "tokens_seen": 219414528 }, { "epoch": 0.07, "learning_rate": 0.0004714643931795387, "loss": 3.3603, "theoretical_loss": 4.30531646839823, "tokens_seen": 219480064 }, { "epoch": 0.07, "objective/train/docs_used": 379444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.240506172180176, "objective/train/theoretical_loss": 4.305157944778228, "objective/train/tokens_used": 240005600, "theoretical_loss": 4.305157944778228, "tokens_seen": 219545600 }, { "epoch": 0.07, "learning_rate": 0.0004714543630892678, "loss": 3.2283, "theoretical_loss": 4.305157944778228, "tokens_seen": 219545600 }, { "epoch": 0.07, "learning_rate": 0.00047144433299899704, "loss": 3.4373, "theoretical_loss": 4.304999481716876, "tokens_seen": 219611136 }, { "epoch": 0.07, "learning_rate": 0.00047143430290872616, "loss": 3.32, "theoretical_loss": 4.304841079172979, "tokens_seen": 219676672 }, { "epoch": 0.07, "learning_rate": 0.0004714242728184554, "loss": 3.2323, "theoretical_loss": 4.30468273710538, "tokens_seen": 219742208 }, { "epoch": 0.07, "learning_rate": 0.0004714142427281846, "loss": 3.5193, "theoretical_loss": 4.304524455472965, "tokens_seen": 219807744 }, { "epoch": 0.07, "learning_rate": 0.00047140421263791376, "loss": 3.4341, "theoretical_loss": 4.304366234234659, "tokens_seen": 219873280 }, { "epoch": 0.07, "learning_rate": 0.00047139418254764294, "loss": 3.3078, "theoretical_loss": 4.304208073349426, "tokens_seen": 219938816 }, { "epoch": 0.07, "learning_rate": 0.0004713841524573721, "loss": 3.3199, "theoretical_loss": 4.304049972776271, "tokens_seen": 220004352 }, { "epoch": 0.07, "learning_rate": 0.0004713741223671013, "loss": 3.5086, "theoretical_loss": 4.30389193247424, "tokens_seen": 220069888 }, { "epoch": 0.07, "learning_rate": 0.00047136409227683054, "loss": 3.366, "theoretical_loss": 4.303733952402419, "tokens_seen": 220135424 }, { "epoch": 0.07, "learning_rate": 0.00047135406218655967, "loss": 3.3814, "theoretical_loss": 4.303576032519931, "tokens_seen": 220200960 }, { "epoch": 0.07, "learning_rate": 0.0004713440320962889, "loss": 3.1313, "theoretical_loss": 4.303418172785943, "tokens_seen": 220266496 }, { "epoch": 0.07, "learning_rate": 0.0004713340020060181, "loss": 3.3322, "theoretical_loss": 4.303260373159659, "tokens_seen": 220332032 }, { "epoch": 0.07, "learning_rate": 0.00047132397191574726, "loss": 3.2583, "theoretical_loss": 4.303102633600322, "tokens_seen": 220397568 }, { "epoch": 0.07, "learning_rate": 0.00047131394182547645, "loss": 3.3567, "theoretical_loss": 4.30294495406722, "tokens_seen": 220463104 }, { "epoch": 0.07, "learning_rate": 0.00047130391173520563, "loss": 3.5088, "theoretical_loss": 4.3027873345196745, "tokens_seen": 220528640 }, { "epoch": 0.07, "learning_rate": 0.0004712938816449348, "loss": 3.3243, "theoretical_loss": 4.302629774917049, "tokens_seen": 220594176 }, { "epoch": 0.07, "learning_rate": 0.00047128385155466404, "loss": 3.3814, "theoretical_loss": 4.302472275218748, "tokens_seen": 220659712 }, { "epoch": 0.07, "learning_rate": 0.00047127382146439317, "loss": 3.3138, "theoretical_loss": 4.302314835384214, "tokens_seen": 220725248 }, { "epoch": 0.07, "learning_rate": 0.0004712637913741224, "loss": 3.3943, "theoretical_loss": 4.30215745537293, "tokens_seen": 220790784 }, { "epoch": 0.07, "learning_rate": 0.00047125376128385153, "loss": 3.4101, "theoretical_loss": 4.302000135144416, "tokens_seen": 220856320 }, { "epoch": 0.07, "learning_rate": 0.00047124373119358077, "loss": 3.3554, "theoretical_loss": 4.301842874658235, "tokens_seen": 220921856 }, { "epoch": 0.07, "learning_rate": 0.00047123370110330995, "loss": 3.4482, "theoretical_loss": 4.301685673873987, "tokens_seen": 220987392 }, { "epoch": 0.07, "learning_rate": 0.00047122367101303913, "loss": 3.2897, "theoretical_loss": 4.301528532751312, "tokens_seen": 221052928 }, { "epoch": 0.07, "learning_rate": 0.0004712136409227683, "loss": 3.2168, "theoretical_loss": 4.301371451249888, "tokens_seen": 221118464 }, { "epoch": 0.07, "objective/train/docs_used": 382238, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.402163505554199, "objective/train/theoretical_loss": 4.301214429329433, "objective/train/tokens_used": 241644000, "theoretical_loss": 4.301214429329433, "tokens_seen": 221184000 }, { "epoch": 0.07, "learning_rate": 0.00047120361083249755, "loss": 3.3688, "theoretical_loss": 4.301214429329433, "tokens_seen": 221184000 }, { "epoch": 0.07, "learning_rate": 0.0004711935807422267, "loss": 3.3468, "theoretical_loss": 4.301057466949707, "tokens_seen": 221249536 }, { "epoch": 0.07, "learning_rate": 0.0004711835506519559, "loss": 3.3963, "theoretical_loss": 4.300900564070504, "tokens_seen": 221315072 }, { "epoch": 0.07, "learning_rate": 0.00047117352056168504, "loss": 3.2847, "theoretical_loss": 4.30074372065166, "tokens_seen": 221380608 }, { "epoch": 0.07, "learning_rate": 0.00047116349047141427, "loss": 3.3806, "theoretical_loss": 4.300586936653049, "tokens_seen": 221446144 }, { "epoch": 0.07, "learning_rate": 0.00047115346038114345, "loss": 3.2935, "theoretical_loss": 4.300430212034587, "tokens_seen": 221511680 }, { "epoch": 0.07, "learning_rate": 0.00047114343029087263, "loss": 3.4121, "theoretical_loss": 4.300273546756223, "tokens_seen": 221577216 }, { "epoch": 0.07, "learning_rate": 0.0004711334002006018, "loss": 3.3818, "theoretical_loss": 4.300116940777951, "tokens_seen": 221642752 }, { "epoch": 0.07, "learning_rate": 0.000471123370110331, "loss": 3.3542, "theoretical_loss": 4.299960394059799, "tokens_seen": 221708288 }, { "epoch": 0.07, "learning_rate": 0.0004711133400200602, "loss": 3.2989, "theoretical_loss": 4.299803906561835, "tokens_seen": 221773824 }, { "epoch": 0.07, "learning_rate": 0.0004711033099297894, "loss": 3.2897, "theoretical_loss": 4.29964747824417, "tokens_seen": 221839360 }, { "epoch": 0.07, "learning_rate": 0.00047109327983951854, "loss": 3.2081, "theoretical_loss": 4.299491109066947, "tokens_seen": 221904896 }, { "epoch": 0.07, "learning_rate": 0.0004710832497492478, "loss": 3.4808, "theoretical_loss": 4.299334798990351, "tokens_seen": 221970432 }, { "epoch": 0.07, "learning_rate": 0.0004710732196589769, "loss": 3.3801, "theoretical_loss": 4.2991785479746065, "tokens_seen": 222035968 }, { "epoch": 0.07, "learning_rate": 0.00047106318956870614, "loss": 3.3562, "theoretical_loss": 4.299022355979974, "tokens_seen": 222101504 }, { "epoch": 0.07, "learning_rate": 0.0004710531594784353, "loss": 3.4946, "theoretical_loss": 4.298866222966755, "tokens_seen": 222167040 }, { "epoch": 0.07, "learning_rate": 0.0004710431293881645, "loss": 3.2386, "theoretical_loss": 4.298710148895286, "tokens_seen": 222232576 }, { "epoch": 0.07, "learning_rate": 0.0004710330992978937, "loss": 3.3829, "theoretical_loss": 4.298554133725946, "tokens_seen": 222298112 }, { "epoch": 0.07, "learning_rate": 0.0004710230692076229, "loss": 3.341, "theoretical_loss": 4.298398177419149, "tokens_seen": 222363648 }, { "epoch": 0.07, "learning_rate": 0.00047101303911735204, "loss": 3.4955, "theoretical_loss": 4.298242279935349, "tokens_seen": 222429184 }, { "epoch": 0.07, "learning_rate": 0.0004710030090270813, "loss": 3.2498, "theoretical_loss": 4.2980864412350375, "tokens_seen": 222494720 }, { "epoch": 0.07, "learning_rate": 0.0004709929789368104, "loss": 3.2604, "theoretical_loss": 4.297930661278745, "tokens_seen": 222560256 }, { "epoch": 0.07, "learning_rate": 0.00047098294884653964, "loss": 3.2939, "theoretical_loss": 4.297774940027038, "tokens_seen": 222625792 }, { "epoch": 0.07, "learning_rate": 0.0004709729187562688, "loss": 3.3322, "theoretical_loss": 4.297619277440523, "tokens_seen": 222691328 }, { "epoch": 0.07, "learning_rate": 0.000470962888665998, "loss": 3.2764, "theoretical_loss": 4.297463673479846, "tokens_seen": 222756864 }, { "epoch": 0.07, "objective/train/docs_used": 385121, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4129462242126465, "objective/train/theoretical_loss": 4.297308128105687, "objective/train/tokens_used": 243282400, "theoretical_loss": 4.297308128105687, "tokens_seen": 222822400 }, { "epoch": 0.07, "learning_rate": 0.0004709528585757272, "loss": 3.3831, "theoretical_loss": 4.297308128105687, "tokens_seen": 222822400 }, { "epoch": 0.07, "learning_rate": 0.00047094282848545636, "loss": 3.3724, "theoretical_loss": 4.297152641278767, "tokens_seen": 222887936 }, { "epoch": 0.07, "learning_rate": 0.00047093279839518555, "loss": 3.2961, "theoretical_loss": 4.296997212959842, "tokens_seen": 222953472 }, { "epoch": 0.07, "learning_rate": 0.0004709227683049148, "loss": 3.521, "theoretical_loss": 4.296841843109711, "tokens_seen": 223019008 }, { "epoch": 0.07, "learning_rate": 0.0004709127382146439, "loss": 3.3597, "theoretical_loss": 4.296686531689204, "tokens_seen": 223084544 }, { "epoch": 0.07, "learning_rate": 0.00047090270812437314, "loss": 3.3755, "theoretical_loss": 4.296531278659193, "tokens_seen": 223150080 }, { "epoch": 0.07, "learning_rate": 0.00047089267803410227, "loss": 3.3058, "theoretical_loss": 4.296376083980589, "tokens_seen": 223215616 }, { "epoch": 0.07, "learning_rate": 0.0004708826479438315, "loss": 3.3455, "theoretical_loss": 4.296220947614337, "tokens_seen": 223281152 }, { "epoch": 0.07, "learning_rate": 0.0004708726178535607, "loss": 3.3322, "theoretical_loss": 4.296065869521421, "tokens_seen": 223346688 }, { "epoch": 0.07, "learning_rate": 0.00047086258776328987, "loss": 3.4259, "theoretical_loss": 4.295910849662862, "tokens_seen": 223412224 }, { "epoch": 0.07, "learning_rate": 0.00047085255767301905, "loss": 3.2823, "theoretical_loss": 4.2957558879997215, "tokens_seen": 223477760 }, { "epoch": 0.07, "learning_rate": 0.0004708425275827483, "loss": 3.3467, "theoretical_loss": 4.295600984493093, "tokens_seen": 223543296 }, { "epoch": 0.07, "learning_rate": 0.0004708324974924774, "loss": 3.3608, "theoretical_loss": 4.295446139104112, "tokens_seen": 223608832 }, { "epoch": 0.07, "learning_rate": 0.00047082246740220665, "loss": 3.4156, "theoretical_loss": 4.295291351793951, "tokens_seen": 223674368 }, { "epoch": 0.07, "learning_rate": 0.0004708124373119358, "loss": 3.4316, "theoretical_loss": 4.295136622523817, "tokens_seen": 223739904 }, { "epoch": 0.07, "learning_rate": 0.000470802407221665, "loss": 3.28, "theoretical_loss": 4.294981951254956, "tokens_seen": 223805440 }, { "epoch": 0.07, "learning_rate": 0.0004707923771313942, "loss": 3.1746, "theoretical_loss": 4.294827337948651, "tokens_seen": 223870976 }, { "epoch": 0.07, "learning_rate": 0.00047078234704112337, "loss": 3.2904, "theoretical_loss": 4.294672782566224, "tokens_seen": 223936512 }, { "epoch": 0.07, "learning_rate": 0.00047077231695085255, "loss": 3.2774, "theoretical_loss": 4.29451828506903, "tokens_seen": 224002048 }, { "epoch": 0.07, "learning_rate": 0.00047076228686058173, "loss": 3.3111, "theoretical_loss": 4.294363845418465, "tokens_seen": 224067584 }, { "epoch": 0.07, "learning_rate": 0.0004707522567703109, "loss": 3.3969, "theoretical_loss": 4.29420946357596, "tokens_seen": 224133120 }, { "epoch": 0.07, "learning_rate": 0.00047074222668004015, "loss": 3.2707, "theoretical_loss": 4.294055139502985, "tokens_seen": 224198656 }, { "epoch": 0.07, "learning_rate": 0.0004707321965897693, "loss": 3.26, "theoretical_loss": 4.293900873161043, "tokens_seen": 224264192 }, { "epoch": 0.07, "learning_rate": 0.0004707221664994985, "loss": 3.3865, "theoretical_loss": 4.293746664511678, "tokens_seen": 224329728 }, { "epoch": 0.07, "learning_rate": 0.00047071213640922775, "loss": 3.2402, "theoretical_loss": 4.293592513516469, "tokens_seen": 224395264 }, { "epoch": 0.07, "objective/train/docs_used": 387710, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9744443893432617, "objective/train/theoretical_loss": 4.293438420137031, "objective/train/tokens_used": 244920800, "theoretical_loss": 4.293438420137031, "tokens_seen": 224460800 }, { "epoch": 0.07, "learning_rate": 0.0004707021063189569, "loss": 3.293, "theoretical_loss": 4.293438420137031, "tokens_seen": 224460800 }, { "epoch": 0.07, "learning_rate": 0.0004706920762286861, "loss": 3.3204, "theoretical_loss": 4.293284384335017, "tokens_seen": 224526336 }, { "epoch": 0.07, "learning_rate": 0.00047068204613841524, "loss": 3.4014, "theoretical_loss": 4.293130406072118, "tokens_seen": 224591872 }, { "epoch": 0.07, "learning_rate": 0.00047067201604814447, "loss": 3.2468, "theoretical_loss": 4.292976485310057, "tokens_seen": 224657408 }, { "epoch": 0.07, "learning_rate": 0.00047066198595787365, "loss": 3.1612, "theoretical_loss": 4.2928226220106005, "tokens_seen": 224722944 }, { "epoch": 0.07, "learning_rate": 0.00047065195586760283, "loss": 3.2405, "theoretical_loss": 4.292668816135545, "tokens_seen": 224788480 }, { "epoch": 0.07, "learning_rate": 0.000470641925777332, "loss": 3.0516, "theoretical_loss": 4.292515067646727, "tokens_seen": 224854016 }, { "epoch": 0.07, "learning_rate": 0.0004706318956870612, "loss": 3.3477, "theoretical_loss": 4.29236137650602, "tokens_seen": 224919552 }, { "epoch": 0.07, "learning_rate": 0.0004706218655967904, "loss": 3.2916, "theoretical_loss": 4.2922077426753305, "tokens_seen": 224985088 }, { "epoch": 0.07, "learning_rate": 0.0004706118355065196, "loss": 3.354, "theoretical_loss": 4.292054166116605, "tokens_seen": 225050624 }, { "epoch": 0.07, "learning_rate": 0.00047060180541624874, "loss": 3.3479, "theoretical_loss": 4.291900646791825, "tokens_seen": 225116160 }, { "epoch": 0.07, "learning_rate": 0.000470591775325978, "loss": 3.1503, "theoretical_loss": 4.2917471846630075, "tokens_seen": 225181696 }, { "epoch": 0.07, "learning_rate": 0.0004705817452357071, "loss": 3.3027, "theoretical_loss": 4.291593779692207, "tokens_seen": 225247232 }, { "epoch": 0.07, "learning_rate": 0.00047057171514543634, "loss": 3.3392, "theoretical_loss": 4.291440431841513, "tokens_seen": 225312768 }, { "epoch": 0.07, "learning_rate": 0.0004705616850551655, "loss": 3.3007, "theoretical_loss": 4.291287141073053, "tokens_seen": 225378304 }, { "epoch": 0.07, "learning_rate": 0.0004705516549648947, "loss": 3.3589, "theoretical_loss": 4.291133907348989, "tokens_seen": 225443840 }, { "epoch": 0.07, "learning_rate": 0.0004705416248746239, "loss": 3.3094, "theoretical_loss": 4.29098073063152, "tokens_seen": 225509376 }, { "epoch": 0.07, "learning_rate": 0.0004705315947843531, "loss": 3.3703, "theoretical_loss": 4.29082761088288, "tokens_seen": 225574912 }, { "epoch": 0.07, "learning_rate": 0.00047052156469408224, "loss": 3.2828, "theoretical_loss": 4.290674548065338, "tokens_seen": 225640448 }, { "epoch": 0.07, "learning_rate": 0.0004705115346038115, "loss": 3.2581, "theoretical_loss": 4.290521542141203, "tokens_seen": 225705984 }, { "epoch": 0.07, "learning_rate": 0.0004705015045135406, "loss": 3.1311, "theoretical_loss": 4.290368593072817, "tokens_seen": 225771520 }, { "epoch": 0.07, "learning_rate": 0.00047049147442326984, "loss": 3.2967, "theoretical_loss": 4.290215700822556, "tokens_seen": 225837056 }, { "epoch": 0.07, "learning_rate": 0.000470481444332999, "loss": 3.4585, "theoretical_loss": 4.290062865352837, "tokens_seen": 225902592 }, { "epoch": 0.07, "learning_rate": 0.0004704714142427282, "loss": 3.3981, "theoretical_loss": 4.289910086626108, "tokens_seen": 225968128 }, { "epoch": 0.07, "learning_rate": 0.0004704613841524574, "loss": 3.2362, "theoretical_loss": 4.289757364604855, "tokens_seen": 226033664 }, { "epoch": 0.07, "objective/train/docs_used": 389112, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.214838981628418, "objective/train/theoretical_loss": 4.2896046992515995, "objective/train/tokens_used": 246559200, "theoretical_loss": 4.2896046992515995, "tokens_seen": 226099200 }, { "epoch": 0.07, "learning_rate": 0.00047045135406218656, "loss": 3.2334, "theoretical_loss": 4.2896046992515995, "tokens_seen": 226099200 }, { "epoch": 0.07, "learning_rate": 0.00047044132397191575, "loss": 3.4242, "theoretical_loss": 4.289452090528897, "tokens_seen": 226164736 }, { "epoch": 0.07, "learning_rate": 0.000470431293881645, "loss": 3.4775, "theoretical_loss": 4.289299538399341, "tokens_seen": 226230272 }, { "epoch": 0.07, "learning_rate": 0.0004704212637913741, "loss": 3.3707, "theoretical_loss": 4.28914704282556, "tokens_seen": 226295808 }, { "epoch": 0.07, "learning_rate": 0.00047041123370110334, "loss": 3.3761, "theoretical_loss": 4.288994603770215, "tokens_seen": 226361344 }, { "epoch": 0.07, "learning_rate": 0.00047040120361083247, "loss": 3.3698, "theoretical_loss": 4.288842221196007, "tokens_seen": 226426880 }, { "epoch": 0.07, "learning_rate": 0.0004703911735205617, "loss": 3.2773, "theoretical_loss": 4.28868989506567, "tokens_seen": 226492416 }, { "epoch": 0.07, "learning_rate": 0.0004703811434302909, "loss": 3.3228, "theoretical_loss": 4.288537625341974, "tokens_seen": 226557952 }, { "epoch": 0.07, "learning_rate": 0.00047037111334002007, "loss": 3.4612, "theoretical_loss": 4.288385411987722, "tokens_seen": 226623488 }, { "epoch": 0.07, "learning_rate": 0.00047036108324974925, "loss": 3.4314, "theoretical_loss": 4.288233254965755, "tokens_seen": 226689024 }, { "epoch": 0.07, "learning_rate": 0.0004703510531594785, "loss": 3.49, "theoretical_loss": 4.2880811542389505, "tokens_seen": 226754560 }, { "epoch": 0.07, "learning_rate": 0.0004703410230692076, "loss": 3.18, "theoretical_loss": 4.287929109770217, "tokens_seen": 226820096 }, { "epoch": 0.07, "learning_rate": 0.00047033099297893685, "loss": 3.4999, "theoretical_loss": 4.287777121522501, "tokens_seen": 226885632 }, { "epoch": 0.07, "learning_rate": 0.000470320962888666, "loss": 3.4895, "theoretical_loss": 4.287625189458781, "tokens_seen": 226951168 }, { "epoch": 0.07, "learning_rate": 0.0004703109327983952, "loss": 3.09, "theoretical_loss": 4.287473313542077, "tokens_seen": 227016704 }, { "epoch": 0.07, "learning_rate": 0.0004703009027081244, "loss": 3.3408, "theoretical_loss": 4.287321493735438, "tokens_seen": 227082240 }, { "epoch": 0.07, "learning_rate": 0.00047029087261785357, "loss": 3.2609, "theoretical_loss": 4.287169730001949, "tokens_seen": 227147776 }, { "epoch": 0.07, "learning_rate": 0.00047028084252758275, "loss": 3.3684, "theoretical_loss": 4.287018022304733, "tokens_seen": 227213312 }, { "epoch": 0.07, "learning_rate": 0.00047027081243731193, "loss": 3.1466, "theoretical_loss": 4.286866370606943, "tokens_seen": 227278848 }, { "epoch": 0.07, "learning_rate": 0.0004702607823470411, "loss": 3.2951, "theoretical_loss": 4.286714774871772, "tokens_seen": 227344384 }, { "epoch": 0.07, "learning_rate": 0.00047025075225677035, "loss": 3.1628, "theoretical_loss": 4.286563235062444, "tokens_seen": 227409920 }, { "epoch": 0.07, "learning_rate": 0.0004702407221664995, "loss": 3.428, "theoretical_loss": 4.28641175114222, "tokens_seen": 227475456 }, { "epoch": 0.07, "learning_rate": 0.0004702306920762287, "loss": 3.2809, "theoretical_loss": 4.286260323074394, "tokens_seen": 227540992 }, { "epoch": 0.07, "learning_rate": 0.00047022066198595784, "loss": 3.4657, "theoretical_loss": 4.286108950822296, "tokens_seen": 227606528 }, { "epoch": 0.07, "learning_rate": 0.0004702106318956871, "loss": 3.2204, "theoretical_loss": 4.285957634349289, "tokens_seen": 227672064 }, { "epoch": 0.07, "objective/train/docs_used": 391912, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9988656044006348, "objective/train/theoretical_loss": 4.285806373618774, "objective/train/tokens_used": 248197600, "theoretical_loss": 4.285806373618774, "tokens_seen": 227737600 }, { "epoch": 0.07, "learning_rate": 0.00047020060180541626, "loss": 3.1812, "theoretical_loss": 4.285806373618774, "tokens_seen": 227737600 }, { "epoch": 0.07, "learning_rate": 0.00047019057171514544, "loss": 3.2592, "theoretical_loss": 4.285655168594182, "tokens_seen": 227803136 }, { "epoch": 0.07, "learning_rate": 0.0004701805416248746, "loss": 3.2476, "theoretical_loss": 4.285504019238982, "tokens_seen": 227868672 }, { "epoch": 0.07, "learning_rate": 0.00047017051153460385, "loss": 3.2497, "theoretical_loss": 4.285352925516676, "tokens_seen": 227934208 }, { "epoch": 0.07, "learning_rate": 0.000470160481444333, "loss": 3.2217, "theoretical_loss": 4.2852018873908, "tokens_seen": 227999744 }, { "epoch": 0.07, "learning_rate": 0.0004701504513540622, "loss": 3.3709, "theoretical_loss": 4.285050904824925, "tokens_seen": 228065280 }, { "epoch": 0.07, "learning_rate": 0.00047014042126379134, "loss": 3.5261, "theoretical_loss": 4.284899977782658, "tokens_seen": 228130816 }, { "epoch": 0.07, "learning_rate": 0.0004701303911735206, "loss": 3.4997, "theoretical_loss": 4.284749106227636, "tokens_seen": 228196352 }, { "epoch": 0.07, "learning_rate": 0.00047012036108324976, "loss": 3.3724, "theoretical_loss": 4.284598290123535, "tokens_seen": 228261888 }, { "epoch": 0.07, "learning_rate": 0.00047011033099297894, "loss": 3.3726, "theoretical_loss": 4.284447529434061, "tokens_seen": 228327424 }, { "epoch": 0.07, "learning_rate": 0.0004701003009027081, "loss": 3.2778, "theoretical_loss": 4.284296824122959, "tokens_seen": 228392960 }, { "epoch": 0.07, "learning_rate": 0.0004700902708124373, "loss": 3.3521, "theoretical_loss": 4.284146174154003, "tokens_seen": 228458496 }, { "epoch": 0.07, "learning_rate": 0.0004700802407221665, "loss": 3.1583, "theoretical_loss": 4.283995579491004, "tokens_seen": 228524032 }, { "epoch": 0.07, "learning_rate": 0.0004700702106318957, "loss": 3.4583, "theoretical_loss": 4.283845040097807, "tokens_seen": 228589568 }, { "epoch": 0.07, "learning_rate": 0.00047006018054162484, "loss": 3.374, "theoretical_loss": 4.28369455593829, "tokens_seen": 228655104 }, { "epoch": 0.07, "learning_rate": 0.0004700501504513541, "loss": 3.1911, "theoretical_loss": 4.2835441269763646, "tokens_seen": 228720640 }, { "epoch": 0.07, "learning_rate": 0.00047004012036108326, "loss": 3.2163, "theoretical_loss": 4.283393753175979, "tokens_seen": 228786176 }, { "epoch": 0.07, "learning_rate": 0.00047003009027081244, "loss": 3.4782, "theoretical_loss": 4.283243434501112, "tokens_seen": 228851712 }, { "epoch": 0.07, "learning_rate": 0.0004700200601805416, "loss": 3.4345, "theoretical_loss": 4.283093170915778, "tokens_seen": 228917248 }, { "epoch": 0.07, "learning_rate": 0.0004700100300902708, "loss": 3.0718, "theoretical_loss": 4.282942962384023, "tokens_seen": 228982784 }, { "epoch": 0.07, "learning_rate": 0.00047, "loss": 3.3068, "theoretical_loss": 4.282792808869932, "tokens_seen": 229048320 }, { "epoch": 0.07, "learning_rate": 0.0004699899699097292, "loss": 3.2156, "theoretical_loss": 4.282642710337618, "tokens_seen": 229113856 }, { "epoch": 0.07, "learning_rate": 0.00046997993981945835, "loss": 3.3879, "theoretical_loss": 4.28249266675123, "tokens_seen": 229179392 }, { "epoch": 0.07, "learning_rate": 0.0004699699097291876, "loss": 3.0536, "theoretical_loss": 4.282342678074951, "tokens_seen": 229244928 }, { "epoch": 0.07, "learning_rate": 0.00046995987963891676, "loss": 3.376, "theoretical_loss": 4.2821927442729955, "tokens_seen": 229310464 }, { "epoch": 0.07, "objective/train/docs_used": 394727, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.484297037124634, "objective/train/theoretical_loss": 4.282042865309616, "objective/train/tokens_used": 249836000, "theoretical_loss": 4.282042865309616, "tokens_seen": 229376000 }, { "epoch": 0.07, "learning_rate": 0.00046994984954864595, "loss": 3.3629, "theoretical_loss": 4.282042865309616, "tokens_seen": 229376000 }, { "epoch": 0.07, "learning_rate": 0.0004699398194583752, "loss": 3.4377, "theoretical_loss": 4.281893041149093, "tokens_seen": 229441536 }, { "epoch": 0.07, "learning_rate": 0.0004699297893681043, "loss": 3.4159, "theoretical_loss": 4.2817432717557455, "tokens_seen": 229507072 }, { "epoch": 0.07, "learning_rate": 0.00046991975927783354, "loss": 3.1621, "theoretical_loss": 4.28159355709392, "tokens_seen": 229572608 }, { "epoch": 0.07, "learning_rate": 0.00046990972918756267, "loss": 3.4109, "theoretical_loss": 4.281443897128004, "tokens_seen": 229638144 }, { "epoch": 0.07, "learning_rate": 0.0004698996990972919, "loss": 3.1996, "theoretical_loss": 4.2812942918224115, "tokens_seen": 229703680 }, { "epoch": 0.07, "learning_rate": 0.0004698896690070211, "loss": 3.4308, "theoretical_loss": 4.281144741141593, "tokens_seen": 229769216 }, { "epoch": 0.07, "learning_rate": 0.00046987963891675027, "loss": 3.3347, "theoretical_loss": 4.280995245050032, "tokens_seen": 229834752 }, { "epoch": 0.07, "learning_rate": 0.00046986960882647945, "loss": 3.4308, "theoretical_loss": 4.2808458035122445, "tokens_seen": 229900288 }, { "epoch": 0.07, "learning_rate": 0.0004698595787362087, "loss": 3.385, "theoretical_loss": 4.2806964164927805, "tokens_seen": 229965824 }, { "epoch": 0.07, "learning_rate": 0.0004698495486459378, "loss": 3.454, "theoretical_loss": 4.280547083956224, "tokens_seen": 230031360 }, { "epoch": 0.07, "learning_rate": 0.00046983951855566705, "loss": 3.4179, "theoretical_loss": 4.280397805867188, "tokens_seen": 230096896 }, { "epoch": 0.07, "learning_rate": 0.0004698294884653962, "loss": 3.2358, "theoretical_loss": 4.280248582190324, "tokens_seen": 230162432 }, { "epoch": 0.07, "learning_rate": 0.0004698194583751254, "loss": 3.2373, "theoretical_loss": 4.280099412890312, "tokens_seen": 230227968 }, { "epoch": 0.07, "learning_rate": 0.0004698094282848546, "loss": 3.4291, "theoretical_loss": 4.279950297931869, "tokens_seen": 230293504 }, { "epoch": 0.07, "learning_rate": 0.00046979939819458377, "loss": 3.1142, "theoretical_loss": 4.27980123727974, "tokens_seen": 230359040 }, { "epoch": 0.07, "learning_rate": 0.00046978936810431295, "loss": 3.3477, "theoretical_loss": 4.279652230898709, "tokens_seen": 230424576 }, { "epoch": 0.07, "learning_rate": 0.00046977933801404213, "loss": 3.3803, "theoretical_loss": 4.279503278753586, "tokens_seen": 230490112 }, { "epoch": 0.07, "learning_rate": 0.0004697693079237713, "loss": 3.3006, "theoretical_loss": 4.27935438080922, "tokens_seen": 230555648 }, { "epoch": 0.07, "learning_rate": 0.00046975927783350055, "loss": 3.2407, "theoretical_loss": 4.27920553703049, "tokens_seen": 230621184 }, { "epoch": 0.07, "learning_rate": 0.0004697492477432297, "loss": 3.3677, "theoretical_loss": 4.279056747382306, "tokens_seen": 230686720 }, { "epoch": 0.07, "learning_rate": 0.0004697392176529589, "loss": 3.536, "theoretical_loss": 4.278908011829613, "tokens_seen": 230752256 }, { "epoch": 0.07, "learning_rate": 0.00046972918756268804, "loss": 3.3258, "theoretical_loss": 4.27875933033739, "tokens_seen": 230817792 }, { "epoch": 0.07, "learning_rate": 0.0004697191574724173, "loss": 3.4573, "theoretical_loss": 4.278610702870646, "tokens_seen": 230883328 }, { "epoch": 0.07, "learning_rate": 0.00046970912738214646, "loss": 3.2022, "theoretical_loss": 4.278462129394423, "tokens_seen": 230948864 }, { "epoch": 0.07, "objective/train/docs_used": 397483, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3717691898345947, "objective/train/theoretical_loss": 4.278313609873795, "objective/train/tokens_used": 251474400, "theoretical_loss": 4.278313609873795, "tokens_seen": 231014400 }, { "epoch": 0.07, "learning_rate": 0.00046969909729187564, "loss": 3.401, "theoretical_loss": 4.278313609873795, "tokens_seen": 231014400 }, { "epoch": 0.07, "learning_rate": 0.0004696890672016048, "loss": 3.4853, "theoretical_loss": 4.278165144273871, "tokens_seen": 231079936 }, { "epoch": 0.07, "learning_rate": 0.00046967903711133405, "loss": 3.2646, "theoretical_loss": 4.27801673255979, "tokens_seen": 231145472 }, { "epoch": 0.07, "learning_rate": 0.0004696690070210632, "loss": 3.3867, "theoretical_loss": 4.277868374696725, "tokens_seen": 231211008 }, { "epoch": 0.07, "learning_rate": 0.0004696589769307924, "loss": 3.3767, "theoretical_loss": 4.277720070649879, "tokens_seen": 231276544 }, { "epoch": 0.07, "learning_rate": 0.00046964894684052154, "loss": 3.4429, "theoretical_loss": 4.277571820384491, "tokens_seen": 231342080 }, { "epoch": 0.07, "learning_rate": 0.0004696389167502508, "loss": 3.3961, "theoretical_loss": 4.277423623865829, "tokens_seen": 231407616 }, { "epoch": 0.07, "learning_rate": 0.00046962888665997996, "loss": 3.3803, "theoretical_loss": 4.277275481059195, "tokens_seen": 231473152 }, { "epoch": 0.07, "learning_rate": 0.00046961885656970914, "loss": 3.3082, "theoretical_loss": 4.2771273919299215, "tokens_seen": 231538688 }, { "epoch": 0.07, "learning_rate": 0.0004696088264794383, "loss": 3.2331, "theoretical_loss": 4.276979356443377, "tokens_seen": 231604224 }, { "epoch": 0.07, "learning_rate": 0.0004695987963891675, "loss": 3.2983, "theoretical_loss": 4.276831374564957, "tokens_seen": 231669760 }, { "epoch": 0.07, "learning_rate": 0.0004695887662988967, "loss": 3.3783, "theoretical_loss": 4.276683446260093, "tokens_seen": 231735296 }, { "epoch": 0.07, "learning_rate": 0.0004695787362086259, "loss": 3.2446, "theoretical_loss": 4.276535571494247, "tokens_seen": 231800832 }, { "epoch": 0.07, "learning_rate": 0.00046956870611835505, "loss": 3.2102, "theoretical_loss": 4.276387750232913, "tokens_seen": 231866368 }, { "epoch": 0.07, "learning_rate": 0.0004695586760280843, "loss": 3.3905, "theoretical_loss": 4.276239982441617, "tokens_seen": 231931904 }, { "epoch": 0.07, "learning_rate": 0.00046954864593781346, "loss": 3.2951, "theoretical_loss": 4.276092268085918, "tokens_seen": 231997440 }, { "epoch": 0.07, "learning_rate": 0.00046953861584754264, "loss": 3.2884, "theoretical_loss": 4.275944607131406, "tokens_seen": 232062976 }, { "epoch": 0.07, "learning_rate": 0.0004695285857572718, "loss": 3.3345, "theoretical_loss": 4.275796999543703, "tokens_seen": 232128512 }, { "epoch": 0.07, "learning_rate": 0.000469518555667001, "loss": 3.6235, "theoretical_loss": 4.275649445288461, "tokens_seen": 232194048 }, { "epoch": 0.07, "learning_rate": 0.0004695085255767302, "loss": 3.3525, "theoretical_loss": 4.275501944331367, "tokens_seen": 232259584 }, { "epoch": 0.07, "learning_rate": 0.0004694984954864594, "loss": 3.2411, "theoretical_loss": 4.275354496638139, "tokens_seen": 232325120 }, { "epoch": 0.07, "learning_rate": 0.00046948846539618855, "loss": 3.2349, "theoretical_loss": 4.275207102174525, "tokens_seen": 232390656 }, { "epoch": 0.07, "learning_rate": 0.0004694784353059178, "loss": 3.3859, "theoretical_loss": 4.275059760906305, "tokens_seen": 232456192 }, { "epoch": 0.07, "learning_rate": 0.0004694684052156469, "loss": 3.3359, "theoretical_loss": 4.2749124727992935, "tokens_seen": 232521728 }, { "epoch": 0.07, "learning_rate": 0.00046945837512537615, "loss": 3.3935, "theoretical_loss": 4.274765237819333, "tokens_seen": 232587264 }, { "epoch": 0.07, "objective/train/docs_used": 399784, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9909167289733887, "objective/train/theoretical_loss": 4.274618055932298, "objective/train/tokens_used": 253112800, "theoretical_loss": 4.274618055932298, "tokens_seen": 232652800 }, { "epoch": 0.07, "learning_rate": 0.0004694483450351053, "loss": 3.3553, "theoretical_loss": 4.274618055932298, "tokens_seen": 232652800 }, { "epoch": 0.07, "learning_rate": 0.0004694383149448345, "loss": 3.34, "theoretical_loss": 4.2744709271040975, "tokens_seen": 232718336 }, { "epoch": 0.07, "learning_rate": 0.0004694282848545637, "loss": 3.3583, "theoretical_loss": 4.27432385130067, "tokens_seen": 232783872 }, { "epoch": 0.07, "learning_rate": 0.00046941825476429287, "loss": 3.3789, "theoretical_loss": 4.274176828487984, "tokens_seen": 232849408 }, { "epoch": 0.07, "learning_rate": 0.00046940822467402205, "loss": 3.252, "theoretical_loss": 4.2740298586320415, "tokens_seen": 232914944 }, { "epoch": 0.07, "learning_rate": 0.0004693981945837513, "loss": 3.2434, "theoretical_loss": 4.273882941698876, "tokens_seen": 232980480 }, { "epoch": 0.07, "learning_rate": 0.0004693881644934804, "loss": 3.4055, "theoretical_loss": 4.27373607765455, "tokens_seen": 233046016 }, { "epoch": 0.07, "learning_rate": 0.00046937813440320965, "loss": 3.3583, "theoretical_loss": 4.2735892664651605, "tokens_seen": 233111552 }, { "epoch": 0.07, "learning_rate": 0.00046936810431293883, "loss": 3.3661, "theoretical_loss": 4.273442508096833, "tokens_seen": 233177088 }, { "epoch": 0.07, "learning_rate": 0.000469358074222668, "loss": 3.4617, "theoretical_loss": 4.273295802515726, "tokens_seen": 233242624 }, { "epoch": 0.07, "learning_rate": 0.0004693480441323972, "loss": 3.3981, "theoretical_loss": 4.273149149688028, "tokens_seen": 233308160 }, { "epoch": 0.07, "learning_rate": 0.0004693380140421264, "loss": 3.2862, "theoretical_loss": 4.27300254957996, "tokens_seen": 233373696 }, { "epoch": 0.07, "learning_rate": 0.00046932798395185555, "loss": 3.2325, "theoretical_loss": 4.272856002157772, "tokens_seen": 233439232 }, { "epoch": 0.07, "learning_rate": 0.0004693179538615848, "loss": 3.3749, "theoretical_loss": 4.272709507387748, "tokens_seen": 233504768 }, { "epoch": 0.07, "learning_rate": 0.0004693079237713139, "loss": 3.3843, "theoretical_loss": 4.2725630652362, "tokens_seen": 233570304 }, { "epoch": 0.07, "learning_rate": 0.00046929789368104315, "loss": 3.5378, "theoretical_loss": 4.272416675669473, "tokens_seen": 233635840 }, { "epoch": 0.07, "learning_rate": 0.0004692878635907723, "loss": 3.3754, "theoretical_loss": 4.272270338653942, "tokens_seen": 233701376 }, { "epoch": 0.07, "learning_rate": 0.0004692778335005015, "loss": 3.3761, "theoretical_loss": 4.272124054156014, "tokens_seen": 233766912 }, { "epoch": 0.07, "learning_rate": 0.0004692678034102307, "loss": 3.1768, "theoretical_loss": 4.271977822142125, "tokens_seen": 233832448 }, { "epoch": 0.07, "learning_rate": 0.0004692577733199599, "loss": 3.316, "theoretical_loss": 4.271831642578745, "tokens_seen": 233897984 }, { "epoch": 0.07, "learning_rate": 0.00046924774322968906, "loss": 3.2177, "theoretical_loss": 4.27168551543237, "tokens_seen": 233963520 }, { "epoch": 0.07, "learning_rate": 0.00046923771313941824, "loss": 3.5093, "theoretical_loss": 4.271539440669532, "tokens_seen": 234029056 }, { "epoch": 0.07, "learning_rate": 0.0004692276830491474, "loss": 3.2319, "theoretical_loss": 4.27139341825679, "tokens_seen": 234094592 }, { "epoch": 0.07, "learning_rate": 0.00046921765295887666, "loss": 3.2718, "theoretical_loss": 4.271247448160736, "tokens_seen": 234160128 }, { "epoch": 0.07, "learning_rate": 0.00046920762286860584, "loss": 3.4089, "theoretical_loss": 4.27110153034799, "tokens_seen": 234225664 }, { "epoch": 0.07, "objective/train/docs_used": 402735, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3240511417388916, "objective/train/theoretical_loss": 4.270955664785207, "objective/train/tokens_used": 254751200, "theoretical_loss": 4.270955664785207, "tokens_seen": 234291200 }, { "epoch": 0.07, "learning_rate": 0.000469197592778335, "loss": 3.1956, "theoretical_loss": 4.270955664785207, "tokens_seen": 234291200 }, { "epoch": 0.07, "learning_rate": 0.00046918756268806425, "loss": 3.2661, "theoretical_loss": 4.2708098514390676, "tokens_seen": 234356736 }, { "epoch": 0.07, "learning_rate": 0.0004691775325977934, "loss": 3.3937, "theoretical_loss": 4.270664090276286, "tokens_seen": 234422272 }, { "epoch": 0.07, "learning_rate": 0.0004691675025075226, "loss": 3.3708, "theoretical_loss": 4.2705183812636065, "tokens_seen": 234487808 }, { "epoch": 0.07, "learning_rate": 0.00046915747241725174, "loss": 3.4421, "theoretical_loss": 4.270372724367803, "tokens_seen": 234553344 }, { "epoch": 0.07, "learning_rate": 0.000469147442326981, "loss": 3.1839, "theoretical_loss": 4.270227119555681, "tokens_seen": 234618880 }, { "epoch": 0.07, "learning_rate": 0.00046913741223671016, "loss": 3.3155, "theoretical_loss": 4.270081566794076, "tokens_seen": 234684416 }, { "epoch": 0.07, "learning_rate": 0.00046912738214643934, "loss": 3.3729, "theoretical_loss": 4.269936066049852, "tokens_seen": 234749952 }, { "epoch": 0.07, "learning_rate": 0.0004691173520561685, "loss": 3.5027, "theoretical_loss": 4.269790617289907, "tokens_seen": 234815488 }, { "epoch": 0.07, "learning_rate": 0.0004691073219658977, "loss": 3.3449, "theoretical_loss": 4.269645220481166, "tokens_seen": 234881024 }, { "epoch": 0.07, "learning_rate": 0.0004690972918756269, "loss": 3.3971, "theoretical_loss": 4.269499875590587, "tokens_seen": 234946560 }, { "epoch": 0.07, "learning_rate": 0.0004690872617853561, "loss": 3.3364, "theoretical_loss": 4.269354582585156, "tokens_seen": 235012096 }, { "epoch": 0.07, "learning_rate": 0.00046907723169508525, "loss": 3.1883, "theoretical_loss": 4.269209341431889, "tokens_seen": 235077632 }, { "epoch": 0.07, "learning_rate": 0.0004690672016048145, "loss": 3.2292, "theoretical_loss": 4.269064152097835, "tokens_seen": 235143168 }, { "epoch": 0.07, "learning_rate": 0.00046905717151454366, "loss": 3.2626, "theoretical_loss": 4.26891901455007, "tokens_seen": 235208704 }, { "epoch": 0.07, "learning_rate": 0.00046904714142427284, "loss": 3.3244, "theoretical_loss": 4.268773928755701, "tokens_seen": 235274240 }, { "epoch": 0.07, "learning_rate": 0.000469037111334002, "loss": 3.2768, "theoretical_loss": 4.268628894681868, "tokens_seen": 235339776 }, { "epoch": 0.07, "learning_rate": 0.0004690270812437312, "loss": 3.2611, "theoretical_loss": 4.268483912295735, "tokens_seen": 235405312 }, { "epoch": 0.07, "learning_rate": 0.0004690170511534604, "loss": 3.23, "theoretical_loss": 4.268338981564502, "tokens_seen": 235470848 }, { "epoch": 0.07, "learning_rate": 0.0004690070210631896, "loss": 3.3949, "theoretical_loss": 4.268194102455395, "tokens_seen": 235536384 }, { "epoch": 0.07, "learning_rate": 0.00046899699097291875, "loss": 3.3048, "theoretical_loss": 4.26804927493567, "tokens_seen": 235601920 }, { "epoch": 0.07, "learning_rate": 0.000468986960882648, "loss": 3.3749, "theoretical_loss": 4.267904498972618, "tokens_seen": 235667456 }, { "epoch": 0.07, "learning_rate": 0.0004689769307923771, "loss": 3.3169, "theoretical_loss": 4.267759774533552, "tokens_seen": 235732992 }, { "epoch": 0.07, "learning_rate": 0.00046896690070210635, "loss": 3.3402, "theoretical_loss": 4.267615101585821, "tokens_seen": 235798528 }, { "epoch": 0.07, "learning_rate": 0.00046895687061183553, "loss": 3.4162, "theoretical_loss": 4.267470480096801, "tokens_seen": 235864064 }, { "epoch": 0.07, "objective/train/docs_used": 405251, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.333627462387085, "objective/train/theoretical_loss": 4.267325910033897, "objective/train/tokens_used": 256389600, "theoretical_loss": 4.267325910033897, "tokens_seen": 235929600 }, { "epoch": 0.07, "learning_rate": 0.0004689468405215647, "loss": 3.3826, "theoretical_loss": 4.267325910033897, "tokens_seen": 235929600 }, { "epoch": 0.07, "learning_rate": 0.0004689368104312939, "loss": 3.2289, "theoretical_loss": 4.267181391364547, "tokens_seen": 235995136 }, { "epoch": 0.07, "learning_rate": 0.00046892678034102307, "loss": 3.413, "theoretical_loss": 4.267036924056215, "tokens_seen": 236060672 }, { "epoch": 0.07, "learning_rate": 0.00046891675025075225, "loss": 3.4066, "theoretical_loss": 4.266892508076397, "tokens_seen": 236126208 }, { "epoch": 0.07, "learning_rate": 0.0004689067201604815, "loss": 3.2177, "theoretical_loss": 4.266748143392617, "tokens_seen": 236191744 }, { "epoch": 0.07, "learning_rate": 0.0004688966900702106, "loss": 3.3657, "theoretical_loss": 4.26660382997243, "tokens_seen": 236257280 }, { "epoch": 0.07, "learning_rate": 0.00046888665997993985, "loss": 3.2978, "theoretical_loss": 4.26645956778342, "tokens_seen": 236322816 }, { "epoch": 0.07, "learning_rate": 0.00046887662988966903, "loss": 3.2945, "theoretical_loss": 4.2663153567932, "tokens_seen": 236388352 }, { "epoch": 0.07, "learning_rate": 0.0004688665997993982, "loss": 3.3411, "theoretical_loss": 4.266171196969412, "tokens_seen": 236453888 }, { "epoch": 0.07, "learning_rate": 0.0004688565697091274, "loss": 3.1644, "theoretical_loss": 4.2660270882797295, "tokens_seen": 236519424 }, { "epoch": 0.07, "learning_rate": 0.0004688465396188566, "loss": 3.4834, "theoretical_loss": 4.265883030691853, "tokens_seen": 236584960 }, { "epoch": 0.07, "learning_rate": 0.00046883650952858575, "loss": 3.2361, "theoretical_loss": 4.265739024173515, "tokens_seen": 236650496 }, { "epoch": 0.07, "learning_rate": 0.000468826479438315, "loss": 3.337, "theoretical_loss": 4.265595068692473, "tokens_seen": 236716032 }, { "epoch": 0.07, "learning_rate": 0.0004688164493480441, "loss": 3.3639, "theoretical_loss": 4.26545116421652, "tokens_seen": 236781568 }, { "epoch": 0.07, "learning_rate": 0.00046880641925777335, "loss": 3.197, "theoretical_loss": 4.265307310713471, "tokens_seen": 236847104 }, { "epoch": 0.07, "learning_rate": 0.0004687963891675025, "loss": 3.484, "theoretical_loss": 4.2651635081511765, "tokens_seen": 236912640 }, { "epoch": 0.07, "learning_rate": 0.0004687863590772317, "loss": 3.4098, "theoretical_loss": 4.265019756497512, "tokens_seen": 236978176 }, { "epoch": 0.07, "learning_rate": 0.0004687763289869609, "loss": 3.3701, "theoretical_loss": 4.264876055720386, "tokens_seen": 237043712 }, { "epoch": 0.07, "learning_rate": 0.0004687662988966901, "loss": 3.2919, "theoretical_loss": 4.264732405787731, "tokens_seen": 237109248 }, { "epoch": 0.07, "learning_rate": 0.00046875626880641926, "loss": 3.2318, "theoretical_loss": 4.264588806667513, "tokens_seen": 237174784 }, { "epoch": 0.07, "learning_rate": 0.00046874623871614844, "loss": 3.2824, "theoretical_loss": 4.264445258327724, "tokens_seen": 237240320 }, { "epoch": 0.07, "learning_rate": 0.0004687362086258776, "loss": 3.1902, "theoretical_loss": 4.264301760736389, "tokens_seen": 237305856 }, { "epoch": 0.07, "learning_rate": 0.00046872617853560686, "loss": 3.3301, "theoretical_loss": 4.264158313861557, "tokens_seen": 237371392 }, { "epoch": 0.07, "learning_rate": 0.000468716148445336, "loss": 3.2807, "theoretical_loss": 4.264014917671309, "tokens_seen": 237436928 }, { "epoch": 0.07, "learning_rate": 0.0004687061183550652, "loss": 3.2744, "theoretical_loss": 4.2638715721337554, "tokens_seen": 237502464 }, { "epoch": 0.07, "objective/train/docs_used": 406700, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.374922037124634, "objective/train/theoretical_loss": 4.263728277217032, "objective/train/tokens_used": 258028000, "theoretical_loss": 4.263728277217032, "tokens_seen": 237568000 }, { "epoch": 0.07, "learning_rate": 0.0004686960882647944, "loss": 3.4473, "theoretical_loss": 4.263728277217032, "tokens_seen": 237568000 }, { "epoch": 0.07, "learning_rate": 0.0004686860581745236, "loss": 3.3449, "theoretical_loss": 4.263585032889306, "tokens_seen": 237633536 }, { "epoch": 0.07, "learning_rate": 0.00046867602808425276, "loss": 3.2657, "theoretical_loss": 4.263441839118776, "tokens_seen": 237699072 }, { "epoch": 0.07, "learning_rate": 0.00046866599799398194, "loss": 3.3153, "theoretical_loss": 4.2632986958736625, "tokens_seen": 237764608 }, { "epoch": 0.07, "learning_rate": 0.0004686559679037111, "loss": 3.3014, "theoretical_loss": 4.263155603122221, "tokens_seen": 237830144 }, { "epoch": 0.07, "learning_rate": 0.00046864593781344036, "loss": 3.3123, "theoretical_loss": 4.263012560832733, "tokens_seen": 237895680 }, { "epoch": 0.07, "learning_rate": 0.0004686359077231695, "loss": 3.2736, "theoretical_loss": 4.262869568973508, "tokens_seen": 237961216 }, { "epoch": 0.07, "learning_rate": 0.0004686258776328987, "loss": 3.1498, "theoretical_loss": 4.262726627512886, "tokens_seen": 238026752 }, { "epoch": 0.07, "learning_rate": 0.00046861584754262785, "loss": 3.4393, "theoretical_loss": 4.262583736419234, "tokens_seen": 238092288 }, { "epoch": 0.07, "learning_rate": 0.0004686058174523571, "loss": 3.3895, "theoretical_loss": 4.26244089566095, "tokens_seen": 238157824 }, { "epoch": 0.07, "learning_rate": 0.00046859578736208626, "loss": 3.2705, "theoretical_loss": 4.262298105206456, "tokens_seen": 238223360 }, { "epoch": 0.07, "learning_rate": 0.00046858575727181545, "loss": 3.4159, "theoretical_loss": 4.262155365024207, "tokens_seen": 238288896 }, { "epoch": 0.07, "learning_rate": 0.0004685757271815446, "loss": 3.2544, "theoretical_loss": 4.262012675082685, "tokens_seen": 238354432 }, { "epoch": 0.07, "learning_rate": 0.0004685656970912738, "loss": 3.1794, "theoretical_loss": 4.261870035350399, "tokens_seen": 238419968 }, { "epoch": 0.07, "learning_rate": 0.000468555667001003, "loss": 3.4069, "theoretical_loss": 4.261727445795888, "tokens_seen": 238485504 }, { "epoch": 0.07, "learning_rate": 0.0004685456369107322, "loss": 3.3407, "theoretical_loss": 4.26158490638772, "tokens_seen": 238551040 }, { "epoch": 0.07, "learning_rate": 0.00046853560682046135, "loss": 3.2095, "theoretical_loss": 4.261442417094488, "tokens_seen": 238616576 }, { "epoch": 0.07, "learning_rate": 0.0004685255767301906, "loss": 3.2679, "theoretical_loss": 4.261299977884816, "tokens_seen": 238682112 }, { "epoch": 0.07, "learning_rate": 0.00046851554663991977, "loss": 3.4694, "theoretical_loss": 4.2611575887273565, "tokens_seen": 238747648 }, { "epoch": 0.07, "learning_rate": 0.00046850551654964895, "loss": 3.2774, "theoretical_loss": 4.261015249590789, "tokens_seen": 238813184 }, { "epoch": 0.07, "learning_rate": 0.00046849548645937813, "loss": 3.3187, "theoretical_loss": 4.260872960443822, "tokens_seen": 238878720 }, { "epoch": 0.07, "learning_rate": 0.0004684854563691073, "loss": 3.0518, "theoretical_loss": 4.260730721255191, "tokens_seen": 238944256 }, { "epoch": 0.07, "learning_rate": 0.0004684754262788365, "loss": 3.3046, "theoretical_loss": 4.260588531993662, "tokens_seen": 239009792 }, { "epoch": 0.07, "learning_rate": 0.00046846539618856573, "loss": 3.2447, "theoretical_loss": 4.260446392628026, "tokens_seen": 239075328 }, { "epoch": 0.07, "learning_rate": 0.0004684553660982949, "loss": 3.4083, "theoretical_loss": 4.2603043031271035, "tokens_seen": 239140864 }, { "epoch": 0.07, "objective/train/docs_used": 409530, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.699578285217285, "objective/train/theoretical_loss": 4.260162263459744, "objective/train/tokens_used": 259666400, "theoretical_loss": 4.260162263459744, "tokens_seen": 239206400 }, { "epoch": 0.07, "learning_rate": 0.0004684453360080241, "loss": 3.5747, "theoretical_loss": 4.260162263459744, "tokens_seen": 239206400 }, { "epoch": 0.07, "learning_rate": 0.00046843530591775327, "loss": 3.4179, "theoretical_loss": 4.260020273594824, "tokens_seen": 239271936 }, { "epoch": 0.07, "learning_rate": 0.00046842527582748245, "loss": 3.2266, "theoretical_loss": 4.259878333501247, "tokens_seen": 239337472 }, { "epoch": 0.07, "learning_rate": 0.0004684152457372117, "loss": 3.2316, "theoretical_loss": 4.259736443147946, "tokens_seen": 239403008 }, { "epoch": 0.07, "learning_rate": 0.0004684052156469408, "loss": 3.3424, "theoretical_loss": 4.259594602503881, "tokens_seen": 239468544 }, { "epoch": 0.07, "learning_rate": 0.00046839518555667005, "loss": 3.4762, "theoretical_loss": 4.259452811538041, "tokens_seen": 239534080 }, { "epoch": 0.07, "learning_rate": 0.00046838515546639923, "loss": 3.3692, "theoretical_loss": 4.259311070219441, "tokens_seen": 239599616 }, { "epoch": 0.07, "learning_rate": 0.0004683751253761284, "loss": 3.2211, "theoretical_loss": 4.259169378517125, "tokens_seen": 239665152 }, { "epoch": 0.07, "learning_rate": 0.0004683650952858576, "loss": 3.403, "theoretical_loss": 4.259027736400165, "tokens_seen": 239730688 }, { "epoch": 0.07, "learning_rate": 0.0004683550651955868, "loss": 3.2607, "theoretical_loss": 4.258886143837661, "tokens_seen": 239796224 }, { "epoch": 0.07, "learning_rate": 0.00046834503510531595, "loss": 3.1357, "theoretical_loss": 4.258744600798739, "tokens_seen": 239861760 }, { "epoch": 0.07, "learning_rate": 0.0004683350050150452, "loss": 3.3204, "theoretical_loss": 4.2586031072525525, "tokens_seen": 239927296 }, { "epoch": 0.07, "learning_rate": 0.0004683249749247743, "loss": 3.2966, "theoretical_loss": 4.258461663168285, "tokens_seen": 239992832 }, { "epoch": 0.07, "learning_rate": 0.00046831494483450355, "loss": 3.1753, "theoretical_loss": 4.258320268515147, "tokens_seen": 240058368 }, { "epoch": 0.07, "learning_rate": 0.0004683049147442327, "loss": 3.3035, "theoretical_loss": 4.258178923262376, "tokens_seen": 240123904 }, { "epoch": 0.07, "learning_rate": 0.0004682948846539619, "loss": 3.2777, "theoretical_loss": 4.258037627379235, "tokens_seen": 240189440 }, { "epoch": 0.07, "learning_rate": 0.0004682848545636911, "loss": 3.2328, "theoretical_loss": 4.257896380835018, "tokens_seen": 240254976 }, { "epoch": 0.07, "learning_rate": 0.0004682748244734203, "loss": 3.2883, "theoretical_loss": 4.257755183599045, "tokens_seen": 240320512 }, { "epoch": 0.07, "learning_rate": 0.00046826479438314946, "loss": 3.2153, "theoretical_loss": 4.257614035640662, "tokens_seen": 240386048 }, { "epoch": 0.07, "learning_rate": 0.00046825476429287864, "loss": 3.3098, "theoretical_loss": 4.257472936929246, "tokens_seen": 240451584 }, { "epoch": 0.07, "learning_rate": 0.0004682447342026078, "loss": 3.357, "theoretical_loss": 4.257331887434198, "tokens_seen": 240517120 }, { "epoch": 0.07, "learning_rate": 0.00046823470411233706, "loss": 3.313, "theoretical_loss": 4.257190887124946, "tokens_seen": 240582656 }, { "epoch": 0.07, "learning_rate": 0.0004682246740220662, "loss": 3.3266, "theoretical_loss": 4.25704993597095, "tokens_seen": 240648192 }, { "epoch": 0.07, "learning_rate": 0.0004682146439317954, "loss": 3.3453, "theoretical_loss": 4.256909033941691, "tokens_seen": 240713728 }, { "epoch": 0.07, "learning_rate": 0.0004682046138415246, "loss": 3.0638, "theoretical_loss": 4.256768181006683, "tokens_seen": 240779264 }, { "epoch": 0.07, "objective/train/docs_used": 412261, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.756887435913086, "objective/train/theoretical_loss": 4.2566273771354615, "objective/train/tokens_used": 261304800, "theoretical_loss": 4.2566273771354615, "tokens_seen": 240844800 }, { "epoch": 0.07, "learning_rate": 0.0004681945837512538, "loss": 3.5587, "theoretical_loss": 4.2566273771354615, "tokens_seen": 240844800 }, { "epoch": 0.07, "learning_rate": 0.00046818455366098296, "loss": 3.3804, "theoretical_loss": 4.256486622297595, "tokens_seen": 240910336 }, { "epoch": 0.07, "learning_rate": 0.00046817452357071214, "loss": 3.2502, "theoretical_loss": 4.256345916462674, "tokens_seen": 240975872 }, { "epoch": 0.07, "learning_rate": 0.0004681644934804413, "loss": 3.3262, "theoretical_loss": 4.256205259600321, "tokens_seen": 241041408 }, { "epoch": 0.07, "learning_rate": 0.00046815446339017056, "loss": 3.2398, "theoretical_loss": 4.256064651680182, "tokens_seen": 241106944 }, { "epoch": 0.07, "learning_rate": 0.0004681444332998997, "loss": 3.401, "theoretical_loss": 4.255924092671931, "tokens_seen": 241172480 }, { "epoch": 0.07, "learning_rate": 0.0004681344032096289, "loss": 3.3859, "theoretical_loss": 4.255783582545269, "tokens_seen": 241238016 }, { "epoch": 0.07, "learning_rate": 0.00046812437311935805, "loss": 3.328, "theoretical_loss": 4.255643121269924, "tokens_seen": 241303552 }, { "epoch": 0.07, "learning_rate": 0.0004681143430290873, "loss": 3.3418, "theoretical_loss": 4.255502708815651, "tokens_seen": 241369088 }, { "epoch": 0.07, "learning_rate": 0.00046810431293881646, "loss": 3.3234, "theoretical_loss": 4.255362345152234, "tokens_seen": 241434624 }, { "epoch": 0.07, "learning_rate": 0.00046809428284854565, "loss": 3.2997, "theoretical_loss": 4.255222030249479, "tokens_seen": 241500160 }, { "epoch": 0.07, "learning_rate": 0.0004680842527582748, "loss": 3.1147, "theoretical_loss": 4.255081764077224, "tokens_seen": 241565696 }, { "epoch": 0.07, "learning_rate": 0.000468074222668004, "loss": 3.4998, "theoretical_loss": 4.25494154660533, "tokens_seen": 241631232 }, { "epoch": 0.07, "learning_rate": 0.0004680641925777332, "loss": 3.1595, "theoretical_loss": 4.254801377803689, "tokens_seen": 241696768 }, { "epoch": 0.07, "learning_rate": 0.0004680541624874624, "loss": 3.1677, "theoretical_loss": 4.254661257642215, "tokens_seen": 241762304 }, { "epoch": 0.07, "learning_rate": 0.00046804413239719155, "loss": 3.2664, "theoretical_loss": 4.254521186090852, "tokens_seen": 241827840 }, { "epoch": 0.07, "learning_rate": 0.0004680341023069208, "loss": 3.2116, "theoretical_loss": 4.254381163119568, "tokens_seen": 241893376 }, { "epoch": 0.07, "learning_rate": 0.00046802407221664997, "loss": 3.1101, "theoretical_loss": 4.254241188698361, "tokens_seen": 241958912 }, { "epoch": 0.07, "learning_rate": 0.00046801404212637915, "loss": 3.2875, "theoretical_loss": 4.2541012627972545, "tokens_seen": 242024448 }, { "epoch": 0.07, "learning_rate": 0.00046800401203610833, "loss": 3.5286, "theoretical_loss": 4.2539613853862965, "tokens_seen": 242089984 }, { "epoch": 0.07, "learning_rate": 0.0004679939819458375, "loss": 3.351, "theoretical_loss": 4.253821556435565, "tokens_seen": 242155520 }, { "epoch": 0.07, "learning_rate": 0.0004679839518555667, "loss": 3.3037, "theoretical_loss": 4.253681775915161, "tokens_seen": 242221056 }, { "epoch": 0.07, "learning_rate": 0.00046797392176529593, "loss": 3.15, "theoretical_loss": 4.253542043795215, "tokens_seen": 242286592 }, { "epoch": 0.07, "learning_rate": 0.00046796389167502505, "loss": 3.3313, "theoretical_loss": 4.253402360045882, "tokens_seen": 242352128 }, { "epoch": 0.07, "learning_rate": 0.0004679538615847543, "loss": 3.1149, "theoretical_loss": 4.253262724637346, "tokens_seen": 242417664 }, { "epoch": 0.07, "objective/train/docs_used": 414980, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.980497121810913, "objective/train/theoretical_loss": 4.253123137539814, "objective/train/tokens_used": 262943200, "theoretical_loss": 4.253123137539814, "tokens_seen": 242483200 }, { "epoch": 0.07, "learning_rate": 0.0004679438314944834, "loss": 3.2693, "theoretical_loss": 4.253123137539814, "tokens_seen": 242483200 }, { "epoch": 0.07, "learning_rate": 0.00046793380140421265, "loss": 3.1405, "theoretical_loss": 4.252983598723521, "tokens_seen": 242548736 }, { "epoch": 0.07, "learning_rate": 0.00046792377131394183, "loss": 3.4462, "theoretical_loss": 4.25284410815873, "tokens_seen": 242614272 }, { "epoch": 0.07, "learning_rate": 0.000467913741223671, "loss": 3.0984, "theoretical_loss": 4.2527046658157275, "tokens_seen": 242679808 }, { "epoch": 0.07, "learning_rate": 0.0004679037111334002, "loss": 3.3499, "theoretical_loss": 4.252565271664828, "tokens_seen": 242745344 }, { "epoch": 0.07, "learning_rate": 0.00046789368104312943, "loss": 3.2947, "theoretical_loss": 4.252425925676373, "tokens_seen": 242810880 }, { "epoch": 0.07, "learning_rate": 0.00046788365095285856, "loss": 3.1145, "theoretical_loss": 4.252286627820727, "tokens_seen": 242876416 }, { "epoch": 0.07, "learning_rate": 0.0004678736208625878, "loss": 3.2274, "theoretical_loss": 4.252147378068285, "tokens_seen": 242941952 }, { "epoch": 0.07, "learning_rate": 0.0004678635907723169, "loss": 3.3943, "theoretical_loss": 4.252008176389465, "tokens_seen": 243007488 }, { "epoch": 0.07, "learning_rate": 0.00046785356068204616, "loss": 3.0519, "theoretical_loss": 4.251869022754712, "tokens_seen": 243073024 }, { "epoch": 0.07, "learning_rate": 0.00046784353059177534, "loss": 3.0748, "theoretical_loss": 4.251729917134498, "tokens_seen": 243138560 }, { "epoch": 0.07, "learning_rate": 0.0004678335005015045, "loss": 3.0996, "theoretical_loss": 4.251590859499322, "tokens_seen": 243204096 }, { "epoch": 0.07, "learning_rate": 0.0004678234704112337, "loss": 3.2989, "theoretical_loss": 4.251451849819704, "tokens_seen": 243269632 }, { "epoch": 0.07, "learning_rate": 0.0004678134403209629, "loss": 3.4171, "theoretical_loss": 4.251312888066197, "tokens_seen": 243335168 }, { "epoch": 0.07, "learning_rate": 0.00046780341023069206, "loss": 3.4031, "theoretical_loss": 4.251173974209375, "tokens_seen": 243400704 }, { "epoch": 0.07, "learning_rate": 0.0004677933801404213, "loss": 3.2698, "theoretical_loss": 4.251035108219839, "tokens_seen": 243466240 }, { "epoch": 0.07, "learning_rate": 0.0004677833500501504, "loss": 3.2478, "theoretical_loss": 4.250896290068218, "tokens_seen": 243531776 }, { "epoch": 0.07, "learning_rate": 0.00046777331995987966, "loss": 3.4525, "theoretical_loss": 4.250757519725165, "tokens_seen": 243597312 }, { "epoch": 0.07, "learning_rate": 0.0004677632898696088, "loss": 3.3451, "theoretical_loss": 4.25061879716136, "tokens_seen": 243662848 }, { "epoch": 0.07, "learning_rate": 0.000467753259779338, "loss": 3.4068, "theoretical_loss": 4.250480122347507, "tokens_seen": 243728384 }, { "epoch": 0.07, "learning_rate": 0.0004677432296890672, "loss": 3.3817, "theoretical_loss": 4.250341495254337, "tokens_seen": 243793920 }, { "epoch": 0.07, "learning_rate": 0.0004677331995987964, "loss": 3.3821, "theoretical_loss": 4.250202915852608, "tokens_seen": 243859456 }, { "epoch": 0.07, "learning_rate": 0.00046772316950852556, "loss": 3.3786, "theoretical_loss": 4.250064384113102, "tokens_seen": 243924992 }, { "epoch": 0.07, "learning_rate": 0.0004677131394182548, "loss": 3.4357, "theoretical_loss": 4.249925900006627, "tokens_seen": 243990528 }, { "epoch": 0.07, "learning_rate": 0.000467703109327984, "loss": 3.2809, "theoretical_loss": 4.249787463504019, "tokens_seen": 244056064 }, { "epoch": 0.07, "objective/train/docs_used": 418010, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.690992832183838, "objective/train/theoretical_loss": 4.249649074576134, "objective/train/tokens_used": 264581600, "theoretical_loss": 4.249649074576134, "tokens_seen": 244121600 }, { "epoch": 0.07, "learning_rate": 0.00046769307923771316, "loss": 3.5073, "theoretical_loss": 4.249649074576134, "tokens_seen": 244121600 }, { "epoch": 0.07, "learning_rate": 0.00046768304914744234, "loss": 3.3301, "theoretical_loss": 4.249510733193862, "tokens_seen": 244187136 }, { "epoch": 0.07, "learning_rate": 0.0004676730190571715, "loss": 3.2154, "theoretical_loss": 4.249372439328111, "tokens_seen": 244252672 }, { "epoch": 0.07, "learning_rate": 0.00046766298896690076, "loss": 3.1482, "theoretical_loss": 4.249234192949818, "tokens_seen": 244318208 }, { "epoch": 0.07, "learning_rate": 0.0004676529588766299, "loss": 3.3322, "theoretical_loss": 4.249095994029947, "tokens_seen": 244383744 }, { "epoch": 0.07, "learning_rate": 0.0004676429287863591, "loss": 3.3403, "theoretical_loss": 4.248957842539484, "tokens_seen": 244449280 }, { "epoch": 0.07, "learning_rate": 0.00046763289869608825, "loss": 3.2225, "theoretical_loss": 4.248819738449442, "tokens_seen": 244514816 }, { "epoch": 0.07, "learning_rate": 0.0004676228686058175, "loss": 3.261, "theoretical_loss": 4.2486816817308615, "tokens_seen": 244580352 }, { "epoch": 0.07, "learning_rate": 0.00046761283851554666, "loss": 3.3, "theoretical_loss": 4.248543672354805, "tokens_seen": 244645888 }, { "epoch": 0.07, "learning_rate": 0.00046760280842527585, "loss": 3.1902, "theoretical_loss": 4.248405710292364, "tokens_seen": 244711424 }, { "epoch": 0.07, "learning_rate": 0.000467592778335005, "loss": 3.1469, "theoretical_loss": 4.248267795514652, "tokens_seen": 244776960 }, { "epoch": 0.07, "learning_rate": 0.0004675827482447342, "loss": 3.3347, "theoretical_loss": 4.248129927992808, "tokens_seen": 244842496 }, { "epoch": 0.07, "learning_rate": 0.0004675727181544634, "loss": 3.3585, "theoretical_loss": 4.247992107698002, "tokens_seen": 244908032 }, { "epoch": 0.07, "learning_rate": 0.0004675626880641926, "loss": 3.3737, "theoretical_loss": 4.247854334601421, "tokens_seen": 244973568 }, { "epoch": 0.07, "learning_rate": 0.00046755265797392175, "loss": 3.0628, "theoretical_loss": 4.247716608674283, "tokens_seen": 245039104 }, { "epoch": 0.07, "learning_rate": 0.000467542627883651, "loss": 3.2971, "theoretical_loss": 4.247578929887829, "tokens_seen": 245104640 }, { "epoch": 0.07, "learning_rate": 0.00046753259779338017, "loss": 3.1979, "theoretical_loss": 4.247441298213326, "tokens_seen": 245170176 }, { "epoch": 0.07, "learning_rate": 0.00046752256770310935, "loss": 3.399, "theoretical_loss": 4.247303713622067, "tokens_seen": 245235712 }, { "epoch": 0.07, "learning_rate": 0.00046751253761283853, "loss": 3.4574, "theoretical_loss": 4.247166176085367, "tokens_seen": 245301248 }, { "epoch": 0.07, "learning_rate": 0.0004675025075225677, "loss": 3.1845, "theoretical_loss": 4.247028685574569, "tokens_seen": 245366784 }, { "epoch": 0.07, "learning_rate": 0.0004674924774322969, "loss": 3.2674, "theoretical_loss": 4.246891242061041, "tokens_seen": 245432320 }, { "epoch": 0.07, "learning_rate": 0.00046748244734202613, "loss": 3.4197, "theoretical_loss": 4.246753845516174, "tokens_seen": 245497856 }, { "epoch": 0.07, "learning_rate": 0.00046747241725175525, "loss": 3.1715, "theoretical_loss": 4.246616495911388, "tokens_seen": 245563392 }, { "epoch": 0.07, "learning_rate": 0.0004674623871614845, "loss": 3.1967, "theoretical_loss": 4.246479193218123, "tokens_seen": 245628928 }, { "epoch": 0.07, "learning_rate": 0.0004674523570712136, "loss": 3.3519, "theoretical_loss": 4.246341937407848, "tokens_seen": 245694464 }, { "epoch": 0.07, "objective/train/docs_used": 420862, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.223174810409546, "objective/train/theoretical_loss": 4.246204728452055, "objective/train/tokens_used": 266220000, "theoretical_loss": 4.246204728452055, "tokens_seen": 245760000 }, { "epoch": 0.07, "learning_rate": 0.00046744232698094285, "loss": 3.26, "theoretical_loss": 4.246204728452055, "tokens_seen": 245760000 }, { "epoch": 0.07, "learning_rate": 0.00046743229689067203, "loss": 3.1801, "theoretical_loss": 4.246067566322259, "tokens_seen": 245825536 }, { "epoch": 0.07, "learning_rate": 0.0004674222668004012, "loss": 3.3376, "theoretical_loss": 4.245930450990007, "tokens_seen": 245891072 }, { "epoch": 0.07, "learning_rate": 0.0004674122367101304, "loss": 3.3283, "theoretical_loss": 4.245793382426861, "tokens_seen": 245956608 }, { "epoch": 0.07, "learning_rate": 0.00046740220661985963, "loss": 3.2915, "theoretical_loss": 4.245656360604417, "tokens_seen": 246022144 }, { "epoch": 0.07, "learning_rate": 0.00046739217652958876, "loss": 3.3617, "theoretical_loss": 4.24551938549429, "tokens_seen": 246087680 }, { "epoch": 0.07, "learning_rate": 0.000467382146439318, "loss": 3.2805, "theoretical_loss": 4.2453824570681205, "tokens_seen": 246153216 }, { "epoch": 0.07, "learning_rate": 0.0004673721163490471, "loss": 3.2456, "theoretical_loss": 4.245245575297577, "tokens_seen": 246218752 }, { "epoch": 0.07, "learning_rate": 0.00046736208625877636, "loss": 3.2543, "theoretical_loss": 4.2451087401543495, "tokens_seen": 246284288 }, { "epoch": 0.07, "learning_rate": 0.00046735205616850554, "loss": 3.1477, "theoretical_loss": 4.244971951610154, "tokens_seen": 246349824 }, { "epoch": 0.07, "learning_rate": 0.0004673420260782347, "loss": 3.3396, "theoretical_loss": 4.24483520963673, "tokens_seen": 246415360 }, { "epoch": 0.07, "learning_rate": 0.0004673319959879639, "loss": 3.4204, "theoretical_loss": 4.244698514205844, "tokens_seen": 246480896 }, { "epoch": 0.07, "learning_rate": 0.0004673219658976931, "loss": 3.3405, "theoretical_loss": 4.244561865289285, "tokens_seen": 246546432 }, { "epoch": 0.07, "learning_rate": 0.00046731193580742226, "loss": 3.2926, "theoretical_loss": 4.244425262858867, "tokens_seen": 246611968 }, { "epoch": 0.07, "learning_rate": 0.0004673019057171515, "loss": 3.3235, "theoretical_loss": 4.2442887068864295, "tokens_seen": 246677504 }, { "epoch": 0.07, "learning_rate": 0.0004672918756268806, "loss": 3.0161, "theoretical_loss": 4.244152197343835, "tokens_seen": 246743040 }, { "epoch": 0.07, "learning_rate": 0.00046728184553660986, "loss": 3.2933, "theoretical_loss": 4.244015734202973, "tokens_seen": 246808576 }, { "epoch": 0.07, "learning_rate": 0.000467271815446339, "loss": 3.2406, "theoretical_loss": 4.243879317435755, "tokens_seen": 246874112 }, { "epoch": 0.07, "learning_rate": 0.0004672617853560682, "loss": 3.2291, "theoretical_loss": 4.243742947014117, "tokens_seen": 246939648 }, { "epoch": 0.07, "learning_rate": 0.0004672517552657974, "loss": 3.2153, "theoretical_loss": 4.243606622910021, "tokens_seen": 247005184 }, { "epoch": 0.07, "learning_rate": 0.0004672417251755266, "loss": 3.4062, "theoretical_loss": 4.243470345095453, "tokens_seen": 247070720 }, { "epoch": 0.07, "learning_rate": 0.00046723169508525576, "loss": 3.3487, "theoretical_loss": 4.2433341135424225, "tokens_seen": 247136256 }, { "epoch": 0.07, "learning_rate": 0.000467221664994985, "loss": 3.0654, "theoretical_loss": 4.243197928222964, "tokens_seen": 247201792 }, { "epoch": 0.07, "learning_rate": 0.0004672116349047141, "loss": 3.2543, "theoretical_loss": 4.243061789109136, "tokens_seen": 247267328 }, { "epoch": 0.07, "learning_rate": 0.00046720160481444336, "loss": 3.3521, "theoretical_loss": 4.242925696173021, "tokens_seen": 247332864 }, { "epoch": 0.07, "objective/train/docs_used": 422249, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.514582872390747, "objective/train/theoretical_loss": 4.2427896493867285, "objective/train/tokens_used": 267858400, "theoretical_loss": 4.2427896493867285, "tokens_seen": 247398400 }, { "epoch": 0.07, "learning_rate": 0.0004671915747241725, "loss": 3.3393, "theoretical_loss": 4.2427896493867285, "tokens_seen": 247398400 }, { "epoch": 0.07, "learning_rate": 0.0004671815446339017, "loss": 3.2436, "theoretical_loss": 4.242653648722387, "tokens_seen": 247463936 }, { "epoch": 0.08, "learning_rate": 0.0004671715145436309, "loss": 3.2632, "theoretical_loss": 4.242517694152154, "tokens_seen": 247529472 }, { "epoch": 0.08, "learning_rate": 0.0004671614844533601, "loss": 3.4374, "theoretical_loss": 4.24238178564821, "tokens_seen": 247595008 }, { "epoch": 0.08, "learning_rate": 0.00046715145436308927, "loss": 3.2201, "theoretical_loss": 4.242245923182756, "tokens_seen": 247660544 }, { "epoch": 0.08, "learning_rate": 0.00046714142427281845, "loss": 3.3575, "theoretical_loss": 4.242110106728022, "tokens_seen": 247726080 }, { "epoch": 0.08, "learning_rate": 0.00046713139418254763, "loss": 3.3011, "theoretical_loss": 4.241974336256261, "tokens_seen": 247791616 }, { "epoch": 0.08, "learning_rate": 0.00046712136409227686, "loss": 3.3262, "theoretical_loss": 4.241838611739748, "tokens_seen": 247857152 }, { "epoch": 0.08, "learning_rate": 0.000467111334002006, "loss": 3.3647, "theoretical_loss": 4.241702933150783, "tokens_seen": 247922688 }, { "epoch": 0.08, "learning_rate": 0.0004671013039117352, "loss": 3.3155, "theoretical_loss": 4.241567300461693, "tokens_seen": 247988224 }, { "epoch": 0.08, "learning_rate": 0.00046709127382146435, "loss": 3.269, "theoretical_loss": 4.241431713644823, "tokens_seen": 248053760 }, { "epoch": 0.08, "learning_rate": 0.0004670812437311936, "loss": 3.2366, "theoretical_loss": 4.241296172672547, "tokens_seen": 248119296 }, { "epoch": 0.08, "learning_rate": 0.00046707121364092277, "loss": 3.3717, "theoretical_loss": 4.24116067751726, "tokens_seen": 248184832 }, { "epoch": 0.08, "learning_rate": 0.00046706118355065195, "loss": 3.2012, "theoretical_loss": 4.241025228151383, "tokens_seen": 248250368 }, { "epoch": 0.08, "learning_rate": 0.00046705115346038113, "loss": 3.2961, "theoretical_loss": 4.24088982454736, "tokens_seen": 248315904 }, { "epoch": 0.08, "learning_rate": 0.00046704112337011037, "loss": 3.2397, "theoretical_loss": 4.240754466677659, "tokens_seen": 248381440 }, { "epoch": 0.08, "learning_rate": 0.0004670310932798395, "loss": 3.1473, "theoretical_loss": 4.240619154514771, "tokens_seen": 248446976 }, { "epoch": 0.08, "learning_rate": 0.00046702106318956873, "loss": 3.4704, "theoretical_loss": 4.240483888031212, "tokens_seen": 248512512 }, { "epoch": 0.08, "learning_rate": 0.00046701103309929786, "loss": 3.3179, "theoretical_loss": 4.240348667199521, "tokens_seen": 248578048 }, { "epoch": 0.08, "learning_rate": 0.0004670010030090271, "loss": 3.3658, "theoretical_loss": 4.240213491992261, "tokens_seen": 248643584 }, { "epoch": 0.08, "learning_rate": 0.0004669909729187563, "loss": 3.3335, "theoretical_loss": 4.240078362382019, "tokens_seen": 248709120 }, { "epoch": 0.08, "learning_rate": 0.00046698094282848545, "loss": 3.2722, "theoretical_loss": 4.239943278341404, "tokens_seen": 248774656 }, { "epoch": 0.08, "learning_rate": 0.00046697091273821464, "loss": 3.295, "theoretical_loss": 4.239808239843052, "tokens_seen": 248840192 }, { "epoch": 0.08, "learning_rate": 0.0004669608826479438, "loss": 3.3868, "theoretical_loss": 4.239673246859619, "tokens_seen": 248905728 }, { "epoch": 0.08, "learning_rate": 0.00046695085255767305, "loss": 3.4361, "theoretical_loss": 4.239538299363788, "tokens_seen": 248971264 }, { "epoch": 0.08, "objective/train/docs_used": 425137, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.140821695327759, "objective/train/theoretical_loss": 4.239403397328261, "objective/train/tokens_used": 269496800, "theoretical_loss": 4.239403397328261, "tokens_seen": 249036800 }, { "epoch": 0.08, "learning_rate": 0.00046694082246740223, "loss": 3.1712, "theoretical_loss": 4.239403397328261, "tokens_seen": 249036800 }, { "epoch": 0.08, "learning_rate": 0.0004669307923771314, "loss": 3.328, "theoretical_loss": 4.239268540725769, "tokens_seen": 249102336 }, { "epoch": 0.08, "learning_rate": 0.0004669207622868606, "loss": 3.2599, "theoretical_loss": 4.239133729529064, "tokens_seen": 249167872 }, { "epoch": 0.08, "learning_rate": 0.00046691073219658983, "loss": 3.3869, "theoretical_loss": 4.2389989637109196, "tokens_seen": 249233408 }, { "epoch": 0.08, "learning_rate": 0.00046690070210631896, "loss": 3.2992, "theoretical_loss": 4.2388642432441355, "tokens_seen": 249298944 }, { "epoch": 0.08, "learning_rate": 0.0004668906720160482, "loss": 3.3305, "theoretical_loss": 4.238729568101535, "tokens_seen": 249364480 }, { "epoch": 0.08, "learning_rate": 0.0004668806419257773, "loss": 3.1292, "theoretical_loss": 4.238594938255963, "tokens_seen": 249430016 }, { "epoch": 0.08, "learning_rate": 0.00046687061183550656, "loss": 3.3939, "theoretical_loss": 4.2384603536802885, "tokens_seen": 249495552 }, { "epoch": 0.08, "learning_rate": 0.00046686058174523574, "loss": 3.1715, "theoretical_loss": 4.238325814347404, "tokens_seen": 249561088 }, { "epoch": 0.08, "learning_rate": 0.0004668505516549649, "loss": 3.4711, "theoretical_loss": 4.238191320230227, "tokens_seen": 249626624 }, { "epoch": 0.08, "learning_rate": 0.0004668405215646941, "loss": 3.2718, "theoretical_loss": 4.238056871301695, "tokens_seen": 249692160 }, { "epoch": 0.08, "learning_rate": 0.0004668304914744233, "loss": 3.2332, "theoretical_loss": 4.237922467534771, "tokens_seen": 249757696 }, { "epoch": 0.08, "learning_rate": 0.00046682046138415246, "loss": 3.2562, "theoretical_loss": 4.237788108902441, "tokens_seen": 249823232 }, { "epoch": 0.08, "learning_rate": 0.0004668104312938817, "loss": 3.2025, "theoretical_loss": 4.237653795377714, "tokens_seen": 249888768 }, { "epoch": 0.08, "learning_rate": 0.0004668004012036108, "loss": 3.1757, "theoretical_loss": 4.237519526933622, "tokens_seen": 249954304 }, { "epoch": 0.08, "learning_rate": 0.00046679037111334006, "loss": 3.3724, "theoretical_loss": 4.2373853035432205, "tokens_seen": 250019840 }, { "epoch": 0.08, "learning_rate": 0.0004667803410230692, "loss": 3.4363, "theoretical_loss": 4.237251125179588, "tokens_seen": 250085376 }, { "epoch": 0.08, "learning_rate": 0.0004667703109327984, "loss": 3.3131, "theoretical_loss": 4.237116991815826, "tokens_seen": 250150912 }, { "epoch": 0.08, "learning_rate": 0.0004667602808425276, "loss": 3.3922, "theoretical_loss": 4.23698290342506, "tokens_seen": 250216448 }, { "epoch": 0.08, "learning_rate": 0.0004667502507522568, "loss": 3.394, "theoretical_loss": 4.236848859980437, "tokens_seen": 250281984 }, { "epoch": 0.08, "learning_rate": 0.00046674022066198596, "loss": 3.2623, "theoretical_loss": 4.23671486145513, "tokens_seen": 250347520 }, { "epoch": 0.08, "learning_rate": 0.0004667301905717152, "loss": 3.2071, "theoretical_loss": 4.236580907822331, "tokens_seen": 250413056 }, { "epoch": 0.08, "learning_rate": 0.0004667201604814443, "loss": 3.4061, "theoretical_loss": 4.236446999055257, "tokens_seen": 250478592 }, { "epoch": 0.08, "learning_rate": 0.00046671013039117356, "loss": 3.1977, "theoretical_loss": 4.2363131351271495, "tokens_seen": 250544128 }, { "epoch": 0.08, "learning_rate": 0.0004667001003009027, "loss": 3.2694, "theoretical_loss": 4.2361793160112695, "tokens_seen": 250609664 }, { "epoch": 0.08, "objective/train/docs_used": 428605, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4890992641448975, "objective/train/theoretical_loss": 4.236045541680905, "objective/train/tokens_used": 271135200, "theoretical_loss": 4.236045541680905, "tokens_seen": 250675200 }, { "epoch": 0.08, "learning_rate": 0.0004666900702106319, "loss": 3.2779, "theoretical_loss": 4.236045541680905, "tokens_seen": 250675200 }, { "epoch": 0.08, "learning_rate": 0.0004666800401203611, "loss": 3.4717, "theoretical_loss": 4.235911812109363, "tokens_seen": 250740736 }, { "epoch": 0.08, "learning_rate": 0.0004666700100300903, "loss": 3.1494, "theoretical_loss": 4.235778127269976, "tokens_seen": 250806272 }, { "epoch": 0.08, "learning_rate": 0.00046665997993981947, "loss": 3.3809, "theoretical_loss": 4.235644487136098, "tokens_seen": 250871808 }, { "epoch": 0.08, "learning_rate": 0.00046664994984954865, "loss": 3.1874, "theoretical_loss": 4.235510891681108, "tokens_seen": 250937344 }, { "epoch": 0.08, "learning_rate": 0.00046663991975927783, "loss": 3.0915, "theoretical_loss": 4.235377340878404, "tokens_seen": 251002880 }, { "epoch": 0.08, "learning_rate": 0.00046662988966900706, "loss": 3.3439, "theoretical_loss": 4.23524383470141, "tokens_seen": 251068416 }, { "epoch": 0.08, "learning_rate": 0.0004666198595787362, "loss": 3.2702, "theoretical_loss": 4.235110373123572, "tokens_seen": 251133952 }, { "epoch": 0.08, "learning_rate": 0.00046660982948846543, "loss": 3.272, "theoretical_loss": 4.2349769561183574, "tokens_seen": 251199488 }, { "epoch": 0.08, "learning_rate": 0.00046659979939819455, "loss": 3.4042, "theoretical_loss": 4.2348435836592575, "tokens_seen": 251265024 }, { "epoch": 0.08, "learning_rate": 0.0004665897693079238, "loss": 3.2659, "theoretical_loss": 4.234710255719786, "tokens_seen": 251330560 }, { "epoch": 0.08, "learning_rate": 0.00046657973921765297, "loss": 3.1843, "theoretical_loss": 4.234576972273481, "tokens_seen": 251396096 }, { "epoch": 0.08, "learning_rate": 0.00046656970912738215, "loss": 3.3077, "theoretical_loss": 4.234443733293899, "tokens_seen": 251461632 }, { "epoch": 0.08, "learning_rate": 0.00046655967903711133, "loss": 3.5381, "theoretical_loss": 4.234310538754624, "tokens_seen": 251527168 }, { "epoch": 0.08, "learning_rate": 0.00046654964894684057, "loss": 3.3664, "theoretical_loss": 4.2341773886292575, "tokens_seen": 251592704 }, { "epoch": 0.08, "learning_rate": 0.0004665396188565697, "loss": 3.242, "theoretical_loss": 4.234044282891429, "tokens_seen": 251658240 }, { "epoch": 0.08, "learning_rate": 0.00046652958876629893, "loss": 3.2597, "theoretical_loss": 4.233911221514787, "tokens_seen": 251723776 }, { "epoch": 0.08, "learning_rate": 0.00046651955867602806, "loss": 3.2642, "theoretical_loss": 4.233778204473002, "tokens_seen": 251789312 }, { "epoch": 0.08, "learning_rate": 0.0004665095285857573, "loss": 3.237, "theoretical_loss": 4.23364523173977, "tokens_seen": 251854848 }, { "epoch": 0.08, "learning_rate": 0.0004664994984954865, "loss": 3.4444, "theoretical_loss": 4.233512303288807, "tokens_seen": 251920384 }, { "epoch": 0.08, "learning_rate": 0.00046648946840521565, "loss": 3.1292, "theoretical_loss": 4.233379419093851, "tokens_seen": 251985920 }, { "epoch": 0.08, "learning_rate": 0.00046647943831494484, "loss": 3.323, "theoretical_loss": 4.233246579128666, "tokens_seen": 252051456 }, { "epoch": 0.08, "learning_rate": 0.000466469408224674, "loss": 3.356, "theoretical_loss": 4.233113783367033, "tokens_seen": 252116992 }, { "epoch": 0.08, "learning_rate": 0.0004664593781344032, "loss": 3.2508, "theoretical_loss": 4.232981031782761, "tokens_seen": 252182528 }, { "epoch": 0.08, "learning_rate": 0.00046644934804413243, "loss": 3.3543, "theoretical_loss": 4.232848324349677, "tokens_seen": 252248064 }, { "epoch": 0.08, "objective/train/docs_used": 430057, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.252034902572632, "objective/train/theoretical_loss": 4.232715661041632, "objective/train/tokens_used": 272773600, "theoretical_loss": 4.232715661041632, "tokens_seen": 252313600 }, { "epoch": 0.08, "learning_rate": 0.00046643931795386156, "loss": 3.2405, "theoretical_loss": 4.232715661041632, "tokens_seen": 252313600 }, { "epoch": 0.08, "learning_rate": 0.0004664292878635908, "loss": 3.2212, "theoretical_loss": 4.232583041832499, "tokens_seen": 252379136 }, { "epoch": 0.08, "learning_rate": 0.0004664192577733199, "loss": 3.3505, "theoretical_loss": 4.232450466696174, "tokens_seen": 252444672 }, { "epoch": 0.08, "learning_rate": 0.00046640922768304916, "loss": 3.0926, "theoretical_loss": 4.2323179356065745, "tokens_seen": 252510208 }, { "epoch": 0.08, "learning_rate": 0.00046639919759277834, "loss": 3.4595, "theoretical_loss": 4.23218544853764, "tokens_seen": 252575744 }, { "epoch": 0.08, "learning_rate": 0.0004663891675025075, "loss": 3.3758, "theoretical_loss": 4.232053005463333, "tokens_seen": 252641280 }, { "epoch": 0.08, "learning_rate": 0.0004663791374122367, "loss": 3.3906, "theoretical_loss": 4.231920606357638, "tokens_seen": 252706816 }, { "epoch": 0.08, "learning_rate": 0.00046636910732196594, "loss": 3.2969, "theoretical_loss": 4.231788251194559, "tokens_seen": 252772352 }, { "epoch": 0.08, "learning_rate": 0.00046635907723169506, "loss": 3.3668, "theoretical_loss": 4.231655939948127, "tokens_seen": 252837888 }, { "epoch": 0.08, "learning_rate": 0.0004663490471414243, "loss": 3.4166, "theoretical_loss": 4.231523672592392, "tokens_seen": 252903424 }, { "epoch": 0.08, "learning_rate": 0.0004663390170511534, "loss": 3.1115, "theoretical_loss": 4.231391449101425, "tokens_seen": 252968960 }, { "epoch": 0.08, "learning_rate": 0.00046632898696088266, "loss": 3.3952, "theoretical_loss": 4.231259269449322, "tokens_seen": 253034496 }, { "epoch": 0.08, "learning_rate": 0.00046631895687061184, "loss": 3.2267, "theoretical_loss": 4.231127133610198, "tokens_seen": 253100032 }, { "epoch": 0.08, "learning_rate": 0.000466308926780341, "loss": 3.3859, "theoretical_loss": 4.230995041558194, "tokens_seen": 253165568 }, { "epoch": 0.08, "learning_rate": 0.0004662988966900702, "loss": 3.3175, "theoretical_loss": 4.230862993267468, "tokens_seen": 253231104 }, { "epoch": 0.08, "learning_rate": 0.0004662888665997994, "loss": 3.1681, "theoretical_loss": 4.230730988712205, "tokens_seen": 253296640 }, { "epoch": 0.08, "learning_rate": 0.00046627883650952857, "loss": 3.3424, "theoretical_loss": 4.230599027866606, "tokens_seen": 253362176 }, { "epoch": 0.08, "learning_rate": 0.0004662688064192578, "loss": 3.3971, "theoretical_loss": 4.2304671107048994, "tokens_seen": 253427712 }, { "epoch": 0.08, "learning_rate": 0.00046625877632898693, "loss": 3.2334, "theoretical_loss": 4.2303352372013325, "tokens_seen": 253493248 }, { "epoch": 0.08, "learning_rate": 0.00046624874623871616, "loss": 3.2335, "theoretical_loss": 4.230203407330176, "tokens_seen": 253558784 }, { "epoch": 0.08, "learning_rate": 0.00046623871614844535, "loss": 3.0188, "theoretical_loss": 4.230071621065721, "tokens_seen": 253624320 }, { "epoch": 0.08, "learning_rate": 0.0004662286860581745, "loss": 3.2931, "theoretical_loss": 4.2299398783822815, "tokens_seen": 253689856 }, { "epoch": 0.08, "learning_rate": 0.0004662186559679037, "loss": 3.2702, "theoretical_loss": 4.229808179254192, "tokens_seen": 253755392 }, { "epoch": 0.08, "learning_rate": 0.0004662086258776329, "loss": 3.3758, "theoretical_loss": 4.22967652365581, "tokens_seen": 253820928 }, { "epoch": 0.08, "learning_rate": 0.0004661985957873621, "loss": 3.5492, "theoretical_loss": 4.229544911561513, "tokens_seen": 253886464 }, { "epoch": 0.08, "objective/train/docs_used": 433011, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.282345771789551, "objective/train/theoretical_loss": 4.229413342945703, "objective/train/tokens_used": 274412000, "theoretical_loss": 4.229413342945703, "tokens_seen": 253952000 }, { "epoch": 0.08, "learning_rate": 0.0004661885656970913, "loss": 3.3212, "theoretical_loss": 4.229413342945703, "tokens_seen": 253952000 }, { "epoch": 0.08, "learning_rate": 0.0004661785356068205, "loss": 3.2396, "theoretical_loss": 4.229281817782801, "tokens_seen": 254017536 }, { "epoch": 0.08, "learning_rate": 0.00046616850551654967, "loss": 3.2437, "theoretical_loss": 4.229150336047251, "tokens_seen": 254083072 }, { "epoch": 0.08, "learning_rate": 0.00046615847542627885, "loss": 3.2013, "theoretical_loss": 4.229018897713519, "tokens_seen": 254148608 }, { "epoch": 0.08, "learning_rate": 0.00046614844533600803, "loss": 3.4602, "theoretical_loss": 4.22888750275609, "tokens_seen": 254214144 }, { "epoch": 0.08, "learning_rate": 0.00046613841524573727, "loss": 3.3896, "theoretical_loss": 4.228756151149475, "tokens_seen": 254279680 }, { "epoch": 0.08, "learning_rate": 0.0004661283851554664, "loss": 3.2146, "theoretical_loss": 4.228624842868202, "tokens_seen": 254345216 }, { "epoch": 0.08, "learning_rate": 0.00046611835506519563, "loss": 3.2017, "theoretical_loss": 4.228493577886824, "tokens_seen": 254410752 }, { "epoch": 0.08, "learning_rate": 0.00046610832497492475, "loss": 3.3278, "theoretical_loss": 4.228362356179913, "tokens_seen": 254476288 }, { "epoch": 0.08, "learning_rate": 0.000466098294884654, "loss": 3.3475, "theoretical_loss": 4.228231177722063, "tokens_seen": 254541824 }, { "epoch": 0.08, "learning_rate": 0.00046608826479438317, "loss": 3.311, "theoretical_loss": 4.228100042487892, "tokens_seen": 254607360 }, { "epoch": 0.08, "learning_rate": 0.00046607823470411235, "loss": 3.1945, "theoretical_loss": 4.227968950452035, "tokens_seen": 254672896 }, { "epoch": 0.08, "learning_rate": 0.00046606820461384153, "loss": 3.2699, "theoretical_loss": 4.227837901589153, "tokens_seen": 254738432 }, { "epoch": 0.08, "learning_rate": 0.00046605817452357077, "loss": 3.2744, "theoretical_loss": 4.227706895873924, "tokens_seen": 254803968 }, { "epoch": 0.08, "learning_rate": 0.0004660481444332999, "loss": 3.1938, "theoretical_loss": 4.227575933281051, "tokens_seen": 254869504 }, { "epoch": 0.08, "learning_rate": 0.00046603811434302913, "loss": 3.2606, "theoretical_loss": 4.227445013785257, "tokens_seen": 254935040 }, { "epoch": 0.08, "learning_rate": 0.00046602808425275826, "loss": 3.1886, "theoretical_loss": 4.227314137361285, "tokens_seen": 255000576 }, { "epoch": 0.08, "learning_rate": 0.0004660180541624875, "loss": 3.2736, "theoretical_loss": 4.227183303983901, "tokens_seen": 255066112 }, { "epoch": 0.08, "learning_rate": 0.0004660080240722167, "loss": 3.2978, "theoretical_loss": 4.227052513627893, "tokens_seen": 255131648 }, { "epoch": 0.08, "learning_rate": 0.00046599799398194586, "loss": 3.3963, "theoretical_loss": 4.226921766268067, "tokens_seen": 255197184 }, { "epoch": 0.08, "learning_rate": 0.00046598796389167504, "loss": 3.2094, "theoretical_loss": 4.226791061879253, "tokens_seen": 255262720 }, { "epoch": 0.08, "learning_rate": 0.0004659779338014042, "loss": 3.2633, "theoretical_loss": 4.226660400436302, "tokens_seen": 255328256 }, { "epoch": 0.08, "learning_rate": 0.0004659679037111334, "loss": 3.2943, "theoretical_loss": 4.226529781914084, "tokens_seen": 255393792 }, { "epoch": 0.08, "learning_rate": 0.00046595787362086263, "loss": 3.3987, "theoretical_loss": 4.226399206287493, "tokens_seen": 255459328 }, { "epoch": 0.08, "learning_rate": 0.00046594784353059176, "loss": 3.2479, "theoretical_loss": 4.226268673531442, "tokens_seen": 255524864 }, { "epoch": 0.08, "objective/train/docs_used": 435787, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.125673294067383, "objective/train/theoretical_loss": 4.226138183620867, "objective/train/tokens_used": 276050400, "theoretical_loss": 4.226138183620867, "tokens_seen": 255590400 }, { "epoch": 0.08, "learning_rate": 0.000465937813440321, "loss": 3.1076, "theoretical_loss": 4.226138183620867, "tokens_seen": 255590400 }, { "epoch": 0.08, "learning_rate": 0.0004659277833500501, "loss": 3.3014, "theoretical_loss": 4.226007736530723, "tokens_seen": 255655936 }, { "epoch": 0.08, "learning_rate": 0.00046591775325977936, "loss": 3.2956, "theoretical_loss": 4.225877332235987, "tokens_seen": 255721472 }, { "epoch": 0.08, "learning_rate": 0.00046590772316950854, "loss": 3.1131, "theoretical_loss": 4.225746970711657, "tokens_seen": 255787008 }, { "epoch": 0.08, "learning_rate": 0.0004658976930792377, "loss": 3.4321, "theoretical_loss": 4.225616651932753, "tokens_seen": 255852544 }, { "epoch": 0.08, "learning_rate": 0.0004658876629889669, "loss": 3.3236, "theoretical_loss": 4.225486375874315, "tokens_seen": 255918080 }, { "epoch": 0.08, "learning_rate": 0.00046587763289869614, "loss": 3.2767, "theoretical_loss": 4.225356142511402, "tokens_seen": 255983616 }, { "epoch": 0.08, "learning_rate": 0.00046586760280842526, "loss": 3.3568, "theoretical_loss": 4.225225951819099, "tokens_seen": 256049152 }, { "epoch": 0.08, "learning_rate": 0.0004658575727181545, "loss": 3.3205, "theoretical_loss": 4.225095803772507, "tokens_seen": 256114688 }, { "epoch": 0.08, "learning_rate": 0.0004658475426278836, "loss": 3.0864, "theoretical_loss": 4.22496569834675, "tokens_seen": 256180224 }, { "epoch": 0.08, "learning_rate": 0.00046583751253761286, "loss": 3.1711, "theoretical_loss": 4.224835635516973, "tokens_seen": 256245760 }, { "epoch": 0.08, "learning_rate": 0.00046582748244734204, "loss": 3.1651, "theoretical_loss": 4.224705615258341, "tokens_seen": 256311296 }, { "epoch": 0.08, "learning_rate": 0.0004658174523570712, "loss": 3.2209, "theoretical_loss": 4.224575637546041, "tokens_seen": 256376832 }, { "epoch": 0.08, "learning_rate": 0.0004658074222668004, "loss": 3.2894, "theoretical_loss": 4.224445702355279, "tokens_seen": 256442368 }, { "epoch": 0.08, "learning_rate": 0.0004657973921765296, "loss": 3.2882, "theoretical_loss": 4.2243158096612845, "tokens_seen": 256507904 }, { "epoch": 0.08, "learning_rate": 0.00046578736208625877, "loss": 3.3329, "theoretical_loss": 4.224185959439305, "tokens_seen": 256573440 }, { "epoch": 0.08, "learning_rate": 0.000465777331995988, "loss": 3.3315, "theoretical_loss": 4.22405615166461, "tokens_seen": 256638976 }, { "epoch": 0.08, "learning_rate": 0.00046576730190571713, "loss": 3.2569, "theoretical_loss": 4.22392638631249, "tokens_seen": 256704512 }, { "epoch": 0.08, "learning_rate": 0.00046575727181544636, "loss": 3.2608, "theoretical_loss": 4.223796663358255, "tokens_seen": 256770048 }, { "epoch": 0.08, "learning_rate": 0.00046574724172517555, "loss": 3.406, "theoretical_loss": 4.223666982777237, "tokens_seen": 256835584 }, { "epoch": 0.08, "learning_rate": 0.0004657372116349047, "loss": 3.3216, "theoretical_loss": 4.223537344544788, "tokens_seen": 256901120 }, { "epoch": 0.08, "learning_rate": 0.0004657271815446339, "loss": 3.174, "theoretical_loss": 4.223407748636282, "tokens_seen": 256966656 }, { "epoch": 0.08, "learning_rate": 0.0004657171514543631, "loss": 3.2721, "theoretical_loss": 4.22327819502711, "tokens_seen": 257032192 }, { "epoch": 0.08, "learning_rate": 0.00046570712136409227, "loss": 3.0971, "theoretical_loss": 4.223148683692687, "tokens_seen": 257097728 }, { "epoch": 0.08, "learning_rate": 0.0004656970912738215, "loss": 3.1727, "theoretical_loss": 4.223019214608446, "tokens_seen": 257163264 }, { "epoch": 0.08, "objective/train/docs_used": 438564, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.392951011657715, "objective/train/theoretical_loss": 4.222889787749845, "objective/train/tokens_used": 277688800, "theoretical_loss": 4.222889787749845, "tokens_seen": 257228800 }, { "epoch": 0.08, "learning_rate": 0.00046568706118355063, "loss": 3.3174, "theoretical_loss": 4.222889787749845, "tokens_seen": 257228800 }, { "epoch": 0.08, "learning_rate": 0.00046567703109327987, "loss": 3.2921, "theoretical_loss": 4.222760403092358, "tokens_seen": 257294336 }, { "epoch": 0.08, "learning_rate": 0.000465667001003009, "loss": 3.2803, "theoretical_loss": 4.22263106061148, "tokens_seen": 257359872 }, { "epoch": 0.08, "learning_rate": 0.00046565697091273823, "loss": 3.115, "theoretical_loss": 4.222501760282729, "tokens_seen": 257425408 }, { "epoch": 0.08, "learning_rate": 0.0004656469408224674, "loss": 3.1489, "theoretical_loss": 4.22237250208164, "tokens_seen": 257490944 }, { "epoch": 0.08, "learning_rate": 0.0004656369107321966, "loss": 3.2977, "theoretical_loss": 4.222243285983772, "tokens_seen": 257556480 }, { "epoch": 0.08, "learning_rate": 0.0004656268806419258, "loss": 3.4226, "theoretical_loss": 4.222114111964703, "tokens_seen": 257622016 }, { "epoch": 0.08, "learning_rate": 0.00046561685055165495, "loss": 3.4419, "theoretical_loss": 4.221984980000029, "tokens_seen": 257687552 }, { "epoch": 0.08, "learning_rate": 0.00046560682046138414, "loss": 3.0213, "theoretical_loss": 4.2218558900653695, "tokens_seen": 257753088 }, { "epoch": 0.08, "learning_rate": 0.00046559679037111337, "loss": 3.2168, "theoretical_loss": 4.221726842136364, "tokens_seen": 257818624 }, { "epoch": 0.08, "learning_rate": 0.0004655867602808425, "loss": 3.3047, "theoretical_loss": 4.2215978361886695, "tokens_seen": 257884160 }, { "epoch": 0.08, "learning_rate": 0.00046557673019057173, "loss": 3.2, "theoretical_loss": 4.221468872197967, "tokens_seen": 257949696 }, { "epoch": 0.08, "learning_rate": 0.0004655667001003009, "loss": 3.4823, "theoretical_loss": 4.221339950139956, "tokens_seen": 258015232 }, { "epoch": 0.08, "learning_rate": 0.0004655566700100301, "loss": 3.3525, "theoretical_loss": 4.221211069990357, "tokens_seen": 258080768 }, { "epoch": 0.08, "learning_rate": 0.0004655466399197593, "loss": 3.3542, "theoretical_loss": 4.221082231724908, "tokens_seen": 258146304 }, { "epoch": 0.08, "learning_rate": 0.00046553660982948846, "loss": 3.1905, "theoretical_loss": 4.22095343531937, "tokens_seen": 258211840 }, { "epoch": 0.08, "learning_rate": 0.00046552657973921764, "loss": 3.3374, "theoretical_loss": 4.220824680749525, "tokens_seen": 258277376 }, { "epoch": 0.08, "learning_rate": 0.0004655165496489469, "loss": 3.4091, "theoretical_loss": 4.220695967991171, "tokens_seen": 258342912 }, { "epoch": 0.08, "learning_rate": 0.000465506519558676, "loss": 3.2537, "theoretical_loss": 4.220567297020131, "tokens_seen": 258408448 }, { "epoch": 0.08, "learning_rate": 0.00046549648946840524, "loss": 3.3577, "theoretical_loss": 4.220438667812244, "tokens_seen": 258473984 }, { "epoch": 0.08, "learning_rate": 0.00046548645937813436, "loss": 3.2005, "theoretical_loss": 4.220310080343373, "tokens_seen": 258539520 }, { "epoch": 0.08, "learning_rate": 0.0004654764292878636, "loss": 3.3635, "theoretical_loss": 4.220181534589398, "tokens_seen": 258605056 }, { "epoch": 0.08, "learning_rate": 0.0004654663991975928, "loss": 3.243, "theoretical_loss": 4.22005303052622, "tokens_seen": 258670592 }, { "epoch": 0.08, "learning_rate": 0.00046545636910732196, "loss": 3.2235, "theoretical_loss": 4.219924568129759, "tokens_seen": 258736128 }, { "epoch": 0.08, "learning_rate": 0.0004654463390170512, "loss": 3.2058, "theoretical_loss": 4.219796147375957, "tokens_seen": 258801664 }, { "epoch": 0.08, "objective/train/docs_used": 441230, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.244802236557007, "objective/train/theoretical_loss": 4.219667768240775, "objective/train/tokens_used": 279327200, "theoretical_loss": 4.219667768240775, "tokens_seen": 258867200 }, { "epoch": 0.08, "learning_rate": 0.0004654363089267803, "loss": 3.3126, "theoretical_loss": 4.219667768240775, "tokens_seen": 258867200 }, { "epoch": 0.08, "learning_rate": 0.00046542627883650956, "loss": 3.34, "theoretical_loss": 4.219539430700195, "tokens_seen": 258932736 }, { "epoch": 0.08, "learning_rate": 0.00046541624874623874, "loss": 3.2369, "theoretical_loss": 4.2194111347302155, "tokens_seen": 258998272 }, { "epoch": 0.08, "learning_rate": 0.0004654062186559679, "loss": 3.2105, "theoretical_loss": 4.219282880306859, "tokens_seen": 259063808 }, { "epoch": 0.08, "learning_rate": 0.0004653961885656971, "loss": 3.3438, "theoretical_loss": 4.219154667406166, "tokens_seen": 259129344 }, { "epoch": 0.08, "learning_rate": 0.00046538615847542634, "loss": 3.2501, "theoretical_loss": 4.219026496004198, "tokens_seen": 259194880 }, { "epoch": 0.08, "learning_rate": 0.00046537612838515546, "loss": 3.3831, "theoretical_loss": 4.218898366077035, "tokens_seen": 259260416 }, { "epoch": 0.08, "learning_rate": 0.0004653660982948847, "loss": 3.1244, "theoretical_loss": 4.218770277600775, "tokens_seen": 259325952 }, { "epoch": 0.08, "learning_rate": 0.0004653560682046138, "loss": 3.3302, "theoretical_loss": 4.218642230551541, "tokens_seen": 259391488 }, { "epoch": 0.08, "learning_rate": 0.00046534603811434306, "loss": 3.2709, "theoretical_loss": 4.218514224905472, "tokens_seen": 259457024 }, { "epoch": 0.08, "learning_rate": 0.00046533600802407224, "loss": 3.3894, "theoretical_loss": 4.218386260638727, "tokens_seen": 259522560 }, { "epoch": 0.08, "learning_rate": 0.0004653259779338014, "loss": 3.2494, "theoretical_loss": 4.2182583377274865, "tokens_seen": 259588096 }, { "epoch": 0.08, "learning_rate": 0.0004653159478435306, "loss": 3.3584, "theoretical_loss": 4.218130456147948, "tokens_seen": 259653632 }, { "epoch": 0.08, "learning_rate": 0.0004653059177532598, "loss": 3.3658, "theoretical_loss": 4.218002615876332, "tokens_seen": 259719168 }, { "epoch": 0.08, "learning_rate": 0.00046529588766298897, "loss": 3.0392, "theoretical_loss": 4.217874816888877, "tokens_seen": 259784704 }, { "epoch": 0.08, "learning_rate": 0.0004652858575727182, "loss": 3.3363, "theoretical_loss": 4.217747059161839, "tokens_seen": 259850240 }, { "epoch": 0.08, "learning_rate": 0.00046527582748244733, "loss": 3.2536, "theoretical_loss": 4.217619342671498, "tokens_seen": 259915776 }, { "epoch": 0.08, "learning_rate": 0.00046526579739217656, "loss": 3.244, "theoretical_loss": 4.2174916673941505, "tokens_seen": 259981312 }, { "epoch": 0.08, "learning_rate": 0.00046525576730190575, "loss": 3.3533, "theoretical_loss": 4.217364033306113, "tokens_seen": 260046848 }, { "epoch": 0.08, "learning_rate": 0.0004652457372116349, "loss": 3.2031, "theoretical_loss": 4.217236440383724, "tokens_seen": 260112384 }, { "epoch": 0.08, "learning_rate": 0.0004652357071213641, "loss": 3.4021, "theoretical_loss": 4.217108888603337, "tokens_seen": 260177920 }, { "epoch": 0.08, "learning_rate": 0.0004652256770310933, "loss": 3.4666, "theoretical_loss": 4.21698137794133, "tokens_seen": 260243456 }, { "epoch": 0.08, "learning_rate": 0.00046521564694082247, "loss": 3.3244, "theoretical_loss": 4.216853908374097, "tokens_seen": 260308992 }, { "epoch": 0.08, "learning_rate": 0.0004652056168505517, "loss": 3.3342, "theoretical_loss": 4.216726479878052, "tokens_seen": 260374528 }, { "epoch": 0.08, "learning_rate": 0.00046519558676028083, "loss": 3.2552, "theoretical_loss": 4.216599092429631, "tokens_seen": 260440064 }, { "epoch": 0.08, "objective/train/docs_used": 444102, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.514075517654419, "objective/train/theoretical_loss": 4.216471746005286, "objective/train/tokens_used": 280965600, "theoretical_loss": 4.216471746005286, "tokens_seen": 260505600 }, { "epoch": 0.08, "learning_rate": 0.00046518555667001007, "loss": 3.2456, "theoretical_loss": 4.216471746005286, "tokens_seen": 260505600 }, { "epoch": 0.08, "learning_rate": 0.0004651755265797392, "loss": 3.3511, "theoretical_loss": 4.216344440581491, "tokens_seen": 260571136 }, { "epoch": 0.08, "learning_rate": 0.00046516549648946843, "loss": 3.2918, "theoretical_loss": 4.2162171761347365, "tokens_seen": 260636672 }, { "epoch": 0.08, "learning_rate": 0.0004651554663991976, "loss": 3.2322, "theoretical_loss": 4.2160899526415365, "tokens_seen": 260702208 }, { "epoch": 0.08, "learning_rate": 0.0004651454363089268, "loss": 3.2082, "theoretical_loss": 4.215962770078422, "tokens_seen": 260767744 }, { "epoch": 0.08, "learning_rate": 0.000465135406218656, "loss": 3.2697, "theoretical_loss": 4.215835628421942, "tokens_seen": 260833280 }, { "epoch": 0.08, "learning_rate": 0.00046512537612838515, "loss": 3.2426, "theoretical_loss": 4.215708527648667, "tokens_seen": 260898816 }, { "epoch": 0.08, "learning_rate": 0.00046511534603811434, "loss": 3.4227, "theoretical_loss": 4.215581467735187, "tokens_seen": 260964352 }, { "epoch": 0.08, "learning_rate": 0.00046510531594784357, "loss": 3.1684, "theoretical_loss": 4.215454448658109, "tokens_seen": 261029888 }, { "epoch": 0.08, "learning_rate": 0.0004650952858575727, "loss": 3.2251, "theoretical_loss": 4.215327470394062, "tokens_seen": 261095424 }, { "epoch": 0.08, "learning_rate": 0.00046508525576730193, "loss": 3.1854, "theoretical_loss": 4.215200532919691, "tokens_seen": 261160960 }, { "epoch": 0.08, "learning_rate": 0.0004650752256770311, "loss": 3.4195, "theoretical_loss": 4.215073636211664, "tokens_seen": 261226496 }, { "epoch": 0.08, "learning_rate": 0.0004650651955867603, "loss": 3.2083, "theoretical_loss": 4.214946780246666, "tokens_seen": 261292032 }, { "epoch": 0.08, "learning_rate": 0.0004650551654964895, "loss": 3.2542, "theoretical_loss": 4.214819965001401, "tokens_seen": 261357568 }, { "epoch": 0.08, "learning_rate": 0.00046504513540621866, "loss": 3.3356, "theoretical_loss": 4.214693190452593, "tokens_seen": 261423104 }, { "epoch": 0.08, "learning_rate": 0.00046503510531594784, "loss": 3.2502, "theoretical_loss": 4.214566456576984, "tokens_seen": 261488640 }, { "epoch": 0.08, "learning_rate": 0.0004650250752256771, "loss": 3.3855, "theoretical_loss": 4.214439763351336, "tokens_seen": 261554176 }, { "epoch": 0.08, "learning_rate": 0.0004650150451354062, "loss": 3.232, "theoretical_loss": 4.214313110752431, "tokens_seen": 261619712 }, { "epoch": 0.08, "learning_rate": 0.00046500501504513544, "loss": 3.1544, "theoretical_loss": 4.214186498757069, "tokens_seen": 261685248 }, { "epoch": 0.08, "learning_rate": 0.00046499498495486456, "loss": 3.2188, "theoretical_loss": 4.214059927342068, "tokens_seen": 261750784 }, { "epoch": 0.08, "learning_rate": 0.0004649849548645938, "loss": 3.3571, "theoretical_loss": 4.213933396484267, "tokens_seen": 261816320 }, { "epoch": 0.08, "learning_rate": 0.000464974924774323, "loss": 3.2399, "theoretical_loss": 4.213806906160523, "tokens_seen": 261881856 }, { "epoch": 0.08, "learning_rate": 0.00046496489468405216, "loss": 3.4071, "theoretical_loss": 4.213680456347712, "tokens_seen": 261947392 }, { "epoch": 0.08, "learning_rate": 0.00046495486459378134, "loss": 3.2881, "theoretical_loss": 4.213554047022729, "tokens_seen": 262012928 }, { "epoch": 0.08, "learning_rate": 0.0004649448345035105, "loss": 3.2349, "theoretical_loss": 4.213427678162489, "tokens_seen": 262078464 }, { "epoch": 0.08, "objective/train/docs_used": 446981, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.5261483192443848, "objective/train/theoretical_loss": 4.213301349743924, "objective/train/tokens_used": 282604000, "theoretical_loss": 4.213301349743924, "tokens_seen": 262144000 }, { "epoch": 0.08, "learning_rate": 0.0004649348044132397, "loss": 3.3954, "theoretical_loss": 4.213301349743924, "tokens_seen": 262144000 }, { "epoch": 0.08, "learning_rate": 0.00046492477432296894, "loss": 3.3465, "theoretical_loss": 4.2131750617439865, "tokens_seen": 262209536 }, { "epoch": 0.08, "learning_rate": 0.00046491474423269807, "loss": 3.3877, "theoretical_loss": 4.213048814139647, "tokens_seen": 262275072 }, { "epoch": 0.08, "learning_rate": 0.0004649047141424273, "loss": 3.3026, "theoretical_loss": 4.212922606907895, "tokens_seen": 262340608 }, { "epoch": 0.08, "learning_rate": 0.0004648946840521565, "loss": 3.3124, "theoretical_loss": 4.21279644002574, "tokens_seen": 262406144 }, { "epoch": 0.08, "learning_rate": 0.00046488465396188566, "loss": 3.3956, "theoretical_loss": 4.212670313470209, "tokens_seen": 262471680 }, { "epoch": 0.08, "learning_rate": 0.00046487462387161485, "loss": 3.0945, "theoretical_loss": 4.212544227218347, "tokens_seen": 262537216 }, { "epoch": 0.08, "learning_rate": 0.000464864593781344, "loss": 3.2534, "theoretical_loss": 4.21241818124722, "tokens_seen": 262602752 }, { "epoch": 0.08, "learning_rate": 0.0004648545636910732, "loss": 3.2788, "theoretical_loss": 4.212292175533912, "tokens_seen": 262668288 }, { "epoch": 0.08, "learning_rate": 0.00046484453360080244, "loss": 3.3183, "theoretical_loss": 4.212166210055526, "tokens_seen": 262733824 }, { "epoch": 0.08, "learning_rate": 0.00046483450351053157, "loss": 3.2587, "theoretical_loss": 4.212040284789181, "tokens_seen": 262799360 }, { "epoch": 0.08, "learning_rate": 0.0004648244734202608, "loss": 3.3269, "theoretical_loss": 4.211914399712019, "tokens_seen": 262864896 }, { "epoch": 0.08, "learning_rate": 0.00046481444332998993, "loss": 3.2918, "theoretical_loss": 4.211788554801198, "tokens_seen": 262930432 }, { "epoch": 0.08, "learning_rate": 0.00046480441323971917, "loss": 3.3098, "theoretical_loss": 4.211662750033895, "tokens_seen": 262995968 }, { "epoch": 0.08, "learning_rate": 0.00046479438314944835, "loss": 3.3629, "theoretical_loss": 4.211536985387307, "tokens_seen": 263061504 }, { "epoch": 0.08, "learning_rate": 0.00046478435305917753, "loss": 3.3471, "theoretical_loss": 4.211411260838647, "tokens_seen": 263127040 }, { "epoch": 0.08, "learning_rate": 0.0004647743229689067, "loss": 3.1613, "theoretical_loss": 4.2112855763651496, "tokens_seen": 263192576 }, { "epoch": 0.08, "learning_rate": 0.00046476429287863595, "loss": 3.3521, "theoretical_loss": 4.211159931944065, "tokens_seen": 263258112 }, { "epoch": 0.08, "learning_rate": 0.0004647542627883651, "loss": 3.2033, "theoretical_loss": 4.211034327552666, "tokens_seen": 263323648 }, { "epoch": 0.08, "learning_rate": 0.0004647442326980943, "loss": 3.3703, "theoretical_loss": 4.210908763168239, "tokens_seen": 263389184 }, { "epoch": 0.08, "learning_rate": 0.00046473420260782344, "loss": 3.3041, "theoretical_loss": 4.210783238768093, "tokens_seen": 263454720 }, { "epoch": 0.08, "learning_rate": 0.00046472417251755267, "loss": 3.3742, "theoretical_loss": 4.210657754329553, "tokens_seen": 263520256 }, { "epoch": 0.08, "learning_rate": 0.0004647141424272819, "loss": 3.3566, "theoretical_loss": 4.210532309829965, "tokens_seen": 263585792 }, { "epoch": 0.08, "learning_rate": 0.00046470411233701103, "loss": 3.4248, "theoretical_loss": 4.21040690524669, "tokens_seen": 263651328 }, { "epoch": 0.08, "learning_rate": 0.00046469408224674027, "loss": 3.1519, "theoretical_loss": 4.21028154055711, "tokens_seen": 263716864 }, { "epoch": 0.08, "objective/train/docs_used": 448341, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.300830364227295, "objective/train/theoretical_loss": 4.2101562157386265, "objective/train/tokens_used": 284242400, "theoretical_loss": 4.2101562157386265, "tokens_seen": 263782400 }, { "epoch": 0.08, "learning_rate": 0.0004646840521564694, "loss": 3.2636, "theoretical_loss": 4.2101562157386265, "tokens_seen": 263782400 }, { "epoch": 0.08, "learning_rate": 0.00046467402206619863, "loss": 3.1726, "theoretical_loss": 4.210030930768655, "tokens_seen": 263847936 }, { "epoch": 0.08, "learning_rate": 0.0004646639919759278, "loss": 3.3262, "theoretical_loss": 4.2099056856246335, "tokens_seen": 263913472 }, { "epoch": 0.08, "learning_rate": 0.000464653961885657, "loss": 3.3303, "theoretical_loss": 4.209780480284017, "tokens_seen": 263979008 }, { "epoch": 0.08, "learning_rate": 0.0004646439317953862, "loss": 3.3105, "theoretical_loss": 4.209655314724279, "tokens_seen": 264044544 }, { "epoch": 0.08, "learning_rate": 0.00046463390170511535, "loss": 3.3157, "theoretical_loss": 4.209530188922911, "tokens_seen": 264110080 }, { "epoch": 0.08, "learning_rate": 0.00046462387161484454, "loss": 3.3794, "theoretical_loss": 4.209405102857422, "tokens_seen": 264175616 }, { "epoch": 0.08, "learning_rate": 0.00046461384152457377, "loss": 3.3107, "theoretical_loss": 4.209280056505342, "tokens_seen": 264241152 }, { "epoch": 0.08, "learning_rate": 0.0004646038114343029, "loss": 3.2998, "theoretical_loss": 4.209155049844217, "tokens_seen": 264306688 }, { "epoch": 0.08, "learning_rate": 0.00046459378134403213, "loss": 3.4576, "theoretical_loss": 4.209030082851612, "tokens_seen": 264372224 }, { "epoch": 0.08, "learning_rate": 0.0004645837512537613, "loss": 3.4446, "theoretical_loss": 4.208905155505109, "tokens_seen": 264437760 }, { "epoch": 0.08, "learning_rate": 0.0004645737211634905, "loss": 3.3742, "theoretical_loss": 4.20878026778231, "tokens_seen": 264503296 }, { "epoch": 0.08, "learning_rate": 0.0004645636910732197, "loss": 3.278, "theoretical_loss": 4.208655419660834, "tokens_seen": 264568832 }, { "epoch": 0.08, "learning_rate": 0.00046455366098294886, "loss": 3.3336, "theoretical_loss": 4.208530611118321, "tokens_seen": 264634368 }, { "epoch": 0.08, "learning_rate": 0.00046454363089267804, "loss": 3.2299, "theoretical_loss": 4.208405842132423, "tokens_seen": 264699904 }, { "epoch": 0.08, "learning_rate": 0.0004645336008024073, "loss": 3.2582, "theoretical_loss": 4.208281112680817, "tokens_seen": 264765440 }, { "epoch": 0.08, "learning_rate": 0.0004645235707121364, "loss": 3.3725, "theoretical_loss": 4.208156422741195, "tokens_seen": 264830976 }, { "epoch": 0.08, "learning_rate": 0.00046451354062186564, "loss": 3.3252, "theoretical_loss": 4.208031772291265, "tokens_seen": 264896512 }, { "epoch": 0.08, "learning_rate": 0.00046450351053159476, "loss": 3.1252, "theoretical_loss": 4.207907161308757, "tokens_seen": 264962048 }, { "epoch": 0.08, "learning_rate": 0.000464493480441324, "loss": 3.2497, "theoretical_loss": 4.2077825897714165, "tokens_seen": 265027584 }, { "epoch": 0.08, "learning_rate": 0.0004644834503510532, "loss": 3.2274, "theoretical_loss": 4.207658057657008, "tokens_seen": 265093120 }, { "epoch": 0.08, "learning_rate": 0.00046447342026078236, "loss": 3.3217, "theoretical_loss": 4.207533564943316, "tokens_seen": 265158656 }, { "epoch": 0.08, "learning_rate": 0.00046446339017051154, "loss": 3.1337, "theoretical_loss": 4.207409111608138, "tokens_seen": 265224192 }, { "epoch": 0.08, "learning_rate": 0.0004644533600802407, "loss": 3.3316, "theoretical_loss": 4.2072846976292935, "tokens_seen": 265289728 }, { "epoch": 0.08, "learning_rate": 0.0004644433299899699, "loss": 3.1584, "theoretical_loss": 4.2071603229846195, "tokens_seen": 265355264 }, { "epoch": 0.08, "objective/train/docs_used": 451154, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.274839401245117, "objective/train/theoretical_loss": 4.20703598765197, "objective/train/tokens_used": 285880800, "theoretical_loss": 4.20703598765197, "tokens_seen": 265420800 }, { "epoch": 0.08, "learning_rate": 0.00046443329989969914, "loss": 3.3765, "theoretical_loss": 4.20703598765197, "tokens_seen": 265420800 }, { "epoch": 0.08, "learning_rate": 0.00046442326980942827, "loss": 3.3764, "theoretical_loss": 4.206911691609217, "tokens_seen": 265486336 }, { "epoch": 0.08, "learning_rate": 0.0004644132397191575, "loss": 3.2874, "theoretical_loss": 4.206787434834251, "tokens_seen": 265551872 }, { "epoch": 0.08, "learning_rate": 0.0004644032096288867, "loss": 3.356, "theoretical_loss": 4.20666321730498, "tokens_seen": 265617408 }, { "epoch": 0.08, "learning_rate": 0.00046439317953861586, "loss": 3.3832, "theoretical_loss": 4.206539038999329, "tokens_seen": 265682944 }, { "epoch": 0.08, "learning_rate": 0.00046438314944834505, "loss": 3.386, "theoretical_loss": 4.206414899895244, "tokens_seen": 265748480 }, { "epoch": 0.08, "learning_rate": 0.0004643731193580742, "loss": 3.1004, "theoretical_loss": 4.206290799970685, "tokens_seen": 265814016 }, { "epoch": 0.08, "learning_rate": 0.0004643630892678034, "loss": 3.4484, "theoretical_loss": 4.206166739203632, "tokens_seen": 265879552 }, { "epoch": 0.08, "learning_rate": 0.00046435305917753264, "loss": 3.4152, "theoretical_loss": 4.206042717572082, "tokens_seen": 265945088 }, { "epoch": 0.08, "learning_rate": 0.00046434302908726177, "loss": 3.2312, "theoretical_loss": 4.20591873505405, "tokens_seen": 266010624 }, { "epoch": 0.08, "learning_rate": 0.000464332998996991, "loss": 3.3112, "theoretical_loss": 4.20579479162757, "tokens_seen": 266076160 }, { "epoch": 0.08, "learning_rate": 0.00046432296890672013, "loss": 3.1658, "theoretical_loss": 4.205670887270691, "tokens_seen": 266141696 }, { "epoch": 0.08, "learning_rate": 0.00046431293881644937, "loss": 3.278, "theoretical_loss": 4.205547021961482, "tokens_seen": 266207232 }, { "epoch": 0.08, "learning_rate": 0.00046430290872617855, "loss": 3.3107, "theoretical_loss": 4.205423195678029, "tokens_seen": 266272768 }, { "epoch": 0.08, "learning_rate": 0.00046429287863590773, "loss": 3.2256, "theoretical_loss": 4.205299408398435, "tokens_seen": 266338304 }, { "epoch": 0.08, "learning_rate": 0.0004642828485456369, "loss": 3.2988, "theoretical_loss": 4.2051756601008226, "tokens_seen": 266403840 }, { "epoch": 0.08, "learning_rate": 0.00046427281845536615, "loss": 3.4076, "theoretical_loss": 4.20505195076333, "tokens_seen": 266469376 }, { "epoch": 0.08, "learning_rate": 0.0004642627883650953, "loss": 3.2331, "theoretical_loss": 4.204928280364115, "tokens_seen": 266534912 }, { "epoch": 0.08, "learning_rate": 0.0004642527582748245, "loss": 3.1945, "theoretical_loss": 4.20480464888135, "tokens_seen": 266600448 }, { "epoch": 0.08, "learning_rate": 0.00046424272818455364, "loss": 3.0746, "theoretical_loss": 4.204681056293228, "tokens_seen": 266665984 }, { "epoch": 0.08, "learning_rate": 0.00046423269809428287, "loss": 3.1786, "theoretical_loss": 4.204557502577957, "tokens_seen": 266731520 }, { "epoch": 0.08, "learning_rate": 0.00046422266800401205, "loss": 3.3116, "theoretical_loss": 4.204433987713767, "tokens_seen": 266797056 }, { "epoch": 0.08, "learning_rate": 0.00046421263791374123, "loss": 3.2807, "theoretical_loss": 4.2043105116789, "tokens_seen": 266862592 }, { "epoch": 0.08, "learning_rate": 0.0004642026078234704, "loss": 3.4024, "theoretical_loss": 4.204187074451617, "tokens_seen": 266928128 }, { "epoch": 0.08, "learning_rate": 0.0004641925777331996, "loss": 3.1767, "theoretical_loss": 4.204063676010202, "tokens_seen": 266993664 }, { "epoch": 0.08, "objective/train/docs_used": 453942, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9560606479644775, "objective/train/theoretical_loss": 4.203940316332948, "objective/train/tokens_used": 287519200, "theoretical_loss": 4.203940316332948, "tokens_seen": 267059200 }, { "epoch": 0.08, "learning_rate": 0.0004641825476429288, "loss": 3.2118, "theoretical_loss": 4.203940316332948, "tokens_seen": 267059200 }, { "epoch": 0.08, "learning_rate": 0.000464172517552658, "loss": 3.3802, "theoretical_loss": 4.203816995398171, "tokens_seen": 267124736 }, { "epoch": 0.08, "learning_rate": 0.00046416248746238714, "loss": 3.2811, "theoretical_loss": 4.203693713184203, "tokens_seen": 267190272 }, { "epoch": 0.08, "learning_rate": 0.0004641524573721164, "loss": 3.3269, "theoretical_loss": 4.203570469669392, "tokens_seen": 267255808 }, { "epoch": 0.08, "learning_rate": 0.0004641424272818455, "loss": 3.1225, "theoretical_loss": 4.203447264832107, "tokens_seen": 267321344 }, { "epoch": 0.08, "learning_rate": 0.00046413239719157474, "loss": 3.1925, "theoretical_loss": 4.203324098650731, "tokens_seen": 267386880 }, { "epoch": 0.08, "learning_rate": 0.0004641223671013039, "loss": 3.3467, "theoretical_loss": 4.203200971103666, "tokens_seen": 267452416 }, { "epoch": 0.08, "learning_rate": 0.0004641123370110331, "loss": 3.2897, "theoretical_loss": 4.20307788216933, "tokens_seen": 267517952 }, { "epoch": 0.08, "learning_rate": 0.0004641023069207623, "loss": 3.2988, "theoretical_loss": 4.202954831826159, "tokens_seen": 267583488 }, { "epoch": 0.08, "learning_rate": 0.0004640922768304915, "loss": 3.2125, "theoretical_loss": 4.202831820052609, "tokens_seen": 267649024 }, { "epoch": 0.08, "learning_rate": 0.00046408224674022064, "loss": 3.3799, "theoretical_loss": 4.202708846827148, "tokens_seen": 267714560 }, { "epoch": 0.08, "learning_rate": 0.0004640722166499499, "loss": 3.2917, "theoretical_loss": 4.202585912128266, "tokens_seen": 267780096 }, { "epoch": 0.08, "learning_rate": 0.000464062186559679, "loss": 3.3316, "theoretical_loss": 4.202463015934468, "tokens_seen": 267845632 }, { "epoch": 0.08, "learning_rate": 0.00046405215646940824, "loss": 3.3113, "theoretical_loss": 4.202340158224277, "tokens_seen": 267911168 }, { "epoch": 0.08, "learning_rate": 0.0004640421263791374, "loss": 3.2006, "theoretical_loss": 4.202217338976231, "tokens_seen": 267976704 }, { "epoch": 0.08, "learning_rate": 0.0004640320962888666, "loss": 3.4383, "theoretical_loss": 4.2020945581688895, "tokens_seen": 268042240 }, { "epoch": 0.08, "learning_rate": 0.0004640220661985958, "loss": 3.2692, "theoretical_loss": 4.201971815780826, "tokens_seen": 268107776 }, { "epoch": 0.08, "learning_rate": 0.00046401203610832496, "loss": 3.2111, "theoretical_loss": 4.201849111790631, "tokens_seen": 268173312 }, { "epoch": 0.08, "learning_rate": 0.00046400200601805414, "loss": 3.1873, "theoretical_loss": 4.201726446176915, "tokens_seen": 268238848 }, { "epoch": 0.08, "learning_rate": 0.0004639919759277834, "loss": 3.1376, "theoretical_loss": 4.201603818918302, "tokens_seen": 268304384 }, { "epoch": 0.08, "learning_rate": 0.0004639819458375125, "loss": 3.3114, "theoretical_loss": 4.201481229993435, "tokens_seen": 268369920 }, { "epoch": 0.08, "learning_rate": 0.00046397191574724174, "loss": 3.3118, "theoretical_loss": 4.201358679380976, "tokens_seen": 268435456 }, { "epoch": 0.08, "learning_rate": 0.0004639618856569709, "loss": 3.3384, "theoretical_loss": 4.201236167059601, "tokens_seen": 268500992 }, { "epoch": 0.08, "learning_rate": 0.0004639518555667001, "loss": 3.1463, "theoretical_loss": 4.201113693008002, "tokens_seen": 268566528 }, { "epoch": 0.08, "learning_rate": 0.00046394182547642934, "loss": 3.2639, "theoretical_loss": 4.200991257204894, "tokens_seen": 268632064 }, { "epoch": 0.08, "objective/train/docs_used": 456515, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.279548168182373, "objective/train/theoretical_loss": 4.2008688596290025, "objective/train/tokens_used": 289157600, "theoretical_loss": 4.2008688596290025, "tokens_seen": 268697600 }, { "epoch": 0.08, "learning_rate": 0.00046393179538615847, "loss": 3.2796, "theoretical_loss": 4.2008688596290025, "tokens_seen": 268697600 }, { "epoch": 0.08, "learning_rate": 0.0004639217652958877, "loss": 3.4824, "theoretical_loss": 4.200746500259073, "tokens_seen": 268763136 }, { "epoch": 0.08, "learning_rate": 0.0004639117352056169, "loss": 3.248, "theoretical_loss": 4.200624179073869, "tokens_seen": 268828672 }, { "epoch": 0.08, "learning_rate": 0.00046390170511534606, "loss": 3.3662, "theoretical_loss": 4.2005018960521685, "tokens_seen": 268894208 }, { "epoch": 0.08, "learning_rate": 0.00046389167502507525, "loss": 3.1771, "theoretical_loss": 4.200379651172769, "tokens_seen": 268959744 }, { "epoch": 0.08, "learning_rate": 0.0004638816449348044, "loss": 3.2208, "theoretical_loss": 4.200257444414483, "tokens_seen": 269025280 }, { "epoch": 0.08, "learning_rate": 0.0004638716148445336, "loss": 3.1126, "theoretical_loss": 4.200135275756139, "tokens_seen": 269090816 }, { "epoch": 0.08, "learning_rate": 0.00046386158475426284, "loss": 3.3136, "theoretical_loss": 4.200013145176587, "tokens_seen": 269156352 }, { "epoch": 0.08, "learning_rate": 0.00046385155466399197, "loss": 3.2654, "theoretical_loss": 4.199891052654689, "tokens_seen": 269221888 }, { "epoch": 0.08, "learning_rate": 0.0004638415245737212, "loss": 3.2437, "theoretical_loss": 4.199768998169326, "tokens_seen": 269287424 }, { "epoch": 0.08, "learning_rate": 0.00046383149448345033, "loss": 3.3195, "theoretical_loss": 4.199646981699395, "tokens_seen": 269352960 }, { "epoch": 0.08, "learning_rate": 0.00046382146439317957, "loss": 3.3988, "theoretical_loss": 4.199525003223812, "tokens_seen": 269418496 }, { "epoch": 0.08, "learning_rate": 0.00046381143430290875, "loss": 3.3749, "theoretical_loss": 4.199403062721506, "tokens_seen": 269484032 }, { "epoch": 0.08, "learning_rate": 0.00046380140421263793, "loss": 3.4967, "theoretical_loss": 4.199281160171427, "tokens_seen": 269549568 }, { "epoch": 0.08, "learning_rate": 0.0004637913741223671, "loss": 3.3363, "theoretical_loss": 4.1991592955525405, "tokens_seen": 269615104 }, { "epoch": 0.08, "learning_rate": 0.00046378134403209635, "loss": 3.113, "theoretical_loss": 4.199037468843825, "tokens_seen": 269680640 }, { "epoch": 0.08, "learning_rate": 0.0004637713139418255, "loss": 3.1284, "theoretical_loss": 4.198915680024282, "tokens_seen": 269746176 }, { "epoch": 0.08, "learning_rate": 0.0004637612838515547, "loss": 3.1793, "theoretical_loss": 4.198793929072925, "tokens_seen": 269811712 }, { "epoch": 0.08, "learning_rate": 0.00046375125376128384, "loss": 3.1856, "theoretical_loss": 4.198672215968785, "tokens_seen": 269877248 }, { "epoch": 0.08, "learning_rate": 0.00046374122367101307, "loss": 3.2715, "theoretical_loss": 4.198550540690912, "tokens_seen": 269942784 }, { "epoch": 0.08, "learning_rate": 0.00046373119358074225, "loss": 3.1332, "theoretical_loss": 4.198428903218371, "tokens_seen": 270008320 }, { "epoch": 0.08, "learning_rate": 0.00046372116349047143, "loss": 3.439, "theoretical_loss": 4.198307303530243, "tokens_seen": 270073856 }, { "epoch": 0.08, "learning_rate": 0.0004637111334002006, "loss": 3.213, "theoretical_loss": 4.198185741605628, "tokens_seen": 270139392 }, { "epoch": 0.08, "learning_rate": 0.0004637011033099298, "loss": 3.1675, "theoretical_loss": 4.19806421742364, "tokens_seen": 270204928 }, { "epoch": 0.08, "learning_rate": 0.000463691073219659, "loss": 3.2144, "theoretical_loss": 4.197942730963412, "tokens_seen": 270270464 }, { "epoch": 0.08, "objective/train/docs_used": 459334, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1271705627441406, "objective/train/theoretical_loss": 4.19782128220409, "objective/train/tokens_used": 290796000, "theoretical_loss": 4.19782128220409, "tokens_seen": 270336000 }, { "epoch": 0.08, "learning_rate": 0.0004636810431293882, "loss": 3.1926, "theoretical_loss": 4.19782128220409, "tokens_seen": 270336000 }, { "epoch": 0.08, "learning_rate": 0.00046367101303911734, "loss": 3.2651, "theoretical_loss": 4.19769987112484, "tokens_seen": 270401536 }, { "epoch": 0.08, "learning_rate": 0.0004636609829488466, "loss": 3.2341, "theoretical_loss": 4.1975784977048445, "tokens_seen": 270467072 }, { "epoch": 0.08, "learning_rate": 0.0004636509528585757, "loss": 3.343, "theoretical_loss": 4.1974571619233, "tokens_seen": 270532608 }, { "epoch": 0.08, "learning_rate": 0.00046364092276830494, "loss": 3.1819, "theoretical_loss": 4.197335863759422, "tokens_seen": 270598144 }, { "epoch": 0.08, "learning_rate": 0.0004636308926780341, "loss": 3.4795, "theoretical_loss": 4.1972146031924416, "tokens_seen": 270663680 }, { "epoch": 0.08, "learning_rate": 0.0004636208625877633, "loss": 3.2564, "theoretical_loss": 4.197093380201606, "tokens_seen": 270729216 }, { "epoch": 0.08, "learning_rate": 0.0004636108324974925, "loss": 3.3382, "theoretical_loss": 4.196972194766179, "tokens_seen": 270794752 }, { "epoch": 0.08, "learning_rate": 0.0004636008024072217, "loss": 3.3461, "theoretical_loss": 4.196851046865442, "tokens_seen": 270860288 }, { "epoch": 0.08, "learning_rate": 0.00046359077231695084, "loss": 3.193, "theoretical_loss": 4.1967299364786905, "tokens_seen": 270925824 }, { "epoch": 0.08, "learning_rate": 0.0004635807422266801, "loss": 3.3489, "theoretical_loss": 4.196608863585239, "tokens_seen": 270991360 }, { "epoch": 0.08, "learning_rate": 0.0004635707121364092, "loss": 3.3435, "theoretical_loss": 4.1964878281644165, "tokens_seen": 271056896 }, { "epoch": 0.08, "learning_rate": 0.00046356068204613844, "loss": 3.1443, "theoretical_loss": 4.19636683019557, "tokens_seen": 271122432 }, { "epoch": 0.08, "learning_rate": 0.0004635506519558676, "loss": 3.3267, "theoretical_loss": 4.196245869658061, "tokens_seen": 271187968 }, { "epoch": 0.08, "learning_rate": 0.0004635406218655968, "loss": 3.2782, "theoretical_loss": 4.1961249465312696, "tokens_seen": 271253504 }, { "epoch": 0.08, "learning_rate": 0.000463530591775326, "loss": 3.3289, "theoretical_loss": 4.196004060794589, "tokens_seen": 271319040 }, { "epoch": 0.08, "learning_rate": 0.00046352056168505516, "loss": 3.3252, "theoretical_loss": 4.195883212427433, "tokens_seen": 271384576 }, { "epoch": 0.08, "learning_rate": 0.00046351053159478434, "loss": 3.1418, "theoretical_loss": 4.195762401409229, "tokens_seen": 271450112 }, { "epoch": 0.08, "learning_rate": 0.0004635005015045136, "loss": 3.1862, "theoretical_loss": 4.19564162771942, "tokens_seen": 271515648 }, { "epoch": 0.08, "learning_rate": 0.0004634904714142427, "loss": 3.2812, "theoretical_loss": 4.195520891337466, "tokens_seen": 271581184 }, { "epoch": 0.08, "learning_rate": 0.00046348044132397194, "loss": 3.2589, "theoretical_loss": 4.195400192242845, "tokens_seen": 271646720 }, { "epoch": 0.08, "learning_rate": 0.00046347041123370107, "loss": 3.2839, "theoretical_loss": 4.19527953041505, "tokens_seen": 271712256 }, { "epoch": 0.08, "learning_rate": 0.0004634603811434303, "loss": 3.0442, "theoretical_loss": 4.19515890583359, "tokens_seen": 271777792 }, { "epoch": 0.08, "learning_rate": 0.0004634503510531595, "loss": 3.4327, "theoretical_loss": 4.195038318477989, "tokens_seen": 271843328 }, { "epoch": 0.08, "learning_rate": 0.00046344032096288867, "loss": 3.1959, "theoretical_loss": 4.194917768327789, "tokens_seen": 271908864 }, { "epoch": 0.08, "objective/train/docs_used": 462056, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1762537956237793, "objective/train/theoretical_loss": 4.194797255362549, "objective/train/tokens_used": 292434400, "theoretical_loss": 4.194797255362549, "tokens_seen": 271974400 }, { "epoch": 0.08, "learning_rate": 0.00046343029087261785, "loss": 3.389, "theoretical_loss": 4.194797255362549, "tokens_seen": 271974400 }, { "epoch": 0.08, "learning_rate": 0.0004634202607823471, "loss": 3.1483, "theoretical_loss": 4.194676779561841, "tokens_seen": 272039936 }, { "epoch": 0.08, "learning_rate": 0.0004634102306920762, "loss": 3.342, "theoretical_loss": 4.194556340905256, "tokens_seen": 272105472 }, { "epoch": 0.08, "learning_rate": 0.00046340020060180545, "loss": 3.2503, "theoretical_loss": 4.194435939372401, "tokens_seen": 272171008 }, { "epoch": 0.08, "learning_rate": 0.00046339017051153457, "loss": 3.3276, "theoretical_loss": 4.194315574942896, "tokens_seen": 272236544 }, { "epoch": 0.08, "learning_rate": 0.0004633801404212638, "loss": 3.1187, "theoretical_loss": 4.194195247596381, "tokens_seen": 272302080 }, { "epoch": 0.08, "learning_rate": 0.000463370110330993, "loss": 3.275, "theoretical_loss": 4.19407495731251, "tokens_seen": 272367616 }, { "epoch": 0.08, "learning_rate": 0.00046336008024072217, "loss": 3.253, "theoretical_loss": 4.193954704070952, "tokens_seen": 272433152 }, { "epoch": 0.08, "learning_rate": 0.00046335005015045135, "loss": 3.2703, "theoretical_loss": 4.193834487851396, "tokens_seen": 272498688 }, { "epoch": 0.08, "learning_rate": 0.00046334002006018053, "loss": 3.301, "theoretical_loss": 4.193714308633542, "tokens_seen": 272564224 }, { "epoch": 0.08, "learning_rate": 0.0004633299899699097, "loss": 3.2141, "theoretical_loss": 4.1935941663971095, "tokens_seen": 272629760 }, { "epoch": 0.08, "learning_rate": 0.00046331995987963895, "loss": 3.2496, "theoretical_loss": 4.193474061121833, "tokens_seen": 272695296 }, { "epoch": 0.08, "learning_rate": 0.0004633099297893681, "loss": 3.1671, "theoretical_loss": 4.193353992787463, "tokens_seen": 272760832 }, { "epoch": 0.08, "learning_rate": 0.0004632998996990973, "loss": 3.435, "theoretical_loss": 4.193233961373766, "tokens_seen": 272826368 }, { "epoch": 0.08, "learning_rate": 0.00046328986960882644, "loss": 3.3558, "theoretical_loss": 4.1931139668605235, "tokens_seen": 272891904 }, { "epoch": 0.08, "learning_rate": 0.0004632798395185557, "loss": 3.1241, "theoretical_loss": 4.192994009227535, "tokens_seen": 272957440 }, { "epoch": 0.08, "learning_rate": 0.00046326980942828485, "loss": 3.1812, "theoretical_loss": 4.192874088454613, "tokens_seen": 273022976 }, { "epoch": 0.08, "learning_rate": 0.00046325977933801404, "loss": 3.4481, "theoretical_loss": 4.19275420452159, "tokens_seen": 273088512 }, { "epoch": 0.08, "learning_rate": 0.0004632497492477432, "loss": 3.3442, "theoretical_loss": 4.192634357408309, "tokens_seen": 273154048 }, { "epoch": 0.08, "learning_rate": 0.00046323971915747245, "loss": 3.206, "theoretical_loss": 4.192514547094634, "tokens_seen": 273219584 }, { "epoch": 0.08, "learning_rate": 0.0004632296890672016, "loss": 3.265, "theoretical_loss": 4.192394773560441, "tokens_seen": 273285120 }, { "epoch": 0.08, "learning_rate": 0.0004632196589769308, "loss": 3.2563, "theoretical_loss": 4.192275036785625, "tokens_seen": 273350656 }, { "epoch": 0.08, "learning_rate": 0.00046320962888666, "loss": 3.3436, "theoretical_loss": 4.192155336750094, "tokens_seen": 273416192 }, { "epoch": 0.08, "learning_rate": 0.0004631995987963892, "loss": 3.2168, "theoretical_loss": 4.192035673433773, "tokens_seen": 273481728 }, { "epoch": 0.08, "learning_rate": 0.0004631895687061184, "loss": 3.2294, "theoretical_loss": 4.191916046816605, "tokens_seen": 273547264 }, { "epoch": 0.08, "objective/train/docs_used": 465025, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.077315092086792, "objective/train/theoretical_loss": 4.191796456878544, "objective/train/tokens_used": 294072800, "theoretical_loss": 4.191796456878544, "tokens_seen": 273612800 }, { "epoch": 0.08, "learning_rate": 0.00046317953861584754, "loss": 3.2035, "theoretical_loss": 4.191796456878544, "tokens_seen": 273612800 }, { "epoch": 0.08, "learning_rate": 0.0004631695085255768, "loss": 3.2977, "theoretical_loss": 4.191676903599563, "tokens_seen": 273678336 }, { "epoch": 0.08, "learning_rate": 0.0004631594784353059, "loss": 3.3531, "theoretical_loss": 4.191557386959651, "tokens_seen": 273743872 }, { "epoch": 0.08, "learning_rate": 0.00046314944834503514, "loss": 3.2175, "theoretical_loss": 4.191437906938811, "tokens_seen": 273809408 }, { "epoch": 0.08, "learning_rate": 0.0004631394182547643, "loss": 3.2075, "theoretical_loss": 4.191318463517062, "tokens_seen": 273874944 }, { "epoch": 0.08, "learning_rate": 0.0004631293881644935, "loss": 3.1693, "theoretical_loss": 4.19119905667444, "tokens_seen": 273940480 }, { "epoch": 0.08, "learning_rate": 0.0004631193580742227, "loss": 3.2318, "theoretical_loss": 4.191079686390996, "tokens_seen": 274006016 }, { "epoch": 0.08, "learning_rate": 0.0004631093279839519, "loss": 3.1632, "theoretical_loss": 4.190960352646796, "tokens_seen": 274071552 }, { "epoch": 0.08, "learning_rate": 0.00046309929789368104, "loss": 3.0824, "theoretical_loss": 4.190841055421921, "tokens_seen": 274137088 }, { "epoch": 0.08, "learning_rate": 0.0004630892678034103, "loss": 3.257, "theoretical_loss": 4.19072179469647, "tokens_seen": 274202624 }, { "epoch": 0.08, "learning_rate": 0.0004630792377131394, "loss": 3.2733, "theoretical_loss": 4.190602570450556, "tokens_seen": 274268160 }, { "epoch": 0.08, "learning_rate": 0.00046306920762286864, "loss": 3.1865, "theoretical_loss": 4.190483382664308, "tokens_seen": 274333696 }, { "epoch": 0.08, "learning_rate": 0.0004630591775325978, "loss": 3.1575, "theoretical_loss": 4.19036423131787, "tokens_seen": 274399232 }, { "epoch": 0.08, "learning_rate": 0.000463049147442327, "loss": 3.1047, "theoretical_loss": 4.190245116391403, "tokens_seen": 274464768 }, { "epoch": 0.08, "learning_rate": 0.0004630391173520562, "loss": 3.2294, "theoretical_loss": 4.190126037865082, "tokens_seen": 274530304 }, { "epoch": 0.08, "learning_rate": 0.00046302908726178536, "loss": 3.3144, "theoretical_loss": 4.190006995719098, "tokens_seen": 274595840 }, { "epoch": 0.08, "learning_rate": 0.00046301905717151455, "loss": 3.3027, "theoretical_loss": 4.1898879899336565, "tokens_seen": 274661376 }, { "epoch": 0.08, "learning_rate": 0.0004630090270812438, "loss": 3.3208, "theoretical_loss": 4.189769020488981, "tokens_seen": 274726912 }, { "epoch": 0.08, "learning_rate": 0.0004629989969909729, "loss": 3.347, "theoretical_loss": 4.189650087365309, "tokens_seen": 274792448 }, { "epoch": 0.08, "learning_rate": 0.00046298896690070214, "loss": 3.1753, "theoretical_loss": 4.189531190542893, "tokens_seen": 274857984 }, { "epoch": 0.08, "learning_rate": 0.00046297893681043127, "loss": 3.1984, "theoretical_loss": 4.189412330002001, "tokens_seen": 274923520 }, { "epoch": 0.08, "learning_rate": 0.0004629689067201605, "loss": 3.1638, "theoretical_loss": 4.189293505722918, "tokens_seen": 274989056 }, { "epoch": 0.08, "learning_rate": 0.0004629588766298897, "loss": 3.3582, "theoretical_loss": 4.189174717685942, "tokens_seen": 275054592 }, { "epoch": 0.08, "learning_rate": 0.00046294884653961887, "loss": 3.1823, "theoretical_loss": 4.189055965871389, "tokens_seen": 275120128 }, { "epoch": 0.08, "learning_rate": 0.00046293881644934805, "loss": 3.3112, "theoretical_loss": 4.188937250259587, "tokens_seen": 275185664 }, { "epoch": 0.08, "objective/train/docs_used": 466444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1221392154693604, "objective/train/theoretical_loss": 4.188818570830883, "objective/train/tokens_used": 295711200, "theoretical_loss": 4.188818570830883, "tokens_seen": 275251200 }, { "epoch": 0.08, "learning_rate": 0.0004629287863590773, "loss": 3.3501, "theoretical_loss": 4.188818570830883, "tokens_seen": 275251200 }, { "epoch": 0.08, "learning_rate": 0.0004629187562688064, "loss": 3.2276, "theoretical_loss": 4.188699927565638, "tokens_seen": 275316736 }, { "epoch": 0.08, "learning_rate": 0.00046290872617853565, "loss": 3.2825, "theoretical_loss": 4.188581320444228, "tokens_seen": 275382272 }, { "epoch": 0.08, "learning_rate": 0.00046289869608826477, "loss": 3.335, "theoretical_loss": 4.1884627494470426, "tokens_seen": 275447808 }, { "epoch": 0.08, "learning_rate": 0.000462888665997994, "loss": 3.2498, "theoretical_loss": 4.1883442145544905, "tokens_seen": 275513344 }, { "epoch": 0.08, "learning_rate": 0.0004628786359077232, "loss": 3.2656, "theoretical_loss": 4.188225715746992, "tokens_seen": 275578880 }, { "epoch": 0.08, "learning_rate": 0.00046286860581745237, "loss": 3.2437, "theoretical_loss": 4.188107253004986, "tokens_seen": 275644416 }, { "epoch": 0.08, "learning_rate": 0.00046285857572718155, "loss": 3.1278, "theoretical_loss": 4.187988826308925, "tokens_seen": 275709952 }, { "epoch": 0.08, "learning_rate": 0.00046284854563691073, "loss": 3.0904, "theoretical_loss": 4.187870435639275, "tokens_seen": 275775488 }, { "epoch": 0.08, "learning_rate": 0.0004628385155466399, "loss": 3.3864, "theoretical_loss": 4.18775208097652, "tokens_seen": 275841024 }, { "epoch": 0.08, "learning_rate": 0.00046282848545636915, "loss": 3.3406, "theoretical_loss": 4.187633762301159, "tokens_seen": 275906560 }, { "epoch": 0.08, "learning_rate": 0.0004628184553660983, "loss": 3.249, "theoretical_loss": 4.187515479593704, "tokens_seen": 275972096 }, { "epoch": 0.08, "learning_rate": 0.0004628084252758275, "loss": 3.2675, "theoretical_loss": 4.187397232834683, "tokens_seen": 276037632 }, { "epoch": 0.08, "learning_rate": 0.00046279839518555664, "loss": 3.4415, "theoretical_loss": 4.187279022004642, "tokens_seen": 276103168 }, { "epoch": 0.08, "learning_rate": 0.0004627883650952859, "loss": 3.0633, "theoretical_loss": 4.1871608470841375, "tokens_seen": 276168704 }, { "epoch": 0.08, "learning_rate": 0.00046277833500501505, "loss": 3.2292, "theoretical_loss": 4.1870427080537445, "tokens_seen": 276234240 }, { "epoch": 0.08, "learning_rate": 0.00046276830491474424, "loss": 3.2127, "theoretical_loss": 4.1869246048940525, "tokens_seen": 276299776 }, { "epoch": 0.08, "learning_rate": 0.0004627582748244734, "loss": 3.2751, "theoretical_loss": 4.186806537585666, "tokens_seen": 276365312 }, { "epoch": 0.08, "learning_rate": 0.00046274824473420265, "loss": 3.1427, "theoretical_loss": 4.186688506109202, "tokens_seen": 276430848 }, { "epoch": 0.08, "learning_rate": 0.0004627382146439318, "loss": 3.3796, "theoretical_loss": 4.186570510445296, "tokens_seen": 276496384 }, { "epoch": 0.08, "learning_rate": 0.000462728184553661, "loss": 3.2006, "theoretical_loss": 4.186452550574599, "tokens_seen": 276561920 }, { "epoch": 0.08, "learning_rate": 0.00046271815446339014, "loss": 3.244, "theoretical_loss": 4.186334626477774, "tokens_seen": 276627456 }, { "epoch": 0.08, "learning_rate": 0.0004627081243731194, "loss": 3.3409, "theoretical_loss": 4.186216738135501, "tokens_seen": 276692992 }, { "epoch": 0.08, "learning_rate": 0.00046269809428284856, "loss": 3.2004, "theoretical_loss": 4.186098885528473, "tokens_seen": 276758528 }, { "epoch": 0.08, "learning_rate": 0.00046268806419257774, "loss": 3.3078, "theoretical_loss": 4.185981068637401, "tokens_seen": 276824064 }, { "epoch": 0.08, "objective/train/docs_used": 468697, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9332034587860107, "objective/train/theoretical_loss": 4.185863287443008, "objective/train/tokens_used": 297349600, "theoretical_loss": 4.185863287443008, "tokens_seen": 276889600 }, { "epoch": 0.08, "learning_rate": 0.0004626780341023069, "loss": 3.1846, "theoretical_loss": 4.185863287443008, "tokens_seen": 276889600 }, { "epoch": 0.08, "learning_rate": 0.0004626680040120361, "loss": 3.2775, "theoretical_loss": 4.185745541926035, "tokens_seen": 276955136 }, { "epoch": 0.08, "learning_rate": 0.0004626579739217653, "loss": 3.3436, "theoretical_loss": 4.185627832067237, "tokens_seen": 277020672 }, { "epoch": 0.08, "learning_rate": 0.0004626479438314945, "loss": 3.2775, "theoretical_loss": 4.1855101578473795, "tokens_seen": 277086208 }, { "epoch": 0.08, "learning_rate": 0.00046263791374122364, "loss": 3.0383, "theoretical_loss": 4.18539251924725, "tokens_seen": 277151744 }, { "epoch": 0.08, "learning_rate": 0.0004626278836509529, "loss": 3.2512, "theoretical_loss": 4.185274916247646, "tokens_seen": 277217280 }, { "epoch": 0.08, "learning_rate": 0.000462617853560682, "loss": 3.2569, "theoretical_loss": 4.185157348829383, "tokens_seen": 277282816 }, { "epoch": 0.08, "learning_rate": 0.00046260782347041124, "loss": 3.1305, "theoretical_loss": 4.185039816973289, "tokens_seen": 277348352 }, { "epoch": 0.08, "learning_rate": 0.0004625977933801404, "loss": 3.0246, "theoretical_loss": 4.184922320660207, "tokens_seen": 277413888 }, { "epoch": 0.08, "learning_rate": 0.0004625877632898696, "loss": 3.2348, "theoretical_loss": 4.184804859870997, "tokens_seen": 277479424 }, { "epoch": 0.08, "learning_rate": 0.0004625777331995988, "loss": 3.1614, "theoretical_loss": 4.184687434586531, "tokens_seen": 277544960 }, { "epoch": 0.08, "learning_rate": 0.000462567703109328, "loss": 3.3074, "theoretical_loss": 4.184570044787698, "tokens_seen": 277610496 }, { "epoch": 0.08, "learning_rate": 0.00046255767301905715, "loss": 3.2104, "theoretical_loss": 4.1844526904554, "tokens_seen": 277676032 }, { "epoch": 0.08, "learning_rate": 0.0004625476429287864, "loss": 3.3912, "theoretical_loss": 4.184335371570556, "tokens_seen": 277741568 }, { "epoch": 0.08, "learning_rate": 0.0004625376128385155, "loss": 3.3328, "theoretical_loss": 4.184218088114097, "tokens_seen": 277807104 }, { "epoch": 0.08, "learning_rate": 0.00046252758274824475, "loss": 3.3806, "theoretical_loss": 4.1841008400669715, "tokens_seen": 277872640 }, { "epoch": 0.08, "learning_rate": 0.0004625175526579739, "loss": 3.2857, "theoretical_loss": 4.183983627410142, "tokens_seen": 277938176 }, { "epoch": 0.08, "learning_rate": 0.0004625075225677031, "loss": 3.173, "theoretical_loss": 4.183866450124584, "tokens_seen": 278003712 }, { "epoch": 0.08, "learning_rate": 0.0004624974924774323, "loss": 3.1323, "theoretical_loss": 4.18374930819129, "tokens_seen": 278069248 }, { "epoch": 0.08, "learning_rate": 0.00046248746238716147, "loss": 3.2129, "theoretical_loss": 4.183632201591264, "tokens_seen": 278134784 }, { "epoch": 0.08, "learning_rate": 0.00046247743229689065, "loss": 3.3556, "theoretical_loss": 4.18351513030553, "tokens_seen": 278200320 }, { "epoch": 0.08, "learning_rate": 0.0004624674022066199, "loss": 3.0799, "theoretical_loss": 4.1833980943151206, "tokens_seen": 278265856 }, { "epoch": 0.08, "learning_rate": 0.00046245737211634907, "loss": 3.1652, "theoretical_loss": 4.183281093601087, "tokens_seen": 278331392 }, { "epoch": 0.08, "learning_rate": 0.00046244734202607825, "loss": 3.2491, "theoretical_loss": 4.183164128144495, "tokens_seen": 278396928 }, { "epoch": 0.08, "learning_rate": 0.0004624373119358075, "loss": 3.1962, "theoretical_loss": 4.183047197926422, "tokens_seen": 278462464 }, { "epoch": 0.08, "objective/train/docs_used": 471337, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.191582202911377, "objective/train/theoretical_loss": 4.182930302927963, "objective/train/tokens_used": 298988000, "theoretical_loss": 4.182930302927963, "tokens_seen": 278528000 }, { "epoch": 0.08, "learning_rate": 0.0004624272818455366, "loss": 3.1889, "theoretical_loss": 4.182930302927963, "tokens_seen": 278528000 }, { "epoch": 0.08, "learning_rate": 0.00046241725175526585, "loss": 2.9861, "theoretical_loss": 4.182813443130227, "tokens_seen": 278593536 }, { "epoch": 0.08, "learning_rate": 0.000462407221664995, "loss": 3.3292, "theoretical_loss": 4.182696618514337, "tokens_seen": 278659072 }, { "epoch": 0.08, "learning_rate": 0.0004623971915747242, "loss": 3.3576, "theoretical_loss": 4.18257982906143, "tokens_seen": 278724608 }, { "epoch": 0.08, "learning_rate": 0.0004623871614844534, "loss": 3.3113, "theoretical_loss": 4.1824630747526585, "tokens_seen": 278790144 }, { "epoch": 0.08, "learning_rate": 0.00046237713139418257, "loss": 3.2743, "theoretical_loss": 4.182346355569189, "tokens_seen": 278855680 }, { "epoch": 0.08, "learning_rate": 0.00046236710130391175, "loss": 3.3154, "theoretical_loss": 4.182229671492204, "tokens_seen": 278921216 }, { "epoch": 0.08, "learning_rate": 0.00046235707121364093, "loss": 3.3594, "theoretical_loss": 4.1821130225028975, "tokens_seen": 278986752 }, { "epoch": 0.08, "learning_rate": 0.0004623470411233701, "loss": 3.2334, "theoretical_loss": 4.1819964085824815, "tokens_seen": 279052288 }, { "epoch": 0.08, "learning_rate": 0.00046233701103309935, "loss": 3.1256, "theoretical_loss": 4.181879829712178, "tokens_seen": 279117824 }, { "epoch": 0.08, "learning_rate": 0.0004623269809428285, "loss": 3.2528, "theoretical_loss": 4.181763285873231, "tokens_seen": 279183360 }, { "epoch": 0.08, "learning_rate": 0.0004623169508525577, "loss": 3.0756, "theoretical_loss": 4.181646777046889, "tokens_seen": 279248896 }, { "epoch": 0.08, "learning_rate": 0.00046230692076228684, "loss": 3.2741, "theoretical_loss": 4.181530303214423, "tokens_seen": 279314432 }, { "epoch": 0.08, "learning_rate": 0.0004622968906720161, "loss": 3.3057, "theoretical_loss": 4.181413864357115, "tokens_seen": 279379968 }, { "epoch": 0.08, "learning_rate": 0.00046228686058174525, "loss": 3.2732, "theoretical_loss": 4.181297460456262, "tokens_seen": 279445504 }, { "epoch": 0.08, "learning_rate": 0.00046227683049147444, "loss": 3.3339, "theoretical_loss": 4.181181091493174, "tokens_seen": 279511040 }, { "epoch": 0.08, "learning_rate": 0.0004622668004012036, "loss": 3.2324, "theoretical_loss": 4.181064757449178, "tokens_seen": 279576576 }, { "epoch": 0.08, "learning_rate": 0.00046225677031093285, "loss": 3.2213, "theoretical_loss": 4.180948458305615, "tokens_seen": 279642112 }, { "epoch": 0.08, "learning_rate": 0.000462246740220662, "loss": 3.2978, "theoretical_loss": 4.180832194043836, "tokens_seen": 279707648 }, { "epoch": 0.08, "learning_rate": 0.0004622367101303912, "loss": 3.3172, "theoretical_loss": 4.180715964645213, "tokens_seen": 279773184 }, { "epoch": 0.08, "learning_rate": 0.00046222668004012034, "loss": 3.4041, "theoretical_loss": 4.180599770091126, "tokens_seen": 279838720 }, { "epoch": 0.08, "learning_rate": 0.0004622166499498496, "loss": 3.1813, "theoretical_loss": 4.180483610362975, "tokens_seen": 279904256 }, { "epoch": 0.08, "learning_rate": 0.00046220661985957876, "loss": 3.2584, "theoretical_loss": 4.18036748544217, "tokens_seen": 279969792 }, { "epoch": 0.08, "learning_rate": 0.00046219658976930794, "loss": 3.1623, "theoretical_loss": 4.180251395310137, "tokens_seen": 280035328 }, { "epoch": 0.08, "learning_rate": 0.0004621865596790371, "loss": 3.1623, "theoretical_loss": 4.1801353399483165, "tokens_seen": 280100864 }, { "epoch": 0.08, "objective/train/docs_used": 474317, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.598167657852173, "objective/train/theoretical_loss": 4.180019319338163, "objective/train/tokens_used": 300626400, "theoretical_loss": 4.180019319338163, "tokens_seen": 280166400 }, { "epoch": 0.08, "learning_rate": 0.0004621765295887663, "loss": 3.5254, "theoretical_loss": 4.180019319338163, "tokens_seen": 280166400 }, { "epoch": 0.08, "learning_rate": 0.0004621664994984955, "loss": 3.3242, "theoretical_loss": 4.179903333461144, "tokens_seen": 280231936 }, { "epoch": 0.08, "learning_rate": 0.0004621564694082247, "loss": 3.2748, "theoretical_loss": 4.179787382298744, "tokens_seen": 280297472 }, { "epoch": 0.08, "learning_rate": 0.00046214643931795384, "loss": 3.1744, "theoretical_loss": 4.179671465832458, "tokens_seen": 280363008 }, { "epoch": 0.08, "learning_rate": 0.0004621364092276831, "loss": 3.2055, "theoretical_loss": 4.179555584043799, "tokens_seen": 280428544 }, { "epoch": 0.08, "learning_rate": 0.0004621263791374122, "loss": 3.2004, "theoretical_loss": 4.17943973691429, "tokens_seen": 280494080 }, { "epoch": 0.09, "learning_rate": 0.00046211634904714144, "loss": 3.0362, "theoretical_loss": 4.179323924425472, "tokens_seen": 280559616 }, { "epoch": 0.09, "learning_rate": 0.0004621063189568706, "loss": 3.2289, "theoretical_loss": 4.179208146558899, "tokens_seen": 280625152 }, { "epoch": 0.09, "learning_rate": 0.0004620962888665998, "loss": 3.2552, "theoretical_loss": 4.1790924032961385, "tokens_seen": 280690688 }, { "epoch": 0.09, "learning_rate": 0.000462086258776329, "loss": 3.2155, "theoretical_loss": 4.178976694618772, "tokens_seen": 280756224 }, { "epoch": 0.09, "learning_rate": 0.0004620762286860582, "loss": 3.2246, "theoretical_loss": 4.178861020508395, "tokens_seen": 280821760 }, { "epoch": 0.09, "learning_rate": 0.00046206619859578735, "loss": 3.2969, "theoretical_loss": 4.178745380946619, "tokens_seen": 280887296 }, { "epoch": 0.09, "learning_rate": 0.0004620561685055166, "loss": 3.059, "theoretical_loss": 4.178629775915066, "tokens_seen": 280952832 }, { "epoch": 0.09, "learning_rate": 0.0004620461384152457, "loss": 3.253, "theoretical_loss": 4.178514205395376, "tokens_seen": 281018368 }, { "epoch": 0.09, "learning_rate": 0.00046203610832497495, "loss": 3.2722, "theoretical_loss": 4.178398669369201, "tokens_seen": 281083904 }, { "epoch": 0.09, "learning_rate": 0.0004620260782347041, "loss": 3.2246, "theoretical_loss": 4.178283167818206, "tokens_seen": 281149440 }, { "epoch": 0.09, "learning_rate": 0.0004620160481444333, "loss": 3.1499, "theoretical_loss": 4.178167700724073, "tokens_seen": 281214976 }, { "epoch": 0.09, "learning_rate": 0.0004620060180541625, "loss": 3.3891, "theoretical_loss": 4.178052268068494, "tokens_seen": 281280512 }, { "epoch": 0.09, "learning_rate": 0.00046199598796389167, "loss": 3.2818, "theoretical_loss": 4.177936869833179, "tokens_seen": 281346048 }, { "epoch": 0.09, "learning_rate": 0.00046198595787362085, "loss": 3.3057, "theoretical_loss": 4.17782150599985, "tokens_seen": 281411584 }, { "epoch": 0.09, "learning_rate": 0.0004619759277833501, "loss": 3.1784, "theoretical_loss": 4.1777061765502435, "tokens_seen": 281477120 }, { "epoch": 0.09, "learning_rate": 0.0004619658976930792, "loss": 3.3757, "theoretical_loss": 4.1775908814661085, "tokens_seen": 281542656 }, { "epoch": 0.09, "learning_rate": 0.00046195586760280845, "loss": 3.4467, "theoretical_loss": 4.17747562072921, "tokens_seen": 281608192 }, { "epoch": 0.09, "learning_rate": 0.00046194583751253763, "loss": 3.2143, "theoretical_loss": 4.177360394321325, "tokens_seen": 281673728 }, { "epoch": 0.09, "learning_rate": 0.0004619358074222668, "loss": 3.1454, "theoretical_loss": 4.177245202224246, "tokens_seen": 281739264 }, { "epoch": 0.09, "objective/train/docs_used": 477057, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2165274620056152, "objective/train/theoretical_loss": 4.17713004441978, "objective/train/tokens_used": 302264800, "theoretical_loss": 4.17713004441978, "tokens_seen": 281804800 }, { "epoch": 0.09, "learning_rate": 0.000461925777331996, "loss": 3.3338, "theoretical_loss": 4.17713004441978, "tokens_seen": 281804800 }, { "epoch": 0.09, "learning_rate": 0.0004619157472417252, "loss": 3.2353, "theoretical_loss": 4.177014920889745, "tokens_seen": 281870336 }, { "epoch": 0.09, "learning_rate": 0.00046190571715145435, "loss": 3.1774, "theoretical_loss": 4.176899831615974, "tokens_seen": 281935872 }, { "epoch": 0.09, "learning_rate": 0.0004618956870611836, "loss": 3.1442, "theoretical_loss": 4.176784776580316, "tokens_seen": 282001408 }, { "epoch": 0.09, "learning_rate": 0.0004618856569709127, "loss": 3.2447, "theoretical_loss": 4.176669755764632, "tokens_seen": 282066944 }, { "epoch": 0.09, "learning_rate": 0.00046187562688064195, "loss": 3.1561, "theoretical_loss": 4.176554769150796, "tokens_seen": 282132480 }, { "epoch": 0.09, "learning_rate": 0.0004618655967903711, "loss": 3.3036, "theoretical_loss": 4.176439816720697, "tokens_seen": 282198016 }, { "epoch": 0.09, "learning_rate": 0.0004618555667001003, "loss": 3.191, "theoretical_loss": 4.1763248984562376, "tokens_seen": 282263552 }, { "epoch": 0.09, "learning_rate": 0.0004618455366098295, "loss": 3.3393, "theoretical_loss": 4.176210014339335, "tokens_seen": 282329088 }, { "epoch": 0.09, "learning_rate": 0.0004618355065195587, "loss": 3.2847, "theoretical_loss": 4.17609516435192, "tokens_seen": 282394624 }, { "epoch": 0.09, "learning_rate": 0.00046182547642928786, "loss": 3.1163, "theoretical_loss": 4.1759803484759335, "tokens_seen": 282460160 }, { "epoch": 0.09, "learning_rate": 0.00046181544633901704, "loss": 3.2305, "theoretical_loss": 4.175865566693336, "tokens_seen": 282525696 }, { "epoch": 0.09, "learning_rate": 0.0004618054162487462, "loss": 3.3157, "theoretical_loss": 4.175750818986098, "tokens_seen": 282591232 }, { "epoch": 0.09, "learning_rate": 0.00046179538615847545, "loss": 3.1046, "theoretical_loss": 4.1756361053362046, "tokens_seen": 282656768 }, { "epoch": 0.09, "learning_rate": 0.0004617853560682046, "loss": 3.4108, "theoretical_loss": 4.1755214257256545, "tokens_seen": 282722304 }, { "epoch": 0.09, "learning_rate": 0.0004617753259779338, "loss": 3.1749, "theoretical_loss": 4.17540678013646, "tokens_seen": 282787840 }, { "epoch": 0.09, "learning_rate": 0.000461765295887663, "loss": 3.2225, "theoretical_loss": 4.175292168550648, "tokens_seen": 282853376 }, { "epoch": 0.09, "learning_rate": 0.0004617552657973922, "loss": 3.2885, "theoretical_loss": 4.175177590950257, "tokens_seen": 282918912 }, { "epoch": 0.09, "learning_rate": 0.00046174523570712136, "loss": 3.2633, "theoretical_loss": 4.175063047317342, "tokens_seen": 282984448 }, { "epoch": 0.09, "learning_rate": 0.00046173520561685054, "loss": 3.0939, "theoretical_loss": 4.174948537633968, "tokens_seen": 283049984 }, { "epoch": 0.09, "learning_rate": 0.0004617251755265797, "loss": 3.0253, "theoretical_loss": 4.174834061882218, "tokens_seen": 283115520 }, { "epoch": 0.09, "learning_rate": 0.00046171514543630896, "loss": 3.0714, "theoretical_loss": 4.1747196200441845, "tokens_seen": 283181056 }, { "epoch": 0.09, "learning_rate": 0.00046170511534603814, "loss": 3.1853, "theoretical_loss": 4.174605212101977, "tokens_seen": 283246592 }, { "epoch": 0.09, "learning_rate": 0.0004616950852557673, "loss": 3.2241, "theoretical_loss": 4.174490838037716, "tokens_seen": 283312128 }, { "epoch": 0.09, "learning_rate": 0.0004616850551654965, "loss": 3.1262, "theoretical_loss": 4.174376497833537, "tokens_seen": 283377664 }, { "epoch": 0.09, "objective/train/docs_used": 479763, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.137798547744751, "objective/train/theoretical_loss": 4.174262191471587, "objective/train/tokens_used": 303903200, "theoretical_loss": 4.174262191471587, "tokens_seen": 283443200 }, { "epoch": 0.09, "learning_rate": 0.0004616750250752257, "loss": 3.3048, "theoretical_loss": 4.174262191471587, "tokens_seen": 283443200 }, { "epoch": 0.09, "learning_rate": 0.0004616649949849549, "loss": 3.1802, "theoretical_loss": 4.17414791893403, "tokens_seen": 283508736 }, { "epoch": 0.09, "learning_rate": 0.00046165496489468404, "loss": 2.9734, "theoretical_loss": 4.17403368020304, "tokens_seen": 283574272 }, { "epoch": 0.09, "learning_rate": 0.0004616449348044133, "loss": 3.2819, "theoretical_loss": 4.173919475260808, "tokens_seen": 283639808 }, { "epoch": 0.09, "learning_rate": 0.0004616349047141424, "loss": 3.401, "theoretical_loss": 4.173805304089536, "tokens_seen": 283705344 }, { "epoch": 0.09, "learning_rate": 0.00046162487462387164, "loss": 3.1662, "theoretical_loss": 4.173691166671439, "tokens_seen": 283770880 }, { "epoch": 0.09, "learning_rate": 0.0004616148445336008, "loss": 3.156, "theoretical_loss": 4.173577062988748, "tokens_seen": 283836416 }, { "epoch": 0.09, "learning_rate": 0.00046160481444333, "loss": 3.3674, "theoretical_loss": 4.173462993023706, "tokens_seen": 283901952 }, { "epoch": 0.09, "learning_rate": 0.0004615947843530592, "loss": 3.3143, "theoretical_loss": 4.173348956758568, "tokens_seen": 283967488 }, { "epoch": 0.09, "learning_rate": 0.0004615847542627884, "loss": 3.2259, "theoretical_loss": 4.173234954175605, "tokens_seen": 284033024 }, { "epoch": 0.09, "learning_rate": 0.00046157472417251755, "loss": 3.2386, "theoretical_loss": 4.173120985257102, "tokens_seen": 284098560 }, { "epoch": 0.09, "learning_rate": 0.0004615646940822468, "loss": 3.3295, "theoretical_loss": 4.173007049985352, "tokens_seen": 284164096 }, { "epoch": 0.09, "learning_rate": 0.0004615546639919759, "loss": 3.2969, "theoretical_loss": 4.172893148342667, "tokens_seen": 284229632 }, { "epoch": 0.09, "learning_rate": 0.00046154463390170515, "loss": 3.2034, "theoretical_loss": 4.172779280311372, "tokens_seen": 284295168 }, { "epoch": 0.09, "learning_rate": 0.0004615346038114343, "loss": 3.4163, "theoretical_loss": 4.172665445873801, "tokens_seen": 284360704 }, { "epoch": 0.09, "learning_rate": 0.0004615245737211635, "loss": 3.1923, "theoretical_loss": 4.172551645012307, "tokens_seen": 284426240 }, { "epoch": 0.09, "learning_rate": 0.0004615145436308927, "loss": 3.1244, "theoretical_loss": 4.1724378777092515, "tokens_seen": 284491776 }, { "epoch": 0.09, "learning_rate": 0.00046150451354062187, "loss": 3.3005, "theoretical_loss": 4.172324143947012, "tokens_seen": 284557312 }, { "epoch": 0.09, "learning_rate": 0.00046149448345035105, "loss": 3.1498, "theoretical_loss": 4.172210443707979, "tokens_seen": 284622848 }, { "epoch": 0.09, "learning_rate": 0.0004614844533600803, "loss": 3.3645, "theoretical_loss": 4.1720967769745565, "tokens_seen": 284688384 }, { "epoch": 0.09, "learning_rate": 0.0004614744232698094, "loss": 3.3557, "theoretical_loss": 4.171983143729159, "tokens_seen": 284753920 }, { "epoch": 0.09, "learning_rate": 0.00046146439317953865, "loss": 3.2443, "theoretical_loss": 4.1718695439542195, "tokens_seen": 284819456 }, { "epoch": 0.09, "learning_rate": 0.00046145436308926783, "loss": 3.1982, "theoretical_loss": 4.17175597763218, "tokens_seen": 284884992 }, { "epoch": 0.09, "learning_rate": 0.000461444332998997, "loss": 3.3043, "theoretical_loss": 4.171642444745497, "tokens_seen": 284950528 }, { "epoch": 0.09, "learning_rate": 0.0004614343029087262, "loss": 3.3063, "theoretical_loss": 4.1715289452766395, "tokens_seen": 285016064 }, { "epoch": 0.09, "objective/train/docs_used": 482575, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.465254068374634, "objective/train/theoretical_loss": 4.1714154792080915, "objective/train/tokens_used": 305541600, "theoretical_loss": 4.1714154792080915, "tokens_seen": 285081600 }, { "epoch": 0.09, "learning_rate": 0.0004614242728184554, "loss": 3.1657, "theoretical_loss": 4.1714154792080915, "tokens_seen": 285081600 }, { "epoch": 0.09, "learning_rate": 0.00046141424272818455, "loss": 3.2104, "theoretical_loss": 4.171302046522349, "tokens_seen": 285147136 }, { "epoch": 0.09, "learning_rate": 0.0004614042126379138, "loss": 3.2683, "theoretical_loss": 4.171188647201921, "tokens_seen": 285212672 }, { "epoch": 0.09, "learning_rate": 0.0004613941825476429, "loss": 3.1045, "theoretical_loss": 4.1710752812293315, "tokens_seen": 285278208 }, { "epoch": 0.09, "learning_rate": 0.00046138415245737215, "loss": 3.4022, "theoretical_loss": 4.170961948587115, "tokens_seen": 285343744 }, { "epoch": 0.09, "learning_rate": 0.0004613741223671013, "loss": 3.0594, "theoretical_loss": 4.17084864925782, "tokens_seen": 285409280 }, { "epoch": 0.09, "learning_rate": 0.0004613640922768305, "loss": 3.3534, "theoretical_loss": 4.1707353832240095, "tokens_seen": 285474816 }, { "epoch": 0.09, "learning_rate": 0.0004613540621865597, "loss": 3.12, "theoretical_loss": 4.170622150468258, "tokens_seen": 285540352 }, { "epoch": 0.09, "learning_rate": 0.0004613440320962889, "loss": 3.1561, "theoretical_loss": 4.170508950973154, "tokens_seen": 285605888 }, { "epoch": 0.09, "learning_rate": 0.00046133400200601806, "loss": 3.0696, "theoretical_loss": 4.1703957847213, "tokens_seen": 285671424 }, { "epoch": 0.09, "learning_rate": 0.00046132397191574724, "loss": 3.1837, "theoretical_loss": 4.170282651695308, "tokens_seen": 285736960 }, { "epoch": 0.09, "learning_rate": 0.0004613139418254764, "loss": 3.1827, "theoretical_loss": 4.170169551877808, "tokens_seen": 285802496 }, { "epoch": 0.09, "learning_rate": 0.00046130391173520566, "loss": 3.1015, "theoretical_loss": 4.170056485251439, "tokens_seen": 285868032 }, { "epoch": 0.09, "learning_rate": 0.0004612938816449348, "loss": 3.265, "theoretical_loss": 4.169943451798856, "tokens_seen": 285933568 }, { "epoch": 0.09, "learning_rate": 0.000461283851554664, "loss": 3.1532, "theoretical_loss": 4.169830451502724, "tokens_seen": 285999104 }, { "epoch": 0.09, "learning_rate": 0.0004612738214643932, "loss": 3.5069, "theoretical_loss": 4.169717484345725, "tokens_seen": 286064640 }, { "epoch": 0.09, "learning_rate": 0.0004612637913741224, "loss": 3.2848, "theoretical_loss": 4.1696045503105506, "tokens_seen": 286130176 }, { "epoch": 0.09, "learning_rate": 0.00046125376128385156, "loss": 3.3279, "theoretical_loss": 4.169491649379905, "tokens_seen": 286195712 }, { "epoch": 0.09, "learning_rate": 0.00046124373119358074, "loss": 3.2696, "theoretical_loss": 4.169378781536509, "tokens_seen": 286261248 }, { "epoch": 0.09, "learning_rate": 0.0004612337011033099, "loss": 3.1816, "theoretical_loss": 4.169265946763095, "tokens_seen": 286326784 }, { "epoch": 0.09, "learning_rate": 0.00046122367101303916, "loss": 3.2091, "theoretical_loss": 4.169153145042405, "tokens_seen": 286392320 }, { "epoch": 0.09, "learning_rate": 0.0004612136409227683, "loss": 3.3528, "theoretical_loss": 4.169040376357199, "tokens_seen": 286457856 }, { "epoch": 0.09, "learning_rate": 0.0004612036108324975, "loss": 3.1589, "theoretical_loss": 4.168927640690246, "tokens_seen": 286523392 }, { "epoch": 0.09, "learning_rate": 0.00046119358074222665, "loss": 3.1954, "theoretical_loss": 4.16881493802433, "tokens_seen": 286588928 }, { "epoch": 0.09, "learning_rate": 0.0004611835506519559, "loss": 3.2539, "theoretical_loss": 4.168702268342248, "tokens_seen": 286654464 }, { "epoch": 0.09, "objective/train/docs_used": 483835, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.182565212249756, "objective/train/theoretical_loss": 4.168589631626808, "objective/train/tokens_used": 307180000, "theoretical_loss": 4.168589631626808, "tokens_seen": 286720000 }, { "epoch": 0.09, "learning_rate": 0.00046117352056168506, "loss": 3.3024, "theoretical_loss": 4.168589631626808, "tokens_seen": 286720000 }, { "epoch": 0.09, "learning_rate": 0.00046116349047141425, "loss": 3.0472, "theoretical_loss": 4.168477027860833, "tokens_seen": 286785536 }, { "epoch": 0.09, "learning_rate": 0.0004611534603811434, "loss": 3.2503, "theoretical_loss": 4.168364457027158, "tokens_seen": 286851072 }, { "epoch": 0.09, "learning_rate": 0.0004611434302908726, "loss": 3.0474, "theoretical_loss": 4.168251919108632, "tokens_seen": 286916608 }, { "epoch": 0.09, "learning_rate": 0.0004611334002006018, "loss": 3.2418, "theoretical_loss": 4.168139414088113, "tokens_seen": 286982144 }, { "epoch": 0.09, "learning_rate": 0.000461123370110331, "loss": 3.4142, "theoretical_loss": 4.168026941948478, "tokens_seen": 287047680 }, { "epoch": 0.09, "learning_rate": 0.00046111334002006015, "loss": 3.2788, "theoretical_loss": 4.167914502672611, "tokens_seen": 287113216 }, { "epoch": 0.09, "learning_rate": 0.0004611033099297894, "loss": 3.3619, "theoretical_loss": 4.1678020962434115, "tokens_seen": 287178752 }, { "epoch": 0.09, "learning_rate": 0.00046109327983951857, "loss": 3.2549, "theoretical_loss": 4.167689722643792, "tokens_seen": 287244288 }, { "epoch": 0.09, "learning_rate": 0.00046108324974924775, "loss": 3.2246, "theoretical_loss": 4.1675773818566775, "tokens_seen": 287309824 }, { "epoch": 0.09, "learning_rate": 0.00046107321965897693, "loss": 3.2691, "theoretical_loss": 4.167465073865006, "tokens_seen": 287375360 }, { "epoch": 0.09, "learning_rate": 0.0004610631895687061, "loss": 3.1066, "theoretical_loss": 4.167352798651726, "tokens_seen": 287440896 }, { "epoch": 0.09, "learning_rate": 0.0004610531594784353, "loss": 3.3805, "theoretical_loss": 4.167240556199802, "tokens_seen": 287506432 }, { "epoch": 0.09, "learning_rate": 0.0004610431293881645, "loss": 3.2886, "theoretical_loss": 4.167128346492211, "tokens_seen": 287571968 }, { "epoch": 0.09, "learning_rate": 0.00046103309929789365, "loss": 3.1971, "theoretical_loss": 4.16701616951194, "tokens_seen": 287637504 }, { "epoch": 0.09, "learning_rate": 0.0004610230692076229, "loss": 3.1362, "theoretical_loss": 4.1669040252419896, "tokens_seen": 287703040 }, { "epoch": 0.09, "learning_rate": 0.000461013039117352, "loss": 3.3472, "theoretical_loss": 4.166791913665375, "tokens_seen": 287768576 }, { "epoch": 0.09, "learning_rate": 0.00046100300902708125, "loss": 3.2153, "theoretical_loss": 4.166679834765123, "tokens_seen": 287834112 }, { "epoch": 0.09, "learning_rate": 0.00046099297893681043, "loss": 3.2937, "theoretical_loss": 4.166567788524272, "tokens_seen": 287899648 }, { "epoch": 0.09, "learning_rate": 0.0004609829488465396, "loss": 3.4313, "theoretical_loss": 4.166455774925875, "tokens_seen": 287965184 }, { "epoch": 0.09, "learning_rate": 0.0004609729187562688, "loss": 3.4216, "theoretical_loss": 4.166343793952995, "tokens_seen": 288030720 }, { "epoch": 0.09, "learning_rate": 0.00046096288866599803, "loss": 3.1268, "theoretical_loss": 4.166231845588712, "tokens_seen": 288096256 }, { "epoch": 0.09, "learning_rate": 0.0004609528585757272, "loss": 3.2755, "theoretical_loss": 4.166119929816113, "tokens_seen": 288161792 }, { "epoch": 0.09, "learning_rate": 0.0004609428284854564, "loss": 3.2275, "theoretical_loss": 4.166008046618303, "tokens_seen": 288227328 }, { "epoch": 0.09, "learning_rate": 0.0004609327983951856, "loss": 3.296, "theoretical_loss": 4.1658961959783944, "tokens_seen": 288292864 }, { "epoch": 0.09, "objective/train/docs_used": 486829, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.394059419631958, "objective/train/theoretical_loss": 4.165784377879517, "objective/train/tokens_used": 308818400, "theoretical_loss": 4.165784377879517, "tokens_seen": 288358400 }, { "epoch": 0.09, "learning_rate": 0.00046092276830491475, "loss": 3.2737, "theoretical_loss": 4.165784377879517, "tokens_seen": 288358400 }, { "epoch": 0.09, "learning_rate": 0.000460912738214644, "loss": 3.2824, "theoretical_loss": 4.165672592304811, "tokens_seen": 288423936 }, { "epoch": 0.09, "learning_rate": 0.0004609027081243731, "loss": 3.1104, "theoretical_loss": 4.165560839237429, "tokens_seen": 288489472 }, { "epoch": 0.09, "learning_rate": 0.00046089267803410235, "loss": 3.2521, "theoretical_loss": 4.165449118660536, "tokens_seen": 288555008 }, { "epoch": 0.09, "learning_rate": 0.0004608826479438315, "loss": 3.3605, "theoretical_loss": 4.16533743055731, "tokens_seen": 288620544 }, { "epoch": 0.09, "learning_rate": 0.0004608726178535607, "loss": 3.1611, "theoretical_loss": 4.165225774910941, "tokens_seen": 288686080 }, { "epoch": 0.09, "learning_rate": 0.0004608625877632899, "loss": 3.1899, "theoretical_loss": 4.165114151704634, "tokens_seen": 288751616 }, { "epoch": 0.09, "learning_rate": 0.0004608525576730191, "loss": 3.2753, "theoretical_loss": 4.165002560921601, "tokens_seen": 288817152 }, { "epoch": 0.09, "learning_rate": 0.00046084252758274826, "loss": 3.2351, "theoretical_loss": 4.164891002545073, "tokens_seen": 288882688 }, { "epoch": 0.09, "learning_rate": 0.00046083249749247744, "loss": 3.3585, "theoretical_loss": 4.16477947655829, "tokens_seen": 288948224 }, { "epoch": 0.09, "learning_rate": 0.0004608224674022066, "loss": 3.2951, "theoretical_loss": 4.164667982944504, "tokens_seen": 289013760 }, { "epoch": 0.09, "learning_rate": 0.00046081243731193586, "loss": 3.2623, "theoretical_loss": 4.164556521686981, "tokens_seen": 289079296 }, { "epoch": 0.09, "learning_rate": 0.000460802407221665, "loss": 3.336, "theoretical_loss": 4.1644450927689975, "tokens_seen": 289144832 }, { "epoch": 0.09, "learning_rate": 0.0004607923771313942, "loss": 3.2913, "theoretical_loss": 4.164333696173846, "tokens_seen": 289210368 }, { "epoch": 0.09, "learning_rate": 0.0004607823470411234, "loss": 3.1929, "theoretical_loss": 4.164222331884827, "tokens_seen": 289275904 }, { "epoch": 0.09, "learning_rate": 0.0004607723169508526, "loss": 3.2433, "theoretical_loss": 4.164110999885256, "tokens_seen": 289341440 }, { "epoch": 0.09, "learning_rate": 0.00046076228686058176, "loss": 3.3477, "theoretical_loss": 4.163999700158462, "tokens_seen": 289406976 }, { "epoch": 0.09, "learning_rate": 0.00046075225677031094, "loss": 3.0174, "theoretical_loss": 4.163888432687784, "tokens_seen": 289472512 }, { "epoch": 0.09, "learning_rate": 0.0004607422266800401, "loss": 3.3004, "theoretical_loss": 4.163777197456573, "tokens_seen": 289538048 }, { "epoch": 0.09, "learning_rate": 0.00046073219658976936, "loss": 3.3454, "theoretical_loss": 4.163665994448197, "tokens_seen": 289603584 }, { "epoch": 0.09, "learning_rate": 0.0004607221664994985, "loss": 3.1867, "theoretical_loss": 4.163554823646027, "tokens_seen": 289669120 }, { "epoch": 0.09, "learning_rate": 0.0004607121364092277, "loss": 3.2883, "theoretical_loss": 4.163443685033458, "tokens_seen": 289734656 }, { "epoch": 0.09, "learning_rate": 0.00046070210631895685, "loss": 3.0943, "theoretical_loss": 4.163332578593889, "tokens_seen": 289800192 }, { "epoch": 0.09, "learning_rate": 0.0004606920762286861, "loss": 3.1312, "theoretical_loss": 4.163221504310734, "tokens_seen": 289865728 }, { "epoch": 0.09, "learning_rate": 0.00046068204613841526, "loss": 3.1873, "theoretical_loss": 4.1631104621674195, "tokens_seen": 289931264 }, { "epoch": 0.09, "objective/train/docs_used": 489827, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2355599403381348, "objective/train/theoretical_loss": 4.162999452147384, "objective/train/tokens_used": 310456800, "theoretical_loss": 4.162999452147384, "tokens_seen": 289996800 }, { "epoch": 0.09, "learning_rate": 0.00046067201604814445, "loss": 3.1316, "theoretical_loss": 4.162999452147384, "tokens_seen": 289996800 }, { "epoch": 0.09, "learning_rate": 0.0004606619859578736, "loss": 3.2627, "theoretical_loss": 4.1628884742340775, "tokens_seen": 290062336 }, { "epoch": 0.09, "learning_rate": 0.0004606519558676028, "loss": 3.3126, "theoretical_loss": 4.162777528410963, "tokens_seen": 290127872 }, { "epoch": 0.09, "learning_rate": 0.000460641925777332, "loss": 3.3492, "theoretical_loss": 4.162666614661518, "tokens_seen": 290193408 }, { "epoch": 0.09, "learning_rate": 0.0004606318956870612, "loss": 3.2733, "theoretical_loss": 4.162555732969227, "tokens_seen": 290258944 }, { "epoch": 0.09, "learning_rate": 0.00046062186559679035, "loss": 3.2826, "theoretical_loss": 4.162444883317591, "tokens_seen": 290324480 }, { "epoch": 0.09, "learning_rate": 0.0004606118355065196, "loss": 3.3157, "theoretical_loss": 4.162334065690123, "tokens_seen": 290390016 }, { "epoch": 0.09, "learning_rate": 0.00046060180541624877, "loss": 3.2378, "theoretical_loss": 4.162223280070345, "tokens_seen": 290455552 }, { "epoch": 0.09, "learning_rate": 0.00046059177532597795, "loss": 3.3626, "theoretical_loss": 4.1621125264417955, "tokens_seen": 290521088 }, { "epoch": 0.09, "learning_rate": 0.00046058174523570713, "loss": 3.1765, "theoretical_loss": 4.162001804788021, "tokens_seen": 290586624 }, { "epoch": 0.09, "learning_rate": 0.0004605717151454363, "loss": 3.3224, "theoretical_loss": 4.161891115092583, "tokens_seen": 290652160 }, { "epoch": 0.09, "learning_rate": 0.0004605616850551655, "loss": 3.0949, "theoretical_loss": 4.161780457339055, "tokens_seen": 290717696 }, { "epoch": 0.09, "learning_rate": 0.0004605516549648947, "loss": 3.2395, "theoretical_loss": 4.161669831511022, "tokens_seen": 290783232 }, { "epoch": 0.09, "learning_rate": 0.00046054162487462385, "loss": 3.2675, "theoretical_loss": 4.16155923759208, "tokens_seen": 290848768 }, { "epoch": 0.09, "learning_rate": 0.0004605315947843531, "loss": 3.1771, "theoretical_loss": 4.161448675565838, "tokens_seen": 290914304 }, { "epoch": 0.09, "learning_rate": 0.0004605215646940822, "loss": 3.4574, "theoretical_loss": 4.161338145415918, "tokens_seen": 290979840 }, { "epoch": 0.09, "learning_rate": 0.00046051153460381145, "loss": 3.2209, "theoretical_loss": 4.161227647125955, "tokens_seen": 291045376 }, { "epoch": 0.09, "learning_rate": 0.00046050150451354063, "loss": 3.1718, "theoretical_loss": 4.161117180679591, "tokens_seen": 291110912 }, { "epoch": 0.09, "learning_rate": 0.0004604914744232698, "loss": 3.2698, "theoretical_loss": 4.161006746060488, "tokens_seen": 291176448 }, { "epoch": 0.09, "learning_rate": 0.000460481444332999, "loss": 3.3882, "theoretical_loss": 4.160896343252311, "tokens_seen": 291241984 }, { "epoch": 0.09, "learning_rate": 0.00046047141424272823, "loss": 3.1938, "theoretical_loss": 4.160785972238745, "tokens_seen": 291307520 }, { "epoch": 0.09, "learning_rate": 0.00046046138415245736, "loss": 3.2207, "theoretical_loss": 4.160675633003484, "tokens_seen": 291373056 }, { "epoch": 0.09, "learning_rate": 0.0004604513540621866, "loss": 2.9952, "theoretical_loss": 4.16056532553023, "tokens_seen": 291438592 }, { "epoch": 0.09, "learning_rate": 0.0004604413239719157, "loss": 3.2495, "theoretical_loss": 4.160455049802706, "tokens_seen": 291504128 }, { "epoch": 0.09, "learning_rate": 0.00046043129388164495, "loss": 3.3505, "theoretical_loss": 4.1603448058046375, "tokens_seen": 291569664 }, { "epoch": 0.09, "objective/train/docs_used": 492733, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.588277816772461, "objective/train/theoretical_loss": 4.160234593519768, "objective/train/tokens_used": 312095200, "theoretical_loss": 4.160234593519768, "tokens_seen": 291635200 }, { "epoch": 0.09, "learning_rate": 0.00046042126379137414, "loss": 2.9944, "theoretical_loss": 4.160234593519768, "tokens_seen": 291635200 }, { "epoch": 0.09, "learning_rate": 0.0004604112337011033, "loss": 3.2623, "theoretical_loss": 4.160124412931852, "tokens_seen": 291700736 }, { "epoch": 0.09, "learning_rate": 0.0004604012036108325, "loss": 3.3189, "theoretical_loss": 4.160014264024654, "tokens_seen": 291766272 }, { "epoch": 0.09, "learning_rate": 0.0004603911735205617, "loss": 3.2653, "theoretical_loss": 4.159904146781952, "tokens_seen": 291831808 }, { "epoch": 0.09, "learning_rate": 0.00046038114343029086, "loss": 3.2401, "theoretical_loss": 4.159794061187536, "tokens_seen": 291897344 }, { "epoch": 0.09, "learning_rate": 0.0004603711133400201, "loss": 3.2677, "theoretical_loss": 4.1596840072252075, "tokens_seen": 291962880 }, { "epoch": 0.09, "learning_rate": 0.0004603610832497492, "loss": 3.1872, "theoretical_loss": 4.159573984878779, "tokens_seen": 292028416 }, { "epoch": 0.09, "learning_rate": 0.00046035105315947846, "loss": 3.3372, "theoretical_loss": 4.159463994132079, "tokens_seen": 292093952 }, { "epoch": 0.09, "learning_rate": 0.0004603410230692076, "loss": 3.3757, "theoretical_loss": 4.15935403496894, "tokens_seen": 292159488 }, { "epoch": 0.09, "learning_rate": 0.0004603309929789368, "loss": 3.193, "theoretical_loss": 4.159244107373215, "tokens_seen": 292225024 }, { "epoch": 0.09, "learning_rate": 0.000460320962888666, "loss": 3.0649, "theoretical_loss": 4.159134211328765, "tokens_seen": 292290560 }, { "epoch": 0.09, "learning_rate": 0.0004603109327983952, "loss": 3.2742, "theoretical_loss": 4.159024346819461, "tokens_seen": 292356096 }, { "epoch": 0.09, "learning_rate": 0.00046030090270812436, "loss": 3.3529, "theoretical_loss": 4.158914513829189, "tokens_seen": 292421632 }, { "epoch": 0.09, "learning_rate": 0.0004602908726178536, "loss": 3.2979, "theoretical_loss": 4.158804712341845, "tokens_seen": 292487168 }, { "epoch": 0.09, "learning_rate": 0.0004602808425275827, "loss": 3.1989, "theoretical_loss": 4.158694942341338, "tokens_seen": 292552704 }, { "epoch": 0.09, "learning_rate": 0.00046027081243731196, "loss": 3.2631, "theoretical_loss": 4.1585852038115885, "tokens_seen": 292618240 }, { "epoch": 0.09, "learning_rate": 0.0004602607823470411, "loss": 3.0867, "theoretical_loss": 4.1584754967365285, "tokens_seen": 292683776 }, { "epoch": 0.09, "learning_rate": 0.0004602507522567703, "loss": 3.1268, "theoretical_loss": 4.1583658211001016, "tokens_seen": 292749312 }, { "epoch": 0.09, "learning_rate": 0.0004602407221664995, "loss": 3.2591, "theoretical_loss": 4.158256176886264, "tokens_seen": 292814848 }, { "epoch": 0.09, "learning_rate": 0.0004602306920762287, "loss": 3.1935, "theoretical_loss": 4.158146564078982, "tokens_seen": 292880384 }, { "epoch": 0.09, "learning_rate": 0.00046022066198595787, "loss": 3.1011, "theoretical_loss": 4.158036982662237, "tokens_seen": 292945920 }, { "epoch": 0.09, "learning_rate": 0.00046021063189568705, "loss": 3.1482, "theoretical_loss": 4.157927432620018, "tokens_seen": 293011456 }, { "epoch": 0.09, "learning_rate": 0.0004602006018054163, "loss": 3.1405, "theoretical_loss": 4.157817913936329, "tokens_seen": 293076992 }, { "epoch": 0.09, "learning_rate": 0.00046019057171514546, "loss": 3.1608, "theoretical_loss": 4.157708426595184, "tokens_seen": 293142528 }, { "epoch": 0.09, "learning_rate": 0.00046018054162487465, "loss": 3.3112, "theoretical_loss": 4.157598970580608, "tokens_seen": 293208064 }, { "epoch": 0.09, "objective/train/docs_used": 495626, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.294774055480957, "objective/train/theoretical_loss": 4.157489545876642, "objective/train/tokens_used": 313733600, "theoretical_loss": 4.157489545876642, "tokens_seen": 293273600 }, { "epoch": 0.09, "learning_rate": 0.0004601705115346038, "loss": 3.3017, "theoretical_loss": 4.157489545876642, "tokens_seen": 293273600 }, { "epoch": 0.09, "learning_rate": 0.000460160481444333, "loss": 3.4001, "theoretical_loss": 4.157380152467333, "tokens_seen": 293339136 }, { "epoch": 0.09, "learning_rate": 0.0004601504513540622, "loss": 3.0635, "theoretical_loss": 4.157270790336742, "tokens_seen": 293404672 }, { "epoch": 0.09, "learning_rate": 0.0004601404212637914, "loss": 3.2307, "theoretical_loss": 4.157161459468944, "tokens_seen": 293470208 }, { "epoch": 0.09, "learning_rate": 0.00046013039117352055, "loss": 3.3323, "theoretical_loss": 4.157052159848023, "tokens_seen": 293535744 }, { "epoch": 0.09, "learning_rate": 0.0004601203610832498, "loss": 3.3083, "theoretical_loss": 4.156942891458074, "tokens_seen": 293601280 }, { "epoch": 0.09, "learning_rate": 0.00046011033099297897, "loss": 3.149, "theoretical_loss": 4.156833654283207, "tokens_seen": 293666816 }, { "epoch": 0.09, "learning_rate": 0.00046010030090270815, "loss": 3.2854, "theoretical_loss": 4.15672444830754, "tokens_seen": 293732352 }, { "epoch": 0.09, "learning_rate": 0.00046009027081243733, "loss": 3.2177, "theoretical_loss": 4.156615273515205, "tokens_seen": 293797888 }, { "epoch": 0.09, "learning_rate": 0.0004600802407221665, "loss": 2.9992, "theoretical_loss": 4.156506129890344, "tokens_seen": 293863424 }, { "epoch": 0.09, "learning_rate": 0.0004600702106318957, "loss": 3.1405, "theoretical_loss": 4.156397017417111, "tokens_seen": 293928960 }, { "epoch": 0.09, "learning_rate": 0.00046006018054162493, "loss": 3.33, "theoretical_loss": 4.156287936079675, "tokens_seen": 293994496 }, { "epoch": 0.09, "learning_rate": 0.00046005015045135405, "loss": 3.2998, "theoretical_loss": 4.156178885862209, "tokens_seen": 294060032 }, { "epoch": 0.09, "learning_rate": 0.0004600401203610833, "loss": 3.3073, "theoretical_loss": 4.156069866748906, "tokens_seen": 294125568 }, { "epoch": 0.09, "learning_rate": 0.0004600300902708124, "loss": 3.1938, "theoretical_loss": 4.155960878723965, "tokens_seen": 294191104 }, { "epoch": 0.09, "learning_rate": 0.00046002006018054165, "loss": 3.3049, "theoretical_loss": 4.155851921771598, "tokens_seen": 294256640 }, { "epoch": 0.09, "learning_rate": 0.00046001003009027083, "loss": 3.3043, "theoretical_loss": 4.155742995876029, "tokens_seen": 294322176 }, { "epoch": 0.09, "learning_rate": 0.00046, "loss": 3.3544, "theoretical_loss": 4.155634101021494, "tokens_seen": 294387712 }, { "epoch": 0.09, "learning_rate": 0.0004599899699097292, "loss": 3.359, "theoretical_loss": 4.155525237192238, "tokens_seen": 294453248 }, { "epoch": 0.09, "learning_rate": 0.00045997993981945843, "loss": 3.1417, "theoretical_loss": 4.155416404372522, "tokens_seen": 294518784 }, { "epoch": 0.09, "learning_rate": 0.00045996990972918756, "loss": 3.4017, "theoretical_loss": 4.155307602546614, "tokens_seen": 294584320 }, { "epoch": 0.09, "learning_rate": 0.0004599598796389168, "loss": 3.2136, "theoretical_loss": 4.155198831698795, "tokens_seen": 294649856 }, { "epoch": 0.09, "learning_rate": 0.0004599498495486459, "loss": 3.1701, "theoretical_loss": 4.155090091813358, "tokens_seen": 294715392 }, { "epoch": 0.09, "learning_rate": 0.00045993981945837515, "loss": 3.223, "theoretical_loss": 4.154981382874608, "tokens_seen": 294780928 }, { "epoch": 0.09, "learning_rate": 0.00045992978936810434, "loss": 3.0267, "theoretical_loss": 4.154872704866859, "tokens_seen": 294846464 }, { "epoch": 0.09, "objective/train/docs_used": 497932, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0154013633728027, "objective/train/theoretical_loss": 4.15476405777444, "objective/train/tokens_used": 315372000, "theoretical_loss": 4.15476405777444, "tokens_seen": 294912000 }, { "epoch": 0.09, "learning_rate": 0.0004599197592778335, "loss": 3.099, "theoretical_loss": 4.15476405777444, "tokens_seen": 294912000 }, { "epoch": 0.09, "learning_rate": 0.0004599097291875627, "loss": 3.3932, "theoretical_loss": 4.154655441581687, "tokens_seen": 294977536 }, { "epoch": 0.09, "learning_rate": 0.0004598996990972919, "loss": 3.2249, "theoretical_loss": 4.154546856272952, "tokens_seen": 295043072 }, { "epoch": 0.09, "learning_rate": 0.00045988966900702106, "loss": 3.2794, "theoretical_loss": 4.154438301832596, "tokens_seen": 295108608 }, { "epoch": 0.09, "learning_rate": 0.0004598796389167503, "loss": 3.1688, "theoretical_loss": 4.154329778244991, "tokens_seen": 295174144 }, { "epoch": 0.09, "learning_rate": 0.0004598696088264794, "loss": 3.2446, "theoretical_loss": 4.154221285494521, "tokens_seen": 295239680 }, { "epoch": 0.09, "learning_rate": 0.00045985957873620866, "loss": 3.0866, "theoretical_loss": 4.154112823565582, "tokens_seen": 295305216 }, { "epoch": 0.09, "learning_rate": 0.0004598495486459378, "loss": 3.3635, "theoretical_loss": 4.15400439244258, "tokens_seen": 295370752 }, { "epoch": 0.09, "learning_rate": 0.000459839518555667, "loss": 3.3119, "theoretical_loss": 4.153895992109935, "tokens_seen": 295436288 }, { "epoch": 0.09, "learning_rate": 0.0004598294884653962, "loss": 3.132, "theoretical_loss": 4.153787622552073, "tokens_seen": 295501824 }, { "epoch": 0.09, "learning_rate": 0.0004598194583751254, "loss": 3.3125, "theoretical_loss": 4.153679283753439, "tokens_seen": 295567360 }, { "epoch": 0.09, "learning_rate": 0.00045980942828485456, "loss": 3.2501, "theoretical_loss": 4.15357097569848, "tokens_seen": 295632896 }, { "epoch": 0.09, "learning_rate": 0.0004597993981945838, "loss": 3.2929, "theoretical_loss": 4.153462698371665, "tokens_seen": 295698432 }, { "epoch": 0.09, "learning_rate": 0.0004597893681043129, "loss": 3.325, "theoretical_loss": 4.1533544517574645, "tokens_seen": 295763968 }, { "epoch": 0.09, "learning_rate": 0.00045977933801404216, "loss": 3.1183, "theoretical_loss": 4.153246235840367, "tokens_seen": 295829504 }, { "epoch": 0.09, "learning_rate": 0.0004597693079237713, "loss": 3.2075, "theoretical_loss": 4.153138050604868, "tokens_seen": 295895040 }, { "epoch": 0.09, "learning_rate": 0.0004597592778335005, "loss": 3.4036, "theoretical_loss": 4.153029896035476, "tokens_seen": 295960576 }, { "epoch": 0.09, "learning_rate": 0.0004597492477432297, "loss": 3.0248, "theoretical_loss": 4.152921772116712, "tokens_seen": 296026112 }, { "epoch": 0.09, "learning_rate": 0.0004597392176529589, "loss": 3.31, "theoretical_loss": 4.152813678833106, "tokens_seen": 296091648 }, { "epoch": 0.09, "learning_rate": 0.00045972918756268807, "loss": 3.1246, "theoretical_loss": 4.152705616169202, "tokens_seen": 296157184 }, { "epoch": 0.09, "learning_rate": 0.00045971915747241725, "loss": 3.2657, "theoretical_loss": 4.15259758410955, "tokens_seen": 296222720 }, { "epoch": 0.09, "learning_rate": 0.00045970912738214643, "loss": 3.3296, "theoretical_loss": 4.152489582638719, "tokens_seen": 296288256 }, { "epoch": 0.09, "learning_rate": 0.00045969909729187566, "loss": 3.2423, "theoretical_loss": 4.152381611741281, "tokens_seen": 296353792 }, { "epoch": 0.09, "learning_rate": 0.0004596890672016048, "loss": 3.1881, "theoretical_loss": 4.152273671401824, "tokens_seen": 296419328 }, { "epoch": 0.09, "learning_rate": 0.000459679037111334, "loss": 3.38, "theoretical_loss": 4.152165761604948, "tokens_seen": 296484864 }, { "epoch": 0.09, "objective/train/docs_used": 500857, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.233283758163452, "objective/train/theoretical_loss": 4.152057882335261, "objective/train/tokens_used": 317010400, "theoretical_loss": 4.152057882335261, "tokens_seen": 296550400 }, { "epoch": 0.09, "learning_rate": 0.00045966900702106315, "loss": 3.0556, "theoretical_loss": 4.152057882335261, "tokens_seen": 296550400 }, { "epoch": 0.09, "learning_rate": 0.0004596589769307924, "loss": 3.2906, "theoretical_loss": 4.151950033577383, "tokens_seen": 296615936 }, { "epoch": 0.09, "learning_rate": 0.00045964894684052157, "loss": 3.3137, "theoretical_loss": 4.151842215315947, "tokens_seen": 296681472 }, { "epoch": 0.09, "learning_rate": 0.00045963891675025075, "loss": 3.14, "theoretical_loss": 4.151734427535594, "tokens_seen": 296747008 }, { "epoch": 0.09, "learning_rate": 0.00045962888665997993, "loss": 3.2252, "theoretical_loss": 4.151626670220979, "tokens_seen": 296812544 }, { "epoch": 0.09, "learning_rate": 0.00045961885656970917, "loss": 3.1722, "theoretical_loss": 4.151518943356768, "tokens_seen": 296878080 }, { "epoch": 0.09, "learning_rate": 0.0004596088264794383, "loss": 3.2755, "theoretical_loss": 4.151411246927636, "tokens_seen": 296943616 }, { "epoch": 0.09, "learning_rate": 0.00045959879638916753, "loss": 3.4103, "theoretical_loss": 4.15130358091827, "tokens_seen": 297009152 }, { "epoch": 0.09, "learning_rate": 0.00045958876629889666, "loss": 3.1889, "theoretical_loss": 4.151195945313369, "tokens_seen": 297074688 }, { "epoch": 0.09, "learning_rate": 0.0004595787362086259, "loss": 3.208, "theoretical_loss": 4.151088340097642, "tokens_seen": 297140224 }, { "epoch": 0.09, "learning_rate": 0.0004595687061183551, "loss": 3.2761, "theoretical_loss": 4.15098076525581, "tokens_seen": 297205760 }, { "epoch": 0.09, "learning_rate": 0.00045955867602808425, "loss": 3.2783, "theoretical_loss": 4.150873220772604, "tokens_seen": 297271296 }, { "epoch": 0.09, "learning_rate": 0.00045954864593781344, "loss": 3.1698, "theoretical_loss": 4.150765706632766, "tokens_seen": 297336832 }, { "epoch": 0.09, "learning_rate": 0.0004595386158475426, "loss": 3.1568, "theoretical_loss": 4.1506582228210505, "tokens_seen": 297402368 }, { "epoch": 0.09, "learning_rate": 0.0004595285857572718, "loss": 3.433, "theoretical_loss": 4.150550769322221, "tokens_seen": 297467904 }, { "epoch": 0.09, "learning_rate": 0.00045951855566700103, "loss": 3.1174, "theoretical_loss": 4.150443346121054, "tokens_seen": 297533440 }, { "epoch": 0.09, "learning_rate": 0.00045950852557673016, "loss": 3.1484, "theoretical_loss": 4.150335953202336, "tokens_seen": 297598976 }, { "epoch": 0.09, "learning_rate": 0.0004594984954864594, "loss": 3.1526, "theoretical_loss": 4.150228590550864, "tokens_seen": 297664512 }, { "epoch": 0.09, "learning_rate": 0.0004594884653961885, "loss": 3.2203, "theoretical_loss": 4.150121258151447, "tokens_seen": 297730048 }, { "epoch": 0.09, "learning_rate": 0.00045947843530591776, "loss": 3.1676, "theoretical_loss": 4.150013955988905, "tokens_seen": 297795584 }, { "epoch": 0.09, "learning_rate": 0.00045946840521564694, "loss": 3.2787, "theoretical_loss": 4.149906684048068, "tokens_seen": 297861120 }, { "epoch": 0.09, "learning_rate": 0.0004594583751253761, "loss": 3.2806, "theoretical_loss": 4.1497994423137765, "tokens_seen": 297926656 }, { "epoch": 0.09, "learning_rate": 0.00045944834503510536, "loss": 3.2088, "theoretical_loss": 4.149692230770884, "tokens_seen": 297992192 }, { "epoch": 0.09, "learning_rate": 0.00045943831494483454, "loss": 3.1608, "theoretical_loss": 4.149585049404253, "tokens_seen": 298057728 }, { "epoch": 0.09, "learning_rate": 0.0004594282848545637, "loss": 3.2604, "theoretical_loss": 4.149477898198759, "tokens_seen": 298123264 }, { "epoch": 0.09, "objective/train/docs_used": 503633, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.947439432144165, "objective/train/theoretical_loss": 4.149370777139286, "objective/train/tokens_used": 318648800, "theoretical_loss": 4.149370777139286, "tokens_seen": 298188800 }, { "epoch": 0.09, "learning_rate": 0.0004594182547642929, "loss": 2.9953, "theoretical_loss": 4.149370777139286, "tokens_seen": 298188800 }, { "epoch": 0.09, "learning_rate": 0.0004594082246740221, "loss": 3.4288, "theoretical_loss": 4.14926368621073, "tokens_seen": 298254336 }, { "epoch": 0.09, "learning_rate": 0.00045939819458375126, "loss": 3.098, "theoretical_loss": 4.149156625397998, "tokens_seen": 298319872 }, { "epoch": 0.09, "learning_rate": 0.0004593881644934805, "loss": 3.2559, "theoretical_loss": 4.149049594686008, "tokens_seen": 298385408 }, { "epoch": 0.09, "learning_rate": 0.0004593781344032096, "loss": 3.2918, "theoretical_loss": 4.1489425940596885, "tokens_seen": 298450944 }, { "epoch": 0.09, "learning_rate": 0.00045936810431293886, "loss": 3.2375, "theoretical_loss": 4.148835623503978, "tokens_seen": 298516480 }, { "epoch": 0.09, "learning_rate": 0.000459358074222668, "loss": 3.253, "theoretical_loss": 4.148728683003829, "tokens_seen": 298582016 }, { "epoch": 0.09, "learning_rate": 0.0004593480441323972, "loss": 3.1826, "theoretical_loss": 4.1486217725442005, "tokens_seen": 298647552 }, { "epoch": 0.09, "learning_rate": 0.0004593380140421264, "loss": 3.3279, "theoretical_loss": 4.148514892110065, "tokens_seen": 298713088 }, { "epoch": 0.09, "learning_rate": 0.0004593279839518556, "loss": 3.2502, "theoretical_loss": 4.148408041686406, "tokens_seen": 298778624 }, { "epoch": 0.09, "learning_rate": 0.00045931795386158476, "loss": 3.094, "theoretical_loss": 4.148301221258217, "tokens_seen": 298844160 }, { "epoch": 0.09, "learning_rate": 0.000459307923771314, "loss": 3.2892, "theoretical_loss": 4.148194430810502, "tokens_seen": 298909696 }, { "epoch": 0.09, "learning_rate": 0.0004592978936810431, "loss": 3.0558, "theoretical_loss": 4.148087670328276, "tokens_seen": 298975232 }, { "epoch": 0.09, "learning_rate": 0.00045928786359077236, "loss": 3.2125, "theoretical_loss": 4.147980939796565, "tokens_seen": 299040768 }, { "epoch": 0.09, "learning_rate": 0.0004592778335005015, "loss": 3.2623, "theoretical_loss": 4.147874239200405, "tokens_seen": 299106304 }, { "epoch": 0.09, "learning_rate": 0.0004592678034102307, "loss": 3.3146, "theoretical_loss": 4.147767568524845, "tokens_seen": 299171840 }, { "epoch": 0.09, "learning_rate": 0.0004592577733199599, "loss": 3.2508, "theoretical_loss": 4.147660927754942, "tokens_seen": 299237376 }, { "epoch": 0.09, "learning_rate": 0.0004592477432296891, "loss": 3.3004, "theoretical_loss": 4.147554316875766, "tokens_seen": 299302912 }, { "epoch": 0.09, "learning_rate": 0.00045923771313941827, "loss": 3.2344, "theoretical_loss": 4.147447735872396, "tokens_seen": 299368448 }, { "epoch": 0.09, "learning_rate": 0.00045922768304914745, "loss": 3.1564, "theoretical_loss": 4.147341184729921, "tokens_seen": 299433984 }, { "epoch": 0.09, "learning_rate": 0.00045921765295887663, "loss": 3.3043, "theoretical_loss": 4.147234663433444, "tokens_seen": 299499520 }, { "epoch": 0.09, "learning_rate": 0.00045920762286860586, "loss": 3.0867, "theoretical_loss": 4.147128171968077, "tokens_seen": 299565056 }, { "epoch": 0.09, "learning_rate": 0.000459197592778335, "loss": 3.223, "theoretical_loss": 4.14702171031894, "tokens_seen": 299630592 }, { "epoch": 0.09, "learning_rate": 0.0004591875626880642, "loss": 3.3077, "theoretical_loss": 4.146915278471169, "tokens_seen": 299696128 }, { "epoch": 0.09, "learning_rate": 0.00045917753259779335, "loss": 3.2055, "theoretical_loss": 4.146808876409906, "tokens_seen": 299761664 }, { "epoch": 0.09, "objective/train/docs_used": 504999, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4209396839141846, "objective/train/theoretical_loss": 4.146702504120305, "objective/train/tokens_used": 320287200, "theoretical_loss": 4.146702504120305, "tokens_seen": 299827200 }, { "epoch": 0.09, "learning_rate": 0.0004591675025075226, "loss": 3.2897, "theoretical_loss": 4.146702504120305, "tokens_seen": 299827200 }, { "epoch": 0.09, "learning_rate": 0.00045915747241725177, "loss": 3.3171, "theoretical_loss": 4.146596161587532, "tokens_seen": 299892736 }, { "epoch": 0.09, "learning_rate": 0.00045914744232698095, "loss": 3.1839, "theoretical_loss": 4.146489848796763, "tokens_seen": 299958272 }, { "epoch": 0.09, "learning_rate": 0.00045913741223671013, "loss": 3.1291, "theoretical_loss": 4.146383565733184, "tokens_seen": 300023808 }, { "epoch": 0.09, "learning_rate": 0.00045912738214643937, "loss": 3.1787, "theoretical_loss": 4.146277312381991, "tokens_seen": 300089344 }, { "epoch": 0.09, "learning_rate": 0.0004591173520561685, "loss": 3.0302, "theoretical_loss": 4.1461710887283925, "tokens_seen": 300154880 }, { "epoch": 0.09, "learning_rate": 0.00045910732196589773, "loss": 3.2685, "theoretical_loss": 4.146064894757606, "tokens_seen": 300220416 }, { "epoch": 0.09, "learning_rate": 0.00045909729187562686, "loss": 3.4017, "theoretical_loss": 4.145958730454861, "tokens_seen": 300285952 }, { "epoch": 0.09, "learning_rate": 0.0004590872617853561, "loss": 3.3776, "theoretical_loss": 4.145852595805396, "tokens_seen": 300351488 }, { "epoch": 0.09, "learning_rate": 0.0004590772316950853, "loss": 3.2661, "theoretical_loss": 4.145746490794461, "tokens_seen": 300417024 }, { "epoch": 0.09, "learning_rate": 0.00045906720160481445, "loss": 3.41, "theoretical_loss": 4.145640415407317, "tokens_seen": 300482560 }, { "epoch": 0.09, "learning_rate": 0.00045905717151454364, "loss": 3.2862, "theoretical_loss": 4.145534369629234, "tokens_seen": 300548096 }, { "epoch": 0.09, "learning_rate": 0.0004590471414242728, "loss": 3.3035, "theoretical_loss": 4.145428353445494, "tokens_seen": 300613632 }, { "epoch": 0.09, "learning_rate": 0.000459037111334002, "loss": 3.1744, "theoretical_loss": 4.145322366841389, "tokens_seen": 300679168 }, { "epoch": 0.09, "learning_rate": 0.00045902708124373123, "loss": 3.3145, "theoretical_loss": 4.145216409802221, "tokens_seen": 300744704 }, { "epoch": 0.09, "learning_rate": 0.00045901705115346036, "loss": 3.1173, "theoretical_loss": 4.145110482313304, "tokens_seen": 300810240 }, { "epoch": 0.09, "learning_rate": 0.0004590070210631896, "loss": 3.136, "theoretical_loss": 4.1450045843599606, "tokens_seen": 300875776 }, { "epoch": 0.09, "learning_rate": 0.0004589969909729187, "loss": 3.1216, "theoretical_loss": 4.144898715927525, "tokens_seen": 300941312 }, { "epoch": 0.09, "learning_rate": 0.00045898696088264796, "loss": 3.3742, "theoretical_loss": 4.144792877001342, "tokens_seen": 301006848 }, { "epoch": 0.09, "learning_rate": 0.00045897693079237714, "loss": 3.0933, "theoretical_loss": 4.144687067566765, "tokens_seen": 301072384 }, { "epoch": 0.09, "learning_rate": 0.0004589669007021063, "loss": 2.9715, "theoretical_loss": 4.144581287609161, "tokens_seen": 301137920 }, { "epoch": 0.09, "learning_rate": 0.0004589568706118355, "loss": 3.228, "theoretical_loss": 4.144475537113905, "tokens_seen": 301203456 }, { "epoch": 0.09, "learning_rate": 0.00045894684052156474, "loss": 3.1339, "theoretical_loss": 4.144369816066385, "tokens_seen": 301268992 }, { "epoch": 0.09, "learning_rate": 0.00045893681043129386, "loss": 2.9604, "theoretical_loss": 4.144264124451995, "tokens_seen": 301334528 }, { "epoch": 0.09, "learning_rate": 0.0004589267803410231, "loss": 3.2139, "theoretical_loss": 4.1441584622561445, "tokens_seen": 301400064 }, { "epoch": 0.09, "objective/train/docs_used": 507643, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.383744716644287, "objective/train/theoretical_loss": 4.144052829464249, "objective/train/tokens_used": 321925600, "theoretical_loss": 4.144052829464249, "tokens_seen": 301465600 }, { "epoch": 0.09, "learning_rate": 0.0004589167502507522, "loss": 3.1607, "theoretical_loss": 4.144052829464249, "tokens_seen": 301465600 }, { "epoch": 0.09, "learning_rate": 0.00045890672016048146, "loss": 3.3867, "theoretical_loss": 4.143947226061737, "tokens_seen": 301531136 }, { "epoch": 0.09, "learning_rate": 0.00045889669007021064, "loss": 3.2096, "theoretical_loss": 4.143841652034048, "tokens_seen": 301596672 }, { "epoch": 0.09, "learning_rate": 0.0004588866599799398, "loss": 3.445, "theoretical_loss": 4.143736107366629, "tokens_seen": 301662208 }, { "epoch": 0.09, "learning_rate": 0.000458876629889669, "loss": 3.0436, "theoretical_loss": 4.14363059204494, "tokens_seen": 301727744 }, { "epoch": 0.09, "learning_rate": 0.0004588665997993982, "loss": 3.2378, "theoretical_loss": 4.14352510605445, "tokens_seen": 301793280 }, { "epoch": 0.09, "learning_rate": 0.00045885656970912737, "loss": 3.2694, "theoretical_loss": 4.143419649380639, "tokens_seen": 301858816 }, { "epoch": 0.09, "learning_rate": 0.0004588465396188566, "loss": 3.0601, "theoretical_loss": 4.143314222008997, "tokens_seen": 301924352 }, { "epoch": 0.09, "learning_rate": 0.00045883650952858573, "loss": 3.2005, "theoretical_loss": 4.143208823925024, "tokens_seen": 301989888 }, { "epoch": 0.09, "learning_rate": 0.00045882647943831496, "loss": 3.0848, "theoretical_loss": 4.143103455114231, "tokens_seen": 302055424 }, { "epoch": 0.09, "learning_rate": 0.00045881644934804415, "loss": 3.3955, "theoretical_loss": 4.142998115562139, "tokens_seen": 302120960 }, { "epoch": 0.09, "learning_rate": 0.0004588064192577733, "loss": 3.2144, "theoretical_loss": 4.14289280525428, "tokens_seen": 302186496 }, { "epoch": 0.09, "learning_rate": 0.0004587963891675025, "loss": 3.2044, "theoretical_loss": 4.142787524176194, "tokens_seen": 302252032 }, { "epoch": 0.09, "learning_rate": 0.0004587863590772317, "loss": 3.2506, "theoretical_loss": 4.142682272313435, "tokens_seen": 302317568 }, { "epoch": 0.09, "learning_rate": 0.00045877632898696087, "loss": 3.2733, "theoretical_loss": 4.142577049651563, "tokens_seen": 302383104 }, { "epoch": 0.09, "learning_rate": 0.0004587662988966901, "loss": 3.3405, "theoretical_loss": 4.142471856176152, "tokens_seen": 302448640 }, { "epoch": 0.09, "learning_rate": 0.00045875626880641923, "loss": 3.2675, "theoretical_loss": 4.142366691872784, "tokens_seen": 302514176 }, { "epoch": 0.09, "learning_rate": 0.00045874623871614847, "loss": 3.3983, "theoretical_loss": 4.142261556727052, "tokens_seen": 302579712 }, { "epoch": 0.09, "learning_rate": 0.0004587362086258776, "loss": 3.2268, "theoretical_loss": 4.14215645072456, "tokens_seen": 302645248 }, { "epoch": 0.09, "learning_rate": 0.00045872617853560683, "loss": 3.1717, "theoretical_loss": 4.14205137385092, "tokens_seen": 302710784 }, { "epoch": 0.09, "learning_rate": 0.000458716148445336, "loss": 3.266, "theoretical_loss": 4.141946326091756, "tokens_seen": 302776320 }, { "epoch": 0.09, "learning_rate": 0.0004587061183550652, "loss": 3.3315, "theoretical_loss": 4.141841307432703, "tokens_seen": 302841856 }, { "epoch": 0.09, "learning_rate": 0.0004586960882647944, "loss": 3.0815, "theoretical_loss": 4.1417363178594035, "tokens_seen": 302907392 }, { "epoch": 0.09, "learning_rate": 0.00045868605817452355, "loss": 3.3015, "theoretical_loss": 4.141631357357513, "tokens_seen": 302972928 }, { "epoch": 0.09, "learning_rate": 0.0004586760280842528, "loss": 3.3002, "theoretical_loss": 4.141526425912694, "tokens_seen": 303038464 }, { "epoch": 0.09, "objective/train/docs_used": 510192, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0655465126037598, "objective/train/theoretical_loss": 4.141421523510623, "objective/train/tokens_used": 323564000, "theoretical_loss": 4.141421523510623, "tokens_seen": 303104000 }, { "epoch": 0.09, "learning_rate": 0.00045866599799398197, "loss": 3.042, "theoretical_loss": 4.141421523510623, "tokens_seen": 303104000 }, { "epoch": 0.09, "learning_rate": 0.00045865596790371115, "loss": 3.3065, "theoretical_loss": 4.141316650136983, "tokens_seen": 303169536 }, { "epoch": 0.09, "learning_rate": 0.00045864593781344033, "loss": 3.1785, "theoretical_loss": 4.14121180577747, "tokens_seen": 303235072 }, { "epoch": 0.09, "learning_rate": 0.00045863590772316957, "loss": 3.1089, "theoretical_loss": 4.141106990417789, "tokens_seen": 303300608 }, { "epoch": 0.09, "learning_rate": 0.0004586258776328987, "loss": 3.2191, "theoretical_loss": 4.141002204043654, "tokens_seen": 303366144 }, { "epoch": 0.09, "learning_rate": 0.00045861584754262793, "loss": 3.3225, "theoretical_loss": 4.140897446640793, "tokens_seen": 303431680 }, { "epoch": 0.09, "learning_rate": 0.00045860581745235706, "loss": 3.332, "theoretical_loss": 4.1407927181949375, "tokens_seen": 303497216 }, { "epoch": 0.09, "learning_rate": 0.0004585957873620863, "loss": 3.1486, "theoretical_loss": 4.140688018691835, "tokens_seen": 303562752 }, { "epoch": 0.09, "learning_rate": 0.0004585857572718155, "loss": 3.2577, "theoretical_loss": 4.140583348117241, "tokens_seen": 303628288 }, { "epoch": 0.09, "learning_rate": 0.00045857572718154465, "loss": 3.4202, "theoretical_loss": 4.140478706456921, "tokens_seen": 303693824 }, { "epoch": 0.09, "learning_rate": 0.00045856569709127384, "loss": 3.0975, "theoretical_loss": 4.140374093696651, "tokens_seen": 303759360 }, { "epoch": 0.09, "learning_rate": 0.000458555667001003, "loss": 3.3441, "theoretical_loss": 4.1402695098222155, "tokens_seen": 303824896 }, { "epoch": 0.09, "learning_rate": 0.0004585456369107322, "loss": 3.0407, "theoretical_loss": 4.140164954819412, "tokens_seen": 303890432 }, { "epoch": 0.09, "learning_rate": 0.00045853560682046143, "loss": 3.275, "theoretical_loss": 4.140060428674046, "tokens_seen": 303955968 }, { "epoch": 0.09, "learning_rate": 0.00045852557673019056, "loss": 3.1185, "theoretical_loss": 4.139955931371932, "tokens_seen": 304021504 }, { "epoch": 0.09, "learning_rate": 0.0004585155466399198, "loss": 3.1361, "theoretical_loss": 4.139851462898897, "tokens_seen": 304087040 }, { "epoch": 0.09, "learning_rate": 0.0004585055165496489, "loss": 3.0179, "theoretical_loss": 4.139747023240777, "tokens_seen": 304152576 }, { "epoch": 0.09, "learning_rate": 0.00045849548645937816, "loss": 3.0797, "theoretical_loss": 4.139642612383418, "tokens_seen": 304218112 }, { "epoch": 0.09, "learning_rate": 0.00045848545636910734, "loss": 3.2315, "theoretical_loss": 4.1395382303126755, "tokens_seen": 304283648 }, { "epoch": 0.09, "learning_rate": 0.0004584754262788365, "loss": 3.1269, "theoretical_loss": 4.139433877014415, "tokens_seen": 304349184 }, { "epoch": 0.09, "learning_rate": 0.0004584653961885657, "loss": 3.1125, "theoretical_loss": 4.139329552474514, "tokens_seen": 304414720 }, { "epoch": 0.09, "learning_rate": 0.00045845536609829494, "loss": 3.2181, "theoretical_loss": 4.139225256678857, "tokens_seen": 304480256 }, { "epoch": 0.09, "learning_rate": 0.00045844533600802406, "loss": 3.2027, "theoretical_loss": 4.139120989613341, "tokens_seen": 304545792 }, { "epoch": 0.09, "learning_rate": 0.0004584353059177533, "loss": 3.3048, "theoretical_loss": 4.1390167512638705, "tokens_seen": 304611328 }, { "epoch": 0.09, "learning_rate": 0.0004584252758274824, "loss": 3.2043, "theoretical_loss": 4.138912541616363, "tokens_seen": 304676864 }, { "epoch": 0.09, "objective/train/docs_used": 513080, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3043508529663086, "objective/train/theoretical_loss": 4.138808360656742, "objective/train/tokens_used": 325202400, "theoretical_loss": 4.138808360656742, "tokens_seen": 304742400 }, { "epoch": 0.09, "learning_rate": 0.00045841524573721166, "loss": 3.1223, "theoretical_loss": 4.138808360656742, "tokens_seen": 304742400 }, { "epoch": 0.09, "learning_rate": 0.00045840521564694084, "loss": 3.2953, "theoretical_loss": 4.138704208370944, "tokens_seen": 304807936 }, { "epoch": 0.09, "learning_rate": 0.00045839518555667, "loss": 3.2321, "theoretical_loss": 4.138600084744915, "tokens_seen": 304873472 }, { "epoch": 0.09, "learning_rate": 0.0004583851554663992, "loss": 3.0783, "theoretical_loss": 4.1384959897646105, "tokens_seen": 304939008 }, { "epoch": 0.09, "learning_rate": 0.0004583751253761284, "loss": 3.2743, "theoretical_loss": 4.138391923415996, "tokens_seen": 305004544 }, { "epoch": 0.09, "learning_rate": 0.00045836509528585757, "loss": 3.3234, "theoretical_loss": 4.138287885685045, "tokens_seen": 305070080 }, { "epoch": 0.09, "learning_rate": 0.0004583550651955868, "loss": 3.0822, "theoretical_loss": 4.138183876557745, "tokens_seen": 305135616 }, { "epoch": 0.09, "learning_rate": 0.00045834503510531593, "loss": 3.0229, "theoretical_loss": 4.1380798960200895, "tokens_seen": 305201152 }, { "epoch": 0.09, "learning_rate": 0.00045833500501504516, "loss": 3.3236, "theoretical_loss": 4.137975944058083, "tokens_seen": 305266688 }, { "epoch": 0.09, "learning_rate": 0.00045832497492477435, "loss": 3.1816, "theoretical_loss": 4.137872020657742, "tokens_seen": 305332224 }, { "epoch": 0.09, "learning_rate": 0.0004583149448345035, "loss": 3.2232, "theoretical_loss": 4.1377681258050885, "tokens_seen": 305397760 }, { "epoch": 0.09, "learning_rate": 0.0004583049147442327, "loss": 3.1866, "theoretical_loss": 4.13766425948616, "tokens_seen": 305463296 }, { "epoch": 0.09, "learning_rate": 0.0004582948846539619, "loss": 3.3865, "theoretical_loss": 4.137560421686998, "tokens_seen": 305528832 }, { "epoch": 0.09, "learning_rate": 0.00045828485456369107, "loss": 3.2516, "theoretical_loss": 4.137456612393658, "tokens_seen": 305594368 }, { "epoch": 0.09, "learning_rate": 0.0004582748244734203, "loss": 3.3122, "theoretical_loss": 4.137352831592203, "tokens_seen": 305659904 }, { "epoch": 0.09, "learning_rate": 0.00045826479438314943, "loss": 3.2393, "theoretical_loss": 4.137249079268707, "tokens_seen": 305725440 }, { "epoch": 0.09, "learning_rate": 0.00045825476429287867, "loss": 3.2103, "theoretical_loss": 4.137145355409253, "tokens_seen": 305790976 }, { "epoch": 0.09, "learning_rate": 0.0004582447342026078, "loss": 3.1519, "theoretical_loss": 4.137041659999936, "tokens_seen": 305856512 }, { "epoch": 0.09, "learning_rate": 0.00045823470411233703, "loss": 3.215, "theoretical_loss": 4.136937993026857, "tokens_seen": 305922048 }, { "epoch": 0.09, "learning_rate": 0.0004582246740220662, "loss": 3.3799, "theoretical_loss": 4.136834354476129, "tokens_seen": 305987584 }, { "epoch": 0.09, "learning_rate": 0.0004582146439317954, "loss": 3.2927, "theoretical_loss": 4.1367307443338746, "tokens_seen": 306053120 }, { "epoch": 0.09, "learning_rate": 0.00045820461384152457, "loss": 3.2461, "theoretical_loss": 4.136627162586226, "tokens_seen": 306118656 }, { "epoch": 0.09, "learning_rate": 0.00045819458375125375, "loss": 3.1635, "theoretical_loss": 4.136523609219327, "tokens_seen": 306184192 }, { "epoch": 0.09, "learning_rate": 0.00045818455366098294, "loss": 3.0601, "theoretical_loss": 4.136420084219327, "tokens_seen": 306249728 }, { "epoch": 0.09, "learning_rate": 0.00045817452357071217, "loss": 3.1694, "theoretical_loss": 4.136316587572388, "tokens_seen": 306315264 }, { "epoch": 0.09, "objective/train/docs_used": 514436, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1695950031280518, "objective/train/theoretical_loss": 4.136213119264681, "objective/train/tokens_used": 326840800, "theoretical_loss": 4.136213119264681, "tokens_seen": 306380800 }, { "epoch": 0.09, "learning_rate": 0.0004581644934804413, "loss": 3.398, "theoretical_loss": 4.136213119264681, "tokens_seen": 306380800 }, { "epoch": 0.09, "learning_rate": 0.00045815446339017053, "loss": 3.2225, "theoretical_loss": 4.136109679282388, "tokens_seen": 306446336 }, { "epoch": 0.09, "learning_rate": 0.0004581444332998997, "loss": 3.127, "theoretical_loss": 4.136006267611697, "tokens_seen": 306511872 }, { "epoch": 0.09, "learning_rate": 0.0004581344032096289, "loss": 3.0622, "theoretical_loss": 4.135902884238812, "tokens_seen": 306577408 }, { "epoch": 0.09, "learning_rate": 0.0004581243731193581, "loss": 3.3023, "theoretical_loss": 4.135799529149939, "tokens_seen": 306642944 }, { "epoch": 0.09, "learning_rate": 0.00045811434302908726, "loss": 3.2068, "theoretical_loss": 4.1356962023313, "tokens_seen": 306708480 }, { "epoch": 0.09, "learning_rate": 0.00045810431293881644, "loss": 3.2373, "theoretical_loss": 4.135592903769124, "tokens_seen": 306774016 }, { "epoch": 0.09, "learning_rate": 0.0004580942828485457, "loss": 3.0171, "theoretical_loss": 4.135489633449649, "tokens_seen": 306839552 }, { "epoch": 0.09, "learning_rate": 0.0004580842527582748, "loss": 3.3278, "theoretical_loss": 4.135386391359123, "tokens_seen": 306905088 }, { "epoch": 0.09, "learning_rate": 0.00045807422266800404, "loss": 3.3211, "theoretical_loss": 4.135283177483807, "tokens_seen": 306970624 }, { "epoch": 0.09, "learning_rate": 0.00045806419257773316, "loss": 3.2296, "theoretical_loss": 4.135179991809965, "tokens_seen": 307036160 }, { "epoch": 0.09, "learning_rate": 0.0004580541624874624, "loss": 3.2851, "theoretical_loss": 4.135076834323876, "tokens_seen": 307101696 }, { "epoch": 0.09, "learning_rate": 0.0004580441323971916, "loss": 3.2806, "theoretical_loss": 4.134973705011828, "tokens_seen": 307167232 }, { "epoch": 0.09, "learning_rate": 0.00045803410230692076, "loss": 3.2595, "theoretical_loss": 4.134870603860117, "tokens_seen": 307232768 }, { "epoch": 0.09, "learning_rate": 0.00045802407221664994, "loss": 3.3341, "theoretical_loss": 4.134767530855047, "tokens_seen": 307298304 }, { "epoch": 0.09, "learning_rate": 0.0004580140421263791, "loss": 3.0851, "theoretical_loss": 4.1346644859829365, "tokens_seen": 307363840 }, { "epoch": 0.09, "learning_rate": 0.0004580040120361083, "loss": 3.2706, "theoretical_loss": 4.1345614692301105, "tokens_seen": 307429376 }, { "epoch": 0.09, "learning_rate": 0.00045799398194583754, "loss": 3.046, "theoretical_loss": 4.134458480582902, "tokens_seen": 307494912 }, { "epoch": 0.09, "learning_rate": 0.00045798395185556667, "loss": 3.3765, "theoretical_loss": 4.134355520027657, "tokens_seen": 307560448 }, { "epoch": 0.09, "learning_rate": 0.0004579739217652959, "loss": 3.0397, "theoretical_loss": 4.134252587550728, "tokens_seen": 307625984 }, { "epoch": 0.09, "learning_rate": 0.0004579638916750251, "loss": 3.1387, "theoretical_loss": 4.134149683138481, "tokens_seen": 307691520 }, { "epoch": 0.09, "learning_rate": 0.00045795386158475426, "loss": 3.2037, "theoretical_loss": 4.134046806777286, "tokens_seen": 307757056 }, { "epoch": 0.09, "learning_rate": 0.0004579438314944835, "loss": 3.1492, "theoretical_loss": 4.133943958453528, "tokens_seen": 307822592 }, { "epoch": 0.09, "learning_rate": 0.0004579338014042126, "loss": 3.2635, "theoretical_loss": 4.133841138153597, "tokens_seen": 307888128 }, { "epoch": 0.09, "learning_rate": 0.00045792377131394186, "loss": 3.2805, "theoretical_loss": 4.133738345863896, "tokens_seen": 307953664 }, { "epoch": 0.09, "objective/train/docs_used": 517196, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2269608974456787, "objective/train/theoretical_loss": 4.133635581570836, "objective/train/tokens_used": 328479200, "theoretical_loss": 4.133635581570836, "tokens_seen": 308019200 }, { "epoch": 0.09, "learning_rate": 0.00045791374122367104, "loss": 3.0012, "theoretical_loss": 4.133635581570836, "tokens_seen": 308019200 }, { "epoch": 0.09, "learning_rate": 0.0004579037111334002, "loss": 3.429, "theoretical_loss": 4.133532845260836, "tokens_seen": 308084736 }, { "epoch": 0.09, "learning_rate": 0.0004578936810431294, "loss": 3.2353, "theoretical_loss": 4.133430136920327, "tokens_seen": 308150272 }, { "epoch": 0.09, "learning_rate": 0.0004578836509528586, "loss": 3.3207, "theoretical_loss": 4.133327456535749, "tokens_seen": 308215808 }, { "epoch": 0.09, "learning_rate": 0.00045787362086258777, "loss": 3.1211, "theoretical_loss": 4.13322480409355, "tokens_seen": 308281344 }, { "epoch": 0.09, "learning_rate": 0.000457863590772317, "loss": 3.1853, "theoretical_loss": 4.133122179580189, "tokens_seen": 308346880 }, { "epoch": 0.09, "learning_rate": 0.00045785356068204613, "loss": 3.2402, "theoretical_loss": 4.133019582982134, "tokens_seen": 308412416 }, { "epoch": 0.09, "learning_rate": 0.00045784353059177536, "loss": 3.2264, "theoretical_loss": 4.1329170142858604, "tokens_seen": 308477952 }, { "epoch": 0.09, "learning_rate": 0.00045783350050150455, "loss": 3.2148, "theoretical_loss": 4.132814473477857, "tokens_seen": 308543488 }, { "epoch": 0.09, "learning_rate": 0.0004578234704112337, "loss": 3.1507, "theoretical_loss": 4.1327119605446185, "tokens_seen": 308609024 }, { "epoch": 0.09, "learning_rate": 0.0004578134403209629, "loss": 3.2388, "theoretical_loss": 4.132609475472651, "tokens_seen": 308674560 }, { "epoch": 0.09, "learning_rate": 0.0004578034102306921, "loss": 3.202, "theoretical_loss": 4.132507018248469, "tokens_seen": 308740096 }, { "epoch": 0.09, "learning_rate": 0.00045779338014042127, "loss": 3.1537, "theoretical_loss": 4.132404588858597, "tokens_seen": 308805632 }, { "epoch": 0.09, "learning_rate": 0.0004577833500501505, "loss": 3.375, "theoretical_loss": 4.132302187289568, "tokens_seen": 308871168 }, { "epoch": 0.09, "learning_rate": 0.00045777331995987963, "loss": 3.1816, "theoretical_loss": 4.132199813527926, "tokens_seen": 308936704 }, { "epoch": 0.09, "learning_rate": 0.00045776328986960887, "loss": 3.4156, "theoretical_loss": 4.132097467560223, "tokens_seen": 309002240 }, { "epoch": 0.09, "learning_rate": 0.000457753259779338, "loss": 3.2363, "theoretical_loss": 4.1319951493730205, "tokens_seen": 309067776 }, { "epoch": 0.09, "learning_rate": 0.00045774322968906723, "loss": 3.2419, "theoretical_loss": 4.131892858952889, "tokens_seen": 309133312 }, { "epoch": 0.09, "learning_rate": 0.0004577331995987964, "loss": 3.231, "theoretical_loss": 4.131790596286409, "tokens_seen": 309198848 }, { "epoch": 0.09, "learning_rate": 0.0004577231695085256, "loss": 3.1746, "theoretical_loss": 4.1316883613601725, "tokens_seen": 309264384 }, { "epoch": 0.09, "learning_rate": 0.0004577131394182548, "loss": 3.2182, "theoretical_loss": 4.131586154160775, "tokens_seen": 309329920 }, { "epoch": 0.09, "learning_rate": 0.00045770310932798395, "loss": 3.2919, "theoretical_loss": 4.131483974674827, "tokens_seen": 309395456 }, { "epoch": 0.09, "learning_rate": 0.00045769307923771314, "loss": 3.1456, "theoretical_loss": 4.131381822888946, "tokens_seen": 309460992 }, { "epoch": 0.09, "learning_rate": 0.00045768304914744237, "loss": 3.2052, "theoretical_loss": 4.131279698789759, "tokens_seen": 309526528 }, { "epoch": 0.09, "learning_rate": 0.0004576730190571715, "loss": 3.3316, "theoretical_loss": 4.1311776023639, "tokens_seen": 309592064 }, { "epoch": 0.09, "objective/train/docs_used": 520038, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1472020149230957, "objective/train/theoretical_loss": 4.131075533598018, "objective/train/tokens_used": 330117600, "theoretical_loss": 4.131075533598018, "tokens_seen": 309657600 }, { "epoch": 0.09, "learning_rate": 0.00045766298896690073, "loss": 3.1756, "theoretical_loss": 4.131075533598018, "tokens_seen": 309657600 }, { "epoch": 0.09, "learning_rate": 0.0004576529588766299, "loss": 3.3296, "theoretical_loss": 4.130973492478766, "tokens_seen": 309723136 }, { "epoch": 0.09, "learning_rate": 0.0004576429287863591, "loss": 3.1775, "theoretical_loss": 4.130871478992807, "tokens_seen": 309788672 }, { "epoch": 0.09, "learning_rate": 0.0004576328986960883, "loss": 3.3458, "theoretical_loss": 4.130769493126817, "tokens_seen": 309854208 }, { "epoch": 0.09, "learning_rate": 0.00045762286860581746, "loss": 3.2383, "theoretical_loss": 4.130667534867476, "tokens_seen": 309919744 }, { "epoch": 0.09, "learning_rate": 0.00045761283851554664, "loss": 3.297, "theoretical_loss": 4.130565604201477, "tokens_seen": 309985280 }, { "epoch": 0.09, "learning_rate": 0.0004576028084252759, "loss": 3.4946, "theoretical_loss": 4.130463701115521, "tokens_seen": 310050816 }, { "epoch": 0.09, "learning_rate": 0.000457592778335005, "loss": 3.2462, "theoretical_loss": 4.130361825596317, "tokens_seen": 310116352 }, { "epoch": 0.09, "learning_rate": 0.00045758274824473424, "loss": 3.1861, "theoretical_loss": 4.130259977630586, "tokens_seen": 310181888 }, { "epoch": 0.09, "learning_rate": 0.00045757271815446336, "loss": 3.1194, "theoretical_loss": 4.130158157205056, "tokens_seen": 310247424 }, { "epoch": 0.09, "learning_rate": 0.0004575626880641926, "loss": 3.3221, "theoretical_loss": 4.130056364306465, "tokens_seen": 310312960 }, { "epoch": 0.09, "learning_rate": 0.0004575526579739218, "loss": 3.0623, "theoretical_loss": 4.129954598921559, "tokens_seen": 310378496 }, { "epoch": 0.09, "learning_rate": 0.00045754262788365096, "loss": 3.2096, "theoretical_loss": 4.1298528610370955, "tokens_seen": 310444032 }, { "epoch": 0.09, "learning_rate": 0.00045753259779338014, "loss": 3.205, "theoretical_loss": 4.12975115063984, "tokens_seen": 310509568 }, { "epoch": 0.09, "learning_rate": 0.0004575225677031093, "loss": 3.0046, "theoretical_loss": 4.129649467716565, "tokens_seen": 310575104 }, { "epoch": 0.09, "learning_rate": 0.0004575125376128385, "loss": 3.2069, "theoretical_loss": 4.1295478122540565, "tokens_seen": 310640640 }, { "epoch": 0.09, "learning_rate": 0.00045750250752256774, "loss": 3.3812, "theoretical_loss": 4.1294461842391055, "tokens_seen": 310706176 }, { "epoch": 0.09, "learning_rate": 0.00045749247743229687, "loss": 3.1928, "theoretical_loss": 4.129344583658516, "tokens_seen": 310771712 }, { "epoch": 0.09, "learning_rate": 0.0004574824473420261, "loss": 3.3079, "theoretical_loss": 4.1292430104990965, "tokens_seen": 310837248 }, { "epoch": 0.09, "learning_rate": 0.0004574724172517553, "loss": 3.1705, "theoretical_loss": 4.1291414647476685, "tokens_seen": 310902784 }, { "epoch": 0.09, "learning_rate": 0.00045746238716148446, "loss": 3.2354, "theoretical_loss": 4.129039946391062, "tokens_seen": 310968320 }, { "epoch": 0.09, "learning_rate": 0.00045745235707121364, "loss": 3.3647, "theoretical_loss": 4.128938455416115, "tokens_seen": 311033856 }, { "epoch": 0.09, "learning_rate": 0.0004574423269809428, "loss": 3.1146, "theoretical_loss": 4.128836991809674, "tokens_seen": 311099392 }, { "epoch": 0.09, "learning_rate": 0.000457432296890672, "loss": 3.1976, "theoretical_loss": 4.128735555558597, "tokens_seen": 311164928 }, { "epoch": 0.09, "learning_rate": 0.00045742226680040124, "loss": 3.3585, "theoretical_loss": 4.128634146649748, "tokens_seen": 311230464 }, { "epoch": 0.09, "objective/train/docs_used": 522843, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.890200614929199, "objective/train/theoretical_loss": 4.128532765070004, "objective/train/tokens_used": 331756000, "theoretical_loss": 4.128532765070004, "tokens_seen": 311296000 }, { "epoch": 0.09, "learning_rate": 0.00045741223671013037, "loss": 3.1197, "theoretical_loss": 4.128532765070004, "tokens_seen": 311296000 }, { "epoch": 0.09, "learning_rate": 0.0004574022066198596, "loss": 3.1863, "theoretical_loss": 4.128431410806247, "tokens_seen": 311361536 }, { "epoch": 0.09, "learning_rate": 0.00045739217652958873, "loss": 3.2217, "theoretical_loss": 4.12833008384537, "tokens_seen": 311427072 }, { "epoch": 0.09, "learning_rate": 0.00045738214643931797, "loss": 3.2409, "theoretical_loss": 4.128228784174275, "tokens_seen": 311492608 }, { "epoch": 0.09, "learning_rate": 0.00045737211634904715, "loss": 3.1926, "theoretical_loss": 4.128127511779873, "tokens_seen": 311558144 }, { "epoch": 0.09, "learning_rate": 0.00045736208625877633, "loss": 3.2913, "theoretical_loss": 4.128026266649085, "tokens_seen": 311623680 }, { "epoch": 0.09, "learning_rate": 0.0004573520561685055, "loss": 3.2174, "theoretical_loss": 4.127925048768839, "tokens_seen": 311689216 }, { "epoch": 0.09, "learning_rate": 0.00045734202607823475, "loss": 3.1133, "theoretical_loss": 4.127823858126073, "tokens_seen": 311754752 }, { "epoch": 0.09, "learning_rate": 0.00045733199598796387, "loss": 3.1458, "theoretical_loss": 4.1277226947077335, "tokens_seen": 311820288 }, { "epoch": 0.09, "learning_rate": 0.0004573219658976931, "loss": 3.3411, "theoretical_loss": 4.127621558500778, "tokens_seen": 311885824 }, { "epoch": 0.09, "learning_rate": 0.00045731193580742223, "loss": 3.218, "theoretical_loss": 4.12752044949217, "tokens_seen": 311951360 }, { "epoch": 0.09, "learning_rate": 0.00045730190571715147, "loss": 3.1269, "theoretical_loss": 4.127419367668884, "tokens_seen": 312016896 }, { "epoch": 0.09, "learning_rate": 0.00045729187562688065, "loss": 3.2843, "theoretical_loss": 4.127318313017904, "tokens_seen": 312082432 }, { "epoch": 0.09, "learning_rate": 0.00045728184553660983, "loss": 3.1959, "theoretical_loss": 4.12721728552622, "tokens_seen": 312147968 }, { "epoch": 0.09, "learning_rate": 0.000457271815446339, "loss": 3.2116, "theoretical_loss": 4.1271162851808345, "tokens_seen": 312213504 }, { "epoch": 0.09, "learning_rate": 0.0004572617853560682, "loss": 3.0417, "theoretical_loss": 4.127015311968757, "tokens_seen": 312279040 }, { "epoch": 0.09, "learning_rate": 0.0004572517552657974, "loss": 3.3829, "theoretical_loss": 4.126914365877004, "tokens_seen": 312344576 }, { "epoch": 0.09, "learning_rate": 0.0004572417251755266, "loss": 3.3112, "theoretical_loss": 4.126813446892607, "tokens_seen": 312410112 }, { "epoch": 0.09, "learning_rate": 0.00045723169508525574, "loss": 3.4246, "theoretical_loss": 4.1267125550026, "tokens_seen": 312475648 }, { "epoch": 0.09, "learning_rate": 0.000457221664994985, "loss": 3.1065, "theoretical_loss": 4.1266116901940295, "tokens_seen": 312541184 }, { "epoch": 0.09, "learning_rate": 0.0004572116349047141, "loss": 3.2717, "theoretical_loss": 4.126510852453949, "tokens_seen": 312606720 }, { "epoch": 0.09, "learning_rate": 0.00045720160481444334, "loss": 3.1704, "theoretical_loss": 4.126410041769423, "tokens_seen": 312672256 }, { "epoch": 0.09, "learning_rate": 0.00045719157472417257, "loss": 3.2341, "theoretical_loss": 4.126309258127524, "tokens_seen": 312737792 }, { "epoch": 0.09, "learning_rate": 0.0004571815446339017, "loss": 3.1891, "theoretical_loss": 4.126208501515331, "tokens_seen": 312803328 }, { "epoch": 0.09, "learning_rate": 0.00045717151454363093, "loss": 3.2786, "theoretical_loss": 4.126107771919935, "tokens_seen": 312868864 }, { "epoch": 0.09, "objective/train/docs_used": 525512, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7899441719055176, "objective/train/theoretical_loss": 4.126007069328436, "objective/train/tokens_used": 333394400, "theoretical_loss": 4.126007069328436, "tokens_seen": 312934400 }, { "epoch": 0.09, "learning_rate": 0.0004571614844533601, "loss": 3.112, "theoretical_loss": 4.126007069328436, "tokens_seen": 312934400 }, { "epoch": 0.09, "learning_rate": 0.0004571514543630893, "loss": 3.2468, "theoretical_loss": 4.125906393727941, "tokens_seen": 312999936 }, { "epoch": 0.09, "learning_rate": 0.0004571414242728185, "loss": 3.2216, "theoretical_loss": 4.125805745105566, "tokens_seen": 313065472 }, { "epoch": 0.09, "learning_rate": 0.00045713139418254766, "loss": 3.2755, "theoretical_loss": 4.125705123448437, "tokens_seen": 313131008 }, { "epoch": 0.09, "learning_rate": 0.00045712136409227684, "loss": 3.0603, "theoretical_loss": 4.125604528743689, "tokens_seen": 313196544 }, { "epoch": 0.09, "learning_rate": 0.0004571113340020061, "loss": 3.1714, "theoretical_loss": 4.125503960978464, "tokens_seen": 313262080 }, { "epoch": 0.09, "learning_rate": 0.0004571013039117352, "loss": 3.156, "theoretical_loss": 4.1254034201399135, "tokens_seen": 313327616 }, { "epoch": 0.09, "learning_rate": 0.00045709127382146444, "loss": 3.2036, "theoretical_loss": 4.125302906215199, "tokens_seen": 313393152 }, { "epoch": 0.09, "learning_rate": 0.00045708124373119356, "loss": 3.3736, "theoretical_loss": 4.12520241919149, "tokens_seen": 313458688 }, { "epoch": 0.1, "learning_rate": 0.0004570712136409228, "loss": 3.1249, "theoretical_loss": 4.125101959055965, "tokens_seen": 313524224 }, { "epoch": 0.1, "learning_rate": 0.000457061183550652, "loss": 3.2548, "theoretical_loss": 4.125001525795811, "tokens_seen": 313589760 }, { "epoch": 0.1, "learning_rate": 0.00045705115346038116, "loss": 3.2806, "theoretical_loss": 4.124901119398222, "tokens_seen": 313655296 }, { "epoch": 0.1, "learning_rate": 0.00045704112337011034, "loss": 3.2891, "theoretical_loss": 4.124800739850406, "tokens_seen": 313720832 }, { "epoch": 0.1, "learning_rate": 0.0004570310932798395, "loss": 3.2495, "theoretical_loss": 4.124700387139574, "tokens_seen": 313786368 }, { "epoch": 0.1, "learning_rate": 0.0004570210631895687, "loss": 3.2174, "theoretical_loss": 4.12460006125295, "tokens_seen": 313851904 }, { "epoch": 0.1, "learning_rate": 0.00045701103309929794, "loss": 3.3141, "theoretical_loss": 4.124499762177764, "tokens_seen": 313917440 }, { "epoch": 0.1, "learning_rate": 0.00045700100300902707, "loss": 3.3301, "theoretical_loss": 4.124399489901254, "tokens_seen": 313982976 }, { "epoch": 0.1, "learning_rate": 0.0004569909729187563, "loss": 3.1711, "theoretical_loss": 4.124299244410672, "tokens_seen": 314048512 }, { "epoch": 0.1, "learning_rate": 0.0004569809428284855, "loss": 3.2888, "theoretical_loss": 4.124199025693272, "tokens_seen": 314114048 }, { "epoch": 0.1, "learning_rate": 0.00045697091273821466, "loss": 3.1129, "theoretical_loss": 4.124098833736321, "tokens_seen": 314179584 }, { "epoch": 0.1, "learning_rate": 0.00045696088264794384, "loss": 3.1492, "theoretical_loss": 4.123998668527094, "tokens_seen": 314245120 }, { "epoch": 0.1, "learning_rate": 0.000456950852557673, "loss": 3.2535, "theoretical_loss": 4.123898530052874, "tokens_seen": 314310656 }, { "epoch": 0.1, "learning_rate": 0.0004569408224674022, "loss": 3.1926, "theoretical_loss": 4.123798418300953, "tokens_seen": 314376192 }, { "epoch": 0.1, "learning_rate": 0.00045693079237713144, "loss": 3.4509, "theoretical_loss": 4.123698333258631, "tokens_seen": 314441728 }, { "epoch": 0.1, "learning_rate": 0.00045692076228686057, "loss": 3.1406, "theoretical_loss": 4.123598274913219, "tokens_seen": 314507264 }, { "epoch": 0.1, "objective/train/docs_used": 528422, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.285691738128662, "objective/train/theoretical_loss": 4.123498243252032, "objective/train/tokens_used": 335032800, "theoretical_loss": 4.123498243252032, "tokens_seen": 314572800 }, { "epoch": 0.1, "learning_rate": 0.0004569107321965898, "loss": 3.2409, "theoretical_loss": 4.123498243252032, "tokens_seen": 314572800 }, { "epoch": 0.1, "learning_rate": 0.00045690070210631893, "loss": 3.1292, "theoretical_loss": 4.1233982382624, "tokens_seen": 314638336 }, { "epoch": 0.1, "learning_rate": 0.00045689067201604817, "loss": 3.2329, "theoretical_loss": 4.123298259931657, "tokens_seen": 314703872 }, { "epoch": 0.1, "learning_rate": 0.00045688064192577735, "loss": 3.1771, "theoretical_loss": 4.123198308247146, "tokens_seen": 314769408 }, { "epoch": 0.1, "learning_rate": 0.00045687061183550653, "loss": 3.026, "theoretical_loss": 4.123098383196222, "tokens_seen": 314834944 }, { "epoch": 0.1, "learning_rate": 0.0004568605817452357, "loss": 3.1037, "theoretical_loss": 4.122998484766244, "tokens_seen": 314900480 }, { "epoch": 0.1, "learning_rate": 0.00045685055165496495, "loss": 3.2043, "theoretical_loss": 4.122898612944582, "tokens_seen": 314966016 }, { "epoch": 0.1, "learning_rate": 0.00045684052156469407, "loss": 3.2504, "theoretical_loss": 4.122798767718616, "tokens_seen": 315031552 }, { "epoch": 0.1, "learning_rate": 0.0004568304914744233, "loss": 3.2685, "theoretical_loss": 4.122698949075732, "tokens_seen": 315097088 }, { "epoch": 0.1, "learning_rate": 0.00045682046138415243, "loss": 3.2461, "theoretical_loss": 4.122599157003327, "tokens_seen": 315162624 }, { "epoch": 0.1, "learning_rate": 0.00045681043129388167, "loss": 3.1085, "theoretical_loss": 4.1224993914888035, "tokens_seen": 315228160 }, { "epoch": 0.1, "learning_rate": 0.00045680040120361085, "loss": 3.2201, "theoretical_loss": 4.122399652519576, "tokens_seen": 315293696 }, { "epoch": 0.1, "learning_rate": 0.00045679037111334003, "loss": 3.2188, "theoretical_loss": 4.122299940083065, "tokens_seen": 315359232 }, { "epoch": 0.1, "learning_rate": 0.0004567803410230692, "loss": 3.1001, "theoretical_loss": 4.1222002541667, "tokens_seen": 315424768 }, { "epoch": 0.1, "learning_rate": 0.0004567703109327984, "loss": 3.0772, "theoretical_loss": 4.122100594757921, "tokens_seen": 315490304 }, { "epoch": 0.1, "learning_rate": 0.0004567602808425276, "loss": 2.9421, "theoretical_loss": 4.122000961844175, "tokens_seen": 315555840 }, { "epoch": 0.1, "learning_rate": 0.0004567502507522568, "loss": 3.4481, "theoretical_loss": 4.121901355412917, "tokens_seen": 315621376 }, { "epoch": 0.1, "learning_rate": 0.00045674022066198594, "loss": 3.3729, "theoretical_loss": 4.121801775451612, "tokens_seen": 315686912 }, { "epoch": 0.1, "learning_rate": 0.0004567301905717152, "loss": 3.1688, "theoretical_loss": 4.121702221947732, "tokens_seen": 315752448 }, { "epoch": 0.1, "learning_rate": 0.0004567201604814443, "loss": 3.2723, "theoretical_loss": 4.121602694888759, "tokens_seen": 315817984 }, { "epoch": 0.1, "learning_rate": 0.00045671013039117354, "loss": 3.0197, "theoretical_loss": 4.121503194262183, "tokens_seen": 315883520 }, { "epoch": 0.1, "learning_rate": 0.0004567001003009027, "loss": 3.1711, "theoretical_loss": 4.121403720055502, "tokens_seen": 315949056 }, { "epoch": 0.1, "learning_rate": 0.0004566900702106319, "loss": 3.1247, "theoretical_loss": 4.121304272256222, "tokens_seen": 316014592 }, { "epoch": 0.1, "learning_rate": 0.0004566800401203611, "loss": 3.0879, "theoretical_loss": 4.121204850851861, "tokens_seen": 316080128 }, { "epoch": 0.1, "learning_rate": 0.0004566700100300903, "loss": 3.0891, "theoretical_loss": 4.121105455829939, "tokens_seen": 316145664 }, { "epoch": 0.1, "objective/train/docs_used": 531254, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9841384887695312, "objective/train/theoretical_loss": 4.121006087177992, "objective/train/tokens_used": 336671200, "theoretical_loss": 4.121006087177992, "tokens_seen": 316211200 }, { "epoch": 0.1, "learning_rate": 0.00045665997993981944, "loss": 3.2472, "theoretical_loss": 4.121006087177992, "tokens_seen": 316211200 }, { "epoch": 0.1, "learning_rate": 0.0004566499498495487, "loss": 3.3255, "theoretical_loss": 4.120906744883559, "tokens_seen": 316276736 }, { "epoch": 0.1, "learning_rate": 0.0004566399197592778, "loss": 3.3574, "theoretical_loss": 4.120807428934189, "tokens_seen": 316342272 }, { "epoch": 0.1, "learning_rate": 0.00045662988966900704, "loss": 3.0968, "theoretical_loss": 4.120708139317441, "tokens_seen": 316407808 }, { "epoch": 0.1, "learning_rate": 0.0004566198595787362, "loss": 3.1047, "theoretical_loss": 4.12060887602088, "tokens_seen": 316473344 }, { "epoch": 0.1, "learning_rate": 0.0004566098294884654, "loss": 3.128, "theoretical_loss": 4.120509639032081, "tokens_seen": 316538880 }, { "epoch": 0.1, "learning_rate": 0.0004565997993981946, "loss": 3.2681, "theoretical_loss": 4.120410428338628, "tokens_seen": 316604416 }, { "epoch": 0.1, "learning_rate": 0.00045658976930792376, "loss": 3.2012, "theoretical_loss": 4.120311243928111, "tokens_seen": 316669952 }, { "epoch": 0.1, "learning_rate": 0.00045657973921765294, "loss": 3.2909, "theoretical_loss": 4.120212085788131, "tokens_seen": 316735488 }, { "epoch": 0.1, "learning_rate": 0.0004565697091273822, "loss": 3.1581, "theoretical_loss": 4.120112953906296, "tokens_seen": 316801024 }, { "epoch": 0.1, "learning_rate": 0.0004565596790371113, "loss": 3.1796, "theoretical_loss": 4.120013848270222, "tokens_seen": 316866560 }, { "epoch": 0.1, "learning_rate": 0.00045654964894684054, "loss": 3.1319, "theoretical_loss": 4.119914768867536, "tokens_seen": 316932096 }, { "epoch": 0.1, "learning_rate": 0.00045653961885656967, "loss": 3.1199, "theoretical_loss": 4.11981571568587, "tokens_seen": 316997632 }, { "epoch": 0.1, "learning_rate": 0.0004565295887662989, "loss": 3.316, "theoretical_loss": 4.119716688712866, "tokens_seen": 317063168 }, { "epoch": 0.1, "learning_rate": 0.0004565195586760281, "loss": 3.1142, "theoretical_loss": 4.119617687936175, "tokens_seen": 317128704 }, { "epoch": 0.1, "learning_rate": 0.00045650952858575727, "loss": 3.4159, "theoretical_loss": 4.119518713343455, "tokens_seen": 317194240 }, { "epoch": 0.1, "learning_rate": 0.00045649949849548645, "loss": 3.2088, "theoretical_loss": 4.119419764922374, "tokens_seen": 317259776 }, { "epoch": 0.1, "learning_rate": 0.0004564894684052157, "loss": 3.229, "theoretical_loss": 4.119320842660606, "tokens_seen": 317325312 }, { "epoch": 0.1, "learning_rate": 0.0004564794383149448, "loss": 3.0561, "theoretical_loss": 4.119221946545836, "tokens_seen": 317390848 }, { "epoch": 0.1, "learning_rate": 0.00045646940822467405, "loss": 3.1754, "theoretical_loss": 4.119123076565755, "tokens_seen": 317456384 }, { "epoch": 0.1, "learning_rate": 0.00045645937813440317, "loss": 3.2346, "theoretical_loss": 4.119024232708064, "tokens_seen": 317521920 }, { "epoch": 0.1, "learning_rate": 0.0004564493480441324, "loss": 2.9431, "theoretical_loss": 4.118925414960472, "tokens_seen": 317587456 }, { "epoch": 0.1, "learning_rate": 0.00045643931795386164, "loss": 3.2826, "theoretical_loss": 4.118826623310696, "tokens_seen": 317652992 }, { "epoch": 0.1, "learning_rate": 0.00045642928786359077, "loss": 3.2618, "theoretical_loss": 4.11872785774646, "tokens_seen": 317718528 }, { "epoch": 0.1, "learning_rate": 0.00045641925777332, "loss": 3.2267, "theoretical_loss": 4.1186291182555, "tokens_seen": 317784064 }, { "epoch": 0.1, "objective/train/docs_used": 532742, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2568554878234863, "objective/train/theoretical_loss": 4.118530404825556, "objective/train/tokens_used": 338309600, "theoretical_loss": 4.118530404825556, "tokens_seen": 317849600 }, { "epoch": 0.1, "learning_rate": 0.00045640922768304913, "loss": 3.1076, "theoretical_loss": 4.118530404825556, "tokens_seen": 317849600 }, { "epoch": 0.1, "learning_rate": 0.00045639919759277837, "loss": 3.3517, "theoretical_loss": 4.11843171744438, "tokens_seen": 317915136 }, { "epoch": 0.1, "learning_rate": 0.00045638916750250755, "loss": 3.2559, "theoretical_loss": 4.118333056099728, "tokens_seen": 317980672 }, { "epoch": 0.1, "learning_rate": 0.00045637913741223673, "loss": 2.9811, "theoretical_loss": 4.11823442077937, "tokens_seen": 318046208 }, { "epoch": 0.1, "learning_rate": 0.0004563691073219659, "loss": 3.225, "theoretical_loss": 4.1181358114710775, "tokens_seen": 318111744 }, { "epoch": 0.1, "learning_rate": 0.00045635907723169515, "loss": 3.1115, "theoretical_loss": 4.1180372281626365, "tokens_seen": 318177280 }, { "epoch": 0.1, "learning_rate": 0.00045634904714142427, "loss": 3.2618, "theoretical_loss": 4.117938670841838, "tokens_seen": 318242816 }, { "epoch": 0.1, "learning_rate": 0.0004563390170511535, "loss": 3.3287, "theoretical_loss": 4.117840139496482, "tokens_seen": 318308352 }, { "epoch": 0.1, "learning_rate": 0.00045632898696088263, "loss": 3.3829, "theoretical_loss": 4.117741634114376, "tokens_seen": 318373888 }, { "epoch": 0.1, "learning_rate": 0.00045631895687061187, "loss": 3.2315, "theoretical_loss": 4.1176431546833365, "tokens_seen": 318439424 }, { "epoch": 0.1, "learning_rate": 0.00045630892678034105, "loss": 3.1198, "theoretical_loss": 4.117544701191187, "tokens_seen": 318504960 }, { "epoch": 0.1, "learning_rate": 0.00045629889669007023, "loss": 3.4329, "theoretical_loss": 4.117446273625763, "tokens_seen": 318570496 }, { "epoch": 0.1, "learning_rate": 0.0004562888665997994, "loss": 3.2922, "theoretical_loss": 4.117347871974903, "tokens_seen": 318636032 }, { "epoch": 0.1, "learning_rate": 0.0004562788365095286, "loss": 3.2713, "theoretical_loss": 4.1172494962264565, "tokens_seen": 318701568 }, { "epoch": 0.1, "learning_rate": 0.0004562688064192578, "loss": 3.1906, "theoretical_loss": 4.117151146368282, "tokens_seen": 318767104 }, { "epoch": 0.1, "learning_rate": 0.000456258776328987, "loss": 3.2119, "theoretical_loss": 4.117052822388243, "tokens_seen": 318832640 }, { "epoch": 0.1, "learning_rate": 0.00045624874623871614, "loss": 3.314, "theoretical_loss": 4.116954524274216, "tokens_seen": 318898176 }, { "epoch": 0.1, "learning_rate": 0.0004562387161484454, "loss": 3.2145, "theoretical_loss": 4.11685625201408, "tokens_seen": 318963712 }, { "epoch": 0.1, "learning_rate": 0.0004562286860581745, "loss": 3.3096, "theoretical_loss": 4.116758005595727, "tokens_seen": 319029248 }, { "epoch": 0.1, "learning_rate": 0.00045621865596790374, "loss": 3.2469, "theoretical_loss": 4.116659785007055, "tokens_seen": 319094784 }, { "epoch": 0.1, "learning_rate": 0.0004562086258776329, "loss": 3.161, "theoretical_loss": 4.116561590235969, "tokens_seen": 319160320 }, { "epoch": 0.1, "learning_rate": 0.0004561985957873621, "loss": 3.2807, "theoretical_loss": 4.116463421270385, "tokens_seen": 319225856 }, { "epoch": 0.1, "learning_rate": 0.0004561885656970913, "loss": 3.1843, "theoretical_loss": 4.116365278098225, "tokens_seen": 319291392 }, { "epoch": 0.1, "learning_rate": 0.0004561785356068205, "loss": 2.9271, "theoretical_loss": 4.116267160707421, "tokens_seen": 319356928 }, { "epoch": 0.1, "learning_rate": 0.00045616850551654964, "loss": 3.2313, "theoretical_loss": 4.11616906908591, "tokens_seen": 319422464 }, { "epoch": 0.1, "objective/train/docs_used": 535531, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4184489250183105, "objective/train/theoretical_loss": 4.11607100322164, "objective/train/tokens_used": 339948000, "theoretical_loss": 4.11607100322164, "tokens_seen": 319488000 }, { "epoch": 0.1, "learning_rate": 0.0004561584754262789, "loss": 3.3129, "theoretical_loss": 4.11607100322164, "tokens_seen": 319488000 }, { "epoch": 0.1, "learning_rate": 0.000456148445336008, "loss": 3.0982, "theoretical_loss": 4.115972963102565, "tokens_seen": 319553536 }, { "epoch": 0.1, "learning_rate": 0.00045613841524573724, "loss": 3.3148, "theoretical_loss": 4.11587494871665, "tokens_seen": 319619072 }, { "epoch": 0.1, "learning_rate": 0.0004561283851554664, "loss": 3.0804, "theoretical_loss": 4.115776960051864, "tokens_seen": 319684608 }, { "epoch": 0.1, "learning_rate": 0.0004561183550651956, "loss": 3.1768, "theoretical_loss": 4.11567899709619, "tokens_seen": 319750144 }, { "epoch": 0.1, "learning_rate": 0.0004561083249749248, "loss": 3.0706, "theoretical_loss": 4.115581059837612, "tokens_seen": 319815680 }, { "epoch": 0.1, "learning_rate": 0.00045609829488465396, "loss": 3.2068, "theoretical_loss": 4.115483148264127, "tokens_seen": 319881216 }, { "epoch": 0.1, "learning_rate": 0.00045608826479438314, "loss": 3.2277, "theoretical_loss": 4.115385262363739, "tokens_seen": 319946752 }, { "epoch": 0.1, "learning_rate": 0.0004560782347041124, "loss": 3.0477, "theoretical_loss": 4.1152874021244585, "tokens_seen": 320012288 }, { "epoch": 0.1, "learning_rate": 0.0004560682046138415, "loss": 3.3166, "theoretical_loss": 4.115189567534307, "tokens_seen": 320077824 }, { "epoch": 0.1, "learning_rate": 0.00045605817452357074, "loss": 3.1297, "theoretical_loss": 4.115091758581309, "tokens_seen": 320143360 }, { "epoch": 0.1, "learning_rate": 0.00045604814443329987, "loss": 3.0976, "theoretical_loss": 4.114993975253505, "tokens_seen": 320208896 }, { "epoch": 0.1, "learning_rate": 0.0004560381143430291, "loss": 3.2308, "theoretical_loss": 4.114896217538935, "tokens_seen": 320274432 }, { "epoch": 0.1, "learning_rate": 0.0004560280842527583, "loss": 3.2474, "theoretical_loss": 4.114798485425652, "tokens_seen": 320339968 }, { "epoch": 0.1, "learning_rate": 0.00045601805416248747, "loss": 3.107, "theoretical_loss": 4.114700778901717, "tokens_seen": 320405504 }, { "epoch": 0.1, "learning_rate": 0.00045600802407221665, "loss": 3.244, "theoretical_loss": 4.114603097955197, "tokens_seen": 320471040 }, { "epoch": 0.1, "learning_rate": 0.0004559979939819459, "loss": 3.3001, "theoretical_loss": 4.114505442574167, "tokens_seen": 320536576 }, { "epoch": 0.1, "learning_rate": 0.000455987963891675, "loss": 3.1806, "theoretical_loss": 4.1144078127467125, "tokens_seen": 320602112 }, { "epoch": 0.1, "learning_rate": 0.00045597793380140425, "loss": 3.2482, "theoretical_loss": 4.114310208460924, "tokens_seen": 320667648 }, { "epoch": 0.1, "learning_rate": 0.00045596790371113337, "loss": 3.3422, "theoretical_loss": 4.114212629704902, "tokens_seen": 320733184 }, { "epoch": 0.1, "learning_rate": 0.0004559578736208626, "loss": 3.1584, "theoretical_loss": 4.114115076466755, "tokens_seen": 320798720 }, { "epoch": 0.1, "learning_rate": 0.0004559478435305918, "loss": 3.1404, "theoretical_loss": 4.114017548734598, "tokens_seen": 320864256 }, { "epoch": 0.1, "learning_rate": 0.00045593781344032097, "loss": 3.1664, "theoretical_loss": 4.113920046496554, "tokens_seen": 320929792 }, { "epoch": 0.1, "learning_rate": 0.00045592778335005015, "loss": 3.147, "theoretical_loss": 4.113822569740757, "tokens_seen": 320995328 }, { "epoch": 0.1, "learning_rate": 0.00045591775325977933, "loss": 3.167, "theoretical_loss": 4.113725118455344, "tokens_seen": 321060864 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.195796012878418, "objective/train/theoretical_loss": 4.113627692628464, "objective/train/tokens_used": 341586400, "theoretical_loss": 4.113627692628464, "tokens_seen": 321126400 }, { "epoch": 0.1, "learning_rate": 0.0004559077231695085, "loss": 3.1639, "theoretical_loss": 4.113627692628464, "tokens_seen": 321126400 }, { "epoch": 0.1, "learning_rate": 0.00045589769307923775, "loss": 3.201, "theoretical_loss": 4.113530292248273, "tokens_seen": 321191936 }, { "epoch": 0.1, "learning_rate": 0.0004558876629889669, "loss": 3.2654, "theoretical_loss": 4.113432917302934, "tokens_seen": 321257472 }, { "epoch": 0.1, "learning_rate": 0.0004558776328986961, "loss": 3.2342, "theoretical_loss": 4.113335567780618, "tokens_seen": 321323008 }, { "epoch": 0.1, "learning_rate": 0.00045586760280842524, "loss": 3.1149, "theoretical_loss": 4.113238243669504, "tokens_seen": 321388544 }, { "epoch": 0.1, "learning_rate": 0.0004558575727181545, "loss": 3.2965, "theoretical_loss": 4.113140944957781, "tokens_seen": 321454080 }, { "epoch": 0.1, "learning_rate": 0.00045584754262788365, "loss": 3.186, "theoretical_loss": 4.113043671633641, "tokens_seen": 321519616 }, { "epoch": 0.1, "learning_rate": 0.00045583751253761284, "loss": 3.1891, "theoretical_loss": 4.11294642368529, "tokens_seen": 321585152 }, { "epoch": 0.1, "learning_rate": 0.000455827482447342, "loss": 3.21, "theoretical_loss": 4.112849201100938, "tokens_seen": 321650688 }, { "epoch": 0.1, "learning_rate": 0.00045581745235707125, "loss": 3.2222, "theoretical_loss": 4.1127520038688035, "tokens_seen": 321716224 }, { "epoch": 0.1, "learning_rate": 0.0004558074222668004, "loss": 3.2934, "theoretical_loss": 4.112654831977112, "tokens_seen": 321781760 }, { "epoch": 0.1, "learning_rate": 0.0004557973921765296, "loss": 3.3153, "theoretical_loss": 4.1125576854141, "tokens_seen": 321847296 }, { "epoch": 0.1, "learning_rate": 0.00045578736208625874, "loss": 3.2302, "theoretical_loss": 4.112460564168009, "tokens_seen": 321912832 }, { "epoch": 0.1, "learning_rate": 0.000455777331995988, "loss": 3.1176, "theoretical_loss": 4.112363468227088, "tokens_seen": 321978368 }, { "epoch": 0.1, "learning_rate": 0.00045576730190571716, "loss": 3.1273, "theoretical_loss": 4.112266397579598, "tokens_seen": 322043904 }, { "epoch": 0.1, "learning_rate": 0.00045575727181544634, "loss": 3.1973, "theoretical_loss": 4.112169352213801, "tokens_seen": 322109440 }, { "epoch": 0.1, "learning_rate": 0.0004557472417251755, "loss": 3.029, "theoretical_loss": 4.1120723321179735, "tokens_seen": 322174976 }, { "epoch": 0.1, "learning_rate": 0.0004557372116349047, "loss": 3.0369, "theoretical_loss": 4.111975337280397, "tokens_seen": 322240512 }, { "epoch": 0.1, "learning_rate": 0.0004557271815446339, "loss": 3.2276, "theoretical_loss": 4.111878367689359, "tokens_seen": 322306048 }, { "epoch": 0.1, "learning_rate": 0.0004557171514543631, "loss": 3.276, "theoretical_loss": 4.1117814233331575, "tokens_seen": 322371584 }, { "epoch": 0.1, "learning_rate": 0.00045570712136409224, "loss": 3.1202, "theoretical_loss": 4.111684504200099, "tokens_seen": 322437120 }, { "epoch": 0.1, "learning_rate": 0.0004556970912738215, "loss": 3.1932, "theoretical_loss": 4.111587610278494, "tokens_seen": 322502656 }, { "epoch": 0.1, "learning_rate": 0.0004556870611835507, "loss": 3.0851, "theoretical_loss": 4.111490741556663, "tokens_seen": 322568192 }, { "epoch": 0.1, "learning_rate": 0.00045567703109327984, "loss": 3.2677, "theoretical_loss": 4.1113938980229365, "tokens_seen": 322633728 }, { "epoch": 0.1, "learning_rate": 0.0004556670010030091, "loss": 3.2846, "theoretical_loss": 4.11129707966565, "tokens_seen": 322699264 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1852855682373047, "objective/train/theoretical_loss": 4.111200286473145, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.111200286473145, "tokens_seen": 322764800 }, { "epoch": 0.1, "learning_rate": 0.0004556569709127382, "loss": 3.2443, "theoretical_loss": 4.111200286473145, "tokens_seen": 322764800 }, { "epoch": 0.1, "learning_rate": 0.00045564694082246744, "loss": 3.1786, "theoretical_loss": 4.111103518433776, "tokens_seen": 322830336 }, { "epoch": 0.1, "learning_rate": 0.0004556369107321966, "loss": 3.3595, "theoretical_loss": 4.111006775535901, "tokens_seen": 322895872 }, { "epoch": 0.1, "learning_rate": 0.0004556268806419258, "loss": 3.1586, "theoretical_loss": 4.110910057767887, "tokens_seen": 322961408 }, { "epoch": 0.1, "learning_rate": 0.000455616850551655, "loss": 3.2594, "theoretical_loss": 4.110813365118109, "tokens_seen": 323026944 }, { "epoch": 0.1, "learning_rate": 0.00045560682046138416, "loss": 3.0063, "theoretical_loss": 4.110716697574951, "tokens_seen": 323092480 }, { "epoch": 0.1, "learning_rate": 0.00045559679037111334, "loss": 3.2926, "theoretical_loss": 4.110620055126802, "tokens_seen": 323158016 }, { "epoch": 0.1, "learning_rate": 0.0004555867602808426, "loss": 3.2099, "theoretical_loss": 4.110523437762059, "tokens_seen": 323223552 }, { "epoch": 0.1, "learning_rate": 0.0004555767301905717, "loss": 3.0989, "theoretical_loss": 4.11042684546913, "tokens_seen": 323289088 }, { "epoch": 0.1, "learning_rate": 0.00045556670010030094, "loss": 3.1241, "theoretical_loss": 4.110330278236427, "tokens_seen": 323354624 }, { "epoch": 0.1, "learning_rate": 0.00045555667001003007, "loss": 3.2177, "theoretical_loss": 4.110233736052372, "tokens_seen": 323420160 }, { "epoch": 0.1, "learning_rate": 0.0004555466399197593, "loss": 3.1854, "theoretical_loss": 4.110137218905393, "tokens_seen": 323485696 }, { "epoch": 0.1, "learning_rate": 0.0004555366098294885, "loss": 3.1621, "theoretical_loss": 4.110040726783927, "tokens_seen": 323551232 }, { "epoch": 0.1, "learning_rate": 0.00045552657973921767, "loss": 3.244, "theoretical_loss": 4.109944259676419, "tokens_seen": 323616768 }, { "epoch": 0.1, "learning_rate": 0.00045551654964894685, "loss": 3.1855, "theoretical_loss": 4.109847817571319, "tokens_seen": 323682304 }, { "epoch": 0.1, "learning_rate": 0.0004555065195586761, "loss": 3.1083, "theoretical_loss": 4.109751400457089, "tokens_seen": 323747840 }, { "epoch": 0.1, "learning_rate": 0.0004554964894684052, "loss": 3.1194, "theoretical_loss": 4.109655008322195, "tokens_seen": 323813376 }, { "epoch": 0.1, "learning_rate": 0.00045548645937813445, "loss": 3.2791, "theoretical_loss": 4.109558641155112, "tokens_seen": 323878912 }, { "epoch": 0.1, "learning_rate": 0.00045547642928786357, "loss": 3.2032, "theoretical_loss": 4.109462298944322, "tokens_seen": 323944448 }, { "epoch": 0.1, "learning_rate": 0.0004554663991975928, "loss": 3.2983, "theoretical_loss": 4.109365981678316, "tokens_seen": 324009984 }, { "epoch": 0.1, "learning_rate": 0.000455456369107322, "loss": 3.3162, "theoretical_loss": 4.109269689345592, "tokens_seen": 324075520 }, { "epoch": 0.1, "learning_rate": 0.00045544633901705117, "loss": 3.1925, "theoretical_loss": 4.109173421934654, "tokens_seen": 324141056 }, { "epoch": 0.1, "learning_rate": 0.00045543630892678035, "loss": 3.2595, "theoretical_loss": 4.109077179434016, "tokens_seen": 324206592 }, { "epoch": 0.1, "learning_rate": 0.00045542627883650953, "loss": 3.2657, "theoretical_loss": 4.1089809618321995, "tokens_seen": 324272128 }, { "epoch": 0.1, "learning_rate": 0.0004554162487462387, "loss": 3.2268, "theoretical_loss": 4.108884769117731, "tokens_seen": 324337664 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.911879062652588, "objective/train/theoretical_loss": 4.108788601279149, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.108788601279149, "tokens_seen": 324403200 }, { "epoch": 0.1, "learning_rate": 0.00045540621865596795, "loss": 3.1794, "theoretical_loss": 4.108788601279149, "tokens_seen": 324403200 }, { "epoch": 0.1, "learning_rate": 0.0004553961885656971, "loss": 3.3123, "theoretical_loss": 4.108692458304994, "tokens_seen": 324468736 }, { "epoch": 0.1, "learning_rate": 0.0004553861584754263, "loss": 3.0715, "theoretical_loss": 4.108596340183819, "tokens_seen": 324534272 }, { "epoch": 0.1, "learning_rate": 0.00045537612838515544, "loss": 3.1401, "theoretical_loss": 4.108500246904184, "tokens_seen": 324599808 }, { "epoch": 0.1, "learning_rate": 0.0004553660982948847, "loss": 3.2479, "theoretical_loss": 4.108404178454651, "tokens_seen": 324665344 }, { "epoch": 0.1, "learning_rate": 0.00045535606820461385, "loss": 3.1411, "theoretical_loss": 4.1083081348237975, "tokens_seen": 324730880 }, { "epoch": 0.1, "learning_rate": 0.00045534603811434304, "loss": 3.1812, "theoretical_loss": 4.108212116000203, "tokens_seen": 324796416 }, { "epoch": 0.1, "learning_rate": 0.0004553360080240722, "loss": 3.2134, "theoretical_loss": 4.108116121972457, "tokens_seen": 324861952 }, { "epoch": 0.1, "learning_rate": 0.00045532597793380145, "loss": 2.9785, "theoretical_loss": 4.108020152729157, "tokens_seen": 324927488 }, { "epoch": 0.1, "learning_rate": 0.0004553159478435306, "loss": 3.2389, "theoretical_loss": 4.107924208258905, "tokens_seen": 324993024 }, { "epoch": 0.1, "learning_rate": 0.0004553059177532598, "loss": 3.2947, "theoretical_loss": 4.107828288550314, "tokens_seen": 325058560 }, { "epoch": 0.1, "learning_rate": 0.00045529588766298894, "loss": 3.1149, "theoretical_loss": 4.107732393592003, "tokens_seen": 325124096 }, { "epoch": 0.1, "learning_rate": 0.0004552858575727182, "loss": 3.1932, "theoretical_loss": 4.107636523372598, "tokens_seen": 325189632 }, { "epoch": 0.1, "learning_rate": 0.00045527582748244736, "loss": 3.4352, "theoretical_loss": 4.107540677880734, "tokens_seen": 325255168 }, { "epoch": 0.1, "learning_rate": 0.00045526579739217654, "loss": 3.2343, "theoretical_loss": 4.107444857105052, "tokens_seen": 325320704 }, { "epoch": 0.1, "learning_rate": 0.0004552557673019057, "loss": 3.1598, "theoretical_loss": 4.107349061034201, "tokens_seen": 325386240 }, { "epoch": 0.1, "learning_rate": 0.0004552457372116349, "loss": 3.1045, "theoretical_loss": 4.107253289656838, "tokens_seen": 325451776 }, { "epoch": 0.1, "learning_rate": 0.0004552357071213641, "loss": 3.2477, "theoretical_loss": 4.107157542961628, "tokens_seen": 325517312 }, { "epoch": 0.1, "learning_rate": 0.0004552256770310933, "loss": 3.1935, "theoretical_loss": 4.10706182093724, "tokens_seen": 325582848 }, { "epoch": 0.1, "learning_rate": 0.00045521564694082244, "loss": 3.2254, "theoretical_loss": 4.106966123572356, "tokens_seen": 325648384 }, { "epoch": 0.1, "learning_rate": 0.0004552056168505517, "loss": 3.1115, "theoretical_loss": 4.106870450855661, "tokens_seen": 325713920 }, { "epoch": 0.1, "learning_rate": 0.0004551955867602808, "loss": 3.2819, "theoretical_loss": 4.106774802775849, "tokens_seen": 325779456 }, { "epoch": 0.1, "learning_rate": 0.00045518555667001004, "loss": 3.2821, "theoretical_loss": 4.106679179321622, "tokens_seen": 325844992 }, { "epoch": 0.1, "learning_rate": 0.0004551755265797392, "loss": 3.0513, "theoretical_loss": 4.106583580481689, "tokens_seen": 325910528 }, { "epoch": 0.1, "learning_rate": 0.0004551654964894684, "loss": 3.2592, "theoretical_loss": 4.106488006244767, "tokens_seen": 325976064 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3236801624298096, "objective/train/theoretical_loss": 4.106392456599577, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.106392456599577, "tokens_seen": 326041600 }, { "epoch": 0.1, "learning_rate": 0.0004551554663991976, "loss": 3.3316, "theoretical_loss": 4.106392456599577, "tokens_seen": 326041600 }, { "epoch": 0.1, "learning_rate": 0.0004551454363089268, "loss": 3.2682, "theoretical_loss": 4.106296931534854, "tokens_seen": 326107136 }, { "epoch": 0.1, "learning_rate": 0.00045513540621865595, "loss": 3.13, "theoretical_loss": 4.106201431039335, "tokens_seen": 326172672 }, { "epoch": 0.1, "learning_rate": 0.0004551253761283852, "loss": 3.1365, "theoretical_loss": 4.106105955101766, "tokens_seen": 326238208 }, { "epoch": 0.1, "learning_rate": 0.0004551153460381143, "loss": 3.1357, "theoretical_loss": 4.1060105037109, "tokens_seen": 326303744 }, { "epoch": 0.1, "learning_rate": 0.00045510531594784354, "loss": 3.137, "theoretical_loss": 4.105915076855499, "tokens_seen": 326369280 }, { "epoch": 0.1, "learning_rate": 0.0004550952858575727, "loss": 3.2704, "theoretical_loss": 4.105819674524332, "tokens_seen": 326434816 }, { "epoch": 0.1, "learning_rate": 0.0004550852557673019, "loss": 3.1549, "theoretical_loss": 4.105724296706172, "tokens_seen": 326500352 }, { "epoch": 0.1, "learning_rate": 0.0004550752256770311, "loss": 3.0773, "theoretical_loss": 4.105628943389805, "tokens_seen": 326565888 }, { "epoch": 0.1, "learning_rate": 0.00045506519558676027, "loss": 3.1909, "theoretical_loss": 4.1055336145640196, "tokens_seen": 326631424 }, { "epoch": 0.1, "learning_rate": 0.00045505516549648945, "loss": 3.2664, "theoretical_loss": 4.105438310217615, "tokens_seen": 326696960 }, { "epoch": 0.1, "learning_rate": 0.0004550451354062187, "loss": 3.0395, "theoretical_loss": 4.105343030339395, "tokens_seen": 326762496 }, { "epoch": 0.1, "learning_rate": 0.0004550351053159478, "loss": 3.3703, "theoretical_loss": 4.1052477749181735, "tokens_seen": 326828032 }, { "epoch": 0.1, "learning_rate": 0.00045502507522567705, "loss": 3.1328, "theoretical_loss": 4.10515254394277, "tokens_seen": 326893568 }, { "epoch": 0.1, "learning_rate": 0.00045501504513540623, "loss": 3.1195, "theoretical_loss": 4.1050573374020125, "tokens_seen": 326959104 }, { "epoch": 0.1, "learning_rate": 0.0004550050150451354, "loss": 3.1166, "theoretical_loss": 4.104962155284734, "tokens_seen": 327024640 }, { "epoch": 0.1, "learning_rate": 0.0004549949849548646, "loss": 3.3278, "theoretical_loss": 4.104866997579778, "tokens_seen": 327090176 }, { "epoch": 0.1, "learning_rate": 0.00045498495486459377, "loss": 3.2933, "theoretical_loss": 4.104771864275993, "tokens_seen": 327155712 }, { "epoch": 0.1, "learning_rate": 0.00045497492477432295, "loss": 3.2312, "theoretical_loss": 4.104676755362237, "tokens_seen": 327221248 }, { "epoch": 0.1, "learning_rate": 0.0004549648946840522, "loss": 3.249, "theoretical_loss": 4.104581670827372, "tokens_seen": 327286784 }, { "epoch": 0.1, "learning_rate": 0.00045495486459378137, "loss": 3.0753, "theoretical_loss": 4.10448661066027, "tokens_seen": 327352320 }, { "epoch": 0.1, "learning_rate": 0.00045494483450351055, "loss": 3.0058, "theoretical_loss": 4.104391574849812, "tokens_seen": 327417856 }, { "epoch": 0.1, "learning_rate": 0.00045493480441323973, "loss": 3.2397, "theoretical_loss": 4.10429656338488, "tokens_seen": 327483392 }, { "epoch": 0.1, "learning_rate": 0.0004549247743229689, "loss": 3.2215, "theoretical_loss": 4.104201576254369, "tokens_seen": 327548928 }, { "epoch": 0.1, "learning_rate": 0.00045491474423269815, "loss": 3.1914, "theoretical_loss": 4.10410661344718, "tokens_seen": 327614464 }, { "debugging/Self-BLEU-5": 0.6109173245000884, "debugging/distinct-1-grams": 0.7555186937138209, "debugging/distinct-2-grams": 0.9513936175936398, "debugging/entropy-1-grams": 6.336503301099761, "debugging/entropy-2-grams": 7.505344554454511, "debugging/length": 533.9230769230769, "debugging/num_segments": 26, "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.394697904586792, "objective/train/theoretical_loss": 4.10401167495222, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.10401167495222, "tokens_seen": 327680000 }, { "epoch": 0.1, "learning_rate": 0.0004549047141424273, "loss": 3.4042, "theoretical_loss": 4.10401167495222, "tokens_seen": 327680000 }, { "epoch": 0.1, "learning_rate": 0.0004548946840521565, "loss": 3.2337, "theoretical_loss": 4.103916760758405, "tokens_seen": 327745536 }, { "epoch": 0.1, "learning_rate": 0.00045488465396188564, "loss": 3.3296, "theoretical_loss": 4.103821870854656, "tokens_seen": 327811072 }, { "epoch": 0.1, "learning_rate": 0.0004548746238716149, "loss": 3.1476, "theoretical_loss": 4.103727005229903, "tokens_seen": 327876608 }, { "epoch": 0.1, "learning_rate": 0.00045486459378134405, "loss": 3.3875, "theoretical_loss": 4.103632163873083, "tokens_seen": 327942144 }, { "epoch": 0.1, "learning_rate": 0.00045485456369107324, "loss": 3.3515, "theoretical_loss": 4.10353734677314, "tokens_seen": 328007680 }, { "epoch": 0.1, "learning_rate": 0.0004548445336008024, "loss": 3.1212, "theoretical_loss": 4.103442553919026, "tokens_seen": 328073216 }, { "epoch": 0.1, "learning_rate": 0.00045483450351053165, "loss": 3.1866, "theoretical_loss": 4.1033477852996985, "tokens_seen": 328138752 }, { "epoch": 0.1, "learning_rate": 0.0004548244734202608, "loss": 3.133, "theoretical_loss": 4.103253040904124, "tokens_seen": 328204288 }, { "epoch": 0.1, "learning_rate": 0.00045481444332999, "loss": 3.2185, "theoretical_loss": 4.103158320721276, "tokens_seen": 328269824 }, { "epoch": 0.1, "learning_rate": 0.00045480441323971914, "loss": 3.3677, "theoretical_loss": 4.103063624740133, "tokens_seen": 328335360 }, { "epoch": 0.1, "learning_rate": 0.0004547943831494484, "loss": 3.113, "theoretical_loss": 4.102968952949684, "tokens_seen": 328400896 }, { "epoch": 0.1, "learning_rate": 0.00045478435305917756, "loss": 3.1994, "theoretical_loss": 4.102874305338923, "tokens_seen": 328466432 }, { "epoch": 0.1, "learning_rate": 0.00045477432296890674, "loss": 3.2009, "theoretical_loss": 4.102779681896852, "tokens_seen": 328531968 }, { "epoch": 0.1, "learning_rate": 0.0004547642928786359, "loss": 3.2624, "theoretical_loss": 4.10268508261248, "tokens_seen": 328597504 }, { "epoch": 0.1, "learning_rate": 0.0004547542627883651, "loss": 3.3358, "theoretical_loss": 4.102590507474824, "tokens_seen": 328663040 }, { "epoch": 0.1, "learning_rate": 0.0004547442326980943, "loss": 3.2532, "theoretical_loss": 4.1024959564729055, "tokens_seen": 328728576 }, { "epoch": 0.1, "learning_rate": 0.0004547342026078235, "loss": 3.1168, "theoretical_loss": 4.102401429595758, "tokens_seen": 328794112 }, { "epoch": 0.1, "learning_rate": 0.00045472417251755264, "loss": 3.3488, "theoretical_loss": 4.102306926832417, "tokens_seen": 328859648 }, { "epoch": 0.1, "learning_rate": 0.0004547141424272819, "loss": 3.2029, "theoretical_loss": 4.102212448171928, "tokens_seen": 328925184 }, { "epoch": 0.1, "learning_rate": 0.000454704112337011, "loss": 3.3278, "theoretical_loss": 4.1021179936033425, "tokens_seen": 328990720 }, { "epoch": 0.1, "learning_rate": 0.00045469408224674024, "loss": 3.1798, "theoretical_loss": 4.102023563115721, "tokens_seen": 329056256 }, { "epoch": 0.1, "learning_rate": 0.0004546840521564694, "loss": 3.1763, "theoretical_loss": 4.10192915669813, "tokens_seen": 329121792 }, { "epoch": 0.1, "learning_rate": 0.0004546740220661986, "loss": 3.3345, "theoretical_loss": 4.1018347743396415, "tokens_seen": 329187328 }, { "epoch": 0.1, "learning_rate": 0.0004546639919759278, "loss": 3.1987, "theoretical_loss": 4.101740416029338, "tokens_seen": 329252864 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3072829246520996, "objective/train/theoretical_loss": 4.101646081756305, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.101646081756305, "tokens_seen": 329318400 }, { "epoch": 0.1, "learning_rate": 0.000454653961885657, "loss": 3.2388, "theoretical_loss": 4.101646081756305, "tokens_seen": 329318400 }, { "epoch": 0.1, "learning_rate": 0.00045464393179538615, "loss": 3.1388, "theoretical_loss": 4.101551771509641, "tokens_seen": 329383936 }, { "epoch": 0.1, "learning_rate": 0.0004546339017051154, "loss": 3.0205, "theoretical_loss": 4.101457485278444, "tokens_seen": 329449472 }, { "epoch": 0.1, "learning_rate": 0.0004546238716148445, "loss": 3.195, "theoretical_loss": 4.101363223051826, "tokens_seen": 329515008 }, { "epoch": 0.1, "learning_rate": 0.00045461384152457374, "loss": 3.2031, "theoretical_loss": 4.101268984818901, "tokens_seen": 329580544 }, { "epoch": 0.1, "learning_rate": 0.0004546038114343029, "loss": 3.1475, "theoretical_loss": 4.101174770568795, "tokens_seen": 329646080 }, { "epoch": 0.1, "learning_rate": 0.0004545937813440321, "loss": 3.2005, "theoretical_loss": 4.1010805802906365, "tokens_seen": 329711616 }, { "epoch": 0.1, "learning_rate": 0.0004545837512537613, "loss": 3.3596, "theoretical_loss": 4.100986413973564, "tokens_seen": 329777152 }, { "epoch": 0.1, "learning_rate": 0.00045457372116349047, "loss": 3.4381, "theoretical_loss": 4.100892271606721, "tokens_seen": 329842688 }, { "epoch": 0.1, "learning_rate": 0.00045456369107321965, "loss": 3.0425, "theoretical_loss": 4.1007981531792606, "tokens_seen": 329908224 }, { "epoch": 0.1, "learning_rate": 0.0004545536609829489, "loss": 3.2377, "theoretical_loss": 4.100704058680341, "tokens_seen": 329973760 }, { "epoch": 0.1, "learning_rate": 0.000454543630892678, "loss": 3.3225, "theoretical_loss": 4.1006099880991265, "tokens_seen": 330039296 }, { "epoch": 0.1, "learning_rate": 0.00045453360080240725, "loss": 3.2118, "theoretical_loss": 4.100515941424792, "tokens_seen": 330104832 }, { "epoch": 0.1, "learning_rate": 0.00045452357071213643, "loss": 3.0717, "theoretical_loss": 4.100421918646517, "tokens_seen": 330170368 }, { "epoch": 0.1, "learning_rate": 0.0004545135406218656, "loss": 3.2327, "theoretical_loss": 4.1003279197534885, "tokens_seen": 330235904 }, { "epoch": 0.1, "learning_rate": 0.0004545035105315948, "loss": 3.234, "theoretical_loss": 4.100233944734899, "tokens_seen": 330301440 }, { "epoch": 0.1, "learning_rate": 0.00045449348044132397, "loss": 3.1198, "theoretical_loss": 4.100139993579952, "tokens_seen": 330366976 }, { "epoch": 0.1, "learning_rate": 0.00045448345035105315, "loss": 2.9839, "theoretical_loss": 4.100046066277853, "tokens_seen": 330432512 }, { "epoch": 0.1, "learning_rate": 0.0004544734202607824, "loss": 3.0359, "theoretical_loss": 4.09995216281782, "tokens_seen": 330498048 }, { "epoch": 0.1, "learning_rate": 0.0004544633901705115, "loss": 3.3078, "theoretical_loss": 4.0998582831890715, "tokens_seen": 330563584 }, { "epoch": 0.1, "learning_rate": 0.00045445336008024075, "loss": 3.4174, "theoretical_loss": 4.0997644273808405, "tokens_seen": 330629120 }, { "epoch": 0.1, "learning_rate": 0.0004544433299899699, "loss": 3.2214, "theoretical_loss": 4.09967059538236, "tokens_seen": 330694656 }, { "epoch": 0.1, "learning_rate": 0.0004544332998996991, "loss": 3.1227, "theoretical_loss": 4.099576787182874, "tokens_seen": 330760192 }, { "epoch": 0.1, "learning_rate": 0.0004544232698094283, "loss": 3.2188, "theoretical_loss": 4.099483002771633, "tokens_seen": 330825728 }, { "epoch": 0.1, "learning_rate": 0.0004544132397191575, "loss": 3.0372, "theoretical_loss": 4.099389242137894, "tokens_seen": 330891264 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2316787242889404, "objective/train/theoretical_loss": 4.099295505270921, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.099295505270921, "tokens_seen": 330956800 }, { "epoch": 0.1, "learning_rate": 0.00045440320962888666, "loss": 3.2257, "theoretical_loss": 4.099295505270921, "tokens_seen": 330956800 }, { "epoch": 0.1, "learning_rate": 0.00045439317953861584, "loss": 3.0433, "theoretical_loss": 4.099201792159985, "tokens_seen": 331022336 }, { "epoch": 0.1, "learning_rate": 0.000454383149448345, "loss": 3.132, "theoretical_loss": 4.099108102794363, "tokens_seen": 331087872 }, { "epoch": 0.1, "learning_rate": 0.00045437311935807425, "loss": 2.894, "theoretical_loss": 4.099014437163342, "tokens_seen": 331153408 }, { "epoch": 0.1, "learning_rate": 0.0004543630892678034, "loss": 3.1362, "theoretical_loss": 4.098920795256213, "tokens_seen": 331218944 }, { "epoch": 0.1, "learning_rate": 0.0004543530591775326, "loss": 3.2817, "theoretical_loss": 4.098827177062273, "tokens_seen": 331284480 }, { "epoch": 0.1, "learning_rate": 0.0004543430290872618, "loss": 3.0162, "theoretical_loss": 4.098733582570831, "tokens_seen": 331350016 }, { "epoch": 0.1, "learning_rate": 0.000454332998996991, "loss": 3.2701, "theoretical_loss": 4.098640011771198, "tokens_seen": 331415552 }, { "epoch": 0.1, "learning_rate": 0.00045432296890672016, "loss": 3.1275, "theoretical_loss": 4.098546464652693, "tokens_seen": 331481088 }, { "epoch": 0.1, "learning_rate": 0.00045431293881644934, "loss": 3.1321, "theoretical_loss": 4.098452941204643, "tokens_seen": 331546624 }, { "epoch": 0.1, "learning_rate": 0.0004543029087261785, "loss": 3.27, "theoretical_loss": 4.098359441416383, "tokens_seen": 331612160 }, { "epoch": 0.1, "learning_rate": 0.00045429287863590776, "loss": 3.3527, "theoretical_loss": 4.0982659652772515, "tokens_seen": 331677696 }, { "epoch": 0.1, "learning_rate": 0.0004542828485456369, "loss": 3.2535, "theoretical_loss": 4.098172512776597, "tokens_seen": 331743232 }, { "epoch": 0.1, "learning_rate": 0.0004542728184553661, "loss": 3.1076, "theoretical_loss": 4.098079083903773, "tokens_seen": 331808768 }, { "epoch": 0.1, "learning_rate": 0.00045426278836509525, "loss": 3.3355, "theoretical_loss": 4.097985678648142, "tokens_seen": 331874304 }, { "epoch": 0.1, "learning_rate": 0.0004542527582748245, "loss": 3.1838, "theoretical_loss": 4.09789229699907, "tokens_seen": 331939840 }, { "epoch": 0.1, "learning_rate": 0.00045424272818455366, "loss": 3.2742, "theoretical_loss": 4.097798938945933, "tokens_seen": 332005376 }, { "epoch": 0.1, "learning_rate": 0.00045423269809428284, "loss": 3.2354, "theoretical_loss": 4.097705604478112, "tokens_seen": 332070912 }, { "epoch": 0.1, "learning_rate": 0.000454222668004012, "loss": 3.2249, "theoretical_loss": 4.097612293584998, "tokens_seen": 332136448 }, { "epoch": 0.1, "learning_rate": 0.0004542126379137412, "loss": 3.1763, "theoretical_loss": 4.0975190062559825, "tokens_seen": 332201984 }, { "epoch": 0.1, "learning_rate": 0.00045420260782347044, "loss": 3.1216, "theoretical_loss": 4.097425742480472, "tokens_seen": 332267520 }, { "epoch": 0.1, "learning_rate": 0.0004541925777331996, "loss": 3.2051, "theoretical_loss": 4.097332502247873, "tokens_seen": 332333056 }, { "epoch": 0.1, "learning_rate": 0.0004541825476429288, "loss": 3.1746, "theoretical_loss": 4.0972392855476025, "tokens_seen": 332398592 }, { "epoch": 0.1, "learning_rate": 0.000454172517552658, "loss": 3.1263, "theoretical_loss": 4.097146092369084, "tokens_seen": 332464128 }, { "epoch": 0.1, "learning_rate": 0.0004541624874623872, "loss": 3.1643, "theoretical_loss": 4.097052922701746, "tokens_seen": 332529664 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.062695026397705, "objective/train/theoretical_loss": 4.0969597765350265, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.0969597765350265, "tokens_seen": 332595200 }, { "epoch": 0.1, "learning_rate": 0.00045415245737211635, "loss": 3.0996, "theoretical_loss": 4.0969597765350265, "tokens_seen": 332595200 }, { "epoch": 0.1, "learning_rate": 0.0004541424272818456, "loss": 3.2237, "theoretical_loss": 4.096866653858368, "tokens_seen": 332660736 }, { "epoch": 0.1, "learning_rate": 0.0004541323971915747, "loss": 3.0112, "theoretical_loss": 4.09677355466122, "tokens_seen": 332726272 }, { "epoch": 0.1, "learning_rate": 0.00045412236710130395, "loss": 3.1103, "theoretical_loss": 4.096680478933042, "tokens_seen": 332791808 }, { "epoch": 0.1, "learning_rate": 0.0004541123370110331, "loss": 3.1912, "theoretical_loss": 4.0965874266632945, "tokens_seen": 332857344 }, { "epoch": 0.1, "learning_rate": 0.0004541023069207623, "loss": 3.322, "theoretical_loss": 4.09649439784145, "tokens_seen": 332922880 }, { "epoch": 0.1, "learning_rate": 0.0004540922768304915, "loss": 3.2562, "theoretical_loss": 4.096401392456988, "tokens_seen": 332988416 }, { "epoch": 0.1, "learning_rate": 0.00045408224674022067, "loss": 3.4071, "theoretical_loss": 4.09630841049939, "tokens_seen": 333053952 }, { "epoch": 0.1, "learning_rate": 0.00045407221664994985, "loss": 3.3237, "theoretical_loss": 4.096215451958146, "tokens_seen": 333119488 }, { "epoch": 0.1, "learning_rate": 0.0004540621865596791, "loss": 2.9536, "theoretical_loss": 4.096122516822757, "tokens_seen": 333185024 }, { "epoch": 0.1, "learning_rate": 0.0004540521564694082, "loss": 3.1359, "theoretical_loss": 4.096029605082726, "tokens_seen": 333250560 }, { "epoch": 0.1, "learning_rate": 0.00045404212637913745, "loss": 3.0979, "theoretical_loss": 4.095936716727564, "tokens_seen": 333316096 }, { "epoch": 0.1, "learning_rate": 0.00045403209628886663, "loss": 3.1871, "theoretical_loss": 4.095843851746791, "tokens_seen": 333381632 }, { "epoch": 0.1, "learning_rate": 0.0004540220661985958, "loss": 3.0999, "theoretical_loss": 4.095751010129929, "tokens_seen": 333447168 }, { "epoch": 0.1, "learning_rate": 0.000454012036108325, "loss": 3.3804, "theoretical_loss": 4.095658191866512, "tokens_seen": 333512704 }, { "epoch": 0.1, "learning_rate": 0.00045400200601805417, "loss": 3.2307, "theoretical_loss": 4.0955653969460775, "tokens_seen": 333578240 }, { "epoch": 0.1, "learning_rate": 0.00045399197592778335, "loss": 3.2057, "theoretical_loss": 4.095472625358171, "tokens_seen": 333643776 }, { "epoch": 0.1, "learning_rate": 0.0004539819458375126, "loss": 2.9676, "theoretical_loss": 4.095379877092343, "tokens_seen": 333709312 }, { "epoch": 0.1, "learning_rate": 0.0004539719157472417, "loss": 2.9891, "theoretical_loss": 4.095287152138154, "tokens_seen": 333774848 }, { "epoch": 0.1, "learning_rate": 0.00045396188565697095, "loss": 3.0122, "theoretical_loss": 4.0951944504851685, "tokens_seen": 333840384 }, { "epoch": 0.1, "learning_rate": 0.0004539518555667001, "loss": 3.0585, "theoretical_loss": 4.095101772122959, "tokens_seen": 333905920 }, { "epoch": 0.1, "learning_rate": 0.0004539418254764293, "loss": 3.2767, "theoretical_loss": 4.095009117041102, "tokens_seen": 333971456 }, { "epoch": 0.1, "learning_rate": 0.0004539317953861585, "loss": 3.175, "theoretical_loss": 4.094916485229186, "tokens_seen": 334036992 }, { "epoch": 0.1, "learning_rate": 0.0004539217652958877, "loss": 3.1844, "theoretical_loss": 4.094823876676802, "tokens_seen": 334102528 }, { "epoch": 0.1, "learning_rate": 0.00045391173520561686, "loss": 3.2077, "theoretical_loss": 4.094731291373548, "tokens_seen": 334168064 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2197489738464355, "objective/train/theoretical_loss": 4.094638729309031, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.094638729309031, "tokens_seen": 334233600 }, { "epoch": 0.1, "learning_rate": 0.00045390170511534604, "loss": 3.3325, "theoretical_loss": 4.094638729309031, "tokens_seen": 334233600 }, { "epoch": 0.1, "learning_rate": 0.0004538916750250752, "loss": 3.2287, "theoretical_loss": 4.094546190472862, "tokens_seen": 334299136 }, { "epoch": 0.1, "learning_rate": 0.00045388164493480445, "loss": 3.3077, "theoretical_loss": 4.09445367485466, "tokens_seen": 334364672 }, { "epoch": 0.1, "learning_rate": 0.0004538716148445336, "loss": 3.2817, "theoretical_loss": 4.09436118244405, "tokens_seen": 334430208 }, { "epoch": 0.1, "learning_rate": 0.0004538615847542628, "loss": 3.0679, "theoretical_loss": 4.094268713230667, "tokens_seen": 334495744 }, { "epoch": 0.1, "learning_rate": 0.000453851554663992, "loss": 2.9914, "theoretical_loss": 4.094176267204148, "tokens_seen": 334561280 }, { "epoch": 0.1, "learning_rate": 0.0004538415245737212, "loss": 3.1458, "theoretical_loss": 4.094083844354137, "tokens_seen": 334626816 }, { "epoch": 0.1, "learning_rate": 0.00045383149448345036, "loss": 3.1583, "theoretical_loss": 4.093991444670289, "tokens_seen": 334692352 }, { "epoch": 0.1, "learning_rate": 0.00045382146439317954, "loss": 3.1593, "theoretical_loss": 4.093899068142262, "tokens_seen": 334757888 }, { "epoch": 0.1, "learning_rate": 0.0004538114343029087, "loss": 3.1953, "theoretical_loss": 4.093806714759721, "tokens_seen": 334823424 }, { "epoch": 0.1, "learning_rate": 0.00045380140421263796, "loss": 3.2307, "theoretical_loss": 4.093714384512337, "tokens_seen": 334888960 }, { "epoch": 0.1, "learning_rate": 0.0004537913741223671, "loss": 3.2933, "theoretical_loss": 4.093622077389791, "tokens_seen": 334954496 }, { "epoch": 0.1, "learning_rate": 0.0004537813440320963, "loss": 3.1672, "theoretical_loss": 4.093529793381768, "tokens_seen": 335020032 }, { "epoch": 0.1, "learning_rate": 0.00045377131394182545, "loss": 3.1222, "theoretical_loss": 4.093437532477958, "tokens_seen": 335085568 }, { "epoch": 0.1, "learning_rate": 0.0004537612838515547, "loss": 3.328, "theoretical_loss": 4.093345294668063, "tokens_seen": 335151104 }, { "epoch": 0.1, "learning_rate": 0.00045375125376128386, "loss": 3.344, "theoretical_loss": 4.0932530799417846, "tokens_seen": 335216640 }, { "epoch": 0.1, "learning_rate": 0.00045374122367101304, "loss": 3.3177, "theoretical_loss": 4.093160888288837, "tokens_seen": 335282176 }, { "epoch": 0.1, "learning_rate": 0.0004537311935807422, "loss": 3.0782, "theoretical_loss": 4.0930687196989375, "tokens_seen": 335347712 }, { "epoch": 0.1, "learning_rate": 0.0004537211634904714, "loss": 3.304, "theoretical_loss": 4.0929765741618125, "tokens_seen": 335413248 }, { "epoch": 0.1, "learning_rate": 0.0004537111334002006, "loss": 3.2133, "theoretical_loss": 4.092884451667191, "tokens_seen": 335478784 }, { "epoch": 0.1, "learning_rate": 0.0004537011033099298, "loss": 3.1184, "theoretical_loss": 4.092792352204814, "tokens_seen": 335544320 }, { "epoch": 0.1, "learning_rate": 0.00045369107321965895, "loss": 3.1783, "theoretical_loss": 4.092700275764424, "tokens_seen": 335609856 }, { "epoch": 0.1, "learning_rate": 0.0004536810431293882, "loss": 3.2746, "theoretical_loss": 4.092608222335774, "tokens_seen": 335675392 }, { "epoch": 0.1, "learning_rate": 0.00045367101303911737, "loss": 3.1755, "theoretical_loss": 4.092516191908621, "tokens_seen": 335740928 }, { "epoch": 0.1, "learning_rate": 0.00045366098294884655, "loss": 3.06, "theoretical_loss": 4.09242418447273, "tokens_seen": 335806464 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1044251918792725, "objective/train/theoretical_loss": 4.092332200017871, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.092332200017871, "tokens_seen": 335872000 }, { "epoch": 0.1, "learning_rate": 0.00045365095285857573, "loss": 3.1891, "theoretical_loss": 4.092332200017871, "tokens_seen": 335872000 }, { "epoch": 0.1, "learning_rate": 0.0004536409227683049, "loss": 3.0908, "theoretical_loss": 4.092240238533822, "tokens_seen": 335937536 }, { "epoch": 0.1, "learning_rate": 0.0004536308926780341, "loss": 3.1558, "theoretical_loss": 4.092148300010367, "tokens_seen": 336003072 }, { "epoch": 0.1, "learning_rate": 0.0004536208625877633, "loss": 3.1654, "theoretical_loss": 4.092056384437297, "tokens_seen": 336068608 }, { "epoch": 0.1, "learning_rate": 0.00045361083249749245, "loss": 3.361, "theoretical_loss": 4.091964491804409, "tokens_seen": 336134144 }, { "epoch": 0.1, "learning_rate": 0.0004536008024072217, "loss": 3.233, "theoretical_loss": 4.091872622101506, "tokens_seen": 336199680 }, { "epoch": 0.1, "learning_rate": 0.0004535907723169508, "loss": 3.2792, "theoretical_loss": 4.091780775318399, "tokens_seen": 336265216 }, { "epoch": 0.1, "learning_rate": 0.00045358074222668005, "loss": 2.995, "theoretical_loss": 4.091688951444904, "tokens_seen": 336330752 }, { "epoch": 0.1, "learning_rate": 0.00045357071213640923, "loss": 3.2985, "theoretical_loss": 4.091597150470845, "tokens_seen": 336396288 }, { "epoch": 0.1, "learning_rate": 0.0004535606820461384, "loss": 3.1297, "theoretical_loss": 4.091505372386051, "tokens_seen": 336461824 }, { "epoch": 0.1, "learning_rate": 0.0004535506519558676, "loss": 3.286, "theoretical_loss": 4.091413617180358, "tokens_seen": 336527360 }, { "epoch": 0.1, "learning_rate": 0.00045354062186559683, "loss": 3.2711, "theoretical_loss": 4.091321884843609, "tokens_seen": 336592896 }, { "epoch": 0.1, "learning_rate": 0.00045353059177532596, "loss": 3.2421, "theoretical_loss": 4.091230175365653, "tokens_seen": 336658432 }, { "epoch": 0.1, "learning_rate": 0.0004535205616850552, "loss": 3.1896, "theoretical_loss": 4.0911384887363464, "tokens_seen": 336723968 }, { "epoch": 0.1, "learning_rate": 0.0004535105315947843, "loss": 3.0811, "theoretical_loss": 4.091046824945551, "tokens_seen": 336789504 }, { "epoch": 0.1, "learning_rate": 0.00045350050150451355, "loss": 3.2057, "theoretical_loss": 4.0909551839831355, "tokens_seen": 336855040 }, { "epoch": 0.1, "learning_rate": 0.00045349047141424274, "loss": 3.2564, "theoretical_loss": 4.090863565838974, "tokens_seen": 336920576 }, { "epoch": 0.1, "learning_rate": 0.0004534804413239719, "loss": 3.1334, "theoretical_loss": 4.090771970502948, "tokens_seen": 336986112 }, { "epoch": 0.1, "learning_rate": 0.0004534704112337011, "loss": 3.2489, "theoretical_loss": 4.090680397964947, "tokens_seen": 337051648 }, { "epoch": 0.1, "learning_rate": 0.0004534603811434303, "loss": 3.1919, "theoretical_loss": 4.090588848214865, "tokens_seen": 337117184 }, { "epoch": 0.1, "learning_rate": 0.0004534503510531595, "loss": 3.2477, "theoretical_loss": 4.0904973212426015, "tokens_seen": 337182720 }, { "epoch": 0.1, "learning_rate": 0.0004534403209628887, "loss": 3.3738, "theoretical_loss": 4.090405817038065, "tokens_seen": 337248256 }, { "epoch": 0.1, "learning_rate": 0.0004534302908726179, "loss": 3.0617, "theoretical_loss": 4.090314335591169, "tokens_seen": 337313792 }, { "epoch": 0.1, "learning_rate": 0.00045342026078234706, "loss": 3.2938, "theoretical_loss": 4.090222876891834, "tokens_seen": 337379328 }, { "epoch": 0.1, "learning_rate": 0.00045341023069207624, "loss": 3.2215, "theoretical_loss": 4.090131440929985, "tokens_seen": 337444864 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2328193187713623, "objective/train/theoretical_loss": 4.090040027695556, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.090040027695556, "tokens_seen": 337510400 }, { "epoch": 0.1, "learning_rate": 0.0004534002006018054, "loss": 3.2488, "theoretical_loss": 4.090040027695556, "tokens_seen": 337510400 }, { "epoch": 0.1, "learning_rate": 0.00045339017051153465, "loss": 3.4379, "theoretical_loss": 4.0899486371784874, "tokens_seen": 337575936 }, { "epoch": 0.1, "learning_rate": 0.0004533801404212638, "loss": 3.1702, "theoretical_loss": 4.089857269368725, "tokens_seen": 337641472 }, { "epoch": 0.1, "learning_rate": 0.000453370110330993, "loss": 3.2591, "theoretical_loss": 4.089765924256218, "tokens_seen": 337707008 }, { "epoch": 0.1, "learning_rate": 0.0004533600802407222, "loss": 3.1913, "theoretical_loss": 4.089674601830929, "tokens_seen": 337772544 }, { "epoch": 0.1, "learning_rate": 0.0004533500501504514, "loss": 3.063, "theoretical_loss": 4.08958330208282, "tokens_seen": 337838080 }, { "epoch": 0.1, "learning_rate": 0.00045334002006018056, "loss": 3.2687, "theoretical_loss": 4.089492025001864, "tokens_seen": 337903616 }, { "epoch": 0.1, "learning_rate": 0.00045332998996990974, "loss": 3.247, "theoretical_loss": 4.089400770578038, "tokens_seen": 337969152 }, { "epoch": 0.1, "learning_rate": 0.0004533199598796389, "loss": 3.1951, "theoretical_loss": 4.089309538801327, "tokens_seen": 338034688 }, { "epoch": 0.1, "learning_rate": 0.00045330992978936816, "loss": 3.2446, "theoretical_loss": 4.08921832966172, "tokens_seen": 338100224 }, { "epoch": 0.1, "learning_rate": 0.0004532998996990973, "loss": 3.2945, "theoretical_loss": 4.0891271431492155, "tokens_seen": 338165760 }, { "epoch": 0.1, "learning_rate": 0.0004532898696088265, "loss": 3.0949, "theoretical_loss": 4.089035979253816, "tokens_seen": 338231296 }, { "epoch": 0.1, "learning_rate": 0.00045327983951855565, "loss": 3.2295, "theoretical_loss": 4.08894483796553, "tokens_seen": 338296832 }, { "epoch": 0.1, "learning_rate": 0.0004532698094282849, "loss": 3.1665, "theoretical_loss": 4.0888537192743755, "tokens_seen": 338362368 }, { "epoch": 0.1, "learning_rate": 0.00045325977933801406, "loss": 3.2827, "theoretical_loss": 4.088762623170373, "tokens_seen": 338427904 }, { "epoch": 0.1, "learning_rate": 0.00045324974924774324, "loss": 3.201, "theoretical_loss": 4.088671549643553, "tokens_seen": 338493440 }, { "epoch": 0.1, "learning_rate": 0.0004532397191574724, "loss": 3.2557, "theoretical_loss": 4.088580498683948, "tokens_seen": 338558976 }, { "epoch": 0.1, "learning_rate": 0.0004532296890672016, "loss": 3.2474, "theoretical_loss": 4.088489470281601, "tokens_seen": 338624512 }, { "epoch": 0.1, "learning_rate": 0.0004532196589769308, "loss": 3.2571, "theoretical_loss": 4.088398464426559, "tokens_seen": 338690048 }, { "epoch": 0.1, "learning_rate": 0.00045320962888666, "loss": 2.8868, "theoretical_loss": 4.088307481108876, "tokens_seen": 338755584 }, { "epoch": 0.1, "learning_rate": 0.00045319959879638915, "loss": 3.4097, "theoretical_loss": 4.088216520318612, "tokens_seen": 338821120 }, { "epoch": 0.1, "learning_rate": 0.0004531895687061184, "loss": 3.2186, "theoretical_loss": 4.0881255820458335, "tokens_seen": 338886656 }, { "epoch": 0.1, "learning_rate": 0.00045317953861584757, "loss": 3.2695, "theoretical_loss": 4.088034666280614, "tokens_seen": 338952192 }, { "epoch": 0.1, "learning_rate": 0.00045316950852557675, "loss": 3.0784, "theoretical_loss": 4.087943773013032, "tokens_seen": 339017728 }, { "epoch": 0.1, "learning_rate": 0.00045315947843530593, "loss": 3.2212, "theoretical_loss": 4.087852902233173, "tokens_seen": 339083264 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2537693977355957, "objective/train/theoretical_loss": 4.087762053931129, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.087762053931129, "tokens_seen": 339148800 }, { "epoch": 0.1, "learning_rate": 0.0004531494483450351, "loss": 3.2183, "theoretical_loss": 4.087762053931129, "tokens_seen": 339148800 }, { "epoch": 0.1, "learning_rate": 0.0004531394182547643, "loss": 3.1293, "theoretical_loss": 4.087671228096997, "tokens_seen": 339214336 }, { "epoch": 0.1, "learning_rate": 0.0004531293881644935, "loss": 3.1242, "theoretical_loss": 4.087580424720882, "tokens_seen": 339279872 }, { "epoch": 0.1, "learning_rate": 0.00045311935807422265, "loss": 3.2049, "theoretical_loss": 4.087489643792894, "tokens_seen": 339345408 }, { "epoch": 0.1, "learning_rate": 0.0004531093279839519, "loss": 3.048, "theoretical_loss": 4.08739888530315, "tokens_seen": 339410944 }, { "epoch": 0.1, "learning_rate": 0.000453099297893681, "loss": 2.8678, "theoretical_loss": 4.087308149241774, "tokens_seen": 339476480 }, { "epoch": 0.1, "learning_rate": 0.00045308926780341025, "loss": 3.2692, "theoretical_loss": 4.087217435598894, "tokens_seen": 339542016 }, { "epoch": 0.1, "learning_rate": 0.00045307923771313943, "loss": 3.0756, "theoretical_loss": 4.087126744364646, "tokens_seen": 339607552 }, { "epoch": 0.1, "learning_rate": 0.0004530692076228686, "loss": 3.0982, "theoretical_loss": 4.087036075529172, "tokens_seen": 339673088 }, { "epoch": 0.1, "learning_rate": 0.0004530591775325978, "loss": 3.165, "theoretical_loss": 4.086945429082618, "tokens_seen": 339738624 }, { "epoch": 0.1, "learning_rate": 0.00045304914744232703, "loss": 3.281, "theoretical_loss": 4.086854805015141, "tokens_seen": 339804160 }, { "epoch": 0.1, "learning_rate": 0.00045303911735205616, "loss": 2.9637, "theoretical_loss": 4.086764203316902, "tokens_seen": 339869696 }, { "epoch": 0.1, "learning_rate": 0.0004530290872617854, "loss": 3.2282, "theoretical_loss": 4.086673623978064, "tokens_seen": 339935232 }, { "epoch": 0.1, "learning_rate": 0.0004530190571715145, "loss": 3.2232, "theoretical_loss": 4.086583066988802, "tokens_seen": 340000768 }, { "epoch": 0.1, "learning_rate": 0.00045300902708124375, "loss": 3.116, "theoretical_loss": 4.086492532339296, "tokens_seen": 340066304 }, { "epoch": 0.1, "learning_rate": 0.00045299899699097294, "loss": 3.1969, "theoretical_loss": 4.0864020200197295, "tokens_seen": 340131840 }, { "epoch": 0.1, "learning_rate": 0.0004529889669007021, "loss": 3.1219, "theoretical_loss": 4.086311530020296, "tokens_seen": 340197376 }, { "epoch": 0.1, "learning_rate": 0.0004529789368104313, "loss": 3.1892, "theoretical_loss": 4.086221062331192, "tokens_seen": 340262912 }, { "epoch": 0.1, "learning_rate": 0.0004529689067201605, "loss": 3.1306, "theoretical_loss": 4.086130616942621, "tokens_seen": 340328448 }, { "epoch": 0.1, "learning_rate": 0.00045295887662988966, "loss": 3.1579, "theoretical_loss": 4.086040193844794, "tokens_seen": 340393984 }, { "epoch": 0.1, "learning_rate": 0.0004529488465396189, "loss": 3.0668, "theoretical_loss": 4.085949793027927, "tokens_seen": 340459520 }, { "epoch": 0.1, "learning_rate": 0.000452938816449348, "loss": 3.019, "theoretical_loss": 4.0858594144822415, "tokens_seen": 340525056 }, { "epoch": 0.1, "learning_rate": 0.00045292878635907726, "loss": 3.3224, "theoretical_loss": 4.085769058197968, "tokens_seen": 340590592 }, { "epoch": 0.1, "learning_rate": 0.0004529187562688064, "loss": 3.2369, "theoretical_loss": 4.085678724165341, "tokens_seen": 340656128 }, { "epoch": 0.1, "learning_rate": 0.0004529087261785356, "loss": 3.0316, "theoretical_loss": 4.0855884123746, "tokens_seen": 340721664 }, { "epoch": 0.1, "objective/train/docs_used": 537861, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1083457469940186, "objective/train/theoretical_loss": 4.085498122815992, "objective/train/tokens_used": 341687776, "theoretical_loss": 4.085498122815992, "tokens_seen": 340787200 }, { "epoch": 0.1, "learning_rate": 0.0004528986960882648, "loss": 3.0403, "theoretical_loss": 4.085498122815992, "tokens_seen": 340787200 }, { "epoch": 0.1, "learning_rate": 0.000452888665997994, "loss": 3.2351, "theoretical_loss": 4.085407855479772, "tokens_seen": 340852736 }, { "epoch": 0.1, "learning_rate": 0.00045287863590772316, "loss": 3.2555, "theoretical_loss": 4.085317610356199, "tokens_seen": 340918272 }, { "epoch": 0.1, "learning_rate": 0.0004528686058174524, "loss": 3.1907, "theoretical_loss": 4.085227387435538, "tokens_seen": 340983808 }, { "epoch": 0.1, "learning_rate": 0.0004528585757271815, "loss": 3.1335, "theoretical_loss": 4.08513718670806, "tokens_seen": 341049344 }, { "epoch": 0.1, "learning_rate": 0.00045284854563691076, "loss": 3.1541, "theoretical_loss": 4.085047008164044, "tokens_seen": 341114880 }, { "epoch": 0.1, "learning_rate": 0.0004528385155466399, "loss": 3.2333, "theoretical_loss": 4.084956851793773, "tokens_seen": 341180416 }, { "epoch": 0.1, "learning_rate": 0.0004528284854563691, "loss": 3.0779, "theoretical_loss": 4.0848667175875395, "tokens_seen": 341245952 }, { "epoch": 0.1, "learning_rate": 0.0004528184553660983, "loss": 3.0413, "theoretical_loss": 4.0847766055356365, "tokens_seen": 341311488 }, { "epoch": 0.1, "learning_rate": 0.0004528084252758275, "loss": 3.0675, "theoretical_loss": 4.084686515628368, "tokens_seen": 341377024 }, { "epoch": 0.1, "learning_rate": 0.00045279839518555667, "loss": 3.2337, "theoretical_loss": 4.0845964478560415, "tokens_seen": 341442560 }, { "epoch": 0.1, "learning_rate": 0.00045278836509528585, "loss": 2.9671, "theoretical_loss": 4.084506402208972, "tokens_seen": 341508096 }, { "epoch": 0.1, "learning_rate": 0.00045277833500501503, "loss": 3.1882, "theoretical_loss": 4.0844163786774805, "tokens_seen": 341573632 }, { "epoch": 0.1, "learning_rate": 0.00045276830491474426, "loss": 3.1617, "theoretical_loss": 4.084326377251894, "tokens_seen": 341639168 }, { "epoch": 0.1, "learning_rate": 0.0004527582748244734, "loss": 3.1589, "theoretical_loss": 4.084236397922544, "tokens_seen": 341704704 }, { "epoch": 1.0, "learning_rate": 0.0004527482447342026, "loss": 4.0079, "theoretical_loss": 4.084142224475771, "tokens_seen": 341773312 }, { "epoch": 1.0, "learning_rate": 0.00045273821464393175, "loss": 3.1511, "theoretical_loss": 4.084052290344537, "tokens_seen": 341838848 }, { "epoch": 1.0, "learning_rate": 0.000452728184553661, "loss": 3.2864, "theoretical_loss": 4.0839623782801215, "tokens_seen": 341904384 }, { "epoch": 1.0, "learning_rate": 0.00045271815446339017, "loss": 2.9609, "theoretical_loss": 4.083872488272884, "tokens_seen": 341969920 }, { "epoch": 1.0, "learning_rate": 0.00045270812437311935, "loss": 3.2278, "theoretical_loss": 4.083782620313186, "tokens_seen": 342035456 }, { "epoch": 1.0, "learning_rate": 0.0004526980942828486, "loss": 3.3228, "theoretical_loss": 4.083692774391398, "tokens_seen": 342100992 }, { "epoch": 1.0, "learning_rate": 0.00045268806419257777, "loss": 3.1782, "theoretical_loss": 4.083602950497896, "tokens_seen": 342166528 }, { "epoch": 1.0, "learning_rate": 0.00045267803410230695, "loss": 3.2537, "theoretical_loss": 4.0835131486230605, "tokens_seen": 342232064 }, { "epoch": 1.0, "learning_rate": 0.00045266800401203613, "loss": 3.1816, "theoretical_loss": 4.0834233687572805, "tokens_seen": 342297600 }, { "epoch": 1.0, "learning_rate": 0.0004526579739217653, "loss": 3.3479, "theoretical_loss": 4.083333610890947, "tokens_seen": 342363136 }, { "epoch": 1.0, "objective/train/docs_used": 572633, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.151498794555664, "objective/train/theoretical_loss": 4.08326630692256, "objective/train/tokens_used": 362872288, "theoretical_loss": 4.08326630692256, "tokens_seen": 342412288 }, { "epoch": 1.0, "learning_rate": 0.0004526479438314945, "loss": 3.1636, "theoretical_loss": 4.083243875014463, "tokens_seen": 342428672 }, { "epoch": 1.0, "learning_rate": 0.0004526379137412237, "loss": 3.1817, "theoretical_loss": 4.083154161118232, "tokens_seen": 342494208 }, { "epoch": 1.0, "learning_rate": 0.00045262788365095285, "loss": 3.1402, "theoretical_loss": 4.083064469192665, "tokens_seen": 342559744 }, { "epoch": 1.0, "learning_rate": 0.0004526178535606821, "loss": 3.2679, "theoretical_loss": 4.082974799228182, "tokens_seen": 342625280 }, { "epoch": 1.0, "learning_rate": 0.0004526078234704112, "loss": 3.2866, "theoretical_loss": 4.082885151215207, "tokens_seen": 342690816 }, { "epoch": 1.0, "learning_rate": 0.00045259779338014045, "loss": 3.2124, "theoretical_loss": 4.082795525144167, "tokens_seen": 342756352 }, { "epoch": 1.0, "learning_rate": 0.00045258776328986963, "loss": 3.0954, "theoretical_loss": 4.082705921005499, "tokens_seen": 342821888 }, { "epoch": 1.0, "learning_rate": 0.0004525777331995988, "loss": 3.0787, "theoretical_loss": 4.082616338789646, "tokens_seen": 342887424 }, { "epoch": 1.0, "learning_rate": 0.000452567703109328, "loss": 3.0992, "theoretical_loss": 4.082526778487054, "tokens_seen": 342952960 }, { "epoch": 1.0, "learning_rate": 0.00045255767301905723, "loss": 3.323, "theoretical_loss": 4.082437240088177, "tokens_seen": 343018496 }, { "epoch": 1.0, "learning_rate": 0.00045254764292878636, "loss": 3.2146, "theoretical_loss": 4.082347723583476, "tokens_seen": 343084032 }, { "epoch": 1.0, "learning_rate": 0.0004525376128385156, "loss": 3.3184, "theoretical_loss": 4.082258228963416, "tokens_seen": 343149568 }, { "epoch": 1.0, "learning_rate": 0.0004525275827482447, "loss": 3.2744, "theoretical_loss": 4.082168756218468, "tokens_seen": 343215104 }, { "epoch": 1.0, "learning_rate": 0.00045251755265797395, "loss": 3.2044, "theoretical_loss": 4.08207930533911, "tokens_seen": 343280640 }, { "epoch": 1.0, "learning_rate": 0.00045250752256770314, "loss": 3.2921, "theoretical_loss": 4.081989876315825, "tokens_seen": 343346176 }, { "epoch": 1.0, "learning_rate": 0.0004524974924774323, "loss": 3.316, "theoretical_loss": 4.0819004691391045, "tokens_seen": 343411712 }, { "epoch": 1.0, "learning_rate": 0.0004524874623871615, "loss": 3.1405, "theoretical_loss": 4.081811083799442, "tokens_seen": 343477248 }, { "epoch": 1.0, "learning_rate": 0.0004524774322968907, "loss": 3.4024, "theoretical_loss": 4.081721720287339, "tokens_seen": 343542784 }, { "epoch": 1.0, "learning_rate": 0.00045246740220661986, "loss": 3.3114, "theoretical_loss": 4.081632378593305, "tokens_seen": 343608320 }, { "epoch": 1.0, "learning_rate": 0.0004524573721163491, "loss": 3.2612, "theoretical_loss": 4.081543058707851, "tokens_seen": 343673856 }, { "epoch": 1.0, "learning_rate": 0.0004524473420260782, "loss": 3.1847, "theoretical_loss": 4.081453760621496, "tokens_seen": 343739392 }, { "epoch": 1.0, "learning_rate": 0.00045243731193580746, "loss": 3.2959, "theoretical_loss": 4.081364484324768, "tokens_seen": 343804928 }, { "epoch": 1.0, "learning_rate": 0.0004524272818455366, "loss": 3.2117, "theoretical_loss": 4.081275229808195, "tokens_seen": 343870464 }, { "epoch": 1.0, "learning_rate": 0.0004524172517552658, "loss": 3.2561, "theoretical_loss": 4.081185997062316, "tokens_seen": 343936000 }, { "epoch": 1.0, "learning_rate": 0.000452407221664995, "loss": 3.2921, "theoretical_loss": 4.081096786077674, "tokens_seen": 344001536 }, { "epoch": 1.0, "objective/train/docs_used": 574101, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8993351459503174, "objective/train/theoretical_loss": 4.0810298921143175, "objective/train/tokens_used": 364510688, "theoretical_loss": 4.0810298921143175, "tokens_seen": 344050688 }, { "epoch": 1.0, "learning_rate": 0.0004523971915747242, "loss": 3.3233, "theoretical_loss": 4.081007596844816, "tokens_seen": 344067072 }, { "epoch": 1.0, "learning_rate": 0.00045238716148445336, "loss": 3.2095, "theoretical_loss": 4.080918429354298, "tokens_seen": 344132608 }, { "epoch": 1.0, "learning_rate": 0.0004523771313941826, "loss": 3.1904, "theoretical_loss": 4.080829283596681, "tokens_seen": 344198144 }, { "epoch": 1.0, "learning_rate": 0.0004523671013039117, "loss": 3.1623, "theoretical_loss": 4.080740159562531, "tokens_seen": 344263680 }, { "epoch": 1.0, "learning_rate": 0.00045235707121364096, "loss": 2.9389, "theoretical_loss": 4.08065105724242, "tokens_seen": 344329216 }, { "epoch": 1.0, "learning_rate": 0.0004523470411233701, "loss": 3.2678, "theoretical_loss": 4.080561976626927, "tokens_seen": 344394752 }, { "epoch": 1.0, "learning_rate": 0.0004523370110330993, "loss": 3.4072, "theoretical_loss": 4.080472917706636, "tokens_seen": 344460288 }, { "epoch": 1.0, "learning_rate": 0.0004523269809428285, "loss": 3.2977, "theoretical_loss": 4.080383880472137, "tokens_seen": 344525824 }, { "epoch": 1.0, "learning_rate": 0.0004523169508525577, "loss": 3.2358, "theoretical_loss": 4.080294864914026, "tokens_seen": 344591360 }, { "epoch": 1.0, "learning_rate": 0.00045230692076228687, "loss": 3.1965, "theoretical_loss": 4.080205871022905, "tokens_seen": 344656896 }, { "epoch": 1.0, "learning_rate": 0.00045229689067201605, "loss": 3.3523, "theoretical_loss": 4.080116898789382, "tokens_seen": 344722432 }, { "epoch": 1.0, "learning_rate": 0.00045228686058174523, "loss": 3.2526, "theoretical_loss": 4.080027948204069, "tokens_seen": 344787968 }, { "epoch": 1.0, "learning_rate": 0.00045227683049147446, "loss": 3.2564, "theoretical_loss": 4.079939019257587, "tokens_seen": 344853504 }, { "epoch": 1.0, "learning_rate": 0.0004522668004012036, "loss": 3.4165, "theoretical_loss": 4.07985011194056, "tokens_seen": 344919040 }, { "epoch": 1.0, "learning_rate": 0.0004522567703109328, "loss": 3.2892, "theoretical_loss": 4.079761226243621, "tokens_seen": 344984576 }, { "epoch": 1.0, "learning_rate": 0.00045224674022066195, "loss": 3.405, "theoretical_loss": 4.079672362157404, "tokens_seen": 345050112 }, { "epoch": 1.0, "learning_rate": 0.0004522367101303912, "loss": 3.0565, "theoretical_loss": 4.079583519672554, "tokens_seen": 345115648 }, { "epoch": 1.0, "learning_rate": 0.00045222668004012037, "loss": 3.1374, "theoretical_loss": 4.079494698779719, "tokens_seen": 345181184 }, { "epoch": 1.0, "learning_rate": 0.00045221664994984955, "loss": 3.3058, "theoretical_loss": 4.079405899469553, "tokens_seen": 345246720 }, { "epoch": 1.0, "learning_rate": 0.00045220661985957873, "loss": 3.123, "theoretical_loss": 4.079317121732716, "tokens_seen": 345312256 }, { "epoch": 1.0, "learning_rate": 0.00045219658976930797, "loss": 3.3065, "theoretical_loss": 4.0792283655598744, "tokens_seen": 345377792 }, { "epoch": 1.0, "learning_rate": 0.0004521865596790371, "loss": 3.0633, "theoretical_loss": 4.079139630941701, "tokens_seen": 345443328 }, { "epoch": 1.0, "learning_rate": 0.00045217652958876633, "loss": 3.2757, "theoretical_loss": 4.079050917868872, "tokens_seen": 345508864 }, { "epoch": 1.0, "learning_rate": 0.00045216649949849546, "loss": 3.1942, "theoretical_loss": 4.078962226332071, "tokens_seen": 345574400 }, { "epoch": 1.0, "learning_rate": 0.0004521564694082247, "loss": 3.3299, "theoretical_loss": 4.078873556321988, "tokens_seen": 345639936 }, { "epoch": 1.0, "objective/train/docs_used": 577024, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3595542907714844, "objective/train/theoretical_loss": 4.078807067935736, "objective/train/tokens_used": 366149088, "theoretical_loss": 4.078807067935736, "tokens_seen": 345689088 }, { "epoch": 1.0, "learning_rate": 0.00045214643931795387, "loss": 3.305, "theoretical_loss": 4.078784907829317, "tokens_seen": 345705472 }, { "epoch": 1.0, "learning_rate": 0.00045213640922768305, "loss": 3.2556, "theoretical_loss": 4.07869628084476, "tokens_seen": 345771008 }, { "epoch": 1.0, "learning_rate": 0.00045212637913741223, "loss": 3.2669, "theoretical_loss": 4.078607675359023, "tokens_seen": 345836544 }, { "epoch": 1.0, "learning_rate": 0.0004521163490471414, "loss": 3.2201, "theoretical_loss": 4.078519091362818, "tokens_seen": 345902080 }, { "epoch": 1.0, "learning_rate": 0.0004521063189568706, "loss": 3.2917, "theoretical_loss": 4.078430528846862, "tokens_seen": 345967616 }, { "epoch": 1.0, "learning_rate": 0.00045209628886659983, "loss": 3.2856, "theoretical_loss": 4.078341987801882, "tokens_seen": 346033152 }, { "epoch": 1.0, "learning_rate": 0.00045208625877632896, "loss": 3.0754, "theoretical_loss": 4.078253468218605, "tokens_seen": 346098688 }, { "epoch": 1.0, "learning_rate": 0.0004520762286860582, "loss": 3.1566, "theoretical_loss": 4.078164970087768, "tokens_seen": 346164224 }, { "epoch": 1.0, "learning_rate": 0.0004520661985957873, "loss": 3.2839, "theoretical_loss": 4.078076493400111, "tokens_seen": 346229760 }, { "epoch": 1.0, "learning_rate": 0.00045205616850551656, "loss": 3.2522, "theoretical_loss": 4.0779880381463816, "tokens_seen": 346295296 }, { "epoch": 1.0, "learning_rate": 0.00045204613841524574, "loss": 3.1443, "theoretical_loss": 4.077899604317332, "tokens_seen": 346360832 }, { "epoch": 1.0, "learning_rate": 0.0004520361083249749, "loss": 3.2144, "theoretical_loss": 4.077811191903721, "tokens_seen": 346426368 }, { "epoch": 1.0, "learning_rate": 0.0004520260782347041, "loss": 3.33, "theoretical_loss": 4.0777228008963124, "tokens_seen": 346491904 }, { "epoch": 1.0, "learning_rate": 0.00045201604814443334, "loss": 3.1137, "theoretical_loss": 4.077634431285876, "tokens_seen": 346557440 }, { "epoch": 1.0, "learning_rate": 0.00045200601805416246, "loss": 3.352, "theoretical_loss": 4.077546083063188, "tokens_seen": 346622976 }, { "epoch": 1.0, "learning_rate": 0.0004519959879638917, "loss": 3.0892, "theoretical_loss": 4.077457756219029, "tokens_seen": 346688512 }, { "epoch": 1.0, "learning_rate": 0.0004519859578736208, "loss": 3.0769, "theoretical_loss": 4.077369450744186, "tokens_seen": 346754048 }, { "epoch": 1.0, "learning_rate": 0.00045197592778335006, "loss": 3.1657, "theoretical_loss": 4.077281166629453, "tokens_seen": 346819584 }, { "epoch": 1.0, "learning_rate": 0.00045196589769307924, "loss": 3.2769, "theoretical_loss": 4.077192903865626, "tokens_seen": 346885120 }, { "epoch": 1.0, "learning_rate": 0.0004519558676028084, "loss": 3.3533, "theoretical_loss": 4.077104662443512, "tokens_seen": 346950656 }, { "epoch": 1.0, "learning_rate": 0.00045194583751253766, "loss": 3.1981, "theoretical_loss": 4.0770164423539175, "tokens_seen": 347016192 }, { "epoch": 1.0, "learning_rate": 0.0004519358074222668, "loss": 3.176, "theoretical_loss": 4.076928243587662, "tokens_seen": 347081728 }, { "epoch": 1.0, "learning_rate": 0.000451925777331996, "loss": 3.2111, "theoretical_loss": 4.0768400661355635, "tokens_seen": 347147264 }, { "epoch": 1.0, "learning_rate": 0.0004519157472417252, "loss": 3.1456, "theoretical_loss": 4.07675190998845, "tokens_seen": 347212800 }, { "epoch": 1.0, "learning_rate": 0.0004519057171514544, "loss": 3.1999, "theoretical_loss": 4.0766637751371535, "tokens_seen": 347278336 }, { "epoch": 1.0, "objective/train/docs_used": 580707, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.278878927230835, "objective/train/theoretical_loss": 4.07659768796855, "objective/train/tokens_used": 367787488, "theoretical_loss": 4.07659768796855, "tokens_seen": 347327488 }, { "epoch": 1.0, "learning_rate": 0.00045189568706118356, "loss": 3.2594, "theoretical_loss": 4.076575661572513, "tokens_seen": 347343872 }, { "epoch": 1.0, "learning_rate": 0.0004518856569709128, "loss": 3.1739, "theoretical_loss": 4.076487569285373, "tokens_seen": 347409408 }, { "epoch": 1.0, "learning_rate": 0.0004518756268806419, "loss": 3.1588, "theoretical_loss": 4.076399498266582, "tokens_seen": 347474944 }, { "epoch": 1.0, "learning_rate": 0.00045186559679037116, "loss": 3.1848, "theoretical_loss": 4.076311448506995, "tokens_seen": 347540480 }, { "epoch": 1.0, "learning_rate": 0.0004518555667001003, "loss": 3.1744, "theoretical_loss": 4.076223419997474, "tokens_seen": 347606016 }, { "epoch": 1.0, "learning_rate": 0.0004518455366098295, "loss": 3.2924, "theoretical_loss": 4.076135412728885, "tokens_seen": 347671552 }, { "epoch": 1.0, "learning_rate": 0.0004518355065195587, "loss": 3.0729, "theoretical_loss": 4.0760474266920985, "tokens_seen": 347737088 }, { "epoch": 1.0, "learning_rate": 0.0004518254764292879, "loss": 3.1451, "theoretical_loss": 4.0759594618779955, "tokens_seen": 347802624 }, { "epoch": 1.0, "learning_rate": 0.00045181544633901707, "loss": 3.1382, "theoretical_loss": 4.075871518277458, "tokens_seen": 347868160 }, { "epoch": 1.0, "learning_rate": 0.00045180541624874625, "loss": 2.9616, "theoretical_loss": 4.075783595881374, "tokens_seen": 347933696 }, { "epoch": 1.0, "learning_rate": 0.00045179538615847543, "loss": 3.3424, "theoretical_loss": 4.07569569468064, "tokens_seen": 347999232 }, { "epoch": 1.0, "learning_rate": 0.00045178535606820466, "loss": 3.3062, "theoretical_loss": 4.075607814666155, "tokens_seen": 348064768 }, { "epoch": 1.0, "learning_rate": 0.0004517753259779338, "loss": 3.1515, "theoretical_loss": 4.075519955828825, "tokens_seen": 348130304 }, { "epoch": 1.0, "learning_rate": 0.000451765295887663, "loss": 3.0296, "theoretical_loss": 4.075432118159563, "tokens_seen": 348195840 }, { "epoch": 1.0, "learning_rate": 0.00045175526579739215, "loss": 3.1444, "theoretical_loss": 4.075344301649285, "tokens_seen": 348261376 }, { "epoch": 1.0, "learning_rate": 0.0004517452357071214, "loss": 3.0994, "theoretical_loss": 4.075256506288914, "tokens_seen": 348326912 }, { "epoch": 1.0, "learning_rate": 0.00045173520561685057, "loss": 3.2109, "theoretical_loss": 4.075168732069379, "tokens_seen": 348392448 }, { "epoch": 1.0, "learning_rate": 0.00045172517552657975, "loss": 3.1103, "theoretical_loss": 4.075080978981614, "tokens_seen": 348457984 }, { "epoch": 1.0, "learning_rate": 0.00045171514543630893, "loss": 3.1479, "theoretical_loss": 4.0749932470165575, "tokens_seen": 348523520 }, { "epoch": 1.0, "learning_rate": 0.00045170511534603817, "loss": 3.3004, "theoretical_loss": 4.074905536165154, "tokens_seen": 348589056 }, { "epoch": 1.0, "learning_rate": 0.0004516950852557673, "loss": 3.1292, "theoretical_loss": 4.074817846418357, "tokens_seen": 348654592 }, { "epoch": 1.0, "learning_rate": 0.00045168505516549653, "loss": 3.1447, "theoretical_loss": 4.074730177767121, "tokens_seen": 348720128 }, { "epoch": 1.0, "learning_rate": 0.00045167502507522566, "loss": 3.2509, "theoretical_loss": 4.074642530202409, "tokens_seen": 348785664 }, { "epoch": 1.0, "learning_rate": 0.0004516649949849549, "loss": 3.1719, "theoretical_loss": 4.074554903715186, "tokens_seen": 348851200 }, { "epoch": 1.0, "learning_rate": 0.00045165496489468407, "loss": 3.1474, "theoretical_loss": 4.074467298296429, "tokens_seen": 348916736 }, { "epoch": 1.0, "objective/train/docs_used": 582101, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3295605182647705, "objective/train/theoretical_loss": 4.074401608053112, "objective/train/tokens_used": 369425888, "theoretical_loss": 4.074401608053112, "tokens_seen": 348965888 }, { "epoch": 1.0, "learning_rate": 0.00045164493480441325, "loss": 3.2704, "theoretical_loss": 4.0743797139371125, "tokens_seen": 348982272 }, { "epoch": 1.0, "learning_rate": 0.00045163490471414244, "loss": 3.4079, "theoretical_loss": 4.0742921506282235, "tokens_seen": 349047808 }, { "epoch": 1.0, "learning_rate": 0.0004516248746238716, "loss": 3.2846, "theoretical_loss": 4.07420460836075, "tokens_seen": 349113344 }, { "epoch": 1.0, "learning_rate": 0.0004516148445336008, "loss": 3.2992, "theoretical_loss": 4.074117087125689, "tokens_seen": 349178880 }, { "epoch": 1.0, "learning_rate": 0.00045160481444333003, "loss": 3.1511, "theoretical_loss": 4.07402958691404, "tokens_seen": 349244416 }, { "epoch": 1.0, "learning_rate": 0.00045159478435305916, "loss": 3.3442, "theoretical_loss": 4.073942107716809, "tokens_seen": 349309952 }, { "epoch": 1.0, "learning_rate": 0.0004515847542627884, "loss": 3.2204, "theoretical_loss": 4.07385464952501, "tokens_seen": 349375488 }, { "epoch": 1.0, "learning_rate": 0.0004515747241725175, "loss": 3.321, "theoretical_loss": 4.073767212329658, "tokens_seen": 349441024 }, { "epoch": 1.0, "learning_rate": 0.00045156469408224676, "loss": 3.1654, "theoretical_loss": 4.073679796121777, "tokens_seen": 349506560 }, { "epoch": 1.0, "learning_rate": 0.00045155466399197594, "loss": 2.9132, "theoretical_loss": 4.073592400892395, "tokens_seen": 349572096 }, { "epoch": 1.0, "learning_rate": 0.0004515446339017051, "loss": 3.1579, "theoretical_loss": 4.073505026632548, "tokens_seen": 349637632 }, { "epoch": 1.0, "learning_rate": 0.0004515346038114343, "loss": 3.1462, "theoretical_loss": 4.073417673333272, "tokens_seen": 349703168 }, { "epoch": 1.0, "learning_rate": 0.00045152457372116354, "loss": 3.0663, "theoretical_loss": 4.073330340985614, "tokens_seen": 349768704 }, { "epoch": 1.0, "learning_rate": 0.00045151454363089266, "loss": 3.3219, "theoretical_loss": 4.073243029580625, "tokens_seen": 349834240 }, { "epoch": 1.0, "learning_rate": 0.0004515045135406219, "loss": 3.1026, "theoretical_loss": 4.073155739109359, "tokens_seen": 349899776 }, { "epoch": 1.0, "learning_rate": 0.000451494483450351, "loss": 3.1257, "theoretical_loss": 4.07306846956288, "tokens_seen": 349965312 }, { "epoch": 1.0, "learning_rate": 0.00045148445336008026, "loss": 3.1401, "theoretical_loss": 4.072981220932253, "tokens_seen": 350030848 }, { "epoch": 1.0, "learning_rate": 0.00045147442326980944, "loss": 3.1261, "theoretical_loss": 4.0728939932085515, "tokens_seen": 350096384 }, { "epoch": 1.0, "learning_rate": 0.0004514643931795386, "loss": 3.2255, "theoretical_loss": 4.072806786382853, "tokens_seen": 350161920 }, { "epoch": 1.0, "learning_rate": 0.0004514543630892678, "loss": 3.3577, "theoretical_loss": 4.072719600446241, "tokens_seen": 350227456 }, { "epoch": 1.0, "learning_rate": 0.000451444332998997, "loss": 3.2905, "theoretical_loss": 4.072632435389805, "tokens_seen": 350292992 }, { "epoch": 1.0, "learning_rate": 0.00045143430290872617, "loss": 3.149, "theoretical_loss": 4.072545291204638, "tokens_seen": 350358528 }, { "epoch": 1.0, "learning_rate": 0.0004514242728184554, "loss": 3.1258, "theoretical_loss": 4.072458167881841, "tokens_seen": 350424064 }, { "epoch": 1.0, "learning_rate": 0.00045141424272818453, "loss": 3.2519, "theoretical_loss": 4.0723710654125185, "tokens_seen": 350489600 }, { "epoch": 1.0, "learning_rate": 0.00045140421263791376, "loss": 3.0796, "theoretical_loss": 4.072283983787782, "tokens_seen": 350555136 }, { "epoch": 1.0, "objective/train/docs_used": 585026, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3945939540863037, "objective/train/theoretical_loss": 4.0722186862431435, "objective/train/tokens_used": 371064288, "theoretical_loss": 4.0722186862431435, "tokens_seen": 350604288 }, { "epoch": 1.0, "learning_rate": 0.00045139418254764294, "loss": 3.3716, "theoretical_loss": 4.072196922998746, "tokens_seen": 350620672 }, { "epoch": 1.0, "learning_rate": 0.0004513841524573721, "loss": 3.1792, "theoretical_loss": 4.072109883036535, "tokens_seen": 350686208 }, { "epoch": 1.0, "learning_rate": 0.0004513741223671013, "loss": 3.1439, "theoretical_loss": 4.072022863892274, "tokens_seen": 350751744 }, { "epoch": 1.0, "learning_rate": 0.0004513640922768305, "loss": 3.2777, "theoretical_loss": 4.071935865557095, "tokens_seen": 350817280 }, { "epoch": 1.0, "learning_rate": 0.00045135406218655967, "loss": 3.2674, "theoretical_loss": 4.071848888022138, "tokens_seen": 350882816 }, { "epoch": 1.0, "learning_rate": 0.0004513440320962889, "loss": 3.3655, "theoretical_loss": 4.0717619312785445, "tokens_seen": 350948352 }, { "epoch": 1.0, "learning_rate": 0.00045133400200601803, "loss": 3.1513, "theoretical_loss": 4.071674995317464, "tokens_seen": 351013888 }, { "epoch": 1.0, "learning_rate": 0.00045132397191574727, "loss": 3.3297, "theoretical_loss": 4.071588080130051, "tokens_seen": 351079424 }, { "epoch": 1.0, "learning_rate": 0.0004513139418254764, "loss": 3.1543, "theoretical_loss": 4.071501185707465, "tokens_seen": 351144960 }, { "epoch": 1.0, "learning_rate": 0.00045130391173520563, "loss": 3.0431, "theoretical_loss": 4.071414312040871, "tokens_seen": 351210496 }, { "epoch": 1.0, "learning_rate": 0.0004512938816449348, "loss": 3.207, "theoretical_loss": 4.07132745912144, "tokens_seen": 351276032 }, { "epoch": 1.0, "learning_rate": 0.000451283851554664, "loss": 3.2479, "theoretical_loss": 4.071240626940346, "tokens_seen": 351341568 }, { "epoch": 1.0, "learning_rate": 0.00045127382146439317, "loss": 3.346, "theoretical_loss": 4.071153815488772, "tokens_seen": 351407104 }, { "epoch": 1.0, "learning_rate": 0.00045126379137412235, "loss": 3.164, "theoretical_loss": 4.0710670247579035, "tokens_seen": 351472640 }, { "epoch": 1.0, "learning_rate": 0.00045125376128385153, "loss": 3.188, "theoretical_loss": 4.070980254738934, "tokens_seen": 351538176 }, { "epoch": 1.0, "learning_rate": 0.00045124373119358077, "loss": 3.2165, "theoretical_loss": 4.07089350542306, "tokens_seen": 351603712 }, { "epoch": 1.0, "learning_rate": 0.0004512337011033099, "loss": 3.214, "theoretical_loss": 4.070806776801484, "tokens_seen": 351669248 }, { "epoch": 1.0, "learning_rate": 0.00045122367101303913, "loss": 3.2569, "theoretical_loss": 4.070720068865414, "tokens_seen": 351734784 }, { "epoch": 1.0, "learning_rate": 0.0004512136409227683, "loss": 3.2842, "theoretical_loss": 4.070633381606065, "tokens_seen": 351800320 }, { "epoch": 1.0, "learning_rate": 0.0004512036108324975, "loss": 3.2326, "theoretical_loss": 4.070546715014654, "tokens_seen": 351865856 }, { "epoch": 1.0, "learning_rate": 0.00045119358074222673, "loss": 3.3039, "theoretical_loss": 4.070460069082406, "tokens_seen": 351931392 }, { "epoch": 1.0, "learning_rate": 0.00045118355065195586, "loss": 3.2128, "theoretical_loss": 4.070373443800552, "tokens_seen": 351996928 }, { "epoch": 1.0, "learning_rate": 0.0004511735205616851, "loss": 3.2679, "theoretical_loss": 4.070286839160325, "tokens_seen": 352062464 }, { "epoch": 1.0, "learning_rate": 0.0004511634904714143, "loss": 3.2887, "theoretical_loss": 4.070200255152967, "tokens_seen": 352128000 }, { "epoch": 1.0, "learning_rate": 0.00045115346038114345, "loss": 3.1375, "theoretical_loss": 4.070113691769722, "tokens_seen": 352193536 }, { "epoch": 1.0, "objective/train/docs_used": 588030, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.242804527282715, "objective/train/theoretical_loss": 4.070048782761599, "objective/train/tokens_used": 372702688, "theoretical_loss": 4.070048782761599, "tokens_seen": 352242688 }, { "epoch": 1.0, "learning_rate": 0.00045114343029087264, "loss": 3.1028, "theoretical_loss": 4.070027149001842, "tokens_seen": 352259072 }, { "epoch": 1.0, "learning_rate": 0.0004511334002006018, "loss": 3.1552, "theoretical_loss": 4.069940626840584, "tokens_seen": 352324608 }, { "epoch": 1.0, "learning_rate": 0.000451123370110331, "loss": 3.2479, "theoretical_loss": 4.069854125277208, "tokens_seen": 352390144 }, { "epoch": 1.0, "learning_rate": 0.00045111334002006023, "loss": 3.0966, "theoretical_loss": 4.069767644302982, "tokens_seen": 352455680 }, { "epoch": 1.0, "learning_rate": 0.00045110330992978936, "loss": 3.338, "theoretical_loss": 4.069681183909179, "tokens_seen": 352521216 }, { "epoch": 1.0, "learning_rate": 0.0004510932798395186, "loss": 3.0606, "theoretical_loss": 4.069594744087076, "tokens_seen": 352586752 }, { "epoch": 1.0, "learning_rate": 0.0004510832497492477, "loss": 3.142, "theoretical_loss": 4.0695083248279555, "tokens_seen": 352652288 }, { "epoch": 1.0, "learning_rate": 0.00045107321965897696, "loss": 3.2738, "theoretical_loss": 4.069421926123106, "tokens_seen": 352717824 }, { "epoch": 1.0, "learning_rate": 0.00045106318956870614, "loss": 3.1903, "theoretical_loss": 4.0693355479638225, "tokens_seen": 352783360 }, { "epoch": 1.0, "learning_rate": 0.0004510531594784353, "loss": 3.2074, "theoretical_loss": 4.069249190341402, "tokens_seen": 352848896 }, { "epoch": 1.0, "learning_rate": 0.0004510431293881645, "loss": 3.1929, "theoretical_loss": 4.06916285324715, "tokens_seen": 352914432 }, { "epoch": 1.0, "learning_rate": 0.00045103309929789374, "loss": 3.2126, "theoretical_loss": 4.069076536672376, "tokens_seen": 352979968 }, { "epoch": 1.0, "learning_rate": 0.00045102306920762286, "loss": 3.1821, "theoretical_loss": 4.068990240608394, "tokens_seen": 353045504 }, { "epoch": 1.0, "learning_rate": 0.0004510130391173521, "loss": 3.2325, "theoretical_loss": 4.068903965046524, "tokens_seen": 353111040 }, { "epoch": 1.0, "learning_rate": 0.0004510030090270812, "loss": 3.333, "theoretical_loss": 4.0688177099780924, "tokens_seen": 353176576 }, { "epoch": 1.0, "learning_rate": 0.00045099297893681046, "loss": 3.1897, "theoretical_loss": 4.068731475394429, "tokens_seen": 353242112 }, { "epoch": 1.0, "learning_rate": 0.00045098294884653964, "loss": 3.202, "theoretical_loss": 4.06864526128687, "tokens_seen": 353307648 }, { "epoch": 1.0, "learning_rate": 0.0004509729187562688, "loss": 3.3567, "theoretical_loss": 4.068559067646758, "tokens_seen": 353373184 }, { "epoch": 1.0, "learning_rate": 0.000450962888665998, "loss": 3.105, "theoretical_loss": 4.068472894465437, "tokens_seen": 353438720 }, { "epoch": 1.0, "learning_rate": 0.0004509528585757272, "loss": 3.2402, "theoretical_loss": 4.06838674173426, "tokens_seen": 353504256 }, { "epoch": 1.0, "learning_rate": 0.00045094282848545637, "loss": 3.3477, "theoretical_loss": 4.0683006094445835, "tokens_seen": 353569792 }, { "epoch": 1.0, "learning_rate": 0.0004509327983951856, "loss": 3.0439, "theoretical_loss": 4.068214497587771, "tokens_seen": 353635328 }, { "epoch": 1.0, "learning_rate": 0.00045092276830491473, "loss": 3.1762, "theoretical_loss": 4.068128406155188, "tokens_seen": 353700864 }, { "epoch": 1.0, "learning_rate": 0.00045091273821464396, "loss": 3.1794, "theoretical_loss": 4.068042335138209, "tokens_seen": 353766400 }, { "epoch": 1.0, "learning_rate": 0.00045090270812437314, "loss": 3.2916, "theoretical_loss": 4.067956284528211, "tokens_seen": 353831936 }, { "epoch": 1.0, "objective/train/docs_used": 591016, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0825870037078857, "objective/train/theoretical_loss": 4.067891759957611, "objective/train/tokens_used": 374341088, "theoretical_loss": 4.067891759957611, "tokens_seen": 353881088 }, { "epoch": 1.0, "learning_rate": 0.0004508926780341023, "loss": 3.1251, "theoretical_loss": 4.067870254316579, "tokens_seen": 353897472 }, { "epoch": 1.0, "learning_rate": 0.0004508826479438315, "loss": 3.2339, "theoretical_loss": 4.067784244494698, "tokens_seen": 353963008 }, { "epoch": 1.0, "learning_rate": 0.0004508726178535607, "loss": 3.1963, "theoretical_loss": 4.067698255053965, "tokens_seen": 354028544 }, { "epoch": 1.0, "learning_rate": 0.00045086258776328987, "loss": 3.1805, "theoretical_loss": 4.067612285985777, "tokens_seen": 354094080 }, { "epoch": 1.0, "learning_rate": 0.0004508525576730191, "loss": 3.4301, "theoretical_loss": 4.067526337281539, "tokens_seen": 354159616 }, { "epoch": 1.0, "learning_rate": 0.00045084252758274823, "loss": 3.2, "theoretical_loss": 4.06744040893266, "tokens_seen": 354225152 }, { "epoch": 1.0, "learning_rate": 0.00045083249749247747, "loss": 3.2281, "theoretical_loss": 4.067354500930554, "tokens_seen": 354290688 }, { "epoch": 1.0, "learning_rate": 0.0004508224674022066, "loss": 3.2222, "theoretical_loss": 4.067268613266641, "tokens_seen": 354356224 }, { "epoch": 1.0, "learning_rate": 0.00045081243731193583, "loss": 3.2451, "theoretical_loss": 4.067182745932348, "tokens_seen": 354421760 }, { "epoch": 1.0, "learning_rate": 0.000450802407221665, "loss": 3.1946, "theoretical_loss": 4.067096898919102, "tokens_seen": 354487296 }, { "epoch": 1.0, "learning_rate": 0.0004507923771313942, "loss": 3.2736, "theoretical_loss": 4.0670110722183415, "tokens_seen": 354552832 }, { "epoch": 1.0, "learning_rate": 0.00045078234704112337, "loss": 3.1553, "theoretical_loss": 4.066925265821504, "tokens_seen": 354618368 }, { "epoch": 1.0, "learning_rate": 0.00045077231695085255, "loss": 3.2572, "theoretical_loss": 4.066839479720038, "tokens_seen": 354683904 }, { "epoch": 1.0, "learning_rate": 0.00045076228686058173, "loss": 3.2032, "theoretical_loss": 4.066753713905392, "tokens_seen": 354749440 }, { "epoch": 1.0, "learning_rate": 0.00045075225677031097, "loss": 3.1612, "theoretical_loss": 4.066667968369023, "tokens_seen": 354814976 }, { "epoch": 1.0, "learning_rate": 0.0004507422266800401, "loss": 3.2938, "theoretical_loss": 4.066582243102394, "tokens_seen": 354880512 }, { "epoch": 1.0, "learning_rate": 0.00045073219658976933, "loss": 3.1224, "theoretical_loss": 4.066496538096969, "tokens_seen": 354946048 }, { "epoch": 1.0, "learning_rate": 0.0004507221664994985, "loss": 3.2743, "theoretical_loss": 4.06641085334422, "tokens_seen": 355011584 }, { "epoch": 1.0, "learning_rate": 0.0004507121364092277, "loss": 3.0825, "theoretical_loss": 4.066325188835625, "tokens_seen": 355077120 }, { "epoch": 1.0, "learning_rate": 0.0004507021063189569, "loss": 3.1583, "theoretical_loss": 4.066239544562666, "tokens_seen": 355142656 }, { "epoch": 1.0, "learning_rate": 0.00045069207622868606, "loss": 3.2246, "theoretical_loss": 4.066153920516828, "tokens_seen": 355208192 }, { "epoch": 1.0, "learning_rate": 0.00045068204613841524, "loss": 3.2781, "theoretical_loss": 4.066068316689606, "tokens_seen": 355273728 }, { "epoch": 1.0, "learning_rate": 0.0004506720160481445, "loss": 3.2817, "theoretical_loss": 4.065982733072495, "tokens_seen": 355339264 }, { "epoch": 1.0, "learning_rate": 0.0004506619859578736, "loss": 3.1662, "theoretical_loss": 4.065897169656999, "tokens_seen": 355404800 }, { "epoch": 1.0, "learning_rate": 0.00045065195586760284, "loss": 3.094, "theoretical_loss": 4.065811626434625, "tokens_seen": 355470336 }, { "epoch": 1.0, "objective/train/docs_used": 592871, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.293246030807495, "objective/train/theoretical_loss": 4.065747482264476, "objective/train/tokens_used": 375979488, "theoretical_loss": 4.065747482264476, "tokens_seen": 355519488 }, { "epoch": 1.0, "learning_rate": 0.00045064192577733196, "loss": 3.2318, "theoretical_loss": 4.065726103396887, "tokens_seen": 355535872 }, { "epoch": 1.0, "learning_rate": 0.0004506318956870612, "loss": 3.2819, "theoretical_loss": 4.065640600535302, "tokens_seen": 355601408 }, { "epoch": 1.0, "learning_rate": 0.0004506218655967904, "loss": 3.2374, "theoretical_loss": 4.0655551178413925, "tokens_seen": 355666944 }, { "epoch": 1.0, "learning_rate": 0.00045061183550651956, "loss": 3.3307, "theoretical_loss": 4.065469655306687, "tokens_seen": 355732480 }, { "epoch": 1.0, "learning_rate": 0.00045060180541624874, "loss": 3.3814, "theoretical_loss": 4.065384212922719, "tokens_seen": 355798016 }, { "epoch": 1.0, "learning_rate": 0.0004505917753259779, "loss": 3.2952, "theoretical_loss": 4.0652987906810285, "tokens_seen": 355863552 }, { "epoch": 1.0, "learning_rate": 0.0004505817452357071, "loss": 3.3166, "theoretical_loss": 4.065213388573157, "tokens_seen": 355929088 }, { "epoch": 1.0, "learning_rate": 0.00045057171514543634, "loss": 3.073, "theoretical_loss": 4.065128006590653, "tokens_seen": 355994624 }, { "epoch": 1.0, "learning_rate": 0.00045056168505516547, "loss": 3.2121, "theoretical_loss": 4.065042644725072, "tokens_seen": 356060160 }, { "epoch": 1.0, "learning_rate": 0.0004505516549648947, "loss": 3.1726, "theoretical_loss": 4.064957302967971, "tokens_seen": 356125696 }, { "epoch": 1.0, "learning_rate": 0.0004505416248746239, "loss": 3.1558, "theoretical_loss": 4.064871981310915, "tokens_seen": 356191232 }, { "epoch": 1.0, "learning_rate": 0.00045053159478435306, "loss": 3.298, "theoretical_loss": 4.064786679745474, "tokens_seen": 356256768 }, { "epoch": 1.0, "learning_rate": 0.00045052156469408224, "loss": 3.2767, "theoretical_loss": 4.064701398263219, "tokens_seen": 356322304 }, { "epoch": 1.0, "learning_rate": 0.0004505115346038114, "loss": 3.269, "theoretical_loss": 4.064616136855733, "tokens_seen": 356387840 }, { "epoch": 1.0, "learning_rate": 0.0004505015045135406, "loss": 3.3209, "theoretical_loss": 4.064530895514597, "tokens_seen": 356453376 }, { "epoch": 1.0, "learning_rate": 0.00045049147442326984, "loss": 3.279, "theoretical_loss": 4.064445674231402, "tokens_seen": 356518912 }, { "epoch": 1.0, "learning_rate": 0.00045048144433299897, "loss": 3.1061, "theoretical_loss": 4.064360472997743, "tokens_seen": 356584448 }, { "epoch": 1.0, "learning_rate": 0.0004504714142427282, "loss": 3.2923, "theoretical_loss": 4.0642752918052185, "tokens_seen": 356649984 }, { "epoch": 1.0, "learning_rate": 0.00045046138415245733, "loss": 3.2879, "theoretical_loss": 4.064190130645432, "tokens_seen": 356715520 }, { "epoch": 1.0, "learning_rate": 0.00045045135406218657, "loss": 3.0167, "theoretical_loss": 4.064104989509996, "tokens_seen": 356781056 }, { "epoch": 1.0, "learning_rate": 0.0004504413239719158, "loss": 3.0849, "theoretical_loss": 4.064019868390522, "tokens_seen": 356846592 }, { "epoch": 1.0, "learning_rate": 0.00045043129388164493, "loss": 3.1723, "theoretical_loss": 4.063934767278632, "tokens_seen": 356912128 }, { "epoch": 1.0, "learning_rate": 0.00045042126379137416, "loss": 3.2252, "theoretical_loss": 4.06384968616595, "tokens_seen": 356977664 }, { "epoch": 1.0, "learning_rate": 0.00045041123370110334, "loss": 3.3523, "theoretical_loss": 4.063764625044106, "tokens_seen": 357043200 }, { "epoch": 1.0, "learning_rate": 0.0004504012036108325, "loss": 3.1049, "theoretical_loss": 4.063679583904735, "tokens_seen": 357108736 }, { "epoch": 1.0, "objective/train/docs_used": 595649, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9357666969299316, "objective/train/theoretical_loss": 4.063615816158674, "objective/train/tokens_used": 377617888, "theoretical_loss": 4.063615816158674, "tokens_seen": 357157888 }, { "epoch": 1.0, "learning_rate": 0.0004503911735205617, "loss": 3.0062, "theoretical_loss": 4.063594562739476, "tokens_seen": 357174272 }, { "epoch": 1.0, "learning_rate": 0.0004503811434302909, "loss": 3.3135, "theoretical_loss": 4.063509561539974, "tokens_seen": 357239808 }, { "epoch": 1.0, "learning_rate": 0.00045037111334002007, "loss": 3.0913, "theoretical_loss": 4.06342458029788, "tokens_seen": 357305344 }, { "epoch": 1.0, "learning_rate": 0.0004503610832497493, "loss": 3.0512, "theoretical_loss": 4.063339619004848, "tokens_seen": 357370880 }, { "epoch": 1.0, "learning_rate": 0.00045035105315947843, "loss": 3.2051, "theoretical_loss": 4.063254677652539, "tokens_seen": 357436416 }, { "epoch": 1.0, "learning_rate": 0.00045034102306920767, "loss": 3.2541, "theoretical_loss": 4.063169756232616, "tokens_seen": 357501952 }, { "epoch": 1.0, "learning_rate": 0.0004503309929789368, "loss": 3.1725, "theoretical_loss": 4.0630848547367515, "tokens_seen": 357567488 }, { "epoch": 1.0, "learning_rate": 0.00045032096288866603, "loss": 3.1111, "theoretical_loss": 4.062999973156619, "tokens_seen": 357633024 }, { "epoch": 1.0, "learning_rate": 0.0004503109327983952, "loss": 3.2036, "theoretical_loss": 4.062915111483899, "tokens_seen": 357698560 }, { "epoch": 1.0, "learning_rate": 0.0004503009027081244, "loss": 2.9417, "theoretical_loss": 4.062830269710275, "tokens_seen": 357764096 }, { "epoch": 1.0, "learning_rate": 0.00045029087261785357, "loss": 3.2017, "theoretical_loss": 4.0627454478274405, "tokens_seen": 357829632 }, { "epoch": 1.0, "learning_rate": 0.00045028084252758275, "loss": 3.1482, "theoretical_loss": 4.062660645827087, "tokens_seen": 357895168 }, { "epoch": 1.0, "learning_rate": 0.00045027081243731193, "loss": 3.2725, "theoretical_loss": 4.062575863700916, "tokens_seen": 357960704 }, { "epoch": 1.0, "learning_rate": 0.00045026078234704117, "loss": 3.2601, "theoretical_loss": 4.062491101440633, "tokens_seen": 358026240 }, { "epoch": 1.0, "learning_rate": 0.0004502507522567703, "loss": 3.1436, "theoretical_loss": 4.062406359037947, "tokens_seen": 358091776 }, { "epoch": 1.0, "learning_rate": 0.00045024072216649953, "loss": 3.0157, "theoretical_loss": 4.0623216364845725, "tokens_seen": 358157312 }, { "epoch": 1.01, "learning_rate": 0.0004502306920762287, "loss": 3.1848, "theoretical_loss": 4.0622369337722315, "tokens_seen": 358222848 }, { "epoch": 1.01, "learning_rate": 0.0004502206619859579, "loss": 3.1944, "theoretical_loss": 4.062152250892646, "tokens_seen": 358288384 }, { "epoch": 1.01, "learning_rate": 0.0004502106318956871, "loss": 3.1226, "theoretical_loss": 4.062067587837548, "tokens_seen": 358353920 }, { "epoch": 1.01, "learning_rate": 0.00045020060180541626, "loss": 3.1318, "theoretical_loss": 4.061982944598672, "tokens_seen": 358419456 }, { "epoch": 1.01, "learning_rate": 0.00045019057171514544, "loss": 3.1546, "theoretical_loss": 4.061898321167757, "tokens_seen": 358484992 }, { "epoch": 1.01, "learning_rate": 0.0004501805416248747, "loss": 3.2143, "theoretical_loss": 4.061813717536548, "tokens_seen": 358550528 }, { "epoch": 1.01, "learning_rate": 0.0004501705115346038, "loss": 3.3559, "theoretical_loss": 4.061729133696795, "tokens_seen": 358616064 }, { "epoch": 1.01, "learning_rate": 0.00045016048144433304, "loss": 3.0883, "theoretical_loss": 4.061644569640252, "tokens_seen": 358681600 }, { "epoch": 1.01, "learning_rate": 0.00045015045135406216, "loss": 3.2549, "theoretical_loss": 4.061560025358679, "tokens_seen": 358747136 }, { "epoch": 1.01, "objective/train/docs_used": 598409, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8931403160095215, "objective/train/theoretical_loss": 4.061496630119868, "objective/train/tokens_used": 379256288, "theoretical_loss": 4.061496630119868, "tokens_seen": 358796288 }, { "epoch": 1.01, "learning_rate": 0.0004501404212637914, "loss": 3.1222, "theoretical_loss": 4.06147550084384, "tokens_seen": 358812672 }, { "epoch": 1.01, "learning_rate": 0.0004501303911735206, "loss": 3.0583, "theoretical_loss": 4.061390996087504, "tokens_seen": 358878208 }, { "epoch": 1.01, "learning_rate": 0.00045012036108324976, "loss": 3.1973, "theoretical_loss": 4.0613065110814475, "tokens_seen": 358943744 }, { "epoch": 1.01, "learning_rate": 0.00045011033099297894, "loss": 3.2175, "theoretical_loss": 4.0612220458174475, "tokens_seen": 359009280 }, { "epoch": 1.01, "learning_rate": 0.0004501003009027081, "loss": 3.2787, "theoretical_loss": 4.06113760028729, "tokens_seen": 359074816 }, { "epoch": 1.01, "learning_rate": 0.0004500902708124373, "loss": 3.2334, "theoretical_loss": 4.061053174482762, "tokens_seen": 359140352 }, { "epoch": 1.01, "learning_rate": 0.00045008024072216654, "loss": 3.2251, "theoretical_loss": 4.060968768395659, "tokens_seen": 359205888 }, { "epoch": 1.01, "learning_rate": 0.00045007021063189567, "loss": 3.1124, "theoretical_loss": 4.060884382017779, "tokens_seen": 359271424 }, { "epoch": 1.01, "learning_rate": 0.0004500601805416249, "loss": 3.3182, "theoretical_loss": 4.060800015340927, "tokens_seen": 359336960 }, { "epoch": 1.01, "learning_rate": 0.0004500501504513541, "loss": 3.0824, "theoretical_loss": 4.060715668356911, "tokens_seen": 359402496 }, { "epoch": 1.01, "learning_rate": 0.00045004012036108326, "loss": 3.2525, "theoretical_loss": 4.060631341057545, "tokens_seen": 359468032 }, { "epoch": 1.01, "learning_rate": 0.00045003009027081244, "loss": 3.2417, "theoretical_loss": 4.060547033434647, "tokens_seen": 359533568 }, { "epoch": 1.01, "learning_rate": 0.0004500200601805416, "loss": 3.2165, "theoretical_loss": 4.060462745480041, "tokens_seen": 359599104 }, { "epoch": 1.01, "learning_rate": 0.0004500100300902708, "loss": 3.1683, "theoretical_loss": 4.060378477185554, "tokens_seen": 359664640 }, { "epoch": 1.01, "learning_rate": 0.00045000000000000004, "loss": 3.0579, "theoretical_loss": 4.060294228543021, "tokens_seen": 359730176 }, { "epoch": 1.01, "learning_rate": 0.00044998996990972917, "loss": 3.2498, "theoretical_loss": 4.060209999544279, "tokens_seen": 359795712 }, { "epoch": 1.01, "learning_rate": 0.0004499799398194584, "loss": 3.2437, "theoretical_loss": 4.060125790181171, "tokens_seen": 359861248 }, { "epoch": 1.01, "learning_rate": 0.00044996990972918753, "loss": 3.1828, "theoretical_loss": 4.060041600445546, "tokens_seen": 359926784 }, { "epoch": 1.01, "learning_rate": 0.00044995987963891677, "loss": 3.0898, "theoretical_loss": 4.059957430329254, "tokens_seen": 359992320 }, { "epoch": 1.01, "learning_rate": 0.00044994984954864595, "loss": 3.2311, "theoretical_loss": 4.0598732798241555, "tokens_seen": 360057856 }, { "epoch": 1.01, "learning_rate": 0.00044993981945837513, "loss": 3.1693, "theoretical_loss": 4.059789148922111, "tokens_seen": 360123392 }, { "epoch": 1.01, "learning_rate": 0.0004499297893681043, "loss": 3.2824, "theoretical_loss": 4.059705037614989, "tokens_seen": 360188928 }, { "epoch": 1.01, "learning_rate": 0.00044991975927783355, "loss": 3.1122, "theoretical_loss": 4.059620945894661, "tokens_seen": 360254464 }, { "epoch": 1.01, "learning_rate": 0.00044990972918756267, "loss": 3.2076, "theoretical_loss": 4.059536873753004, "tokens_seen": 360320000 }, { "epoch": 1.01, "learning_rate": 0.0004498996990972919, "loss": 3.3451, "theoretical_loss": 4.059452821181899, "tokens_seen": 360385536 }, { "epoch": 1.01, "objective/train/docs_used": 601352, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.097663402557373, "objective/train/theoretical_loss": 4.059389794591865, "objective/train/tokens_used": 380894688, "theoretical_loss": 4.059389794591865, "tokens_seen": 360434688 }, { "epoch": 1.01, "learning_rate": 0.00044988966900702103, "loss": 3.1391, "theoretical_loss": 4.059368788173233, "tokens_seen": 360451072 }, { "epoch": 1.01, "learning_rate": 0.00044987963891675027, "loss": 3.1609, "theoretical_loss": 4.0592847747189, "tokens_seen": 360516608 }, { "epoch": 1.01, "learning_rate": 0.00044986960882647945, "loss": 3.1536, "theoretical_loss": 4.059200780810793, "tokens_seen": 360582144 }, { "epoch": 1.01, "learning_rate": 0.00044985957873620863, "loss": 3.2882, "theoretical_loss": 4.059116806440814, "tokens_seen": 360647680 }, { "epoch": 1.01, "learning_rate": 0.0004498495486459378, "loss": 3.2414, "theoretical_loss": 4.05903285160087, "tokens_seen": 360713216 }, { "epoch": 1.01, "learning_rate": 0.000449839518555667, "loss": 3.2019, "theoretical_loss": 4.058948916282871, "tokens_seen": 360778752 }, { "epoch": 1.01, "learning_rate": 0.0004498294884653962, "loss": 3.1268, "theoretical_loss": 4.058865000478733, "tokens_seen": 360844288 }, { "epoch": 1.01, "learning_rate": 0.0004498194583751254, "loss": 3.1961, "theoretical_loss": 4.058781104180377, "tokens_seen": 360909824 }, { "epoch": 1.01, "learning_rate": 0.00044980942828485454, "loss": 3.1296, "theoretical_loss": 4.058697227379726, "tokens_seen": 360975360 }, { "epoch": 1.01, "learning_rate": 0.00044979939819458377, "loss": 3.1161, "theoretical_loss": 4.058613370068713, "tokens_seen": 361040896 }, { "epoch": 1.01, "learning_rate": 0.0004497893681043129, "loss": 3.2019, "theoretical_loss": 4.058529532239271, "tokens_seen": 361106432 }, { "epoch": 1.01, "learning_rate": 0.00044977933801404213, "loss": 3.1822, "theoretical_loss": 4.05844571388334, "tokens_seen": 361171968 }, { "epoch": 1.01, "learning_rate": 0.0004497693079237713, "loss": 3.199, "theoretical_loss": 4.058361914992865, "tokens_seen": 361237504 }, { "epoch": 1.01, "learning_rate": 0.0004497592778335005, "loss": 3.2235, "theoretical_loss": 4.058278135559794, "tokens_seen": 361303040 }, { "epoch": 1.01, "learning_rate": 0.0004497492477432297, "loss": 3.1332, "theoretical_loss": 4.0581943755760825, "tokens_seen": 361368576 }, { "epoch": 1.01, "learning_rate": 0.0004497392176529589, "loss": 3.2183, "theoretical_loss": 4.058110635033689, "tokens_seen": 361434112 }, { "epoch": 1.01, "learning_rate": 0.00044972918756268804, "loss": 3.3398, "theoretical_loss": 4.058026913924576, "tokens_seen": 361499648 }, { "epoch": 1.01, "learning_rate": 0.0004497191574724173, "loss": 3.065, "theoretical_loss": 4.057943212240713, "tokens_seen": 361565184 }, { "epoch": 1.01, "learning_rate": 0.0004497091273821464, "loss": 3.2105, "theoretical_loss": 4.057859529974073, "tokens_seen": 361630720 }, { "epoch": 1.01, "learning_rate": 0.00044969909729187564, "loss": 3.2554, "theoretical_loss": 4.057775867116634, "tokens_seen": 361696256 }, { "epoch": 1.01, "learning_rate": 0.0004496890672016049, "loss": 3.0404, "theoretical_loss": 4.0576922236603785, "tokens_seen": 361761792 }, { "epoch": 1.01, "learning_rate": 0.000449679037111334, "loss": 3.157, "theoretical_loss": 4.057608599597294, "tokens_seen": 361827328 }, { "epoch": 1.01, "learning_rate": 0.00044966900702106324, "loss": 3.2655, "theoretical_loss": 4.057524994919372, "tokens_seen": 361892864 }, { "epoch": 1.01, "learning_rate": 0.00044965897693079236, "loss": 3.2229, "theoretical_loss": 4.05744140961861, "tokens_seen": 361958400 }, { "epoch": 1.01, "learning_rate": 0.0004496489468405216, "loss": 3.1545, "theoretical_loss": 4.0573578436870115, "tokens_seen": 362023936 }, { "epoch": 1.01, "objective/train/docs_used": 604190, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3383069038391113, "objective/train/theoretical_loss": 4.057295181944515, "objective/train/tokens_used": 382533088, "theoretical_loss": 4.057295181944515, "tokens_seen": 362073088 }, { "epoch": 1.01, "learning_rate": 0.0004496389167502508, "loss": 3.24, "theoretical_loss": 4.05727429711658, "tokens_seen": 362089472 }, { "epoch": 1.01, "learning_rate": 0.00044962888665997996, "loss": 3.0683, "theoretical_loss": 4.057190769899329, "tokens_seen": 362155008 }, { "epoch": 1.01, "learning_rate": 0.00044961885656970914, "loss": 3.2231, "theoretical_loss": 4.057107262027273, "tokens_seen": 362220544 }, { "epoch": 1.01, "learning_rate": 0.0004496088264794383, "loss": 3.197, "theoretical_loss": 4.057023773492434, "tokens_seen": 362286080 }, { "epoch": 1.01, "learning_rate": 0.0004495987963891675, "loss": 3.054, "theoretical_loss": 4.056940304286836, "tokens_seen": 362351616 }, { "epoch": 1.01, "learning_rate": 0.00044958876629889674, "loss": 3.2506, "theoretical_loss": 4.056856854402509, "tokens_seen": 362417152 }, { "epoch": 1.01, "learning_rate": 0.00044957873620862587, "loss": 3.3116, "theoretical_loss": 4.0567734238314905, "tokens_seen": 362482688 }, { "epoch": 1.01, "learning_rate": 0.0004495687061183551, "loss": 3.1057, "theoretical_loss": 4.056690012565818, "tokens_seen": 362548224 }, { "epoch": 1.01, "learning_rate": 0.0004495586760280843, "loss": 3.1159, "theoretical_loss": 4.056606620597536, "tokens_seen": 362613760 }, { "epoch": 1.01, "learning_rate": 0.00044954864593781346, "loss": 3.3334, "theoretical_loss": 4.056523247918694, "tokens_seen": 362679296 }, { "epoch": 1.01, "learning_rate": 0.00044953861584754264, "loss": 3.0236, "theoretical_loss": 4.056439894521345, "tokens_seen": 362744832 }, { "epoch": 1.01, "learning_rate": 0.0004495285857572718, "loss": 3.0826, "theoretical_loss": 4.056356560397549, "tokens_seen": 362810368 }, { "epoch": 1.01, "learning_rate": 0.000449518555667001, "loss": 3.183, "theoretical_loss": 4.0562732455393675, "tokens_seen": 362875904 }, { "epoch": 1.01, "learning_rate": 0.00044950852557673024, "loss": 3.0782, "theoretical_loss": 4.05618994993887, "tokens_seen": 362941440 }, { "epoch": 1.01, "learning_rate": 0.00044949849548645937, "loss": 3.1031, "theoretical_loss": 4.056106673588127, "tokens_seen": 363006976 }, { "epoch": 1.01, "learning_rate": 0.0004494884653961886, "loss": 3.252, "theoretical_loss": 4.056023416479217, "tokens_seen": 363072512 }, { "epoch": 1.01, "learning_rate": 0.00044947843530591773, "loss": 3.2155, "theoretical_loss": 4.055940178604223, "tokens_seen": 363138048 }, { "epoch": 1.01, "learning_rate": 0.00044946840521564697, "loss": 3.2258, "theoretical_loss": 4.05585695995523, "tokens_seen": 363203584 }, { "epoch": 1.01, "learning_rate": 0.00044945837512537615, "loss": 3.0571, "theoretical_loss": 4.05577376052433, "tokens_seen": 363269120 }, { "epoch": 1.01, "learning_rate": 0.00044944834503510533, "loss": 3.224, "theoretical_loss": 4.055690580303619, "tokens_seen": 363334656 }, { "epoch": 1.01, "learning_rate": 0.0004494383149448345, "loss": 3.103, "theoretical_loss": 4.055607419285197, "tokens_seen": 363400192 }, { "epoch": 1.01, "learning_rate": 0.00044942828485456375, "loss": 3.2521, "theoretical_loss": 4.05552427746117, "tokens_seen": 363465728 }, { "epoch": 1.01, "learning_rate": 0.00044941825476429287, "loss": 3.158, "theoretical_loss": 4.055441154823648, "tokens_seen": 363531264 }, { "epoch": 1.01, "learning_rate": 0.0004494082246740221, "loss": 3.2071, "theoretical_loss": 4.055358051364745, "tokens_seen": 363596800 }, { "epoch": 1.01, "learning_rate": 0.00044939819458375123, "loss": 3.2064, "theoretical_loss": 4.055274967076583, "tokens_seen": 363662336 }, { "epoch": 1.01, "objective/train/docs_used": 606755, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2970306873321533, "objective/train/theoretical_loss": 4.055212666436519, "objective/train/tokens_used": 384171488, "theoretical_loss": 4.055212666436519, "tokens_seen": 363711488 }, { "epoch": 1.01, "learning_rate": 0.00044938816449348047, "loss": 3.2782, "theoretical_loss": 4.055191901951282, "tokens_seen": 363727872 }, { "epoch": 1.01, "learning_rate": 0.00044937813440320965, "loss": 3.0839, "theoretical_loss": 4.055108855980974, "tokens_seen": 363793408 }, { "epoch": 1.01, "learning_rate": 0.00044936810431293883, "loss": 3.181, "theoretical_loss": 4.05502582915779, "tokens_seen": 363858944 }, { "epoch": 1.01, "learning_rate": 0.000449358074222668, "loss": 3.1188, "theoretical_loss": 4.05494282147387, "tokens_seen": 363924480 }, { "epoch": 1.01, "learning_rate": 0.0004493480441323972, "loss": 3.1471, "theoretical_loss": 4.0548598329213545, "tokens_seen": 363990016 }, { "epoch": 1.01, "learning_rate": 0.0004493380140421264, "loss": 3.4, "theoretical_loss": 4.054776863492393, "tokens_seen": 364055552 }, { "epoch": 1.01, "learning_rate": 0.0004493279839518556, "loss": 3.3033, "theoretical_loss": 4.054693913179135, "tokens_seen": 364121088 }, { "epoch": 1.01, "learning_rate": 0.00044931795386158474, "loss": 3.2636, "theoretical_loss": 4.054610981973738, "tokens_seen": 364186624 }, { "epoch": 1.01, "learning_rate": 0.00044930792377131397, "loss": 3.1519, "theoretical_loss": 4.054528069868365, "tokens_seen": 364252160 }, { "epoch": 1.01, "learning_rate": 0.0004492978936810431, "loss": 3.1962, "theoretical_loss": 4.054445176855179, "tokens_seen": 364317696 }, { "epoch": 1.01, "learning_rate": 0.00044928786359077234, "loss": 3.1557, "theoretical_loss": 4.054362302926351, "tokens_seen": 364383232 }, { "epoch": 1.01, "learning_rate": 0.0004492778335005015, "loss": 3.284, "theoretical_loss": 4.054279448074057, "tokens_seen": 364448768 }, { "epoch": 1.01, "learning_rate": 0.0004492678034102307, "loss": 3.0854, "theoretical_loss": 4.054196612290476, "tokens_seen": 364514304 }, { "epoch": 1.01, "learning_rate": 0.0004492577733199599, "loss": 3.3635, "theoretical_loss": 4.054113795567792, "tokens_seen": 364579840 }, { "epoch": 1.01, "learning_rate": 0.0004492477432296891, "loss": 3.3261, "theoretical_loss": 4.054030997898195, "tokens_seen": 364645376 }, { "epoch": 1.01, "learning_rate": 0.00044923771313941824, "loss": 3.1881, "theoretical_loss": 4.053948219273877, "tokens_seen": 364710912 }, { "epoch": 1.01, "learning_rate": 0.0004492276830491475, "loss": 3.1726, "theoretical_loss": 4.053865459687037, "tokens_seen": 364776448 }, { "epoch": 1.01, "learning_rate": 0.0004492176529588766, "loss": 3.083, "theoretical_loss": 4.053782719129877, "tokens_seen": 364841984 }, { "epoch": 1.01, "learning_rate": 0.00044920762286860584, "loss": 3.16, "theoretical_loss": 4.053699997594605, "tokens_seen": 364907520 }, { "epoch": 1.01, "learning_rate": 0.000449197592778335, "loss": 3.1599, "theoretical_loss": 4.053617295073432, "tokens_seen": 364973056 }, { "epoch": 1.01, "learning_rate": 0.0004491875626880642, "loss": 3.143, "theoretical_loss": 4.053534611558575, "tokens_seen": 365038592 }, { "epoch": 1.01, "learning_rate": 0.0004491775325977934, "loss": 3.2688, "theoretical_loss": 4.053451947042255, "tokens_seen": 365104128 }, { "epoch": 1.01, "learning_rate": 0.00044916750250752256, "loss": 3.0754, "theoretical_loss": 4.053369301516697, "tokens_seen": 365169664 }, { "epoch": 1.01, "learning_rate": 0.00044915747241725174, "loss": 3.1217, "theoretical_loss": 4.053286674974132, "tokens_seen": 365235200 }, { "epoch": 1.01, "learning_rate": 0.000449147442326981, "loss": 3.1994, "theoretical_loss": 4.053204067406793, "tokens_seen": 365300736 }, { "epoch": 1.01, "objective/train/docs_used": 608178, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.397564172744751, "objective/train/theoretical_loss": 4.053142124179114, "objective/train/tokens_used": 385809888, "theoretical_loss": 4.053142124179114, "tokens_seen": 365349888 }, { "epoch": 1.01, "learning_rate": 0.0004491374122367101, "loss": 3.3091, "theoretical_loss": 4.053121478806922, "tokens_seen": 365366272 }, { "epoch": 1.01, "learning_rate": 0.00044912738214643934, "loss": 3.2184, "theoretical_loss": 4.0530389091667605, "tokens_seen": 365431808 }, { "epoch": 1.01, "learning_rate": 0.00044911735205616847, "loss": 3.0497, "theoretical_loss": 4.052956358478558, "tokens_seen": 365497344 }, { "epoch": 1.01, "learning_rate": 0.0004491073219658977, "loss": 3.3217, "theoretical_loss": 4.052873826734567, "tokens_seen": 365562880 }, { "epoch": 1.01, "learning_rate": 0.0004490972918756269, "loss": 3.0866, "theoretical_loss": 4.052791313927045, "tokens_seen": 365628416 }, { "epoch": 1.01, "learning_rate": 0.00044908726178535607, "loss": 3.2387, "theoretical_loss": 4.052708820048256, "tokens_seen": 365693952 }, { "epoch": 1.01, "learning_rate": 0.00044907723169508525, "loss": 3.2211, "theoretical_loss": 4.052626345090464, "tokens_seen": 365759488 }, { "epoch": 1.01, "learning_rate": 0.0004490672016048145, "loss": 3.0352, "theoretical_loss": 4.052543889045941, "tokens_seen": 365825024 }, { "epoch": 1.01, "learning_rate": 0.0004490571715145436, "loss": 3.042, "theoretical_loss": 4.052461451906963, "tokens_seen": 365890560 }, { "epoch": 1.01, "learning_rate": 0.00044904714142427284, "loss": 3.1447, "theoretical_loss": 4.05237903366581, "tokens_seen": 365956096 }, { "epoch": 1.01, "learning_rate": 0.00044903711133400197, "loss": 3.1584, "theoretical_loss": 4.052296634314767, "tokens_seen": 366021632 }, { "epoch": 1.01, "learning_rate": 0.0004490270812437312, "loss": 3.1719, "theoretical_loss": 4.052214253846124, "tokens_seen": 366087168 }, { "epoch": 1.01, "learning_rate": 0.0004490170511534604, "loss": 3.2374, "theoretical_loss": 4.052131892252174, "tokens_seen": 366152704 }, { "epoch": 1.01, "learning_rate": 0.00044900702106318957, "loss": 3.2818, "theoretical_loss": 4.052049549525214, "tokens_seen": 366218240 }, { "epoch": 1.01, "learning_rate": 0.00044899699097291875, "loss": 3.3597, "theoretical_loss": 4.05196722565755, "tokens_seen": 366283776 }, { "epoch": 1.01, "learning_rate": 0.00044898696088264793, "loss": 3.1442, "theoretical_loss": 4.051884920641487, "tokens_seen": 366349312 }, { "epoch": 1.01, "learning_rate": 0.0004489769307923771, "loss": 3.107, "theoretical_loss": 4.051802634469338, "tokens_seen": 366414848 }, { "epoch": 1.01, "learning_rate": 0.00044896690070210635, "loss": 3.1836, "theoretical_loss": 4.051720367133419, "tokens_seen": 366480384 }, { "epoch": 1.01, "learning_rate": 0.0004489568706118355, "loss": 3.1885, "theoretical_loss": 4.051638118626052, "tokens_seen": 366545920 }, { "epoch": 1.01, "learning_rate": 0.0004489468405215647, "loss": 3.1525, "theoretical_loss": 4.05155588893956, "tokens_seen": 366611456 }, { "epoch": 1.01, "learning_rate": 0.00044893681043129395, "loss": 3.0665, "theoretical_loss": 4.051473678066275, "tokens_seen": 366676992 }, { "epoch": 1.01, "learning_rate": 0.00044892678034102307, "loss": 3.4077, "theoretical_loss": 4.051391485998531, "tokens_seen": 366742528 }, { "epoch": 1.01, "learning_rate": 0.0004489167502507523, "loss": 3.1495, "theoretical_loss": 4.051309312728667, "tokens_seen": 366808064 }, { "epoch": 1.01, "learning_rate": 0.00044890672016048143, "loss": 3.133, "theoretical_loss": 4.051227158249025, "tokens_seen": 366873600 }, { "epoch": 1.01, "learning_rate": 0.00044889669007021067, "loss": 3.2124, "theoretical_loss": 4.051145022551956, "tokens_seen": 366939136 }, { "epoch": 1.01, "objective/train/docs_used": 610875, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.822375535964966, "objective/train/theoretical_loss": 4.051083433100615, "objective/train/tokens_used": 387448288, "theoretical_loss": 4.051083433100615, "tokens_seen": 366988288 }, { "epoch": 1.01, "learning_rate": 0.00044888665997993985, "loss": 3.0073, "theoretical_loss": 4.051062905629809, "tokens_seen": 367004672 }, { "epoch": 1.01, "learning_rate": 0.00044887662988966903, "loss": 3.1853, "theoretical_loss": 4.050980807474944, "tokens_seen": 367070208 }, { "epoch": 1.01, "learning_rate": 0.0004488665997993982, "loss": 3.0363, "theoretical_loss": 4.050898728079719, "tokens_seen": 367135744 }, { "epoch": 1.01, "learning_rate": 0.0004488565697091274, "loss": 3.1736, "theoretical_loss": 4.050816667436502, "tokens_seen": 367201280 }, { "epoch": 1.01, "learning_rate": 0.0004488465396188566, "loss": 3.0399, "theoretical_loss": 4.050734625537663, "tokens_seen": 367266816 }, { "epoch": 1.01, "learning_rate": 0.0004488365095285858, "loss": 3.107, "theoretical_loss": 4.0506526023755764, "tokens_seen": 367332352 }, { "epoch": 1.01, "learning_rate": 0.00044882647943831494, "loss": 3.2594, "theoretical_loss": 4.050570597942622, "tokens_seen": 367397888 }, { "epoch": 1.01, "learning_rate": 0.0004488164493480442, "loss": 3.1195, "theoretical_loss": 4.050488612231183, "tokens_seen": 367463424 }, { "epoch": 1.01, "learning_rate": 0.0004488064192577733, "loss": 3.2085, "theoretical_loss": 4.050406645233647, "tokens_seen": 367528960 }, { "epoch": 1.01, "learning_rate": 0.00044879638916750254, "loss": 3.0812, "theoretical_loss": 4.050324696942407, "tokens_seen": 367594496 }, { "epoch": 1.01, "learning_rate": 0.0004487863590772317, "loss": 3.2829, "theoretical_loss": 4.05024276734986, "tokens_seen": 367660032 }, { "epoch": 1.01, "learning_rate": 0.0004487763289869609, "loss": 2.9995, "theoretical_loss": 4.050160856448408, "tokens_seen": 367725568 }, { "epoch": 1.01, "learning_rate": 0.0004487662988966901, "loss": 3.2281, "theoretical_loss": 4.050078964230456, "tokens_seen": 367791104 }, { "epoch": 1.01, "learning_rate": 0.0004487562688064193, "loss": 3.1954, "theoretical_loss": 4.049997090688415, "tokens_seen": 367856640 }, { "epoch": 1.01, "learning_rate": 0.00044874623871614844, "loss": 3.1034, "theoretical_loss": 4.049915235814701, "tokens_seen": 367922176 }, { "epoch": 1.01, "learning_rate": 0.0004487362086258777, "loss": 3.1053, "theoretical_loss": 4.04983339960173, "tokens_seen": 367987712 }, { "epoch": 1.01, "learning_rate": 0.0004487261785356068, "loss": 3.2847, "theoretical_loss": 4.049751582041928, "tokens_seen": 368053248 }, { "epoch": 1.01, "learning_rate": 0.00044871614844533604, "loss": 3.1994, "theoretical_loss": 4.049669783127722, "tokens_seen": 368118784 }, { "epoch": 1.01, "learning_rate": 0.0004487061183550652, "loss": 3.0861, "theoretical_loss": 4.049588002851546, "tokens_seen": 368184320 }, { "epoch": 1.01, "learning_rate": 0.0004486960882647944, "loss": 3.204, "theoretical_loss": 4.049506241205835, "tokens_seen": 368249856 }, { "epoch": 1.01, "learning_rate": 0.0004486860581745236, "loss": 2.9666, "theoretical_loss": 4.049424498183031, "tokens_seen": 368315392 }, { "epoch": 1.01, "learning_rate": 0.00044867602808425276, "loss": 3.1713, "theoretical_loss": 4.04934277377558, "tokens_seen": 368380928 }, { "epoch": 1.01, "learning_rate": 0.00044866599799398194, "loss": 3.156, "theoretical_loss": 4.049261067975932, "tokens_seen": 368446464 }, { "epoch": 1.01, "learning_rate": 0.0004486559679037112, "loss": 3.3038, "theoretical_loss": 4.049179380776542, "tokens_seen": 368512000 }, { "epoch": 1.01, "learning_rate": 0.0004486459378134403, "loss": 3.1131, "theoretical_loss": 4.049097712169869, "tokens_seen": 368577536 }, { "epoch": 1.01, "objective/train/docs_used": 613523, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2236547470092773, "objective/train/theoretical_loss": 4.0490364729118, "objective/train/tokens_used": 389086688, "theoretical_loss": 4.0490364729118, "tokens_seen": 368626688 }, { "epoch": 1.01, "learning_rate": 0.00044863590772316954, "loss": 3.2957, "theoretical_loss": 4.049016062148374, "tokens_seen": 368643072 }, { "epoch": 1.01, "learning_rate": 0.00044862587763289867, "loss": 3.2323, "theoretical_loss": 4.048934430704529, "tokens_seen": 368708608 }, { "epoch": 1.01, "learning_rate": 0.0004486158475426279, "loss": 3.1679, "theoretical_loss": 4.048852817830801, "tokens_seen": 368774144 }, { "epoch": 1.01, "learning_rate": 0.0004486058174523571, "loss": 3.1209, "theoretical_loss": 4.048771223519671, "tokens_seen": 368839680 }, { "epoch": 1.01, "learning_rate": 0.00044859578736208627, "loss": 3.2621, "theoretical_loss": 4.048689647763618, "tokens_seen": 368905216 }, { "epoch": 1.01, "learning_rate": 0.00044858575727181545, "loss": 3.2142, "theoretical_loss": 4.048608090555127, "tokens_seen": 368970752 }, { "epoch": 1.01, "learning_rate": 0.0004485757271815447, "loss": 3.2225, "theoretical_loss": 4.048526551886687, "tokens_seen": 369036288 }, { "epoch": 1.01, "learning_rate": 0.0004485656970912738, "loss": 3.2707, "theoretical_loss": 4.048445031750795, "tokens_seen": 369101824 }, { "epoch": 1.01, "learning_rate": 0.00044855566700100304, "loss": 3.2761, "theoretical_loss": 4.048363530139945, "tokens_seen": 369167360 }, { "epoch": 1.01, "learning_rate": 0.00044854563691073217, "loss": 2.9868, "theoretical_loss": 4.048282047046644, "tokens_seen": 369232896 }, { "epoch": 1.01, "learning_rate": 0.0004485356068204614, "loss": 3.0526, "theoretical_loss": 4.048200582463396, "tokens_seen": 369298432 }, { "epoch": 1.01, "learning_rate": 0.0004485255767301906, "loss": 3.2251, "theoretical_loss": 4.048119136382715, "tokens_seen": 369363968 }, { "epoch": 1.01, "learning_rate": 0.00044851554663991977, "loss": 3.2525, "theoretical_loss": 4.048037708797115, "tokens_seen": 369429504 }, { "epoch": 1.01, "learning_rate": 0.00044850551654964895, "loss": 3.1153, "theoretical_loss": 4.047956299699117, "tokens_seen": 369495040 }, { "epoch": 1.01, "learning_rate": 0.00044849548645937813, "loss": 3.1193, "theoretical_loss": 4.047874909081245, "tokens_seen": 369560576 }, { "epoch": 1.01, "learning_rate": 0.0004484854563691073, "loss": 3.2303, "theoretical_loss": 4.0477935369360285, "tokens_seen": 369626112 }, { "epoch": 1.01, "learning_rate": 0.00044847542627883655, "loss": 3.192, "theoretical_loss": 4.047712183256, "tokens_seen": 369691648 }, { "epoch": 1.01, "learning_rate": 0.0004484653961885657, "loss": 3.0395, "theoretical_loss": 4.047630848033698, "tokens_seen": 369757184 }, { "epoch": 1.01, "learning_rate": 0.0004484553660982949, "loss": 3.2857, "theoretical_loss": 4.047549531261664, "tokens_seen": 369822720 }, { "epoch": 1.01, "learning_rate": 0.00044844533600802404, "loss": 3.2698, "theoretical_loss": 4.047468232932444, "tokens_seen": 369888256 }, { "epoch": 1.01, "learning_rate": 0.00044843530591775327, "loss": 3.2721, "theoretical_loss": 4.04738695303859, "tokens_seen": 369953792 }, { "epoch": 1.01, "learning_rate": 0.00044842527582748245, "loss": 3.0699, "theoretical_loss": 4.047305691572654, "tokens_seen": 370019328 }, { "epoch": 1.01, "learning_rate": 0.00044841524573721163, "loss": 3.175, "theoretical_loss": 4.0472244485271975, "tokens_seen": 370084864 }, { "epoch": 1.01, "learning_rate": 0.0004484052156469408, "loss": 3.0309, "theoretical_loss": 4.047143223894784, "tokens_seen": 370150400 }, { "epoch": 1.01, "learning_rate": 0.00044839518555667005, "loss": 3.3289, "theoretical_loss": 4.047062017667981, "tokens_seen": 370215936 }, { "epoch": 1.01, "objective/train/docs_used": 616213, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.360872268676758, "objective/train/theoretical_loss": 4.047001125072091, "objective/train/tokens_used": 390725088, "theoretical_loss": 4.047001125072091, "tokens_seen": 370265088 }, { "epoch": 1.01, "learning_rate": 0.0004483851554663992, "loss": 3.2002, "theoretical_loss": 4.04698082983936, "tokens_seen": 370281472 }, { "epoch": 1.01, "learning_rate": 0.0004483751253761284, "loss": 3.126, "theoretical_loss": 4.046899660401499, "tokens_seen": 370347008 }, { "epoch": 1.01, "learning_rate": 0.00044836509528585754, "loss": 3.0514, "theoretical_loss": 4.046818509346977, "tokens_seen": 370412544 }, { "epoch": 1.01, "learning_rate": 0.0004483550651955868, "loss": 3.1121, "theoretical_loss": 4.04673737666838, "tokens_seen": 370478080 }, { "epoch": 1.01, "learning_rate": 0.00044834503510531596, "loss": 3.0854, "theoretical_loss": 4.046656262358297, "tokens_seen": 370543616 }, { "epoch": 1.01, "learning_rate": 0.00044833500501504514, "loss": 3.1725, "theoretical_loss": 4.0465751664093235, "tokens_seen": 370609152 }, { "epoch": 1.01, "learning_rate": 0.0004483249749247743, "loss": 3.2363, "theoretical_loss": 4.046494088814056, "tokens_seen": 370674688 }, { "epoch": 1.01, "learning_rate": 0.0004483149448345035, "loss": 2.9785, "theoretical_loss": 4.046413029565096, "tokens_seen": 370740224 }, { "epoch": 1.01, "learning_rate": 0.0004483049147442327, "loss": 3.1793, "theoretical_loss": 4.0463319886550515, "tokens_seen": 370805760 }, { "epoch": 1.01, "learning_rate": 0.0004482948846539619, "loss": 3.0955, "theoretical_loss": 4.046250966076533, "tokens_seen": 370871296 }, { "epoch": 1.01, "learning_rate": 0.00044828485456369104, "loss": 3.2491, "theoretical_loss": 4.046169961822156, "tokens_seen": 370936832 }, { "epoch": 1.01, "learning_rate": 0.0004482748244734203, "loss": 3.2696, "theoretical_loss": 4.0460889758845395, "tokens_seen": 371002368 }, { "epoch": 1.01, "learning_rate": 0.0004482647943831494, "loss": 3.2697, "theoretical_loss": 4.046008008256307, "tokens_seen": 371067904 }, { "epoch": 1.01, "learning_rate": 0.00044825476429287864, "loss": 3.1358, "theoretical_loss": 4.045927058930086, "tokens_seen": 371133440 }, { "epoch": 1.01, "learning_rate": 0.0004482447342026078, "loss": 3.1154, "theoretical_loss": 4.045846127898511, "tokens_seen": 371198976 }, { "epoch": 1.01, "learning_rate": 0.000448234704112337, "loss": 3.1296, "theoretical_loss": 4.0457652151542165, "tokens_seen": 371264512 }, { "epoch": 1.01, "learning_rate": 0.0004482246740220662, "loss": 3.2922, "theoretical_loss": 4.045684320689844, "tokens_seen": 371330048 }, { "epoch": 1.01, "learning_rate": 0.0004482146439317954, "loss": 3.2668, "theoretical_loss": 4.045603444498037, "tokens_seen": 371395584 }, { "epoch": 1.01, "learning_rate": 0.00044820461384152455, "loss": 3.1198, "theoretical_loss": 4.0455225865714475, "tokens_seen": 371461120 }, { "epoch": 1.01, "learning_rate": 0.0004481945837512538, "loss": 3.1919, "theoretical_loss": 4.0454417469027275, "tokens_seen": 371526656 }, { "epoch": 1.01, "learning_rate": 0.00044818455366098296, "loss": 3.2613, "theoretical_loss": 4.045360925484535, "tokens_seen": 371592192 }, { "epoch": 1.01, "learning_rate": 0.00044817452357071214, "loss": 3.1746, "theoretical_loss": 4.045280122309532, "tokens_seen": 371657728 }, { "epoch": 1.01, "learning_rate": 0.0004481644934804414, "loss": 3.1085, "theoretical_loss": 4.045199337370385, "tokens_seen": 371723264 }, { "epoch": 1.01, "learning_rate": 0.0004481544633901705, "loss": 3.3149, "theoretical_loss": 4.045118570659764, "tokens_seen": 371788800 }, { "epoch": 1.01, "learning_rate": 0.00044814443329989974, "loss": 3.0554, "theoretical_loss": 4.045037822170345, "tokens_seen": 371854336 }, { "epoch": 1.01, "objective/train/docs_used": 619201, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0458619594573975, "objective/train/theoretical_loss": 4.044977272756539, "objective/train/tokens_used": 392363488, "theoretical_loss": 4.044977272756539, "tokens_seen": 371903488 }, { "epoch": 1.01, "learning_rate": 0.00044813440320962887, "loss": 3.0907, "theoretical_loss": 4.044957091894806, "tokens_seen": 371919872 }, { "epoch": 1.01, "learning_rate": 0.0004481243731193581, "loss": 3.1066, "theoretical_loss": 4.04487637982583, "tokens_seen": 371985408 }, { "epoch": 1.01, "learning_rate": 0.0004481143430290873, "loss": 3.2859, "theoretical_loss": 4.044795685956105, "tokens_seen": 372050944 }, { "epoch": 1.01, "learning_rate": 0.00044810431293881647, "loss": 3.1516, "theoretical_loss": 4.044715010278322, "tokens_seen": 372116480 }, { "epoch": 1.01, "learning_rate": 0.00044809428284854565, "loss": 3.2058, "theoretical_loss": 4.044634352785179, "tokens_seen": 372182016 }, { "epoch": 1.01, "learning_rate": 0.0004480842527582749, "loss": 3.2822, "theoretical_loss": 4.044553713469373, "tokens_seen": 372247552 }, { "epoch": 1.01, "learning_rate": 0.000448074222668004, "loss": 2.9836, "theoretical_loss": 4.044473092323611, "tokens_seen": 372313088 }, { "epoch": 1.01, "learning_rate": 0.00044806419257773324, "loss": 3.3087, "theoretical_loss": 4.0443924893406, "tokens_seen": 372378624 }, { "epoch": 1.01, "learning_rate": 0.00044805416248746237, "loss": 2.9899, "theoretical_loss": 4.044311904513054, "tokens_seen": 372444160 }, { "epoch": 1.01, "learning_rate": 0.0004480441323971916, "loss": 3.145, "theoretical_loss": 4.044231337833689, "tokens_seen": 372509696 }, { "epoch": 1.01, "learning_rate": 0.0004480341023069208, "loss": 3.1667, "theoretical_loss": 4.044150789295227, "tokens_seen": 372575232 }, { "epoch": 1.01, "learning_rate": 0.00044802407221664997, "loss": 3.0723, "theoretical_loss": 4.044070258890391, "tokens_seen": 372640768 }, { "epoch": 1.01, "learning_rate": 0.00044801404212637915, "loss": 3.1035, "theoretical_loss": 4.0439897466119135, "tokens_seen": 372706304 }, { "epoch": 1.01, "learning_rate": 0.00044800401203610833, "loss": 3.0959, "theoretical_loss": 4.0439092524525275, "tokens_seen": 372771840 }, { "epoch": 1.01, "learning_rate": 0.0004479939819458375, "loss": 3.0935, "theoretical_loss": 4.04382877640497, "tokens_seen": 372837376 }, { "epoch": 1.01, "learning_rate": 0.00044798395185556675, "loss": 3.1488, "theoretical_loss": 4.043748318461985, "tokens_seen": 372902912 }, { "epoch": 1.01, "learning_rate": 0.0004479739217652959, "loss": 3.2154, "theoretical_loss": 4.043667878616316, "tokens_seen": 372968448 }, { "epoch": 1.01, "learning_rate": 0.0004479638916750251, "loss": 3.2798, "theoretical_loss": 4.043587456860715, "tokens_seen": 373033984 }, { "epoch": 1.01, "learning_rate": 0.00044795386158475424, "loss": 3.2472, "theoretical_loss": 4.043507053187938, "tokens_seen": 373099520 }, { "epoch": 1.01, "learning_rate": 0.00044794383149448347, "loss": 3.0683, "theoretical_loss": 4.043426667590741, "tokens_seen": 373165056 }, { "epoch": 1.01, "learning_rate": 0.00044793380140421265, "loss": 3.216, "theoretical_loss": 4.04334630006189, "tokens_seen": 373230592 }, { "epoch": 1.01, "learning_rate": 0.00044792377131394183, "loss": 2.8441, "theoretical_loss": 4.04326595059415, "tokens_seen": 373296128 }, { "epoch": 1.01, "learning_rate": 0.000447913741223671, "loss": 3.1454, "theoretical_loss": 4.043185619180294, "tokens_seen": 373361664 }, { "epoch": 1.01, "learning_rate": 0.00044790371113340025, "loss": 3.2306, "theoretical_loss": 4.0431053058130955, "tokens_seen": 373427200 }, { "epoch": 1.01, "learning_rate": 0.0004478936810431294, "loss": 3.2945, "theoretical_loss": 4.043025010485336, "tokens_seen": 373492736 }, { "epoch": 1.01, "objective/train/docs_used": 620463, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9614388942718506, "objective/train/theoretical_loss": 4.042964800823556, "objective/train/tokens_used": 394001888, "theoretical_loss": 4.042964800823556, "tokens_seen": 373541888 }, { "epoch": 1.01, "learning_rate": 0.0004478836509528586, "loss": 3.0247, "theoretical_loss": 4.042944733189799, "tokens_seen": 373558272 }, { "epoch": 1.01, "learning_rate": 0.00044787362086258774, "loss": 3.0954, "theoretical_loss": 4.042864473919272, "tokens_seen": 373623808 }, { "epoch": 1.01, "learning_rate": 0.000447863590772317, "loss": 3.3021, "theoretical_loss": 4.042784232666547, "tokens_seen": 373689344 }, { "epoch": 1.01, "learning_rate": 0.00044785356068204616, "loss": 3.2639, "theoretical_loss": 4.04270400942442, "tokens_seen": 373754880 }, { "epoch": 1.01, "learning_rate": 0.00044784353059177534, "loss": 3.2402, "theoretical_loss": 4.042623804185692, "tokens_seen": 373820416 }, { "epoch": 1.01, "learning_rate": 0.0004478335005015045, "loss": 3.1285, "theoretical_loss": 4.042543616943168, "tokens_seen": 373885952 }, { "epoch": 1.01, "learning_rate": 0.0004478234704112337, "loss": 3.1432, "theoretical_loss": 4.042463447689657, "tokens_seen": 373951488 }, { "epoch": 1.01, "learning_rate": 0.0004478134403209629, "loss": 3.1601, "theoretical_loss": 4.042383296417969, "tokens_seen": 374017024 }, { "epoch": 1.01, "learning_rate": 0.0004478034102306921, "loss": 3.1847, "theoretical_loss": 4.042303163120925, "tokens_seen": 374082560 }, { "epoch": 1.01, "learning_rate": 0.00044779338014042124, "loss": 3.0807, "theoretical_loss": 4.042223047791343, "tokens_seen": 374148096 }, { "epoch": 1.01, "learning_rate": 0.0004477833500501505, "loss": 3.2432, "theoretical_loss": 4.04214295042205, "tokens_seen": 374213632 }, { "epoch": 1.01, "learning_rate": 0.0004477733199598796, "loss": 3.1634, "theoretical_loss": 4.042062871005874, "tokens_seen": 374279168 }, { "epoch": 1.01, "learning_rate": 0.00044776328986960884, "loss": 3.1356, "theoretical_loss": 4.041982809535649, "tokens_seen": 374344704 }, { "epoch": 1.01, "learning_rate": 0.000447753259779338, "loss": 3.1292, "theoretical_loss": 4.041902766004213, "tokens_seen": 374410240 }, { "epoch": 1.01, "learning_rate": 0.0004477432296890672, "loss": 3.383, "theoretical_loss": 4.041822740404407, "tokens_seen": 374475776 }, { "epoch": 1.01, "learning_rate": 0.0004477331995987964, "loss": 3.2434, "theoretical_loss": 4.041742732729078, "tokens_seen": 374541312 }, { "epoch": 1.01, "learning_rate": 0.0004477231695085256, "loss": 3.0211, "theoretical_loss": 4.041662742971074, "tokens_seen": 374606848 }, { "epoch": 1.01, "learning_rate": 0.00044771313941825475, "loss": 3.1897, "theoretical_loss": 4.04158277112325, "tokens_seen": 374672384 }, { "epoch": 1.01, "learning_rate": 0.000447703109327984, "loss": 3.2015, "theoretical_loss": 4.041502817178464, "tokens_seen": 374737920 }, { "epoch": 1.01, "learning_rate": 0.0004476930792377131, "loss": 3.0941, "theoretical_loss": 4.041422881129579, "tokens_seen": 374803456 }, { "epoch": 1.01, "learning_rate": 0.00044768304914744234, "loss": 3.2044, "theoretical_loss": 4.041342962969459, "tokens_seen": 374868992 }, { "epoch": 1.01, "learning_rate": 0.0004476730190571715, "loss": 3.1702, "theoretical_loss": 4.041263062690978, "tokens_seen": 374934528 }, { "epoch": 1.01, "learning_rate": 0.0004476629889669007, "loss": 3.0858, "theoretical_loss": 4.041183180287007, "tokens_seen": 375000064 }, { "epoch": 1.01, "learning_rate": 0.0004476529588766299, "loss": 3.1945, "theoretical_loss": 4.0411033157504255, "tokens_seen": 375065600 }, { "epoch": 1.01, "learning_rate": 0.00044764292878635907, "loss": 3.0141, "theoretical_loss": 4.041023469074117, "tokens_seen": 375131136 }, { "epoch": 1.01, "objective/train/docs_used": 623485, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8966448307037354, "objective/train/theoretical_loss": 4.04096359578341, "objective/train/tokens_used": 395640288, "theoretical_loss": 4.04096359578341, "tokens_seen": 375180288 }, { "epoch": 1.01, "learning_rate": 0.00044763289869608825, "loss": 2.9995, "theoretical_loss": 4.040943640250967, "tokens_seen": 375196672 }, { "epoch": 1.01, "learning_rate": 0.0004476228686058175, "loss": 3.2838, "theoretical_loss": 4.040863829273868, "tokens_seen": 375262208 }, { "epoch": 1.01, "learning_rate": 0.0004476128385155466, "loss": 3.3544, "theoretical_loss": 4.0407840361357135, "tokens_seen": 375327744 }, { "epoch": 1.01, "learning_rate": 0.00044760280842527585, "loss": 3.2535, "theoretical_loss": 4.040704260829403, "tokens_seen": 375393280 }, { "epoch": 1.01, "learning_rate": 0.00044759277833500503, "loss": 3.014, "theoretical_loss": 4.040624503347839, "tokens_seen": 375458816 }, { "epoch": 1.01, "learning_rate": 0.0004475827482447342, "loss": 3.147, "theoretical_loss": 4.040544763683929, "tokens_seen": 375524352 }, { "epoch": 1.01, "learning_rate": 0.0004475727181544634, "loss": 3.0356, "theoretical_loss": 4.040465041830583, "tokens_seen": 375589888 }, { "epoch": 1.01, "learning_rate": 0.00044756268806419257, "loss": 3.1305, "theoretical_loss": 4.040385337780718, "tokens_seen": 375655424 }, { "epoch": 1.01, "learning_rate": 0.00044755265797392175, "loss": 3.0213, "theoretical_loss": 4.040305651527252, "tokens_seen": 375720960 }, { "epoch": 1.01, "learning_rate": 0.000447542627883651, "loss": 3.0275, "theoretical_loss": 4.040225983063108, "tokens_seen": 375786496 }, { "epoch": 1.01, "learning_rate": 0.0004475325977933801, "loss": 3.0726, "theoretical_loss": 4.040146332381214, "tokens_seen": 375852032 }, { "epoch": 1.01, "learning_rate": 0.00044752256770310935, "loss": 3.109, "theoretical_loss": 4.040066699474501, "tokens_seen": 375917568 }, { "epoch": 1.01, "learning_rate": 0.0004475125376128385, "loss": 3.266, "theoretical_loss": 4.0399870843359045, "tokens_seen": 375983104 }, { "epoch": 1.01, "learning_rate": 0.0004475025075225677, "loss": 3.0701, "theoretical_loss": 4.039907486958365, "tokens_seen": 376048640 }, { "epoch": 1.01, "learning_rate": 0.0004474924774322969, "loss": 3.1213, "theoretical_loss": 4.039827907334824, "tokens_seen": 376114176 }, { "epoch": 1.01, "learning_rate": 0.0004474824473420261, "loss": 3.2474, "theoretical_loss": 4.0397483454582295, "tokens_seen": 376179712 }, { "epoch": 1.01, "learning_rate": 0.00044747241725175526, "loss": 3.0919, "theoretical_loss": 4.039668801321534, "tokens_seen": 376245248 }, { "epoch": 1.01, "learning_rate": 0.00044746238716148444, "loss": 3.1075, "theoretical_loss": 4.039589274917693, "tokens_seen": 376310784 }, { "epoch": 1.01, "learning_rate": 0.0004474523570712136, "loss": 3.3231, "theoretical_loss": 4.039509766239665, "tokens_seen": 376376320 }, { "epoch": 1.01, "learning_rate": 0.00044744232698094285, "loss": 2.9824, "theoretical_loss": 4.039430275280415, "tokens_seen": 376441856 }, { "epoch": 1.01, "learning_rate": 0.00044743229689067204, "loss": 3.1099, "theoretical_loss": 4.03935080203291, "tokens_seen": 376507392 }, { "epoch": 1.01, "learning_rate": 0.0004474222668004012, "loss": 3.1963, "theoretical_loss": 4.03927134649012, "tokens_seen": 376572928 }, { "epoch": 1.01, "learning_rate": 0.00044741223671013045, "loss": 2.8893, "theoretical_loss": 4.039191908645024, "tokens_seen": 376638464 }, { "epoch": 1.01, "learning_rate": 0.0004474022066198596, "loss": 3.3031, "theoretical_loss": 4.0391124884905985, "tokens_seen": 376704000 }, { "epoch": 1.01, "learning_rate": 0.0004473921765295888, "loss": 3.0218, "theoretical_loss": 4.039033086019829, "tokens_seen": 376769536 }, { "epoch": 1.01, "objective/train/docs_used": 626181, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2160143852233887, "objective/train/theoretical_loss": 4.0389735457674325, "objective/train/tokens_used": 397278688, "theoretical_loss": 4.0389735457674325, "tokens_seen": 376818688 }, { "epoch": 1.01, "learning_rate": 0.00044738214643931794, "loss": 3.1576, "theoretical_loss": 4.038953701225703, "tokens_seen": 376835072 }, { "epoch": 1.01, "learning_rate": 0.0004473721163490472, "loss": 3.0948, "theoretical_loss": 4.0388743341012106, "tokens_seen": 376900608 }, { "epoch": 1.01, "learning_rate": 0.00044736208625877636, "loss": 3.3536, "theoretical_loss": 4.03879498463935, "tokens_seen": 376966144 }, { "epoch": 1.01, "learning_rate": 0.00044735205616850554, "loss": 3.1389, "theoretical_loss": 4.038715652833118, "tokens_seen": 377031680 }, { "epoch": 1.01, "learning_rate": 0.0004473420260782347, "loss": 3.1471, "theoretical_loss": 4.038636338675521, "tokens_seen": 377097216 }, { "epoch": 1.01, "learning_rate": 0.0004473319959879639, "loss": 3.099, "theoretical_loss": 4.038557042159566, "tokens_seen": 377162752 }, { "epoch": 1.01, "learning_rate": 0.0004473219658976931, "loss": 3.2467, "theoretical_loss": 4.038477763278262, "tokens_seen": 377228288 }, { "epoch": 1.01, "learning_rate": 0.0004473119358074223, "loss": 3.2151, "theoretical_loss": 4.038398502024628, "tokens_seen": 377293824 }, { "epoch": 1.01, "learning_rate": 0.00044730190571715144, "loss": 3.1703, "theoretical_loss": 4.038319258391682, "tokens_seen": 377359360 }, { "epoch": 1.01, "learning_rate": 0.0004472918756268807, "loss": 3.0667, "theoretical_loss": 4.038240032372447, "tokens_seen": 377424896 }, { "epoch": 1.01, "learning_rate": 0.0004472818455366098, "loss": 3.015, "theoretical_loss": 4.038160823959952, "tokens_seen": 377490432 }, { "epoch": 1.01, "learning_rate": 0.00044727181544633904, "loss": 3.1027, "theoretical_loss": 4.038081633147227, "tokens_seen": 377555968 }, { "epoch": 1.01, "learning_rate": 0.0004472617853560682, "loss": 3.1426, "theoretical_loss": 4.038002459927309, "tokens_seen": 377621504 }, { "epoch": 1.01, "learning_rate": 0.0004472517552657974, "loss": 3.2085, "theoretical_loss": 4.037923304293237, "tokens_seen": 377687040 }, { "epoch": 1.01, "learning_rate": 0.0004472417251755266, "loss": 3.0602, "theoretical_loss": 4.037844166238053, "tokens_seen": 377752576 }, { "epoch": 1.01, "learning_rate": 0.0004472316950852558, "loss": 3.0465, "theoretical_loss": 4.037765045754806, "tokens_seen": 377818112 }, { "epoch": 1.01, "learning_rate": 0.00044722166499498495, "loss": 3.1957, "theoretical_loss": 4.037685942836546, "tokens_seen": 377883648 }, { "epoch": 1.01, "learning_rate": 0.0004472116349047142, "loss": 3.1196, "theoretical_loss": 4.03760685747633, "tokens_seen": 377949184 }, { "epoch": 1.01, "learning_rate": 0.0004472016048144433, "loss": 3.0835, "theoretical_loss": 4.037527789667216, "tokens_seen": 378014720 }, { "epoch": 1.01, "learning_rate": 0.00044719157472417254, "loss": 3.2045, "theoretical_loss": 4.037448739402267, "tokens_seen": 378080256 }, { "epoch": 1.01, "learning_rate": 0.0004471815446339017, "loss": 3.2134, "theoretical_loss": 4.03736970667455, "tokens_seen": 378145792 }, { "epoch": 1.01, "learning_rate": 0.0004471715145436309, "loss": 3.0529, "theoretical_loss": 4.0372906914771365, "tokens_seen": 378211328 }, { "epoch": 1.01, "learning_rate": 0.0004471614844533601, "loss": 3.1643, "theoretical_loss": 4.037211693803101, "tokens_seen": 378276864 }, { "epoch": 1.01, "learning_rate": 0.00044715145436308927, "loss": 3.1266, "theoretical_loss": 4.037132713645525, "tokens_seen": 378342400 }, { "epoch": 1.01, "learning_rate": 0.00044714142427281845, "loss": 3.3439, "theoretical_loss": 4.037053750997487, "tokens_seen": 378407936 }, { "epoch": 1.01, "objective/train/docs_used": 629062, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2237536907196045, "objective/train/theoretical_loss": 4.036994540497936, "objective/train/tokens_used": 398917088, "theoretical_loss": 4.036994540497936, "tokens_seen": 378457088 }, { "epoch": 1.01, "learning_rate": 0.0004471313941825477, "loss": 3.1322, "theoretical_loss": 4.0369748058520765, "tokens_seen": 378473472 }, { "epoch": 1.01, "learning_rate": 0.0004471213640922768, "loss": 3.1309, "theoretical_loss": 4.036895878202383, "tokens_seen": 378539008 }, { "epoch": 1.01, "learning_rate": 0.00044711133400200605, "loss": 3.0153, "theoretical_loss": 4.036816968041503, "tokens_seen": 378604544 }, { "epoch": 1.01, "learning_rate": 0.00044710130391173523, "loss": 3.1281, "theoretical_loss": 4.036738075362533, "tokens_seen": 378670080 }, { "epoch": 1.01, "learning_rate": 0.0004470912738214644, "loss": 3.1428, "theoretical_loss": 4.036659200158576, "tokens_seen": 378735616 }, { "epoch": 1.01, "learning_rate": 0.0004470812437311936, "loss": 3.1493, "theoretical_loss": 4.036580342422739, "tokens_seen": 378801152 }, { "epoch": 1.01, "learning_rate": 0.00044707121364092277, "loss": 3.2065, "theoretical_loss": 4.036501502148132, "tokens_seen": 378866688 }, { "epoch": 1.01, "learning_rate": 0.00044706118355065195, "loss": 3.0335, "theoretical_loss": 4.036422679327869, "tokens_seen": 378932224 }, { "epoch": 1.01, "learning_rate": 0.0004470511534603812, "loss": 3.1442, "theoretical_loss": 4.036343873955068, "tokens_seen": 378997760 }, { "epoch": 1.01, "learning_rate": 0.0004470411233701103, "loss": 3.0504, "theoretical_loss": 4.036265086022851, "tokens_seen": 379063296 }, { "epoch": 1.01, "learning_rate": 0.00044703109327983955, "loss": 3.2498, "theoretical_loss": 4.036186315524344, "tokens_seen": 379128832 }, { "epoch": 1.01, "learning_rate": 0.0004470210631895687, "loss": 3.1186, "theoretical_loss": 4.036107562452677, "tokens_seen": 379194368 }, { "epoch": 1.01, "learning_rate": 0.0004470110330992979, "loss": 2.8179, "theoretical_loss": 4.036028826800983, "tokens_seen": 379259904 }, { "epoch": 1.01, "learning_rate": 0.0004470010030090271, "loss": 3.1984, "theoretical_loss": 4.035950108562401, "tokens_seen": 379325440 }, { "epoch": 1.01, "learning_rate": 0.0004469909729187563, "loss": 3.2193, "theoretical_loss": 4.035871407730071, "tokens_seen": 379390976 }, { "epoch": 1.01, "learning_rate": 0.00044698094282848546, "loss": 3.364, "theoretical_loss": 4.035792724297139, "tokens_seen": 379456512 }, { "epoch": 1.01, "learning_rate": 0.00044697091273821464, "loss": 3.2956, "theoretical_loss": 4.0357140582567546, "tokens_seen": 379522048 }, { "epoch": 1.01, "learning_rate": 0.0004469608826479438, "loss": 3.1532, "theoretical_loss": 4.03563540960207, "tokens_seen": 379587584 }, { "epoch": 1.01, "learning_rate": 0.00044695085255767305, "loss": 3.318, "theoretical_loss": 4.035556778326242, "tokens_seen": 379653120 }, { "epoch": 1.01, "learning_rate": 0.0004469408224674022, "loss": 3.2539, "theoretical_loss": 4.035478164422434, "tokens_seen": 379718656 }, { "epoch": 1.01, "learning_rate": 0.0004469307923771314, "loss": 3.2745, "theoretical_loss": 4.0353995678838075, "tokens_seen": 379784192 }, { "epoch": 1.01, "learning_rate": 0.0004469207622868606, "loss": 3.2102, "theoretical_loss": 4.035320988703533, "tokens_seen": 379849728 }, { "epoch": 1.01, "learning_rate": 0.0004469107321965898, "loss": 3.1481, "theoretical_loss": 4.035242426874782, "tokens_seen": 379915264 }, { "epoch": 1.01, "learning_rate": 0.00044690070210631896, "loss": 3.1289, "theoretical_loss": 4.035163882390732, "tokens_seen": 379980800 }, { "epoch": 1.01, "learning_rate": 0.00044689067201604814, "loss": 3.2767, "theoretical_loss": 4.035085355244561, "tokens_seen": 380046336 }, { "epoch": 1.01, "objective/train/docs_used": 631903, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1707515716552734, "objective/train/theoretical_loss": 4.035026471258818, "objective/train/tokens_used": 400555488, "theoretical_loss": 4.035026471258818, "tokens_seen": 380095488 }, { "epoch": 1.01, "learning_rate": 0.0004468806419257773, "loss": 3.189, "theoretical_loss": 4.035006845429456, "tokens_seen": 380111872 }, { "epoch": 1.01, "learning_rate": 0.00044687061183550656, "loss": 3.3398, "theoretical_loss": 4.0349283529386035, "tokens_seen": 380177408 }, { "epoch": 1.01, "learning_rate": 0.0004468605817452357, "loss": 3.1075, "theoretical_loss": 4.034849877765194, "tokens_seen": 380242944 }, { "epoch": 1.01, "learning_rate": 0.0004468505516549649, "loss": 3.1495, "theoretical_loss": 4.034771419902425, "tokens_seen": 380308480 }, { "epoch": 1.01, "learning_rate": 0.00044684052156469405, "loss": 3.0909, "theoretical_loss": 4.034692979343495, "tokens_seen": 380374016 }, { "epoch": 1.01, "learning_rate": 0.0004468304914744233, "loss": 3.1544, "theoretical_loss": 4.034614556081609, "tokens_seen": 380439552 }, { "epoch": 1.01, "learning_rate": 0.00044682046138415246, "loss": 3.1525, "theoretical_loss": 4.034536150109971, "tokens_seen": 380505088 }, { "epoch": 1.01, "learning_rate": 0.00044681043129388164, "loss": 3.2216, "theoretical_loss": 4.034457761421794, "tokens_seen": 380570624 }, { "epoch": 1.01, "learning_rate": 0.0004468004012036108, "loss": 2.9789, "theoretical_loss": 4.034379390010292, "tokens_seen": 380636160 }, { "epoch": 1.01, "learning_rate": 0.00044679037111334, "loss": 3.174, "theoretical_loss": 4.034301035868685, "tokens_seen": 380701696 }, { "epoch": 1.01, "learning_rate": 0.0004467803410230692, "loss": 3.2022, "theoretical_loss": 4.034222698990194, "tokens_seen": 380767232 }, { "epoch": 1.01, "learning_rate": 0.0004467703109327984, "loss": 3.1708, "theoretical_loss": 4.034144379368046, "tokens_seen": 380832768 }, { "epoch": 1.01, "learning_rate": 0.00044676028084252755, "loss": 3.2458, "theoretical_loss": 4.0340660769954715, "tokens_seen": 380898304 }, { "epoch": 1.01, "learning_rate": 0.0004467502507522568, "loss": 2.9546, "theoretical_loss": 4.033987791865703, "tokens_seen": 380963840 }, { "epoch": 1.01, "learning_rate": 0.00044674022066198597, "loss": 3.1723, "theoretical_loss": 4.03390952397198, "tokens_seen": 381029376 }, { "epoch": 1.01, "learning_rate": 0.00044673019057171515, "loss": 3.2023, "theoretical_loss": 4.033831273307542, "tokens_seen": 381094912 }, { "epoch": 1.01, "learning_rate": 0.00044672016048144433, "loss": 3.2225, "theoretical_loss": 4.033753039865637, "tokens_seen": 381160448 }, { "epoch": 1.01, "learning_rate": 0.0004467101303911735, "loss": 2.9344, "theoretical_loss": 4.033674823639512, "tokens_seen": 381225984 }, { "epoch": 1.01, "learning_rate": 0.0004467001003009027, "loss": 3.3626, "theoretical_loss": 4.033596624622421, "tokens_seen": 381291520 }, { "epoch": 1.01, "learning_rate": 0.0004466900702106319, "loss": 3.1289, "theoretical_loss": 4.03351844280762, "tokens_seen": 381357056 }, { "epoch": 1.01, "learning_rate": 0.0004466800401203611, "loss": 3.3191, "theoretical_loss": 4.033440278188371, "tokens_seen": 381422592 }, { "epoch": 1.01, "learning_rate": 0.0004466700100300903, "loss": 2.9545, "theoretical_loss": 4.033362130757936, "tokens_seen": 381488128 }, { "epoch": 1.01, "learning_rate": 0.00044665997993981947, "loss": 3.1746, "theoretical_loss": 4.033284000509586, "tokens_seen": 381553664 }, { "epoch": 1.01, "learning_rate": 0.00044664994984954865, "loss": 3.1733, "theoretical_loss": 4.033205887436592, "tokens_seen": 381619200 }, { "epoch": 1.01, "learning_rate": 0.0004466399197592779, "loss": 3.2657, "theoretical_loss": 4.033127791532229, "tokens_seen": 381684736 }, { "epoch": 1.01, "objective/train/docs_used": 633212, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.311382532119751, "objective/train/theoretical_loss": 4.033069230866828, "objective/train/tokens_used": 402193888, "theoretical_loss": 4.033069230866828, "tokens_seen": 381733888 }, { "epoch": 1.01, "learning_rate": 0.000446629889669007, "loss": 3.0603, "theoretical_loss": 4.0330497127897775, "tokens_seen": 381750272 }, { "epoch": 1.01, "learning_rate": 0.00044661985957873625, "loss": 3.2634, "theoretical_loss": 4.032971651202519, "tokens_seen": 381815808 }, { "epoch": 1.01, "learning_rate": 0.00044660982948846543, "loss": 3.17, "theoretical_loss": 4.032893606763744, "tokens_seen": 381881344 }, { "epoch": 1.01, "learning_rate": 0.0004465997993981946, "loss": 3.3028, "theoretical_loss": 4.03281557946674, "tokens_seen": 381946880 }, { "epoch": 1.01, "learning_rate": 0.0004465897693079238, "loss": 3.1473, "theoretical_loss": 4.032737569304803, "tokens_seen": 382012416 }, { "epoch": 1.01, "learning_rate": 0.00044657973921765297, "loss": 3.1514, "theoretical_loss": 4.032659576271232, "tokens_seen": 382077952 }, { "epoch": 1.01, "learning_rate": 0.00044656970912738215, "loss": 3.2032, "theoretical_loss": 4.032581600359329, "tokens_seen": 382143488 }, { "epoch": 1.01, "learning_rate": 0.0004465596790371114, "loss": 3.1734, "theoretical_loss": 4.0325036415624, "tokens_seen": 382209024 }, { "epoch": 1.01, "learning_rate": 0.0004465496489468405, "loss": 3.3463, "theoretical_loss": 4.0324256998737535, "tokens_seen": 382274560 }, { "epoch": 1.01, "learning_rate": 0.00044653961885656975, "loss": 3.0387, "theoretical_loss": 4.032347775286704, "tokens_seen": 382340096 }, { "epoch": 1.01, "learning_rate": 0.0004465295887662989, "loss": 3.0406, "theoretical_loss": 4.03226986779457, "tokens_seen": 382405632 }, { "epoch": 1.01, "learning_rate": 0.0004465195586760281, "loss": 3.1234, "theoretical_loss": 4.03219197739067, "tokens_seen": 382471168 }, { "epoch": 1.01, "learning_rate": 0.0004465095285857573, "loss": 3.1421, "theoretical_loss": 4.032114104068331, "tokens_seen": 382536704 }, { "epoch": 1.01, "learning_rate": 0.0004464994984954865, "loss": 3.2172, "theoretical_loss": 4.032036247820879, "tokens_seen": 382602240 }, { "epoch": 1.01, "learning_rate": 0.00044648946840521566, "loss": 3.2211, "theoretical_loss": 4.03195840864165, "tokens_seen": 382667776 }, { "epoch": 1.01, "learning_rate": 0.00044647943831494484, "loss": 3.224, "theoretical_loss": 4.031880586523976, "tokens_seen": 382733312 }, { "epoch": 1.01, "learning_rate": 0.000446469408224674, "loss": 3.2465, "theoretical_loss": 4.0318027814612, "tokens_seen": 382798848 }, { "epoch": 1.01, "learning_rate": 0.00044645937813440325, "loss": 3.1651, "theoretical_loss": 4.031724993446663, "tokens_seen": 382864384 }, { "epoch": 1.01, "learning_rate": 0.0004464493480441324, "loss": 3.2821, "theoretical_loss": 4.031647222473714, "tokens_seen": 382929920 }, { "epoch": 1.01, "learning_rate": 0.0004464393179538616, "loss": 3.048, "theoretical_loss": 4.031569468535704, "tokens_seen": 382995456 }, { "epoch": 1.01, "learning_rate": 0.0004464292878635908, "loss": 3.1204, "theoretical_loss": 4.031491731625986, "tokens_seen": 383060992 }, { "epoch": 1.01, "learning_rate": 0.00044641925777332, "loss": 3.0957, "theoretical_loss": 4.03141401173792, "tokens_seen": 383126528 }, { "epoch": 1.01, "learning_rate": 0.00044640922768304916, "loss": 3.0913, "theoretical_loss": 4.0313363088648675, "tokens_seen": 383192064 }, { "epoch": 1.01, "learning_rate": 0.00044639919759277834, "loss": 3.0233, "theoretical_loss": 4.031258623000195, "tokens_seen": 383257600 }, { "epoch": 1.01, "learning_rate": 0.0004463891675025075, "loss": 3.0461, "theoretical_loss": 4.031180954137271, "tokens_seen": 383323136 }, { "epoch": 1.01, "objective/train/docs_used": 637151, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.117532730102539, "objective/train/theoretical_loss": 4.03112271364349, "objective/train/tokens_used": 403832288, "theoretical_loss": 4.03112271364349, "tokens_seen": 383372288 }, { "epoch": 1.01, "learning_rate": 0.00044637913741223676, "loss": 3.1983, "theoretical_loss": 4.03110330226947, "tokens_seen": 383388672 }, { "epoch": 1.01, "learning_rate": 0.0004463691073219659, "loss": 3.1156, "theoretical_loss": 4.0310256673901685, "tokens_seen": 383454208 }, { "epoch": 1.01, "learning_rate": 0.0004463590772316951, "loss": 3.0981, "theoretical_loss": 4.030948049492747, "tokens_seen": 383519744 }, { "epoch": 1.01, "learning_rate": 0.00044634904714142425, "loss": 3.0808, "theoretical_loss": 4.030870448570591, "tokens_seen": 383585280 }, { "epoch": 1.01, "learning_rate": 0.0004463390170511535, "loss": 3.1125, "theoretical_loss": 4.030792864617087, "tokens_seen": 383650816 }, { "epoch": 1.01, "learning_rate": 0.00044632898696088266, "loss": 3.0892, "theoretical_loss": 4.030715297625628, "tokens_seen": 383716352 }, { "epoch": 1.01, "learning_rate": 0.00044631895687061184, "loss": 3.2634, "theoretical_loss": 4.030637747589609, "tokens_seen": 383781888 }, { "epoch": 1.01, "learning_rate": 0.000446308926780341, "loss": 3.1031, "theoretical_loss": 4.03056021450243, "tokens_seen": 383847424 }, { "epoch": 1.01, "learning_rate": 0.0004462988966900702, "loss": 3.2231, "theoretical_loss": 4.030482698357494, "tokens_seen": 383912960 }, { "epoch": 1.01, "learning_rate": 0.0004462888665997994, "loss": 3.1271, "theoretical_loss": 4.030405199148206, "tokens_seen": 383978496 }, { "epoch": 1.01, "learning_rate": 0.0004462788365095286, "loss": 3.1311, "theoretical_loss": 4.030327716867979, "tokens_seen": 384044032 }, { "epoch": 1.01, "learning_rate": 0.00044626880641925775, "loss": 3.3051, "theoretical_loss": 4.030250251510225, "tokens_seen": 384109568 }, { "epoch": 1.01, "learning_rate": 0.000446258776328987, "loss": 3.1579, "theoretical_loss": 4.030172803068362, "tokens_seen": 384175104 }, { "epoch": 1.01, "learning_rate": 0.00044624874623871617, "loss": 3.0105, "theoretical_loss": 4.030095371535813, "tokens_seen": 384240640 }, { "epoch": 1.01, "learning_rate": 0.00044623871614844535, "loss": 3.3145, "theoretical_loss": 4.030017956906001, "tokens_seen": 384306176 }, { "epoch": 1.01, "learning_rate": 0.00044622868605817453, "loss": 3.0775, "theoretical_loss": 4.029940559172355, "tokens_seen": 384371712 }, { "epoch": 1.01, "learning_rate": 0.0004462186559679037, "loss": 3.2684, "theoretical_loss": 4.029863178328309, "tokens_seen": 384437248 }, { "epoch": 1.01, "learning_rate": 0.0004462086258776329, "loss": 3.2485, "theoretical_loss": 4.0297858143672975, "tokens_seen": 384502784 }, { "epoch": 1.01, "learning_rate": 0.0004461985957873621, "loss": 3.1589, "theoretical_loss": 4.029708467282761, "tokens_seen": 384568320 }, { "epoch": 1.01, "learning_rate": 0.00044618856569709125, "loss": 3.081, "theoretical_loss": 4.029631137068144, "tokens_seen": 384633856 }, { "epoch": 1.01, "learning_rate": 0.0004461785356068205, "loss": 3.2683, "theoretical_loss": 4.029553823716891, "tokens_seen": 384699392 }, { "epoch": 1.01, "learning_rate": 0.0004461685055165496, "loss": 3.1109, "theoretical_loss": 4.029476527222455, "tokens_seen": 384764928 }, { "epoch": 1.01, "learning_rate": 0.00044615847542627885, "loss": 3.1706, "theoretical_loss": 4.029399247578289, "tokens_seen": 384830464 }, { "epoch": 1.01, "learning_rate": 0.00044614844533600803, "loss": 3.1271, "theoretical_loss": 4.029321984777853, "tokens_seen": 384896000 }, { "epoch": 1.01, "learning_rate": 0.0004461384152457372, "loss": 3.2686, "theoretical_loss": 4.029244738814607, "tokens_seen": 384961536 }, { "epoch": 1.01, "objective/train/docs_used": 638614, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4022700786590576, "objective/train/theoretical_loss": 4.029186815387647, "objective/train/tokens_used": 405470688, "theoretical_loss": 4.029186815387647, "tokens_seen": 385010688 }, { "epoch": 1.01, "learning_rate": 0.0004461283851554664, "loss": 3.2138, "theoretical_loss": 4.029167509682017, "tokens_seen": 385027072 }, { "epoch": 1.01, "learning_rate": 0.00044611835506519563, "loss": 3.0774, "theoretical_loss": 4.029090297373552, "tokens_seen": 385092608 }, { "epoch": 1.01, "learning_rate": 0.00044610832497492476, "loss": 3.1986, "theoretical_loss": 4.029013101882684, "tokens_seen": 385158144 }, { "epoch": 1.01, "learning_rate": 0.000446098294884654, "loss": 3.2223, "theoretical_loss": 4.02893592320289, "tokens_seen": 385223680 }, { "epoch": 1.01, "learning_rate": 0.0004460882647943831, "loss": 3.0841, "theoretical_loss": 4.02885876132765, "tokens_seen": 385289216 }, { "epoch": 1.01, "learning_rate": 0.00044607823470411235, "loss": 3.2132, "theoretical_loss": 4.0287816162504475, "tokens_seen": 385354752 }, { "epoch": 1.01, "learning_rate": 0.00044606820461384153, "loss": 3.1214, "theoretical_loss": 4.02870448796477, "tokens_seen": 385420288 }, { "epoch": 1.01, "learning_rate": 0.0004460581745235707, "loss": 3.2069, "theoretical_loss": 4.028627376464108, "tokens_seen": 385485824 }, { "epoch": 1.01, "learning_rate": 0.0004460481444332999, "loss": 3.1153, "theoretical_loss": 4.028550281741957, "tokens_seen": 385551360 }, { "epoch": 1.01, "learning_rate": 0.0004460381143430291, "loss": 3.1108, "theoretical_loss": 4.028473203791813, "tokens_seen": 385616896 }, { "epoch": 1.01, "learning_rate": 0.00044602808425275826, "loss": 3.0822, "theoretical_loss": 4.028396142607179, "tokens_seen": 385682432 }, { "epoch": 1.01, "learning_rate": 0.0004460180541624875, "loss": 3.1935, "theoretical_loss": 4.028319098181561, "tokens_seen": 385747968 }, { "epoch": 1.01, "learning_rate": 0.0004460080240722166, "loss": 3.2821, "theoretical_loss": 4.028242070508467, "tokens_seen": 385813504 }, { "epoch": 1.01, "learning_rate": 0.00044599799398194586, "loss": 3.0238, "theoretical_loss": 4.0281650595814105, "tokens_seen": 385879040 }, { "epoch": 1.01, "learning_rate": 0.000445987963891675, "loss": 3.2164, "theoretical_loss": 4.028088065393907, "tokens_seen": 385944576 }, { "epoch": 1.01, "learning_rate": 0.0004459779338014042, "loss": 3.2096, "theoretical_loss": 4.0280110879394755, "tokens_seen": 386010112 }, { "epoch": 1.01, "learning_rate": 0.0004459679037111334, "loss": 3.2475, "theoretical_loss": 4.027934127211641, "tokens_seen": 386075648 }, { "epoch": 1.01, "learning_rate": 0.0004459578736208626, "loss": 3.2187, "theoretical_loss": 4.027857183203931, "tokens_seen": 386141184 }, { "epoch": 1.01, "learning_rate": 0.00044594784353059176, "loss": 3.099, "theoretical_loss": 4.0277802559098745, "tokens_seen": 386206720 }, { "epoch": 1.01, "learning_rate": 0.000445937813440321, "loss": 3.2083, "theoretical_loss": 4.027703345323006, "tokens_seen": 386272256 }, { "epoch": 1.01, "learning_rate": 0.0004459277833500502, "loss": 3.2668, "theoretical_loss": 4.027626451436864, "tokens_seen": 386337792 }, { "epoch": 1.01, "learning_rate": 0.00044591775325977936, "loss": 3.1032, "theoretical_loss": 4.027549574244989, "tokens_seen": 386403328 }, { "epoch": 1.01, "learning_rate": 0.00044590772316950854, "loss": 3.3184, "theoretical_loss": 4.027472713740927, "tokens_seen": 386468864 }, { "epoch": 1.01, "learning_rate": 0.0004458976930792377, "loss": 3.1716, "theoretical_loss": 4.027395869918227, "tokens_seen": 386534400 }, { "epoch": 1.01, "learning_rate": 0.00044588766298896696, "loss": 3.2013, "theoretical_loss": 4.02731904277044, "tokens_seen": 386599936 }, { "epoch": 1.01, "objective/train/docs_used": 641636, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8756632804870605, "objective/train/theoretical_loss": 4.0272614333486345, "objective/train/tokens_used": 407109088, "theoretical_loss": 4.0272614333486345, "tokens_seen": 386649088 }, { "epoch": 1.01, "learning_rate": 0.0004458776328986961, "loss": 3.0738, "theoretical_loss": 4.027242232291122, "tokens_seen": 386665472 }, { "epoch": 1.01, "learning_rate": 0.0004458676028084253, "loss": 3.2877, "theoretical_loss": 4.0271654384738325, "tokens_seen": 386731008 }, { "epoch": 1.01, "learning_rate": 0.00044585757271815445, "loss": 3.2961, "theoretical_loss": 4.027088661312135, "tokens_seen": 386796544 }, { "epoch": 1.01, "learning_rate": 0.0004458475426278837, "loss": 3.1342, "theoretical_loss": 4.027011900799597, "tokens_seen": 386862080 }, { "epoch": 1.01, "learning_rate": 0.00044583751253761286, "loss": 3.1603, "theoretical_loss": 4.026935156929785, "tokens_seen": 386927616 }, { "epoch": 1.01, "learning_rate": 0.00044582748244734204, "loss": 3.1972, "theoretical_loss": 4.026858429696276, "tokens_seen": 386993152 }, { "epoch": 1.01, "learning_rate": 0.0004458174523570712, "loss": 3.1587, "theoretical_loss": 4.0267817190926465, "tokens_seen": 387058688 }, { "epoch": 1.01, "learning_rate": 0.0004458074222668004, "loss": 3.1363, "theoretical_loss": 4.026705025112476, "tokens_seen": 387124224 }, { "epoch": 1.01, "learning_rate": 0.0004457973921765296, "loss": 3.2598, "theoretical_loss": 4.026628347749351, "tokens_seen": 387189760 }, { "epoch": 1.01, "learning_rate": 0.0004457873620862588, "loss": 3.1693, "theoretical_loss": 4.026551686996857, "tokens_seen": 387255296 }, { "epoch": 1.01, "learning_rate": 0.00044577733199598795, "loss": 3.3725, "theoretical_loss": 4.026475042848588, "tokens_seen": 387320832 }, { "epoch": 1.01, "learning_rate": 0.0004457673019057172, "loss": 3.1516, "theoretical_loss": 4.026398415298138, "tokens_seen": 387386368 }, { "epoch": 1.01, "learning_rate": 0.00044575727181544637, "loss": 3.0204, "theoretical_loss": 4.026321804339105, "tokens_seen": 387451904 }, { "epoch": 1.01, "learning_rate": 0.00044574724172517555, "loss": 3.2431, "theoretical_loss": 4.026245209965092, "tokens_seen": 387517440 }, { "epoch": 1.01, "learning_rate": 0.00044573721163490473, "loss": 3.298, "theoretical_loss": 4.026168632169703, "tokens_seen": 387582976 }, { "epoch": 1.01, "learning_rate": 0.0004457271815446339, "loss": 3.1526, "theoretical_loss": 4.02609207094655, "tokens_seen": 387648512 }, { "epoch": 1.01, "learning_rate": 0.0004457171514543631, "loss": 3.1765, "theoretical_loss": 4.026015526289244, "tokens_seen": 387714048 }, { "epoch": 1.01, "learning_rate": 0.0004457071213640923, "loss": 3.3351, "theoretical_loss": 4.0259389981914016, "tokens_seen": 387779584 }, { "epoch": 1.01, "learning_rate": 0.00044569709127382145, "loss": 3.3395, "theoretical_loss": 4.025862486646643, "tokens_seen": 387845120 }, { "epoch": 1.01, "learning_rate": 0.0004456870611835507, "loss": 3.013, "theoretical_loss": 4.025785991648592, "tokens_seen": 387910656 }, { "epoch": 1.01, "learning_rate": 0.0004456770310932798, "loss": 3.1303, "theoretical_loss": 4.025709513190874, "tokens_seen": 387976192 }, { "epoch": 1.01, "learning_rate": 0.00044566700100300905, "loss": 3.187, "theoretical_loss": 4.025633051267121, "tokens_seen": 388041728 }, { "epoch": 1.01, "learning_rate": 0.00044565697091273823, "loss": 3.1807, "theoretical_loss": 4.025556605870966, "tokens_seen": 388107264 }, { "epoch": 1.01, "learning_rate": 0.0004456469408224674, "loss": 3.241, "theoretical_loss": 4.025480176996047, "tokens_seen": 388172800 }, { "epoch": 1.01, "learning_rate": 0.0004456369107321966, "loss": 3.2389, "theoretical_loss": 4.025403764636005, "tokens_seen": 388238336 }, { "epoch": 1.01, "objective/train/docs_used": 644548, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.338693857192993, "objective/train/theoretical_loss": 4.025346466200038, "objective/train/tokens_used": 408747488, "theoretical_loss": 4.025346466200038, "tokens_seen": 388287488 }, { "epoch": 1.01, "learning_rate": 0.00044562688064192583, "loss": 3.1083, "theoretical_loss": 4.025327368784485, "tokens_seen": 388303872 }, { "epoch": 1.01, "learning_rate": 0.00044561685055165496, "loss": 3.2245, "theoretical_loss": 4.0252509894351345, "tokens_seen": 388369408 }, { "epoch": 1.01, "learning_rate": 0.0004456068204613842, "loss": 3.3237, "theoretical_loss": 4.025174626581606, "tokens_seen": 388434944 }, { "epoch": 1.01, "learning_rate": 0.0004455967903711133, "loss": 3.1219, "theoretical_loss": 4.025098280217552, "tokens_seen": 388500480 }, { "epoch": 1.01, "learning_rate": 0.00044558676028084255, "loss": 3.2262, "theoretical_loss": 4.025021950336635, "tokens_seen": 388566016 }, { "epoch": 1.01, "learning_rate": 0.00044557673019057173, "loss": 3.2031, "theoretical_loss": 4.0249456369325145, "tokens_seen": 388631552 }, { "epoch": 1.01, "learning_rate": 0.0004455667001003009, "loss": 3.1753, "theoretical_loss": 4.024869339998856, "tokens_seen": 388697088 }, { "epoch": 1.01, "learning_rate": 0.0004455566700100301, "loss": 3.1525, "theoretical_loss": 4.024793059529331, "tokens_seen": 388762624 }, { "epoch": 1.01, "learning_rate": 0.0004455466399197593, "loss": 3.1837, "theoretical_loss": 4.02471679551761, "tokens_seen": 388828160 }, { "epoch": 1.01, "learning_rate": 0.00044553660982948846, "loss": 3.1845, "theoretical_loss": 4.024640547957369, "tokens_seen": 388893696 }, { "epoch": 1.01, "learning_rate": 0.0004455265797392177, "loss": 3.1545, "theoretical_loss": 4.024564316842289, "tokens_seen": 388959232 }, { "epoch": 1.01, "learning_rate": 0.0004455165496489468, "loss": 3.2623, "theoretical_loss": 4.024488102166052, "tokens_seen": 389024768 }, { "epoch": 1.01, "learning_rate": 0.00044550651955867606, "loss": 3.2009, "theoretical_loss": 4.024411903922346, "tokens_seen": 389090304 }, { "epoch": 1.01, "learning_rate": 0.0004454964894684052, "loss": 3.1885, "theoretical_loss": 4.02433572210486, "tokens_seen": 389155840 }, { "epoch": 1.01, "learning_rate": 0.0004454864593781344, "loss": 3.1609, "theoretical_loss": 4.024259556707287, "tokens_seen": 389221376 }, { "epoch": 1.01, "learning_rate": 0.0004454764292878636, "loss": 3.1481, "theoretical_loss": 4.024183407723326, "tokens_seen": 389286912 }, { "epoch": 1.01, "learning_rate": 0.0004454663991975928, "loss": 2.8893, "theoretical_loss": 4.024107275146676, "tokens_seen": 389352448 }, { "epoch": 1.01, "learning_rate": 0.00044545636910732196, "loss": 3.0705, "theoretical_loss": 4.024031158971042, "tokens_seen": 389417984 }, { "epoch": 1.01, "learning_rate": 0.0004454463390170512, "loss": 3.0589, "theoretical_loss": 4.02395505919013, "tokens_seen": 389483520 }, { "epoch": 1.01, "learning_rate": 0.0004454363089267803, "loss": 3.1369, "theoretical_loss": 4.023878975797652, "tokens_seen": 389549056 }, { "epoch": 1.01, "learning_rate": 0.00044542627883650956, "loss": 3.0674, "theoretical_loss": 4.0238029087873235, "tokens_seen": 389614592 }, { "epoch": 1.01, "learning_rate": 0.0004454162487462387, "loss": 3.1196, "theoretical_loss": 4.023726858152861, "tokens_seen": 389680128 }, { "epoch": 1.01, "learning_rate": 0.0004454062186559679, "loss": 3.0751, "theoretical_loss": 4.023650823887985, "tokens_seen": 389745664 }, { "epoch": 1.01, "learning_rate": 0.0004453961885656971, "loss": 3.1393, "theoretical_loss": 4.023574805986423, "tokens_seen": 389811200 }, { "epoch": 1.01, "learning_rate": 0.0004453861584754263, "loss": 3.2249, "theoretical_loss": 4.0234988044419016, "tokens_seen": 389876736 }, { "epoch": 1.01, "objective/train/docs_used": 647342, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.482271671295166, "objective/train/theoretical_loss": 4.023441814014048, "objective/train/tokens_used": 410385888, "theoretical_loss": 4.023441814014048, "tokens_seen": 389925888 }, { "epoch": 1.01, "learning_rate": 0.00044537612838515547, "loss": 3.4005, "theoretical_loss": 4.023422819248153, "tokens_seen": 389942272 }, { "epoch": 1.01, "learning_rate": 0.00044536609829488465, "loss": 3.1046, "theoretical_loss": 4.023346850398912, "tokens_seen": 390007808 }, { "epoch": 1.01, "learning_rate": 0.00044535606820461383, "loss": 3.1477, "theoretical_loss": 4.023270897887917, "tokens_seen": 390073344 }, { "epoch": 1.01, "learning_rate": 0.00044534603811434306, "loss": 3.1029, "theoretical_loss": 4.023194961708912, "tokens_seen": 390138880 }, { "epoch": 1.01, "learning_rate": 0.0004453360080240722, "loss": 3.2536, "theoretical_loss": 4.02311904185564, "tokens_seen": 390204416 }, { "epoch": 1.01, "learning_rate": 0.0004453259779338014, "loss": 3.2746, "theoretical_loss": 4.023043138321851, "tokens_seen": 390269952 }, { "epoch": 1.01, "learning_rate": 0.00044531594784353055, "loss": 3.3431, "theoretical_loss": 4.022967251101298, "tokens_seen": 390335488 }, { "epoch": 1.01, "learning_rate": 0.0004453059177532598, "loss": 3.178, "theoretical_loss": 4.022891380187737, "tokens_seen": 390401024 }, { "epoch": 1.01, "learning_rate": 0.00044529588766298897, "loss": 3.1621, "theoretical_loss": 4.022815525574927, "tokens_seen": 390466560 }, { "epoch": 1.01, "learning_rate": 0.00044528585757271815, "loss": 3.1188, "theoretical_loss": 4.02273968725663, "tokens_seen": 390532096 }, { "epoch": 1.01, "learning_rate": 0.00044527582748244733, "loss": 3.154, "theoretical_loss": 4.022663865226614, "tokens_seen": 390597632 }, { "epoch": 1.01, "learning_rate": 0.00044526579739217657, "loss": 3.1501, "theoretical_loss": 4.022588059478647, "tokens_seen": 390663168 }, { "epoch": 1.01, "learning_rate": 0.0004452557673019057, "loss": 3.1358, "theoretical_loss": 4.0225122700065015, "tokens_seen": 390728704 }, { "epoch": 1.01, "learning_rate": 0.00044524573721163493, "loss": 3.2348, "theoretical_loss": 4.022436496803956, "tokens_seen": 390794240 }, { "epoch": 1.01, "learning_rate": 0.00044523570712136406, "loss": 3.209, "theoretical_loss": 4.022360739864789, "tokens_seen": 390859776 }, { "epoch": 1.01, "learning_rate": 0.0004452256770310933, "loss": 3.1227, "theoretical_loss": 4.022284999182785, "tokens_seen": 390925312 }, { "epoch": 1.01, "learning_rate": 0.00044521564694082247, "loss": 3.1945, "theoretical_loss": 4.02220927475173, "tokens_seen": 390990848 }, { "epoch": 1.01, "learning_rate": 0.00044520561685055165, "loss": 3.17, "theoretical_loss": 4.022133566565413, "tokens_seen": 391056384 }, { "epoch": 1.01, "learning_rate": 0.00044519558676028083, "loss": 3.28, "theoretical_loss": 4.02205787461763, "tokens_seen": 391121920 }, { "epoch": 1.01, "learning_rate": 0.00044518555667001, "loss": 3.0713, "theoretical_loss": 4.021982198902176, "tokens_seen": 391187456 }, { "epoch": 1.02, "learning_rate": 0.00044517552657973925, "loss": 3.1661, "theoretical_loss": 4.0219065394128535, "tokens_seen": 391252992 }, { "epoch": 1.02, "learning_rate": 0.00044516549648946843, "loss": 3.1705, "theoretical_loss": 4.021830896143463, "tokens_seen": 391318528 }, { "epoch": 1.02, "learning_rate": 0.0004451554663991976, "loss": 3.1827, "theoretical_loss": 4.021755269087815, "tokens_seen": 391384064 }, { "epoch": 1.02, "learning_rate": 0.0004451454363089268, "loss": 3.0362, "theoretical_loss": 4.0216796582397185, "tokens_seen": 391449600 }, { "epoch": 1.02, "learning_rate": 0.00044513540621865603, "loss": 3.2966, "theoretical_loss": 4.021604063592988, "tokens_seen": 391515136 }, { "epoch": 1.02, "objective/train/docs_used": 649632, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0831313133239746, "objective/train/theoretical_loss": 4.021547378236367, "objective/train/tokens_used": 412024288, "theoretical_loss": 4.021547378236367, "tokens_seen": 391564288 }, { "epoch": 1.02, "learning_rate": 0.00044512537612838516, "loss": 3.1815, "theoretical_loss": 4.02152848514144, "tokens_seen": 391580672 }, { "epoch": 1.02, "learning_rate": 0.0004451153460381144, "loss": 3.057, "theoretical_loss": 4.021452922878896, "tokens_seen": 391646208 }, { "epoch": 1.02, "learning_rate": 0.0004451053159478435, "loss": 3.251, "theoretical_loss": 4.0213773767991805, "tokens_seen": 391711744 }, { "epoch": 1.02, "learning_rate": 0.00044509528585757275, "loss": 3.0871, "theoretical_loss": 4.021301846896121, "tokens_seen": 391777280 }, { "epoch": 1.02, "learning_rate": 0.00044508525576730194, "loss": 3.2716, "theoretical_loss": 4.021226333163547, "tokens_seen": 391842816 }, { "epoch": 1.02, "learning_rate": 0.0004450752256770311, "loss": 2.9601, "theoretical_loss": 4.021150835595295, "tokens_seen": 391908352 }, { "epoch": 1.02, "learning_rate": 0.0004450651955867603, "loss": 3.1007, "theoretical_loss": 4.021075354185201, "tokens_seen": 391973888 }, { "epoch": 1.02, "learning_rate": 0.0004450551654964895, "loss": 3.142, "theoretical_loss": 4.020999888927107, "tokens_seen": 392039424 }, { "epoch": 1.02, "learning_rate": 0.00044504513540621866, "loss": 3.0993, "theoretical_loss": 4.020924439814857, "tokens_seen": 392104960 }, { "epoch": 1.02, "learning_rate": 0.0004450351053159479, "loss": 3.2119, "theoretical_loss": 4.0208490068423, "tokens_seen": 392170496 }, { "epoch": 1.02, "learning_rate": 0.000445025075225677, "loss": 3.2581, "theoretical_loss": 4.0207735900032855, "tokens_seen": 392236032 }, { "epoch": 1.02, "learning_rate": 0.00044501504513540626, "loss": 3.2317, "theoretical_loss": 4.02069818929167, "tokens_seen": 392301568 }, { "epoch": 1.02, "learning_rate": 0.0004450050150451354, "loss": 3.1351, "theoretical_loss": 4.0206228047013095, "tokens_seen": 392367104 }, { "epoch": 1.02, "learning_rate": 0.0004449949849548646, "loss": 3.2002, "theoretical_loss": 4.020547436226067, "tokens_seen": 392432640 }, { "epoch": 1.02, "learning_rate": 0.0004449849548645938, "loss": 3.1583, "theoretical_loss": 4.020472083859806, "tokens_seen": 392498176 }, { "epoch": 1.02, "learning_rate": 0.000444974924774323, "loss": 3.1493, "theoretical_loss": 4.020396747596395, "tokens_seen": 392563712 }, { "epoch": 1.02, "learning_rate": 0.00044496489468405216, "loss": 3.2375, "theoretical_loss": 4.020321427429705, "tokens_seen": 392629248 }, { "epoch": 1.02, "learning_rate": 0.0004449548645937814, "loss": 3.1719, "theoretical_loss": 4.020246123353612, "tokens_seen": 392694784 }, { "epoch": 1.02, "learning_rate": 0.0004449448345035105, "loss": 3.1373, "theoretical_loss": 4.020170835361992, "tokens_seen": 392760320 }, { "epoch": 1.02, "learning_rate": 0.00044493480441323976, "loss": 3.1245, "theoretical_loss": 4.020095563448729, "tokens_seen": 392825856 }, { "epoch": 1.02, "learning_rate": 0.0004449247743229689, "loss": 3.1288, "theoretical_loss": 4.020020307607706, "tokens_seen": 392891392 }, { "epoch": 1.02, "learning_rate": 0.0004449147442326981, "loss": 3.0218, "theoretical_loss": 4.019945067832811, "tokens_seen": 392956928 }, { "epoch": 1.02, "learning_rate": 0.0004449047141424273, "loss": 3.0215, "theoretical_loss": 4.019869844117938, "tokens_seen": 393022464 }, { "epoch": 1.02, "learning_rate": 0.0004448946840521565, "loss": 3.1933, "theoretical_loss": 4.019794636456979, "tokens_seen": 393088000 }, { "epoch": 1.02, "learning_rate": 0.00044488465396188567, "loss": 3.0746, "theoretical_loss": 4.019719444843833, "tokens_seen": 393153536 }, { "epoch": 1.02, "objective/train/docs_used": 652342, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.425205945968628, "objective/train/theoretical_loss": 4.0196630616616815, "objective/train/tokens_used": 413662688, "theoretical_loss": 4.0196630616616815, "tokens_seen": 393202688 }, { "epoch": 1.02, "learning_rate": 0.00044487462387161485, "loss": 3.1477, "theoretical_loss": 4.019644269272401, "tokens_seen": 393219072 }, { "epoch": 1.02, "learning_rate": 0.00044486459378134403, "loss": 3.1995, "theoretical_loss": 4.01956910973659, "tokens_seen": 393284608 }, { "epoch": 1.02, "learning_rate": 0.00044485456369107326, "loss": 3.3738, "theoretical_loss": 4.019493966230306, "tokens_seen": 393350144 }, { "epoch": 1.02, "learning_rate": 0.0004448445336008024, "loss": 3.1653, "theoretical_loss": 4.019418838747462, "tokens_seen": 393415680 }, { "epoch": 1.02, "learning_rate": 0.0004448345035105316, "loss": 3.1426, "theoretical_loss": 4.019343727281971, "tokens_seen": 393481216 }, { "epoch": 1.02, "learning_rate": 0.00044482447342026075, "loss": 3.1297, "theoretical_loss": 4.019268631827752, "tokens_seen": 393546752 }, { "epoch": 1.02, "learning_rate": 0.00044481444332999, "loss": 3.2405, "theoretical_loss": 4.019193552378728, "tokens_seen": 393612288 }, { "epoch": 1.02, "learning_rate": 0.00044480441323971917, "loss": 3.184, "theoretical_loss": 4.019118488928822, "tokens_seen": 393677824 }, { "epoch": 1.02, "learning_rate": 0.00044479438314944835, "loss": 3.2742, "theoretical_loss": 4.019043441471962, "tokens_seen": 393743360 }, { "epoch": 1.02, "learning_rate": 0.00044478435305917753, "loss": 3.078, "theoretical_loss": 4.01896841000208, "tokens_seen": 393808896 }, { "epoch": 1.02, "learning_rate": 0.00044477432296890677, "loss": 3.0352, "theoretical_loss": 4.018893394513112, "tokens_seen": 393874432 }, { "epoch": 1.02, "learning_rate": 0.0004447642928786359, "loss": 3.1681, "theoretical_loss": 4.018818394998994, "tokens_seen": 393939968 }, { "epoch": 1.02, "learning_rate": 0.00044475426278836513, "loss": 3.2009, "theoretical_loss": 4.018743411453668, "tokens_seen": 394005504 }, { "epoch": 1.02, "learning_rate": 0.00044474423269809426, "loss": 3.1461, "theoretical_loss": 4.018668443871079, "tokens_seen": 394071040 }, { "epoch": 1.02, "learning_rate": 0.0004447342026078235, "loss": 3.1544, "theoretical_loss": 4.018593492245175, "tokens_seen": 394136576 }, { "epoch": 1.02, "learning_rate": 0.00044472417251755267, "loss": 3.0311, "theoretical_loss": 4.018518556569908, "tokens_seen": 394202112 }, { "epoch": 1.02, "learning_rate": 0.00044471414242728185, "loss": 3.1817, "theoretical_loss": 4.018443636839231, "tokens_seen": 394267648 }, { "epoch": 1.02, "learning_rate": 0.00044470411233701103, "loss": 3.0491, "theoretical_loss": 4.018368733047102, "tokens_seen": 394333184 }, { "epoch": 1.02, "learning_rate": 0.0004446940822467402, "loss": 3.2832, "theoretical_loss": 4.018293845187483, "tokens_seen": 394398720 }, { "epoch": 1.02, "learning_rate": 0.0004446840521564694, "loss": 3.1093, "theoretical_loss": 4.018218973254338, "tokens_seen": 394464256 }, { "epoch": 1.02, "learning_rate": 0.00044467402206619863, "loss": 3.1159, "theoretical_loss": 4.018144117241635, "tokens_seen": 394529792 }, { "epoch": 1.02, "learning_rate": 0.00044466399197592776, "loss": 3.0352, "theoretical_loss": 4.018069277143344, "tokens_seen": 394595328 }, { "epoch": 1.02, "learning_rate": 0.000444653961885657, "loss": 3.2365, "theoretical_loss": 4.017994452953441, "tokens_seen": 394660864 }, { "epoch": 1.02, "learning_rate": 0.0004446439317953861, "loss": 3.1296, "theoretical_loss": 4.017919644665903, "tokens_seen": 394726400 }, { "epoch": 1.02, "learning_rate": 0.00044463390170511536, "loss": 3.2051, "theoretical_loss": 4.01784485227471, "tokens_seen": 394791936 }, { "epoch": 1.02, "objective/train/docs_used": 655042, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.976278781890869, "objective/train/theoretical_loss": 4.017788768409673, "objective/train/tokens_used": 415301088, "theoretical_loss": 4.017788768409673, "tokens_seen": 394841088 }, { "epoch": 1.02, "learning_rate": 0.00044462387161484454, "loss": 3.1336, "theoretical_loss": 4.017770075773846, "tokens_seen": 394857472 }, { "epoch": 1.02, "learning_rate": 0.0004446138415245737, "loss": 3.0651, "theoretical_loss": 4.017695315157301, "tokens_seen": 394923008 }, { "epoch": 1.02, "learning_rate": 0.0004446038114343029, "loss": 3.2821, "theoretical_loss": 4.017620570419063, "tokens_seen": 394988544 }, { "epoch": 1.02, "learning_rate": 0.00044459378134403214, "loss": 3.1694, "theoretical_loss": 4.017545841553127, "tokens_seen": 395054080 }, { "epoch": 1.02, "learning_rate": 0.00044458375125376126, "loss": 3.3314, "theoretical_loss": 4.01747112855349, "tokens_seen": 395119616 }, { "epoch": 1.02, "learning_rate": 0.0004445737211634905, "loss": 3.1524, "theoretical_loss": 4.017396431414152, "tokens_seen": 395185152 }, { "epoch": 1.02, "learning_rate": 0.0004445636910732196, "loss": 3.2209, "theoretical_loss": 4.017321750129118, "tokens_seen": 395250688 }, { "epoch": 1.02, "learning_rate": 0.00044455366098294886, "loss": 3.0883, "theoretical_loss": 4.017247084692394, "tokens_seen": 395316224 }, { "epoch": 1.02, "learning_rate": 0.00044454363089267804, "loss": 3.0281, "theoretical_loss": 4.01717243509799, "tokens_seen": 395381760 }, { "epoch": 1.02, "learning_rate": 0.0004445336008024072, "loss": 3.2338, "theoretical_loss": 4.01709780133992, "tokens_seen": 395447296 }, { "epoch": 1.02, "learning_rate": 0.0004445235707121364, "loss": 3.1416, "theoretical_loss": 4.017023183412203, "tokens_seen": 395512832 }, { "epoch": 1.02, "learning_rate": 0.0004445135406218656, "loss": 3.3052, "theoretical_loss": 4.016948581308855, "tokens_seen": 395578368 }, { "epoch": 1.02, "learning_rate": 0.00044450351053159477, "loss": 3.0801, "theoretical_loss": 4.016873995023902, "tokens_seen": 395643904 }, { "epoch": 1.02, "learning_rate": 0.000444493480441324, "loss": 3.3004, "theoretical_loss": 4.016799424551369, "tokens_seen": 395709440 }, { "epoch": 1.02, "learning_rate": 0.00044448345035105313, "loss": 3.1106, "theoretical_loss": 4.016724869885286, "tokens_seen": 395774976 }, { "epoch": 1.02, "learning_rate": 0.00044447342026078236, "loss": 3.1639, "theoretical_loss": 4.016650331019688, "tokens_seen": 395840512 }, { "epoch": 1.02, "learning_rate": 0.0004444633901705115, "loss": 3.1595, "theoretical_loss": 4.016575807948609, "tokens_seen": 395906048 }, { "epoch": 1.02, "learning_rate": 0.0004444533600802407, "loss": 3.1512, "theoretical_loss": 4.016501300666089, "tokens_seen": 395971584 }, { "epoch": 1.02, "learning_rate": 0.00044444332998996996, "loss": 3.1545, "theoretical_loss": 4.016426809166172, "tokens_seen": 396037120 }, { "epoch": 1.02, "learning_rate": 0.0004444332998996991, "loss": 3.1667, "theoretical_loss": 4.016352333442902, "tokens_seen": 396102656 }, { "epoch": 1.02, "learning_rate": 0.0004444232698094283, "loss": 3.3667, "theoretical_loss": 4.0162778734903295, "tokens_seen": 396168192 }, { "epoch": 1.02, "learning_rate": 0.0004444132397191575, "loss": 3.0118, "theoretical_loss": 4.016203429302507, "tokens_seen": 396233728 }, { "epoch": 1.02, "learning_rate": 0.0004444032096288867, "loss": 3.2139, "theoretical_loss": 4.016129000873489, "tokens_seen": 396299264 }, { "epoch": 1.02, "learning_rate": 0.00044439317953861587, "loss": 3.2505, "theoretical_loss": 4.016054588197336, "tokens_seen": 396364800 }, { "epoch": 1.02, "learning_rate": 0.00044438314944834505, "loss": 3.1907, "theoretical_loss": 4.015980191268109, "tokens_seen": 396430336 }, { "epoch": 1.02, "objective/train/docs_used": 657795, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1850392818450928, "objective/train/theoretical_loss": 4.015924403901538, "objective/train/tokens_used": 416939488, "theoretical_loss": 4.015924403901538, "tokens_seen": 396479488 }, { "epoch": 1.02, "learning_rate": 0.00044437311935807423, "loss": 3.2413, "theoretical_loss": 4.015905810079873, "tokens_seen": 396495872 }, { "epoch": 1.02, "learning_rate": 0.00044436308926780346, "loss": 3.1881, "theoretical_loss": 4.015831444626697, "tokens_seen": 396561408 }, { "epoch": 1.02, "learning_rate": 0.0004443530591775326, "loss": 3.0779, "theoretical_loss": 4.0157570949026535, "tokens_seen": 396626944 }, { "epoch": 1.02, "learning_rate": 0.0004443430290872618, "loss": 3.3068, "theoretical_loss": 4.015682760901816, "tokens_seen": 396692480 }, { "epoch": 1.02, "learning_rate": 0.00044433299899699095, "loss": 3.1219, "theoretical_loss": 4.015608442618264, "tokens_seen": 396758016 }, { "epoch": 1.02, "learning_rate": 0.0004443229689067202, "loss": 3.2288, "theoretical_loss": 4.015534140046078, "tokens_seen": 396823552 }, { "epoch": 1.02, "learning_rate": 0.00044431293881644937, "loss": 3.1498, "theoretical_loss": 4.015459853179342, "tokens_seen": 396889088 }, { "epoch": 1.02, "learning_rate": 0.00044430290872617855, "loss": 3.1391, "theoretical_loss": 4.015385582012146, "tokens_seen": 396954624 }, { "epoch": 1.02, "learning_rate": 0.00044429287863590773, "loss": 3.0391, "theoretical_loss": 4.01531132653858, "tokens_seen": 397020160 }, { "epoch": 1.02, "learning_rate": 0.00044428284854563697, "loss": 3.1169, "theoretical_loss": 4.0152370867527365, "tokens_seen": 397085696 }, { "epoch": 1.02, "learning_rate": 0.0004442728184553661, "loss": 3.1967, "theoretical_loss": 4.015162862648714, "tokens_seen": 397151232 }, { "epoch": 1.02, "learning_rate": 0.00044426278836509533, "loss": 3.1238, "theoretical_loss": 4.015088654220614, "tokens_seen": 397216768 }, { "epoch": 1.02, "learning_rate": 0.00044425275827482446, "loss": 3.2535, "theoretical_loss": 4.01501446146254, "tokens_seen": 397282304 }, { "epoch": 1.02, "learning_rate": 0.0004442427281845537, "loss": 3.1461, "theoretical_loss": 4.014940284368598, "tokens_seen": 397347840 }, { "epoch": 1.02, "learning_rate": 0.00044423269809428287, "loss": 3.0547, "theoretical_loss": 4.014866122932899, "tokens_seen": 397413376 }, { "epoch": 1.02, "learning_rate": 0.00044422266800401205, "loss": 3.1003, "theoretical_loss": 4.014791977149556, "tokens_seen": 397478912 }, { "epoch": 1.02, "learning_rate": 0.00044421263791374123, "loss": 3.1201, "theoretical_loss": 4.014717847012685, "tokens_seen": 397544448 }, { "epoch": 1.02, "learning_rate": 0.0004442026078234704, "loss": 3.1644, "theoretical_loss": 4.014643732516407, "tokens_seen": 397609984 }, { "epoch": 1.02, "learning_rate": 0.0004441925777331996, "loss": 3.2568, "theoretical_loss": 4.014569633654844, "tokens_seen": 397675520 }, { "epoch": 1.02, "learning_rate": 0.00044418254764292883, "loss": 3.1901, "theoretical_loss": 4.014495550422121, "tokens_seen": 397741056 }, { "epoch": 1.02, "learning_rate": 0.00044417251755265796, "loss": 3.0878, "theoretical_loss": 4.01442148281237, "tokens_seen": 397806592 }, { "epoch": 1.02, "learning_rate": 0.0004441624874623872, "loss": 3.0368, "theoretical_loss": 4.01434743081972, "tokens_seen": 397872128 }, { "epoch": 1.02, "learning_rate": 0.0004441524573721163, "loss": 3.0908, "theoretical_loss": 4.01427339443831, "tokens_seen": 397937664 }, { "epoch": 1.02, "learning_rate": 0.00044414242728184556, "loss": 3.1215, "theoretical_loss": 4.014199373662277, "tokens_seen": 398003200 }, { "epoch": 1.02, "learning_rate": 0.00044413239719157474, "loss": 2.8899, "theoretical_loss": 4.014125368485762, "tokens_seen": 398068736 }, { "epoch": 1.02, "objective/train/docs_used": 659073, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1963558197021484, "objective/train/theoretical_loss": 4.014069874837038, "objective/train/tokens_used": 418577888, "theoretical_loss": 4.014069874837038, "tokens_seen": 398117888 }, { "epoch": 1.02, "learning_rate": 0.0004441223671013039, "loss": 3.0919, "theoretical_loss": 4.014051378902911, "tokens_seen": 398134272 }, { "epoch": 1.02, "learning_rate": 0.0004441123370110331, "loss": 3.2818, "theoretical_loss": 4.013977404907873, "tokens_seen": 398199808 }, { "epoch": 1.02, "learning_rate": 0.00044410230692076234, "loss": 3.1324, "theoretical_loss": 4.0139034464947985, "tokens_seen": 398265344 }, { "epoch": 1.02, "learning_rate": 0.00044409227683049146, "loss": 3.082, "theoretical_loss": 4.013829503657842, "tokens_seen": 398330880 }, { "epoch": 1.02, "learning_rate": 0.0004440822467402207, "loss": 3.1842, "theoretical_loss": 4.013755576391161, "tokens_seen": 398396416 }, { "epoch": 1.02, "learning_rate": 0.0004440722166499498, "loss": 3.1254, "theoretical_loss": 4.013681664688917, "tokens_seen": 398461952 }, { "epoch": 1.02, "learning_rate": 0.00044406218655967906, "loss": 3.0077, "theoretical_loss": 4.013607768545274, "tokens_seen": 398527488 }, { "epoch": 1.02, "learning_rate": 0.00044405215646940824, "loss": 3.072, "theoretical_loss": 4.013533887954399, "tokens_seen": 398593024 }, { "epoch": 1.02, "learning_rate": 0.0004440421263791374, "loss": 2.9596, "theoretical_loss": 4.013460022910461, "tokens_seen": 398658560 }, { "epoch": 1.02, "learning_rate": 0.0004440320962888666, "loss": 3.2128, "theoretical_loss": 4.013386173407636, "tokens_seen": 398724096 }, { "epoch": 1.02, "learning_rate": 0.0004440220661985958, "loss": 2.98, "theoretical_loss": 4.013312339440099, "tokens_seen": 398789632 }, { "epoch": 1.02, "learning_rate": 0.00044401203610832497, "loss": 3.2338, "theoretical_loss": 4.013238521002029, "tokens_seen": 398855168 }, { "epoch": 1.02, "learning_rate": 0.0004440020060180542, "loss": 3.1396, "theoretical_loss": 4.01316471808761, "tokens_seen": 398920704 }, { "epoch": 1.02, "learning_rate": 0.00044399197592778333, "loss": 3.0749, "theoretical_loss": 4.013090930691028, "tokens_seen": 398986240 }, { "epoch": 1.02, "learning_rate": 0.00044398194583751256, "loss": 2.9859, "theoretical_loss": 4.0130171588064725, "tokens_seen": 399051776 }, { "epoch": 1.02, "learning_rate": 0.0004439719157472417, "loss": 2.9431, "theoretical_loss": 4.012943402428134, "tokens_seen": 399117312 }, { "epoch": 1.02, "learning_rate": 0.0004439618856569709, "loss": 3.1454, "theoretical_loss": 4.0128696615502095, "tokens_seen": 399182848 }, { "epoch": 1.02, "learning_rate": 0.0004439518555667001, "loss": 3.2182, "theoretical_loss": 4.012795936166897, "tokens_seen": 399248384 }, { "epoch": 1.02, "learning_rate": 0.0004439418254764293, "loss": 3.0838, "theoretical_loss": 4.012722226272397, "tokens_seen": 399313920 }, { "epoch": 1.02, "learning_rate": 0.00044393179538615847, "loss": 3.2453, "theoretical_loss": 4.012648531860917, "tokens_seen": 399379456 }, { "epoch": 1.02, "learning_rate": 0.0004439217652958877, "loss": 3.2477, "theoretical_loss": 4.012574852926662, "tokens_seen": 399444992 }, { "epoch": 1.02, "learning_rate": 0.00044391173520561683, "loss": 3.1862, "theoretical_loss": 4.012501189463843, "tokens_seen": 399510528 }, { "epoch": 1.02, "learning_rate": 0.00044390170511534607, "loss": 3.1929, "theoretical_loss": 4.012427541466677, "tokens_seen": 399576064 }, { "epoch": 1.02, "learning_rate": 0.0004438916750250752, "loss": 3.2371, "theoretical_loss": 4.012353908929379, "tokens_seen": 399641600 }, { "epoch": 1.02, "learning_rate": 0.00044388164493480443, "loss": 3.1941, "theoretical_loss": 4.012280291846169, "tokens_seen": 399707136 }, { "epoch": 1.02, "objective/train/docs_used": 661929, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1316635608673096, "objective/train/theoretical_loss": 4.012225089172033, "objective/train/tokens_used": 420216288, "theoretical_loss": 4.012225089172033, "tokens_seen": 399756288 }, { "epoch": 1.02, "learning_rate": 0.0004438716148445336, "loss": 3.2071, "theoretical_loss": 4.012206690211272, "tokens_seen": 399772672 }, { "epoch": 1.02, "learning_rate": 0.0004438615847542628, "loss": 3.1421, "theoretical_loss": 4.012133104018914, "tokens_seen": 399838208 }, { "epoch": 1.02, "learning_rate": 0.00044385155466399197, "loss": 3.2449, "theoretical_loss": 4.012059533263323, "tokens_seen": 399903744 }, { "epoch": 1.02, "learning_rate": 0.00044384152457372115, "loss": 3.1714, "theoretical_loss": 4.011985977938735, "tokens_seen": 399969280 }, { "epoch": 1.02, "learning_rate": 0.00044383149448345033, "loss": 3.0767, "theoretical_loss": 4.011912438039381, "tokens_seen": 400034816 }, { "epoch": 1.02, "learning_rate": 0.00044382146439317957, "loss": 3.3464, "theoretical_loss": 4.011838913559505, "tokens_seen": 400100352 }, { "epoch": 1.02, "learning_rate": 0.0004438114343029087, "loss": 3.0958, "theoretical_loss": 4.011765404493346, "tokens_seen": 400165888 }, { "epoch": 1.02, "learning_rate": 0.00044380140421263793, "loss": 3.2756, "theoretical_loss": 4.01169191083515, "tokens_seen": 400231424 }, { "epoch": 1.02, "learning_rate": 0.0004437913741223671, "loss": 3.2101, "theoretical_loss": 4.011618432579166, "tokens_seen": 400296960 }, { "epoch": 1.02, "learning_rate": 0.0004437813440320963, "loss": 3.1358, "theoretical_loss": 4.011544969719644, "tokens_seen": 400362496 }, { "epoch": 1.02, "learning_rate": 0.0004437713139418255, "loss": 3.1691, "theoretical_loss": 4.011471522250838, "tokens_seen": 400428032 }, { "epoch": 1.02, "learning_rate": 0.00044376128385155466, "loss": 3.1723, "theoretical_loss": 4.011398090167007, "tokens_seen": 400493568 }, { "epoch": 1.02, "learning_rate": 0.00044375125376128384, "loss": 3.1188, "theoretical_loss": 4.011324673462411, "tokens_seen": 400559104 }, { "epoch": 1.02, "learning_rate": 0.00044374122367101307, "loss": 3.1563, "theoretical_loss": 4.011251272131313, "tokens_seen": 400624640 }, { "epoch": 1.02, "learning_rate": 0.0004437311935807422, "loss": 3.0824, "theoretical_loss": 4.01117788616798, "tokens_seen": 400690176 }, { "epoch": 1.02, "learning_rate": 0.00044372116349047143, "loss": 3.0572, "theoretical_loss": 4.011104515566682, "tokens_seen": 400755712 }, { "epoch": 1.02, "learning_rate": 0.00044371113340020056, "loss": 3.1788, "theoretical_loss": 4.011031160321693, "tokens_seen": 400821248 }, { "epoch": 1.02, "learning_rate": 0.0004437011033099298, "loss": 3.1684, "theoretical_loss": 4.010957820427286, "tokens_seen": 400886784 }, { "epoch": 1.02, "learning_rate": 0.00044369107321965903, "loss": 3.182, "theoretical_loss": 4.010884495877743, "tokens_seen": 400952320 }, { "epoch": 1.02, "learning_rate": 0.00044368104312938816, "loss": 3.1687, "theoretical_loss": 4.010811186667344, "tokens_seen": 401017856 }, { "epoch": 1.02, "learning_rate": 0.0004436710130391174, "loss": 3.2799, "theoretical_loss": 4.010737892790376, "tokens_seen": 401083392 }, { "epoch": 1.02, "learning_rate": 0.0004436609829488465, "loss": 3.2692, "theoretical_loss": 4.010664614241124, "tokens_seen": 401148928 }, { "epoch": 1.02, "learning_rate": 0.00044365095285857576, "loss": 3.1671, "theoretical_loss": 4.010591351013883, "tokens_seen": 401214464 }, { "epoch": 1.02, "learning_rate": 0.00044364092276830494, "loss": 3.1197, "theoretical_loss": 4.010518103102945, "tokens_seen": 401280000 }, { "epoch": 1.02, "learning_rate": 0.0004436308926780341, "loss": 3.0354, "theoretical_loss": 4.010444870502608, "tokens_seen": 401345536 }, { "epoch": 1.02, "objective/train/docs_used": 664949, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1559255123138428, "objective/train/theoretical_loss": 4.010389956096509, "objective/train/tokens_used": 421854688, "theoretical_loss": 4.010389956096509, "tokens_seen": 401394688 }, { "epoch": 1.02, "learning_rate": 0.0004436208625877633, "loss": 3.1059, "theoretical_loss": 4.010371653207173, "tokens_seen": 401411072 }, { "epoch": 1.02, "learning_rate": 0.00044361083249749254, "loss": 3.004, "theoretical_loss": 4.010298451210942, "tokens_seen": 401476608 }, { "epoch": 1.02, "learning_rate": 0.00044360080240722166, "loss": 3.1094, "theoretical_loss": 4.010225264508223, "tokens_seen": 401542144 }, { "epoch": 1.02, "learning_rate": 0.0004435907723169509, "loss": 3.1524, "theoretical_loss": 4.010152093093325, "tokens_seen": 401607680 }, { "epoch": 1.02, "learning_rate": 0.00044358074222668, "loss": 3.0404, "theoretical_loss": 4.01007893696056, "tokens_seen": 401673216 }, { "epoch": 1.02, "learning_rate": 0.00044357071213640926, "loss": 3.1441, "theoretical_loss": 4.010005796104245, "tokens_seen": 401738752 }, { "epoch": 1.02, "learning_rate": 0.00044356068204613844, "loss": 3.0723, "theoretical_loss": 4.0099326705186975, "tokens_seen": 401804288 }, { "epoch": 1.02, "learning_rate": 0.0004435506519558676, "loss": 3.1296, "theoretical_loss": 4.009859560198239, "tokens_seen": 401869824 }, { "epoch": 1.02, "learning_rate": 0.0004435406218655968, "loss": 3.0601, "theoretical_loss": 4.0097864651371955, "tokens_seen": 401935360 }, { "epoch": 1.02, "learning_rate": 0.000443530591775326, "loss": 3.1076, "theoretical_loss": 4.009713385329894, "tokens_seen": 402000896 }, { "epoch": 1.02, "learning_rate": 0.00044352056168505517, "loss": 3.2014, "theoretical_loss": 4.009640320770666, "tokens_seen": 402066432 }, { "epoch": 1.02, "learning_rate": 0.0004435105315947844, "loss": 3.2282, "theoretical_loss": 4.009567271453845, "tokens_seen": 402131968 }, { "epoch": 1.02, "learning_rate": 0.00044350050150451353, "loss": 3.0772, "theoretical_loss": 4.009494237373768, "tokens_seen": 402197504 }, { "epoch": 1.02, "learning_rate": 0.00044349047141424276, "loss": 3.1647, "theoretical_loss": 4.009421218524774, "tokens_seen": 402263040 }, { "epoch": 1.02, "learning_rate": 0.0004434804413239719, "loss": 3.0952, "theoretical_loss": 4.009348214901207, "tokens_seen": 402328576 }, { "epoch": 1.02, "learning_rate": 0.0004434704112337011, "loss": 3.2656, "theoretical_loss": 4.0092752264974125, "tokens_seen": 402394112 }, { "epoch": 1.02, "learning_rate": 0.0004434603811434303, "loss": 3.1997, "theoretical_loss": 4.00920225330774, "tokens_seen": 402459648 }, { "epoch": 1.02, "learning_rate": 0.0004434503510531595, "loss": 3.1141, "theoretical_loss": 4.009129295326542, "tokens_seen": 402525184 }, { "epoch": 1.02, "learning_rate": 0.00044344032096288867, "loss": 3.1395, "theoretical_loss": 4.009056352548171, "tokens_seen": 402590720 }, { "epoch": 1.02, "learning_rate": 0.0004434302908726179, "loss": 2.9297, "theoretical_loss": 4.008983424966988, "tokens_seen": 402656256 }, { "epoch": 1.02, "learning_rate": 0.00044342026078234703, "loss": 2.8848, "theoretical_loss": 4.008910512577351, "tokens_seen": 402721792 }, { "epoch": 1.02, "learning_rate": 0.00044341023069207627, "loss": 3.0209, "theoretical_loss": 4.008837615373627, "tokens_seen": 402787328 }, { "epoch": 1.02, "learning_rate": 0.0004434002006018054, "loss": 3.0426, "theoretical_loss": 4.008764733350183, "tokens_seen": 402852864 }, { "epoch": 1.02, "learning_rate": 0.00044339017051153463, "loss": 3.107, "theoretical_loss": 4.008691866501387, "tokens_seen": 402918400 }, { "epoch": 1.02, "learning_rate": 0.0004433801404212638, "loss": 3.0955, "theoretical_loss": 4.008619014821613, "tokens_seen": 402983936 }, { "epoch": 1.02, "objective/train/docs_used": 667718, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.441606283187866, "objective/train/theoretical_loss": 4.008564386013069, "objective/train/tokens_used": 423493088, "theoretical_loss": 4.008564386013069, "tokens_seen": 403033088 }, { "epoch": 1.02, "learning_rate": 0.000443370110330993, "loss": 3.2576, "theoretical_loss": 4.008546178305236, "tokens_seen": 403049472 }, { "epoch": 1.02, "learning_rate": 0.00044336008024072217, "loss": 3.2349, "theoretical_loss": 4.008473356946638, "tokens_seen": 403115008 }, { "epoch": 1.02, "learning_rate": 0.00044335005015045135, "loss": 3.2431, "theoretical_loss": 4.008400550740198, "tokens_seen": 403180544 }, { "epoch": 1.02, "learning_rate": 0.00044334002006018053, "loss": 3.1016, "theoretical_loss": 4.008327759680304, "tokens_seen": 403246080 }, { "epoch": 1.02, "learning_rate": 0.00044332998996990977, "loss": 3.1774, "theoretical_loss": 4.008254983761341, "tokens_seen": 403311616 }, { "epoch": 1.02, "learning_rate": 0.0004433199598796389, "loss": 3.2086, "theoretical_loss": 4.008182222977702, "tokens_seen": 403377152 }, { "epoch": 1.02, "learning_rate": 0.00044330992978936813, "loss": 3.0634, "theoretical_loss": 4.00810947732378, "tokens_seen": 403442688 }, { "epoch": 1.02, "learning_rate": 0.0004432998996990973, "loss": 3.2066, "theoretical_loss": 4.008036746793973, "tokens_seen": 403508224 }, { "epoch": 1.02, "learning_rate": 0.0004432898696088265, "loss": 3.2646, "theoretical_loss": 4.007964031382681, "tokens_seen": 403573760 }, { "epoch": 1.02, "learning_rate": 0.0004432798395185557, "loss": 3.0483, "theoretical_loss": 4.007891331084306, "tokens_seen": 403639296 }, { "epoch": 1.02, "learning_rate": 0.00044326980942828486, "loss": 3.0592, "theoretical_loss": 4.007818645893254, "tokens_seen": 403704832 }, { "epoch": 1.02, "learning_rate": 0.00044325977933801404, "loss": 3.1277, "theoretical_loss": 4.007745975803934, "tokens_seen": 403770368 }, { "epoch": 1.02, "learning_rate": 0.00044324974924774327, "loss": 3.2363, "theoretical_loss": 4.00767332081076, "tokens_seen": 403835904 }, { "epoch": 1.02, "learning_rate": 0.0004432397191574724, "loss": 3.1623, "theoretical_loss": 4.007600680908144, "tokens_seen": 403901440 }, { "epoch": 1.02, "learning_rate": 0.00044322968906720163, "loss": 3.1713, "theoretical_loss": 4.007528056090505, "tokens_seen": 403966976 }, { "epoch": 1.02, "learning_rate": 0.00044321965897693076, "loss": 3.1795, "theoretical_loss": 4.007455446352266, "tokens_seen": 404032512 }, { "epoch": 1.02, "learning_rate": 0.00044320962888666, "loss": 3.1872, "theoretical_loss": 4.007382851687847, "tokens_seen": 404098048 }, { "epoch": 1.02, "learning_rate": 0.0004431995987963892, "loss": 3.0736, "theoretical_loss": 4.007310272091677, "tokens_seen": 404163584 }, { "epoch": 1.02, "learning_rate": 0.00044318956870611836, "loss": 3.0984, "theoretical_loss": 4.007237707558185, "tokens_seen": 404229120 }, { "epoch": 1.02, "learning_rate": 0.00044317953861584754, "loss": 3.2079, "theoretical_loss": 4.007165158081804, "tokens_seen": 404294656 }, { "epoch": 1.02, "learning_rate": 0.0004431695085255767, "loss": 3.0979, "theoretical_loss": 4.007092623656971, "tokens_seen": 404360192 }, { "epoch": 1.02, "learning_rate": 0.0004431594784353059, "loss": 3.0986, "theoretical_loss": 4.007020104278122, "tokens_seen": 404425728 }, { "epoch": 1.02, "learning_rate": 0.00044314944834503514, "loss": 3.024, "theoretical_loss": 4.0069475999397, "tokens_seen": 404491264 }, { "epoch": 1.02, "learning_rate": 0.00044313941825476427, "loss": 3.2362, "theoretical_loss": 4.00687511063615, "tokens_seen": 404556800 }, { "epoch": 1.02, "learning_rate": 0.0004431293881644935, "loss": 2.9359, "theoretical_loss": 4.006802636361918, "tokens_seen": 404622336 }, { "epoch": 1.02, "objective/train/docs_used": 670599, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3230090141296387, "objective/train/theoretical_loss": 4.0067482905158975, "objective/train/tokens_used": 425131488, "theoretical_loss": 4.0067482905158975, "tokens_seen": 404671488 }, { "epoch": 1.02, "learning_rate": 0.0004431193580742227, "loss": 3.2055, "theoretical_loss": 4.006730177111456, "tokens_seen": 404687872 }, { "epoch": 1.02, "learning_rate": 0.00044310932798395186, "loss": 3.2107, "theoretical_loss": 4.0066577328792174, "tokens_seen": 404753408 }, { "epoch": 1.02, "learning_rate": 0.00044309929789368104, "loss": 2.997, "theoretical_loss": 4.006585303659657, "tokens_seen": 404818944 }, { "epoch": 1.02, "learning_rate": 0.0004430892678034102, "loss": 3.1885, "theoretical_loss": 4.006512889447235, "tokens_seen": 404884480 }, { "epoch": 1.02, "learning_rate": 0.0004430792377131394, "loss": 3.1671, "theoretical_loss": 4.006440490236414, "tokens_seen": 404950016 }, { "epoch": 1.02, "learning_rate": 0.00044306920762286864, "loss": 2.9834, "theoretical_loss": 4.006368106021657, "tokens_seen": 405015552 }, { "epoch": 1.02, "learning_rate": 0.00044305917753259777, "loss": 3.2602, "theoretical_loss": 4.006295736797436, "tokens_seen": 405081088 }, { "epoch": 1.02, "learning_rate": 0.000443049147442327, "loss": 3.0788, "theoretical_loss": 4.006223382558218, "tokens_seen": 405146624 }, { "epoch": 1.02, "learning_rate": 0.00044303911735205613, "loss": 3.2234, "theoretical_loss": 4.00615104329848, "tokens_seen": 405212160 }, { "epoch": 1.02, "learning_rate": 0.00044302908726178537, "loss": 3.1383, "theoretical_loss": 4.006078719012697, "tokens_seen": 405277696 }, { "epoch": 1.02, "learning_rate": 0.00044301905717151455, "loss": 3.1129, "theoretical_loss": 4.00600640969535, "tokens_seen": 405343232 }, { "epoch": 1.02, "learning_rate": 0.00044300902708124373, "loss": 3.2178, "theoretical_loss": 4.005934115340921, "tokens_seen": 405408768 }, { "epoch": 1.02, "learning_rate": 0.0004429989969909729, "loss": 3.2129, "theoretical_loss": 4.005861835943895, "tokens_seen": 405474304 }, { "epoch": 1.02, "learning_rate": 0.0004429889669007021, "loss": 3.1967, "theoretical_loss": 4.005789571498761, "tokens_seen": 405539840 }, { "epoch": 1.02, "learning_rate": 0.00044297893681043127, "loss": 3.0363, "theoretical_loss": 4.005717322000012, "tokens_seen": 405605376 }, { "epoch": 1.02, "learning_rate": 0.0004429689067201605, "loss": 3.218, "theoretical_loss": 4.005645087442142, "tokens_seen": 405670912 }, { "epoch": 1.02, "learning_rate": 0.00044295887662988963, "loss": 2.8683, "theoretical_loss": 4.005572867819646, "tokens_seen": 405736448 }, { "epoch": 1.02, "learning_rate": 0.00044294884653961887, "loss": 3.168, "theoretical_loss": 4.0055006631270285, "tokens_seen": 405801984 }, { "epoch": 1.02, "learning_rate": 0.0004429388164493481, "loss": 3.1383, "theoretical_loss": 4.005428473358788, "tokens_seen": 405867520 }, { "epoch": 1.02, "learning_rate": 0.00044292878635907723, "loss": 3.2121, "theoretical_loss": 4.005356298509433, "tokens_seen": 405933056 }, { "epoch": 1.02, "learning_rate": 0.00044291875626880647, "loss": 3.269, "theoretical_loss": 4.005284138573473, "tokens_seen": 405998592 }, { "epoch": 1.02, "learning_rate": 0.0004429087261785356, "loss": 3.2273, "theoretical_loss": 4.0052119935454185, "tokens_seen": 406064128 }, { "epoch": 1.02, "learning_rate": 0.00044289869608826483, "loss": 3.0237, "theoretical_loss": 4.005139863419785, "tokens_seen": 406129664 }, { "epoch": 1.02, "learning_rate": 0.000442888665997994, "loss": 3.2761, "theoretical_loss": 4.00506774819109, "tokens_seen": 406195200 }, { "epoch": 1.02, "learning_rate": 0.0004428786359077232, "loss": 3.1452, "theoretical_loss": 4.004995647853855, "tokens_seen": 406260736 }, { "epoch": 1.02, "objective/train/docs_used": 673420, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3555471897125244, "objective/train/theoretical_loss": 4.004941582370154, "objective/train/tokens_used": 426769888, "theoretical_loss": 4.004941582370154, "tokens_seen": 406309888 }, { "epoch": 1.02, "learning_rate": 0.00044286860581745237, "loss": 3.0677, "theoretical_loss": 4.0049235624026025, "tokens_seen": 406326272 }, { "epoch": 1.02, "learning_rate": 0.00044285857572718155, "loss": 3.1551, "theoretical_loss": 4.004851491831859, "tokens_seen": 406391808 }, { "epoch": 1.02, "learning_rate": 0.00044284854563691073, "loss": 3.2058, "theoretical_loss": 4.004779436136154, "tokens_seen": 406457344 }, { "epoch": 1.02, "learning_rate": 0.00044283851554663997, "loss": 3.263, "theoretical_loss": 4.004707395310019, "tokens_seen": 406522880 }, { "epoch": 1.02, "learning_rate": 0.0004428284854563691, "loss": 3.1822, "theoretical_loss": 4.004635369347991, "tokens_seen": 406588416 }, { "epoch": 1.02, "learning_rate": 0.00044281845536609833, "loss": 3.2681, "theoretical_loss": 4.0045633582446065, "tokens_seen": 406653952 }, { "epoch": 1.02, "learning_rate": 0.0004428084252758275, "loss": 3.2651, "theoretical_loss": 4.004491361994406, "tokens_seen": 406719488 }, { "epoch": 1.02, "learning_rate": 0.0004427983951855567, "loss": 3.1931, "theoretical_loss": 4.0044193805919335, "tokens_seen": 406785024 }, { "epoch": 1.02, "learning_rate": 0.0004427883650952859, "loss": 3.1803, "theoretical_loss": 4.004347414031736, "tokens_seen": 406850560 }, { "epoch": 1.02, "learning_rate": 0.00044277833500501506, "loss": 3.1794, "theoretical_loss": 4.004275462308364, "tokens_seen": 406916096 }, { "epoch": 1.02, "learning_rate": 0.00044276830491474424, "loss": 3.2176, "theoretical_loss": 4.004203525416369, "tokens_seen": 406981632 }, { "epoch": 1.02, "learning_rate": 0.00044275827482447347, "loss": 3.0228, "theoretical_loss": 4.004131603350305, "tokens_seen": 407047168 }, { "epoch": 1.02, "learning_rate": 0.0004427482447342026, "loss": 3.0775, "theoretical_loss": 4.004059696104732, "tokens_seen": 407112704 }, { "epoch": 1.02, "learning_rate": 0.00044273821464393184, "loss": 3.2313, "theoretical_loss": 4.003987803674209, "tokens_seen": 407178240 }, { "epoch": 1.02, "learning_rate": 0.00044272818455366096, "loss": 3.0529, "theoretical_loss": 4.003915926053303, "tokens_seen": 407243776 }, { "epoch": 1.02, "learning_rate": 0.0004427181544633902, "loss": 3.2632, "theoretical_loss": 4.003844063236578, "tokens_seen": 407309312 }, { "epoch": 1.02, "learning_rate": 0.0004427081243731194, "loss": 3.2198, "theoretical_loss": 4.003772215218604, "tokens_seen": 407374848 }, { "epoch": 1.02, "learning_rate": 0.00044269809428284856, "loss": 3.2137, "theoretical_loss": 4.003700381993955, "tokens_seen": 407440384 }, { "epoch": 1.02, "learning_rate": 0.00044268806419257774, "loss": 3.2018, "theoretical_loss": 4.003628563557205, "tokens_seen": 407505920 }, { "epoch": 1.02, "learning_rate": 0.0004426780341023069, "loss": 3.0985, "theoretical_loss": 4.003556759902933, "tokens_seen": 407571456 }, { "epoch": 1.02, "learning_rate": 0.0004426680040120361, "loss": 3.0047, "theoretical_loss": 4.00348497102572, "tokens_seen": 407636992 }, { "epoch": 1.02, "learning_rate": 0.00044265797392176534, "loss": 3.4497, "theoretical_loss": 4.003413196920148, "tokens_seen": 407702528 }, { "epoch": 1.02, "learning_rate": 0.00044264794383149447, "loss": 3.0903, "theoretical_loss": 4.003341437580806, "tokens_seen": 407768064 }, { "epoch": 1.02, "learning_rate": 0.0004426379137412237, "loss": 3.0692, "theoretical_loss": 4.0032696930022835, "tokens_seen": 407833600 }, { "epoch": 1.02, "learning_rate": 0.0004426278836509529, "loss": 3.107, "theoretical_loss": 4.003197963179172, "tokens_seen": 407899136 }, { "epoch": 1.02, "objective/train/docs_used": 674928, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.161184787750244, "objective/train/theoretical_loss": 4.003144175491826, "objective/train/tokens_used": 428408288, "theoretical_loss": 4.003144175491826, "tokens_seen": 407948288 }, { "epoch": 1.02, "learning_rate": 0.00044261785356068206, "loss": 3.1824, "theoretical_loss": 4.003126248106068, "tokens_seen": 407964672 }, { "epoch": 1.02, "learning_rate": 0.00044260782347041124, "loss": 3.1982, "theoretical_loss": 4.003054547777569, "tokens_seen": 408030208 }, { "epoch": 1.02, "learning_rate": 0.0004425977933801404, "loss": 3.0532, "theoretical_loss": 4.002982862188276, "tokens_seen": 408095744 }, { "epoch": 1.02, "learning_rate": 0.0004425877632898696, "loss": 3.1955, "theoretical_loss": 4.002911191332792, "tokens_seen": 408161280 }, { "epoch": 1.02, "learning_rate": 0.00044257773319959884, "loss": 3.026, "theoretical_loss": 4.002839535205725, "tokens_seen": 408226816 }, { "epoch": 1.02, "learning_rate": 0.00044256770310932797, "loss": 3.0578, "theoretical_loss": 4.002767893801685, "tokens_seen": 408292352 }, { "epoch": 1.02, "learning_rate": 0.0004425576730190572, "loss": 3.0882, "theoretical_loss": 4.002696267115282, "tokens_seen": 408357888 }, { "epoch": 1.02, "learning_rate": 0.00044254764292878633, "loss": 3.2744, "theoretical_loss": 4.002624655141134, "tokens_seen": 408423424 }, { "epoch": 1.02, "learning_rate": 0.00044253761283851557, "loss": 3.0509, "theoretical_loss": 4.0025530578738575, "tokens_seen": 408488960 }, { "epoch": 1.02, "learning_rate": 0.00044252758274824475, "loss": 3.2021, "theoretical_loss": 4.002481475308074, "tokens_seen": 408554496 }, { "epoch": 1.02, "learning_rate": 0.00044251755265797393, "loss": 3.1834, "theoretical_loss": 4.002409907438407, "tokens_seen": 408620032 }, { "epoch": 1.02, "learning_rate": 0.0004425075225677031, "loss": 3.2989, "theoretical_loss": 4.002338354259483, "tokens_seen": 408685568 }, { "epoch": 1.02, "learning_rate": 0.0004424974924774323, "loss": 3.2241, "theoretical_loss": 4.002266815765931, "tokens_seen": 408751104 }, { "epoch": 1.02, "learning_rate": 0.00044248746238716147, "loss": 3.1832, "theoretical_loss": 4.002195291952384, "tokens_seen": 408816640 }, { "epoch": 1.02, "learning_rate": 0.0004424774322968907, "loss": 3.2157, "theoretical_loss": 4.002123782813476, "tokens_seen": 408882176 }, { "epoch": 1.02, "learning_rate": 0.00044246740220661983, "loss": 3.0849, "theoretical_loss": 4.0020522883438465, "tokens_seen": 408947712 }, { "epoch": 1.02, "learning_rate": 0.00044245737211634907, "loss": 3.2961, "theoretical_loss": 4.001980808538135, "tokens_seen": 409013248 }, { "epoch": 1.02, "learning_rate": 0.00044244734202607825, "loss": 3.1781, "theoretical_loss": 4.0019093433909845, "tokens_seen": 409078784 }, { "epoch": 1.02, "learning_rate": 0.00044243731193580743, "loss": 3.0967, "theoretical_loss": 4.001837892897042, "tokens_seen": 409144320 }, { "epoch": 1.02, "learning_rate": 0.0004424272818455366, "loss": 3.1355, "theoretical_loss": 4.001766457050957, "tokens_seen": 409209856 }, { "epoch": 1.02, "learning_rate": 0.0004424172517552658, "loss": 3.1921, "theoretical_loss": 4.00169503584738, "tokens_seen": 409275392 }, { "epoch": 1.02, "learning_rate": 0.000442407221664995, "loss": 3.1715, "theoretical_loss": 4.001623629280967, "tokens_seen": 409340928 }, { "epoch": 1.02, "learning_rate": 0.0004423971915747242, "loss": 3.0265, "theoretical_loss": 4.001552237346376, "tokens_seen": 409406464 }, { "epoch": 1.02, "learning_rate": 0.00044238716148445334, "loss": 3.2878, "theoretical_loss": 4.001480860038265, "tokens_seen": 409472000 }, { "epoch": 1.02, "learning_rate": 0.00044237713139418257, "loss": 3.0034, "theoretical_loss": 4.0014094973513, "tokens_seen": 409537536 }, { "epoch": 1.02, "objective/train/docs_used": 678633, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.349667549133301, "objective/train/theoretical_loss": 4.001355984927994, "objective/train/tokens_used": 430046688, "theoretical_loss": 4.001355984927994, "tokens_seen": 409586688 }, { "epoch": 1.02, "learning_rate": 0.0004423671013039117, "loss": 3.2148, "theoretical_loss": 4.001338149280146, "tokens_seen": 409603072 }, { "epoch": 1.02, "learning_rate": 0.00044235707121364093, "loss": 3.137, "theoretical_loss": 4.00126681581947, "tokens_seen": 409668608 }, { "epoch": 1.02, "learning_rate": 0.0004423470411233701, "loss": 3.1011, "theoretical_loss": 4.001195496963946, "tokens_seen": 409734144 }, { "epoch": 1.02, "learning_rate": 0.0004423370110330993, "loss": 3.2545, "theoretical_loss": 4.001124192708247, "tokens_seen": 409799680 }, { "epoch": 1.02, "learning_rate": 0.0004423269809428285, "loss": 3.3253, "theoretical_loss": 4.001052903047049, "tokens_seen": 409865216 }, { "epoch": 1.02, "learning_rate": 0.0004423169508525577, "loss": 3.113, "theoretical_loss": 4.000981627975034, "tokens_seen": 409930752 }, { "epoch": 1.02, "learning_rate": 0.00044230692076228684, "loss": 3.3433, "theoretical_loss": 4.000910367486885, "tokens_seen": 409996288 }, { "epoch": 1.02, "learning_rate": 0.0004422968906720161, "loss": 3.1803, "theoretical_loss": 4.000839121577285, "tokens_seen": 410061824 }, { "epoch": 1.02, "learning_rate": 0.0004422868605817452, "loss": 3.1116, "theoretical_loss": 4.000767890240924, "tokens_seen": 410127360 }, { "epoch": 1.02, "learning_rate": 0.00044227683049147444, "loss": 2.9406, "theoretical_loss": 4.000696673472493, "tokens_seen": 410192896 }, { "epoch": 1.02, "learning_rate": 0.0004422668004012036, "loss": 3.1632, "theoretical_loss": 4.0006254712666856, "tokens_seen": 410258432 }, { "epoch": 1.02, "learning_rate": 0.0004422567703109328, "loss": 3.1464, "theoretical_loss": 4.000554283618198, "tokens_seen": 410323968 }, { "epoch": 1.02, "learning_rate": 0.000442246740220662, "loss": 3.0979, "theoretical_loss": 4.000483110521731, "tokens_seen": 410389504 }, { "epoch": 1.02, "learning_rate": 0.00044223671013039116, "loss": 3.2003, "theoretical_loss": 4.000411951971985, "tokens_seen": 410455040 }, { "epoch": 1.02, "learning_rate": 0.00044222668004012034, "loss": 3.2259, "theoretical_loss": 4.000340807963666, "tokens_seen": 410520576 }, { "epoch": 1.02, "learning_rate": 0.0004422166499498496, "loss": 3.3083, "theoretical_loss": 4.000269678491482, "tokens_seen": 410586112 }, { "epoch": 1.02, "learning_rate": 0.0004422066198595787, "loss": 2.9151, "theoretical_loss": 4.000198563550143, "tokens_seen": 410651648 }, { "epoch": 1.02, "learning_rate": 0.00044219658976930794, "loss": 3.1303, "theoretical_loss": 4.000127463134361, "tokens_seen": 410717184 }, { "epoch": 1.02, "learning_rate": 0.0004421865596790371, "loss": 3.2767, "theoretical_loss": 4.000056377238854, "tokens_seen": 410782720 }, { "epoch": 1.02, "learning_rate": 0.0004421765295887663, "loss": 3.1408, "theoretical_loss": 3.99998530585834, "tokens_seen": 410848256 }, { "epoch": 1.02, "learning_rate": 0.00044216649949849554, "loss": 3.2617, "theoretical_loss": 3.999914248987541, "tokens_seen": 410913792 }, { "epoch": 1.02, "learning_rate": 0.00044215646940822467, "loss": 3.1651, "theoretical_loss": 3.999843206621181, "tokens_seen": 410979328 }, { "epoch": 1.02, "learning_rate": 0.0004421464393179539, "loss": 3.1085, "theoretical_loss": 3.999772178753987, "tokens_seen": 411044864 }, { "epoch": 1.02, "learning_rate": 0.0004421364092276831, "loss": 3.163, "theoretical_loss": 3.999701165380688, "tokens_seen": 411110400 }, { "epoch": 1.02, "learning_rate": 0.00044212637913741226, "loss": 3.2184, "theoretical_loss": 3.9996301664960185, "tokens_seen": 411175936 }, { "epoch": 1.02, "objective/train/docs_used": 679975, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.998873472213745, "objective/train/theoretical_loss": 3.999576926837511, "objective/train/tokens_used": 431685088, "theoretical_loss": 3.999576926837511, "tokens_seen": 411225088 }, { "epoch": 1.02, "learning_rate": 0.00044211634904714144, "loss": 2.998, "theoretical_loss": 3.9995591820947123, "tokens_seen": 411241472 }, { "epoch": 1.02, "learning_rate": 0.0004421063189568706, "loss": 3.1675, "theoretical_loss": 3.9994882121715083, "tokens_seen": 411307008 }, { "epoch": 1.02, "learning_rate": 0.0004420962888665998, "loss": 3.3106, "theoretical_loss": 3.999417256721147, "tokens_seen": 411372544 }, { "epoch": 1.02, "learning_rate": 0.00044208625877632904, "loss": 3.128, "theoretical_loss": 3.9993463157383715, "tokens_seen": 411438080 }, { "epoch": 1.02, "learning_rate": 0.00044207622868605817, "loss": 3.1528, "theoretical_loss": 3.999275389217929, "tokens_seen": 411503616 }, { "epoch": 1.02, "learning_rate": 0.0004420661985957874, "loss": 3.2449, "theoretical_loss": 3.999204477154568, "tokens_seen": 411569152 }, { "epoch": 1.02, "learning_rate": 0.00044205616850551653, "loss": 3.1256, "theoretical_loss": 3.9991335795430407, "tokens_seen": 411634688 }, { "epoch": 1.02, "learning_rate": 0.00044204613841524577, "loss": 3.0463, "theoretical_loss": 3.9990626963781017, "tokens_seen": 411700224 }, { "epoch": 1.02, "learning_rate": 0.00044203610832497495, "loss": 3.1384, "theoretical_loss": 3.998991827654508, "tokens_seen": 411765760 }, { "epoch": 1.02, "learning_rate": 0.00044202607823470413, "loss": 3.2745, "theoretical_loss": 3.99892097336702, "tokens_seen": 411831296 }, { "epoch": 1.02, "learning_rate": 0.0004420160481444333, "loss": 3.2424, "theoretical_loss": 3.9988501335104, "tokens_seen": 411896832 }, { "epoch": 1.02, "learning_rate": 0.0004420060180541625, "loss": 3.0784, "theoretical_loss": 3.9987793080794134, "tokens_seen": 411962368 }, { "epoch": 1.02, "learning_rate": 0.00044199598796389167, "loss": 3.2675, "theoretical_loss": 3.9987084970688294, "tokens_seen": 412027904 }, { "epoch": 1.02, "learning_rate": 0.0004419859578736209, "loss": 3.1045, "theoretical_loss": 3.9986377004734184, "tokens_seen": 412093440 }, { "epoch": 1.02, "learning_rate": 0.00044197592778335003, "loss": 3.1295, "theoretical_loss": 3.9985669182879535, "tokens_seen": 412158976 }, { "epoch": 1.02, "learning_rate": 0.00044196589769307927, "loss": 3.1375, "theoretical_loss": 3.9984961505072123, "tokens_seen": 412224512 }, { "epoch": 1.02, "learning_rate": 0.00044195586760280845, "loss": 3.0849, "theoretical_loss": 3.998425397125973, "tokens_seen": 412290048 }, { "epoch": 1.02, "learning_rate": 0.00044194583751253763, "loss": 3.1641, "theoretical_loss": 3.998354658139018, "tokens_seen": 412355584 }, { "epoch": 1.02, "learning_rate": 0.0004419358074222668, "loss": 3.0879, "theoretical_loss": 3.9982839335411313, "tokens_seen": 412421120 }, { "epoch": 1.02, "learning_rate": 0.000441925777331996, "loss": 3.2206, "theoretical_loss": 3.9982132233271006, "tokens_seen": 412486656 }, { "epoch": 1.02, "learning_rate": 0.0004419157472417252, "loss": 3.1495, "theoretical_loss": 3.9981425274917166, "tokens_seen": 412552192 }, { "epoch": 1.02, "learning_rate": 0.0004419057171514544, "loss": 3.1057, "theoretical_loss": 3.998071846029771, "tokens_seen": 412617728 }, { "epoch": 1.02, "learning_rate": 0.00044189568706118354, "loss": 3.253, "theoretical_loss": 3.998001178936059, "tokens_seen": 412683264 }, { "epoch": 1.02, "learning_rate": 0.00044188565697091277, "loss": 3.1884, "theoretical_loss": 3.99793052620538, "tokens_seen": 412748800 }, { "epoch": 1.02, "learning_rate": 0.0004418756268806419, "loss": 3.1416, "theoretical_loss": 3.9978598878325338, "tokens_seen": 412814336 }, { "epoch": 1.02, "objective/train/docs_used": 682814, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.243901014328003, "objective/train/theoretical_loss": 3.9978069184721012, "objective/train/tokens_used": 433323488, "theoretical_loss": 3.9978069184721012, "tokens_seen": 412863488 }, { "epoch": 1.02, "learning_rate": 0.00044186559679037113, "loss": 3.2848, "theoretical_loss": 3.997789263812325, "tokens_seen": 412879872 }, { "epoch": 1.02, "learning_rate": 0.0004418555667001003, "loss": 3.3098, "theoretical_loss": 3.9977186541395584, "tokens_seen": 412945408 }, { "epoch": 1.02, "learning_rate": 0.0004418455366098295, "loss": 3.0319, "theoretical_loss": 3.9976480588090446, "tokens_seen": 413010944 }, { "epoch": 1.02, "learning_rate": 0.0004418355065195587, "loss": 3.1171, "theoretical_loss": 3.997577477815594, "tokens_seen": 413076480 }, { "epoch": 1.02, "learning_rate": 0.0004418254764292879, "loss": 3.1089, "theoretical_loss": 3.997506911154022, "tokens_seen": 413142016 }, { "epoch": 1.02, "learning_rate": 0.00044181544633901704, "loss": 3.1321, "theoretical_loss": 3.9974363588191446, "tokens_seen": 413207552 }, { "epoch": 1.02, "learning_rate": 0.0004418054162487463, "loss": 3.2199, "theoretical_loss": 3.9973658208057827, "tokens_seen": 413273088 }, { "epoch": 1.02, "learning_rate": 0.0004417953861584754, "loss": 3.0403, "theoretical_loss": 3.997295297108758, "tokens_seen": 413338624 }, { "epoch": 1.02, "learning_rate": 0.00044178535606820464, "loss": 3.1421, "theoretical_loss": 3.997224787722896, "tokens_seen": 413404160 }, { "epoch": 1.02, "learning_rate": 0.0004417753259779338, "loss": 3.0815, "theoretical_loss": 3.9971542926430246, "tokens_seen": 413469696 }, { "epoch": 1.02, "learning_rate": 0.000441765295887663, "loss": 3.1736, "theoretical_loss": 3.9970838118639733, "tokens_seen": 413535232 }, { "epoch": 1.02, "learning_rate": 0.0004417552657973922, "loss": 3.1421, "theoretical_loss": 3.9970133453805774, "tokens_seen": 413600768 }, { "epoch": 1.02, "learning_rate": 0.00044174523570712136, "loss": 3.0979, "theoretical_loss": 3.996942893187671, "tokens_seen": 413666304 }, { "epoch": 1.02, "learning_rate": 0.00044173520561685054, "loss": 3.192, "theoretical_loss": 3.9968724552800934, "tokens_seen": 413731840 }, { "epoch": 1.02, "learning_rate": 0.0004417251755265798, "loss": 3.1487, "theoretical_loss": 3.9968020316526855, "tokens_seen": 413797376 }, { "epoch": 1.02, "learning_rate": 0.0004417151454363089, "loss": 3.1742, "theoretical_loss": 3.9967316223002918, "tokens_seen": 413862912 }, { "epoch": 1.02, "learning_rate": 0.00044170511534603814, "loss": 3.1394, "theoretical_loss": 3.9966612272177593, "tokens_seen": 413928448 }, { "epoch": 1.02, "learning_rate": 0.00044169508525576727, "loss": 3.0694, "theoretical_loss": 3.9965908463999362, "tokens_seen": 413993984 }, { "epoch": 1.02, "learning_rate": 0.0004416850551654965, "loss": 3.1334, "theoretical_loss": 3.996520479841675, "tokens_seen": 414059520 }, { "epoch": 1.02, "learning_rate": 0.0004416750250752257, "loss": 2.9864, "theoretical_loss": 3.9964501275378304, "tokens_seen": 414125056 }, { "epoch": 1.02, "learning_rate": 0.00044166499498495487, "loss": 3.0386, "theoretical_loss": 3.9963797894832602, "tokens_seen": 414190592 }, { "epoch": 1.02, "learning_rate": 0.00044165496489468405, "loss": 3.2317, "theoretical_loss": 3.9963094656728235, "tokens_seen": 414256128 }, { "epoch": 1.02, "learning_rate": 0.0004416449348044133, "loss": 3.3034, "theoretical_loss": 3.9962391561013826, "tokens_seen": 414321664 }, { "epoch": 1.02, "learning_rate": 0.0004416349047141424, "loss": 3.0092, "theoretical_loss": 3.996168860763805, "tokens_seen": 414387200 }, { "epoch": 1.02, "learning_rate": 0.00044162487462387164, "loss": 3.1165, "theoretical_loss": 3.9960985796549564, "tokens_seen": 414452736 }, { "epoch": 1.02, "objective/train/docs_used": 685529, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.978787660598755, "objective/train/theoretical_loss": 3.9960458781578385, "objective/train/tokens_used": 434961888, "theoretical_loss": 3.9960458781578385, "tokens_seen": 414501888 }, { "epoch": 1.02, "learning_rate": 0.00044161484453360077, "loss": 2.9954, "theoretical_loss": 3.9960283127697087, "tokens_seen": 414518272 }, { "epoch": 1.02, "learning_rate": 0.00044160481444333, "loss": 3.0338, "theoretical_loss": 3.9959580601029345, "tokens_seen": 414583808 }, { "epoch": 1.02, "learning_rate": 0.0004415947843530592, "loss": 3.2726, "theoretical_loss": 3.99588782164951, "tokens_seen": 414649344 }, { "epoch": 1.02, "learning_rate": 0.00044158475426278837, "loss": 3.0841, "theoretical_loss": 3.9958175974043146, "tokens_seen": 414714880 }, { "epoch": 1.02, "learning_rate": 0.00044157472417251755, "loss": 3.1591, "theoretical_loss": 3.9957473873622287, "tokens_seen": 414780416 }, { "epoch": 1.02, "learning_rate": 0.00044156469408224673, "loss": 3.2047, "theoretical_loss": 3.995677191518136, "tokens_seen": 414845952 }, { "epoch": 1.02, "learning_rate": 0.0004415546639919759, "loss": 3.1431, "theoretical_loss": 3.9956070098669243, "tokens_seen": 414911488 }, { "epoch": 1.02, "learning_rate": 0.00044154463390170515, "loss": 3.236, "theoretical_loss": 3.9955368424034816, "tokens_seen": 414977024 }, { "epoch": 1.02, "learning_rate": 0.0004415346038114343, "loss": 3.1339, "theoretical_loss": 3.9954666891227, "tokens_seen": 415042560 }, { "epoch": 1.02, "learning_rate": 0.0004415245737211635, "loss": 3.2363, "theoretical_loss": 3.995396550019475, "tokens_seen": 415108096 }, { "epoch": 1.02, "learning_rate": 0.00044151454363089264, "loss": 3.0702, "theoretical_loss": 3.995326425088703, "tokens_seen": 415173632 }, { "epoch": 1.02, "learning_rate": 0.00044150451354062187, "loss": 3.1602, "theoretical_loss": 3.9952563143252835, "tokens_seen": 415239168 }, { "epoch": 1.02, "learning_rate": 0.00044149448345035105, "loss": 3.2896, "theoretical_loss": 3.995186217724119, "tokens_seen": 415304704 }, { "epoch": 1.02, "learning_rate": 0.00044148445336008023, "loss": 3.1839, "theoretical_loss": 3.9951161352801154, "tokens_seen": 415370240 }, { "epoch": 1.02, "learning_rate": 0.0004414744232698094, "loss": 3.3364, "theoretical_loss": 3.99504606698818, "tokens_seen": 415435776 }, { "epoch": 1.02, "learning_rate": 0.00044146439317953865, "loss": 3.1019, "theoretical_loss": 3.9949760128432232, "tokens_seen": 415501312 }, { "epoch": 1.02, "learning_rate": 0.0004414543630892678, "loss": 3.2287, "theoretical_loss": 3.994905972840158, "tokens_seen": 415566848 }, { "epoch": 1.02, "learning_rate": 0.000441444332998997, "loss": 3.178, "theoretical_loss": 3.9948359469738994, "tokens_seen": 415632384 }, { "epoch": 1.02, "learning_rate": 0.0004414343029087262, "loss": 3.1397, "theoretical_loss": 3.994765935239367, "tokens_seen": 415697920 }, { "epoch": 1.02, "learning_rate": 0.0004414242728184554, "loss": 3.1414, "theoretical_loss": 3.9946959376314797, "tokens_seen": 415763456 }, { "epoch": 1.02, "learning_rate": 0.0004414142427281846, "loss": 3.1424, "theoretical_loss": 3.994625954145163, "tokens_seen": 415828992 }, { "epoch": 1.02, "learning_rate": 0.00044140421263791374, "loss": 2.9848, "theoretical_loss": 3.9945559847753422, "tokens_seen": 415894528 }, { "epoch": 1.02, "learning_rate": 0.00044139418254764297, "loss": 3.3037, "theoretical_loss": 3.9944860295169455, "tokens_seen": 415960064 }, { "epoch": 1.02, "learning_rate": 0.0004413841524573721, "loss": 3.0974, "theoretical_loss": 3.9944160883649054, "tokens_seen": 416025600 }, { "epoch": 1.02, "learning_rate": 0.00044137412236710133, "loss": 3.2604, "theoretical_loss": 3.994346161314155, "tokens_seen": 416091136 }, { "epoch": 1.02, "objective/train/docs_used": 688429, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0733585357666016, "objective/train/theoretical_loss": 3.994293725277018, "objective/train/tokens_used": 436600288, "theoretical_loss": 3.994293725277018, "tokens_seen": 416140288 }, { "epoch": 1.02, "learning_rate": 0.0004413640922768305, "loss": 3.0635, "theoretical_loss": 3.9942762483596312, "tokens_seen": 416156672 }, { "epoch": 1.02, "learning_rate": 0.0004413540621865597, "loss": 3.1777, "theoretical_loss": 3.994206349496274, "tokens_seen": 416222208 }, { "epoch": 1.02, "learning_rate": 0.0004413440320962889, "loss": 3.0123, "theoretical_loss": 3.9941364647190234, "tokens_seen": 416287744 }, { "epoch": 1.02, "learning_rate": 0.0004413340020060181, "loss": 3.0035, "theoretical_loss": 3.9940665940228257, "tokens_seen": 416353280 }, { "epoch": 1.02, "learning_rate": 0.00044132397191574724, "loss": 3.1437, "theoretical_loss": 3.993996737402627, "tokens_seen": 416418816 }, { "epoch": 1.02, "learning_rate": 0.0004413139418254765, "loss": 2.9565, "theoretical_loss": 3.9939268948533773, "tokens_seen": 416484352 }, { "epoch": 1.02, "learning_rate": 0.0004413039117352056, "loss": 3.132, "theoretical_loss": 3.9938570663700284, "tokens_seen": 416549888 }, { "epoch": 1.02, "learning_rate": 0.00044129388164493484, "loss": 3.1392, "theoretical_loss": 3.993787251947536, "tokens_seen": 416615424 }, { "epoch": 1.02, "learning_rate": 0.000441283851554664, "loss": 3.0798, "theoretical_loss": 3.9937174515808564, "tokens_seen": 416680960 }, { "epoch": 1.02, "learning_rate": 0.0004412738214643932, "loss": 3.263, "theoretical_loss": 3.993647665264951, "tokens_seen": 416746496 }, { "epoch": 1.02, "learning_rate": 0.0004412637913741224, "loss": 3.1196, "theoretical_loss": 3.9935778929947814, "tokens_seen": 416812032 }, { "epoch": 1.02, "learning_rate": 0.00044125376128385156, "loss": 3.354, "theoretical_loss": 3.993508134765314, "tokens_seen": 416877568 }, { "epoch": 1.02, "learning_rate": 0.00044124373119358074, "loss": 3.1284, "theoretical_loss": 3.9934383905715154, "tokens_seen": 416943104 }, { "epoch": 1.02, "learning_rate": 0.00044123370110331, "loss": 2.988, "theoretical_loss": 3.9933686604083576, "tokens_seen": 417008640 }, { "epoch": 1.02, "learning_rate": 0.0004412236710130391, "loss": 3.1118, "theoretical_loss": 3.993298944270812, "tokens_seen": 417074176 }, { "epoch": 1.02, "learning_rate": 0.00044121364092276834, "loss": 3.124, "theoretical_loss": 3.993229242153855, "tokens_seen": 417139712 }, { "epoch": 1.02, "learning_rate": 0.00044120361083249747, "loss": 3.1149, "theoretical_loss": 3.993159554052465, "tokens_seen": 417205248 }, { "epoch": 1.02, "learning_rate": 0.0004411935807422267, "loss": 3.1666, "theoretical_loss": 3.993089879961623, "tokens_seen": 417270784 }, { "epoch": 1.02, "learning_rate": 0.0004411835506519559, "loss": 3.2613, "theoretical_loss": 3.9930202198763114, "tokens_seen": 417336320 }, { "epoch": 1.02, "learning_rate": 0.00044117352056168507, "loss": 3.1762, "theoretical_loss": 3.992950573791518, "tokens_seen": 417401856 }, { "epoch": 1.02, "learning_rate": 0.00044116349047141425, "loss": 3.0867, "theoretical_loss": 3.9928809417022295, "tokens_seen": 417467392 }, { "epoch": 1.02, "learning_rate": 0.0004411534603811435, "loss": 3.0774, "theoretical_loss": 3.992811323603438, "tokens_seen": 417532928 }, { "epoch": 1.02, "learning_rate": 0.0004411434302908726, "loss": 3.0896, "theoretical_loss": 3.992741719490137, "tokens_seen": 417598464 }, { "epoch": 1.02, "learning_rate": 0.00044113340020060184, "loss": 3.1982, "theoretical_loss": 3.992672129357323, "tokens_seen": 417664000 }, { "epoch": 1.02, "learning_rate": 0.00044112337011033097, "loss": 3.2352, "theoretical_loss": 3.9926025531999945, "tokens_seen": 417729536 }, { "epoch": 1.02, "objective/train/docs_used": 690993, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2612905502319336, "objective/train/theoretical_loss": 3.992550380250404, "objective/train/tokens_used": 438238688, "theoretical_loss": 3.992550380250404, "tokens_seen": 417778688 }, { "epoch": 1.02, "learning_rate": 0.0004411133400200602, "loss": 3.128, "theoretical_loss": 3.9925329910131535, "tokens_seen": 417795072 }, { "epoch": 1.02, "learning_rate": 0.0004411033099297894, "loss": 3.1155, "theoretical_loss": 3.992463442791804, "tokens_seen": 417860608 }, { "epoch": 1.02, "learning_rate": 0.00044109327983951857, "loss": 3.0679, "theoretical_loss": 3.9923939085309517, "tokens_seen": 417926144 }, { "epoch": 1.02, "learning_rate": 0.00044108324974924775, "loss": 3.1543, "theoretical_loss": 3.992324388225607, "tokens_seen": 417991680 }, { "epoch": 1.02, "learning_rate": 0.00044107321965897693, "loss": 3.0113, "theoretical_loss": 3.992254881870781, "tokens_seen": 418057216 }, { "epoch": 1.02, "learning_rate": 0.0004410631895687061, "loss": 3.1952, "theoretical_loss": 3.9921853894614885, "tokens_seen": 418122752 }, { "epoch": 1.02, "learning_rate": 0.00044105315947843535, "loss": 3.1021, "theoretical_loss": 3.9921159109927453, "tokens_seen": 418188288 }, { "epoch": 1.02, "learning_rate": 0.0004410431293881645, "loss": 2.9445, "theoretical_loss": 3.9920464464595717, "tokens_seen": 418253824 }, { "epoch": 1.02, "learning_rate": 0.0004410330992978937, "loss": 3.2122, "theoretical_loss": 3.99197699585699, "tokens_seen": 418319360 }, { "epoch": 1.02, "learning_rate": 0.00044102306920762284, "loss": 3.2492, "theoretical_loss": 3.9919075591800235, "tokens_seen": 418384896 }, { "epoch": 1.02, "learning_rate": 0.00044101303911735207, "loss": 3.2392, "theoretical_loss": 3.9918381364237003, "tokens_seen": 418450432 }, { "epoch": 1.02, "learning_rate": 0.00044100300902708125, "loss": 3.1782, "theoretical_loss": 3.9917687275830493, "tokens_seen": 418515968 }, { "epoch": 1.02, "learning_rate": 0.00044099297893681043, "loss": 2.9854, "theoretical_loss": 3.991699332653104, "tokens_seen": 418581504 }, { "epoch": 1.02, "learning_rate": 0.0004409829488465396, "loss": 3.058, "theoretical_loss": 3.991629951628898, "tokens_seen": 418647040 }, { "epoch": 1.02, "learning_rate": 0.00044097291875626885, "loss": 3.1785, "theoretical_loss": 3.991560584505469, "tokens_seen": 418712576 }, { "epoch": 1.02, "learning_rate": 0.000440962888665998, "loss": 2.9504, "theoretical_loss": 3.9914912312778568, "tokens_seen": 418778112 }, { "epoch": 1.02, "learning_rate": 0.0004409528585757272, "loss": 3.2487, "theoretical_loss": 3.9914218919411035, "tokens_seen": 418843648 }, { "epoch": 1.02, "learning_rate": 0.00044094282848545634, "loss": 3.1567, "theoretical_loss": 3.9913525664902547, "tokens_seen": 418909184 }, { "epoch": 1.02, "learning_rate": 0.0004409327983951856, "loss": 3.1663, "theoretical_loss": 3.991283254920358, "tokens_seen": 418974720 }, { "epoch": 1.02, "learning_rate": 0.00044092276830491476, "loss": 3.195, "theoretical_loss": 3.9912139572264618, "tokens_seen": 419040256 }, { "epoch": 1.02, "learning_rate": 0.00044091273821464394, "loss": 3.0917, "theoretical_loss": 3.9911446734036207, "tokens_seen": 419105792 }, { "epoch": 1.02, "learning_rate": 0.0004409027081243731, "loss": 3.1884, "theoretical_loss": 3.9910754034468887, "tokens_seen": 419171328 }, { "epoch": 1.02, "learning_rate": 0.0004408926780341023, "loss": 3.1763, "theoretical_loss": 3.9910061473513236, "tokens_seen": 419236864 }, { "epoch": 1.02, "learning_rate": 0.0004408826479438315, "loss": 3.1159, "theoretical_loss": 3.990936905111986, "tokens_seen": 419302400 }, { "epoch": 1.02, "learning_rate": 0.0004408726178535607, "loss": 3.2048, "theoretical_loss": 3.990867676723938, "tokens_seen": 419367936 }, { "epoch": 1.02, "objective/train/docs_used": 693733, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1483314037323, "objective/train/theoretical_loss": 3.9908157645198425, "objective/train/tokens_used": 439877088, "theoretical_loss": 3.9908157645198425, "tokens_seen": 419417088 }, { "epoch": 1.02, "learning_rate": 0.00044086258776328984, "loss": 3.2121, "theoretical_loss": 3.990798462182245, "tokens_seen": 419433472 }, { "epoch": 1.02, "learning_rate": 0.0004408525576730191, "loss": 3.2749, "theoretical_loss": 3.990729261481975, "tokens_seen": 419499008 }, { "epoch": 1.02, "learning_rate": 0.0004408425275827482, "loss": 3.0562, "theoretical_loss": 3.9906600746181984, "tokens_seen": 419564544 }, { "epoch": 1.02, "learning_rate": 0.00044083249749247744, "loss": 3.2224, "theoretical_loss": 3.9905909015859873, "tokens_seen": 419630080 }, { "epoch": 1.02, "learning_rate": 0.0004408224674022066, "loss": 3.1345, "theoretical_loss": 3.990521742380418, "tokens_seen": 419695616 }, { "epoch": 1.02, "learning_rate": 0.0004408124373119358, "loss": 3.1249, "theoretical_loss": 3.990452596996567, "tokens_seen": 419761152 }, { "epoch": 1.02, "learning_rate": 0.000440802407221665, "loss": 3.2106, "theoretical_loss": 3.9903834654295167, "tokens_seen": 419826688 }, { "epoch": 1.02, "learning_rate": 0.0004407923771313942, "loss": 3.1307, "theoretical_loss": 3.9903143476743486, "tokens_seen": 419892224 }, { "epoch": 1.02, "learning_rate": 0.00044078234704112335, "loss": 3.0096, "theoretical_loss": 3.9902452437261475, "tokens_seen": 419957760 }, { "epoch": 1.02, "learning_rate": 0.0004407723169508526, "loss": 2.9944, "theoretical_loss": 3.990176153580003, "tokens_seen": 420023296 }, { "epoch": 1.02, "learning_rate": 0.0004407622868605817, "loss": 3.0833, "theoretical_loss": 3.9901070772310048, "tokens_seen": 420088832 }, { "epoch": 1.02, "learning_rate": 0.00044075225677031094, "loss": 3.2234, "theoretical_loss": 3.9900380146742456, "tokens_seen": 420154368 }, { "epoch": 1.02, "learning_rate": 0.0004407422266800401, "loss": 3.0489, "theoretical_loss": 3.989968965904821, "tokens_seen": 420219904 }, { "epoch": 1.02, "learning_rate": 0.0004407321965897693, "loss": 3.1436, "theoretical_loss": 3.9898999309178294, "tokens_seen": 420285440 }, { "epoch": 1.02, "learning_rate": 0.0004407221664994985, "loss": 3.1146, "theoretical_loss": 3.989830909708371, "tokens_seen": 420350976 }, { "epoch": 1.02, "learning_rate": 0.00044071213640922767, "loss": 3.0191, "theoretical_loss": 3.9897619022715483, "tokens_seen": 420416512 }, { "epoch": 1.02, "learning_rate": 0.00044070210631895685, "loss": 2.9673, "theoretical_loss": 3.9896929086024677, "tokens_seen": 420482048 }, { "epoch": 1.02, "learning_rate": 0.0004406920762286861, "loss": 3.1182, "theoretical_loss": 3.9896239286962367, "tokens_seen": 420547584 }, { "epoch": 1.02, "learning_rate": 0.00044068204613841527, "loss": 3.0893, "theoretical_loss": 3.9895549625479654, "tokens_seen": 420613120 }, { "epoch": 1.02, "learning_rate": 0.00044067201604814445, "loss": 3.1506, "theoretical_loss": 3.989486010152768, "tokens_seen": 420678656 }, { "epoch": 1.02, "learning_rate": 0.0004406619859578737, "loss": 3.246, "theoretical_loss": 3.9894170715057586, "tokens_seen": 420744192 }, { "epoch": 1.02, "learning_rate": 0.0004406519558676028, "loss": 3.0786, "theoretical_loss": 3.989348146602056, "tokens_seen": 420809728 }, { "epoch": 1.02, "learning_rate": 0.00044064192577733204, "loss": 3.0989, "theoretical_loss": 3.9892792354367805, "tokens_seen": 420875264 }, { "epoch": 1.02, "learning_rate": 0.00044063189568706117, "loss": 2.9882, "theoretical_loss": 3.989210338005055, "tokens_seen": 420940800 }, { "epoch": 1.02, "learning_rate": 0.0004406218655967904, "loss": 3.1926, "theoretical_loss": 3.9891414543020054, "tokens_seen": 421006336 }, { "epoch": 1.02, "objective/train/docs_used": 695092, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.298571825027466, "objective/train/theoretical_loss": 3.98908980053123, "objective/train/tokens_used": 441515488, "theoretical_loss": 3.98908980053123, "tokens_seen": 421055488 }, { "epoch": 1.02, "learning_rate": 0.0004406118355065196, "loss": 3.2257, "theoretical_loss": 3.9890725843227592, "tokens_seen": 421071872 }, { "epoch": 1.02, "learning_rate": 0.00044060180541624877, "loss": 3.1983, "theoretical_loss": 3.989003728062446, "tokens_seen": 421137408 }, { "epoch": 1.02, "learning_rate": 0.00044059177532597795, "loss": 3.1381, "theoretical_loss": 3.988934885516201, "tokens_seen": 421202944 }, { "epoch": 1.02, "learning_rate": 0.00044058174523570713, "loss": 3.0604, "theoretical_loss": 3.9888660566791576, "tokens_seen": 421268480 }, { "epoch": 1.02, "learning_rate": 0.0004405717151454363, "loss": 3.1226, "theoretical_loss": 3.9887972415464548, "tokens_seen": 421334016 }, { "epoch": 1.02, "learning_rate": 0.00044056168505516555, "loss": 3.1859, "theoretical_loss": 3.988728440113232, "tokens_seen": 421399552 }, { "epoch": 1.02, "learning_rate": 0.0004405516549648947, "loss": 3.1316, "theoretical_loss": 3.9886596523746327, "tokens_seen": 421465088 }, { "epoch": 1.02, "learning_rate": 0.0004405416248746239, "loss": 3.1494, "theoretical_loss": 3.9885908783258026, "tokens_seen": 421530624 }, { "epoch": 1.02, "learning_rate": 0.00044053159478435304, "loss": 3.072, "theoretical_loss": 3.988522117961888, "tokens_seen": 421596160 }, { "epoch": 1.02, "learning_rate": 0.00044052156469408227, "loss": 3.1146, "theoretical_loss": 3.988453371278041, "tokens_seen": 421661696 }, { "epoch": 1.02, "learning_rate": 0.00044051153460381145, "loss": 3.2451, "theoretical_loss": 3.9883846382694133, "tokens_seen": 421727232 }, { "epoch": 1.02, "learning_rate": 0.00044050150451354063, "loss": 3.1252, "theoretical_loss": 3.9883159189311606, "tokens_seen": 421792768 }, { "epoch": 1.02, "learning_rate": 0.0004404914744232698, "loss": 3.0432, "theoretical_loss": 3.9882472132584397, "tokens_seen": 421858304 }, { "epoch": 1.02, "learning_rate": 0.00044048144433299905, "loss": 3.0595, "theoretical_loss": 3.988178521246412, "tokens_seen": 421923840 }, { "epoch": 1.02, "learning_rate": 0.0004404714142427282, "loss": 3.2388, "theoretical_loss": 3.988109842890239, "tokens_seen": 421989376 }, { "epoch": 1.02, "learning_rate": 0.0004404613841524574, "loss": 2.9641, "theoretical_loss": 3.988041178185087, "tokens_seen": 422054912 }, { "epoch": 1.02, "learning_rate": 0.00044045135406218654, "loss": 3.076, "theoretical_loss": 3.987972527126122, "tokens_seen": 422120448 }, { "epoch": 1.02, "learning_rate": 0.0004404413239719158, "loss": 3.1732, "theoretical_loss": 3.987903889708515, "tokens_seen": 422185984 }, { "epoch": 1.02, "learning_rate": 0.00044043129388164496, "loss": 3.0582, "theoretical_loss": 3.987835265927439, "tokens_seen": 422251520 }, { "epoch": 1.02, "learning_rate": 0.00044042126379137414, "loss": 3.2205, "theoretical_loss": 3.9877666557780675, "tokens_seen": 422317056 }, { "epoch": 1.02, "learning_rate": 0.0004404112337011033, "loss": 3.0438, "theoretical_loss": 3.987698059255579, "tokens_seen": 422382592 }, { "epoch": 1.02, "learning_rate": 0.0004404012036108325, "loss": 3.2013, "theoretical_loss": 3.987629476355153, "tokens_seen": 422448128 }, { "epoch": 1.02, "learning_rate": 0.0004403911735205617, "loss": 3.205, "theoretical_loss": 3.9875609070719715, "tokens_seen": 422513664 }, { "epoch": 1.02, "learning_rate": 0.0004403811434302909, "loss": 3.0674, "theoretical_loss": 3.9874923514012193, "tokens_seen": 422579200 }, { "epoch": 1.02, "learning_rate": 0.00044037111334002004, "loss": 3.2892, "theoretical_loss": 3.9874238093380843, "tokens_seen": 422644736 }, { "epoch": 1.02, "objective/train/docs_used": 697758, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.20353627204895, "objective/train/theoretical_loss": 3.9873724117178373, "objective/train/tokens_used": 443153888, "theoretical_loss": 3.9873724117178373, "tokens_seen": 422693888 }, { "epoch": 1.02, "learning_rate": 0.0004403610832497493, "loss": 3.0299, "theoretical_loss": 3.987355280877755, "tokens_seen": 422710272 }, { "epoch": 1.02, "learning_rate": 0.0004403510531594784, "loss": 3.1412, "theoretical_loss": 3.9872867660154245, "tokens_seen": 422775808 }, { "epoch": 1.02, "learning_rate": 0.00044034102306920764, "loss": 3.2232, "theoretical_loss": 3.9872182647462866, "tokens_seen": 422841344 }, { "epoch": 1.02, "learning_rate": 0.0004403309929789368, "loss": 3.2636, "theoretical_loss": 3.9871497770655386, "tokens_seen": 422906880 }, { "epoch": 1.02, "learning_rate": 0.000440320962888666, "loss": 3.0843, "theoretical_loss": 3.9870813029683796, "tokens_seen": 422972416 }, { "epoch": 1.02, "learning_rate": 0.0004403109327983952, "loss": 3.0793, "theoretical_loss": 3.987012842450012, "tokens_seen": 423037952 }, { "epoch": 1.02, "learning_rate": 0.0004403009027081244, "loss": 3.2152, "theoretical_loss": 3.98694439550564, "tokens_seen": 423103488 }, { "epoch": 1.02, "learning_rate": 0.00044029087261785355, "loss": 3.1375, "theoretical_loss": 3.98687596213047, "tokens_seen": 423169024 }, { "epoch": 1.02, "learning_rate": 0.0004402808425275828, "loss": 3.0998, "theoretical_loss": 3.9868075423197107, "tokens_seen": 423234560 }, { "epoch": 1.02, "learning_rate": 0.0004402708124373119, "loss": 3.1082, "theoretical_loss": 3.986739136068574, "tokens_seen": 423300096 }, { "epoch": 1.02, "learning_rate": 0.00044026078234704114, "loss": 3.0246, "theoretical_loss": 3.986670743372275, "tokens_seen": 423365632 }, { "epoch": 1.02, "learning_rate": 0.0004402507522567703, "loss": 3.2945, "theoretical_loss": 3.9866023642260293, "tokens_seen": 423431168 }, { "epoch": 1.02, "learning_rate": 0.0004402407221664995, "loss": 3.1111, "theoretical_loss": 3.986533998625056, "tokens_seen": 423496704 }, { "epoch": 1.02, "learning_rate": 0.0004402306920762287, "loss": 3.25, "theoretical_loss": 3.986465646564575, "tokens_seen": 423562240 }, { "epoch": 1.02, "learning_rate": 0.00044022066198595787, "loss": 3.1124, "theoretical_loss": 3.9863973080398125, "tokens_seen": 423627776 }, { "epoch": 1.02, "learning_rate": 0.00044021063189568705, "loss": 3.1663, "theoretical_loss": 3.9863289830459925, "tokens_seen": 423693312 }, { "epoch": 1.02, "learning_rate": 0.0004402006018054163, "loss": 3.1158, "theoretical_loss": 3.986260671578345, "tokens_seen": 423758848 }, { "epoch": 1.02, "learning_rate": 0.0004401905717151454, "loss": 3.2005, "theoretical_loss": 3.9861923736321003, "tokens_seen": 423824384 }, { "epoch": 1.02, "learning_rate": 0.00044018054162487465, "loss": 3.076, "theoretical_loss": 3.986124089202492, "tokens_seen": 423889920 }, { "epoch": 1.02, "learning_rate": 0.00044017051153460383, "loss": 3.2516, "theoretical_loss": 3.9860558182847554, "tokens_seen": 423955456 }, { "epoch": 1.02, "learning_rate": 0.000440160481444333, "loss": 3.1715, "theoretical_loss": 3.9859875608741295, "tokens_seen": 424020992 }, { "epoch": 1.02, "learning_rate": 0.0004401504513540622, "loss": 3.101, "theoretical_loss": 3.985919316965855, "tokens_seen": 424086528 }, { "epoch": 1.02, "learning_rate": 0.00044014042126379137, "loss": 3.1703, "theoretical_loss": 3.985851086555174, "tokens_seen": 424152064 }, { "epoch": 1.03, "learning_rate": 0.00044013039117352055, "loss": 3.1643, "theoretical_loss": 3.985782869637333, "tokens_seen": 424217600 }, { "epoch": 1.03, "learning_rate": 0.0004401203610832498, "loss": 3.219, "theoretical_loss": 3.9857146662075795, "tokens_seen": 424283136 }, { "epoch": 1.03, "objective/train/docs_used": 700507, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3581738471984863, "objective/train/theoretical_loss": 3.9856635224839643, "objective/train/tokens_used": 444792288, "theoretical_loss": 3.9856635224839643, "tokens_seen": 424332288 }, { "epoch": 1.03, "learning_rate": 0.0004401103309929789, "loss": 3.0885, "theoretical_loss": 3.985646476261164, "tokens_seen": 424348672 }, { "epoch": 1.03, "learning_rate": 0.00044010030090270815, "loss": 3.0664, "theoretical_loss": 3.985578299793339, "tokens_seen": 424414208 }, { "epoch": 1.03, "learning_rate": 0.0004400902708124373, "loss": 3.1536, "theoretical_loss": 3.985510136799359, "tokens_seen": 424479744 }, { "epoch": 1.03, "learning_rate": 0.0004400802407221665, "loss": 3.0815, "theoretical_loss": 3.985441987274483, "tokens_seen": 424545280 }, { "epoch": 1.03, "learning_rate": 0.0004400702106318957, "loss": 3.156, "theoretical_loss": 3.9853738512139696, "tokens_seen": 424610816 }, { "epoch": 1.03, "learning_rate": 0.0004400601805416249, "loss": 3.0356, "theoretical_loss": 3.9853057286130813, "tokens_seen": 424676352 }, { "epoch": 1.03, "learning_rate": 0.00044005015045135406, "loss": 2.8745, "theoretical_loss": 3.9852376194670835, "tokens_seen": 424741888 }, { "epoch": 1.03, "learning_rate": 0.00044004012036108324, "loss": 3.0801, "theoretical_loss": 3.985169523771243, "tokens_seen": 424807424 }, { "epoch": 1.03, "learning_rate": 0.0004400300902708124, "loss": 3.1231, "theoretical_loss": 3.9851014415208286, "tokens_seen": 424872960 }, { "epoch": 1.03, "learning_rate": 0.00044002006018054165, "loss": 3.1273, "theoretical_loss": 3.985033372711113, "tokens_seen": 424938496 }, { "epoch": 1.03, "learning_rate": 0.0004400100300902708, "loss": 3.1891, "theoretical_loss": 3.9849653173373705, "tokens_seen": 425004032 }, { "epoch": 1.03, "learning_rate": 0.00044, "loss": 3.2649, "theoretical_loss": 3.984897275394877, "tokens_seen": 425069568 }, { "epoch": 1.03, "learning_rate": 0.0004399899699097292, "loss": 3.2069, "theoretical_loss": 3.9848292468789124, "tokens_seen": 425135104 }, { "epoch": 1.03, "learning_rate": 0.0004399799398194584, "loss": 3.1408, "theoretical_loss": 3.9847612317847574, "tokens_seen": 425200640 }, { "epoch": 1.03, "learning_rate": 0.00043996990972918756, "loss": 3.0255, "theoretical_loss": 3.9846932301076965, "tokens_seen": 425266176 }, { "epoch": 1.03, "learning_rate": 0.00043995987963891674, "loss": 3.107, "theoretical_loss": 3.984625241843016, "tokens_seen": 425331712 }, { "epoch": 1.03, "learning_rate": 0.0004399498495486459, "loss": 3.3499, "theoretical_loss": 3.984557266986004, "tokens_seen": 425397248 }, { "epoch": 1.03, "learning_rate": 0.00043993981945837516, "loss": 3.1551, "theoretical_loss": 3.9844893055319517, "tokens_seen": 425462784 }, { "epoch": 1.03, "learning_rate": 0.00043992978936810434, "loss": 3.0646, "theoretical_loss": 3.984421357476152, "tokens_seen": 425528320 }, { "epoch": 1.03, "learning_rate": 0.0004399197592778335, "loss": 3.2017, "theoretical_loss": 3.9843534228139017, "tokens_seen": 425593856 }, { "epoch": 1.03, "learning_rate": 0.0004399097291875627, "loss": 3.1262, "theoretical_loss": 3.984285501540498, "tokens_seen": 425659392 }, { "epoch": 1.03, "learning_rate": 0.0004398996990972919, "loss": 3.1903, "theoretical_loss": 3.9842175936512416, "tokens_seen": 425724928 }, { "epoch": 1.03, "learning_rate": 0.0004398896690070211, "loss": 3.1037, "theoretical_loss": 3.9841496991414354, "tokens_seen": 425790464 }, { "epoch": 1.03, "learning_rate": 0.00043987963891675024, "loss": 3.1689, "theoretical_loss": 3.984081818006384, "tokens_seen": 425856000 }, { "epoch": 1.03, "learning_rate": 0.0004398696088264795, "loss": 3.173, "theoretical_loss": 3.9840139502413967, "tokens_seen": 425921536 }, { "epoch": 1.03, "objective/train/docs_used": 703399, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.798731565475464, "objective/train/theoretical_loss": 3.9839630581889383, "objective/train/tokens_used": 446430688, "theoretical_loss": 3.9839630581889383, "tokens_seen": 425970688 }, { "epoch": 1.03, "learning_rate": 0.0004398595787362086, "loss": 3.0592, "theoretical_loss": 3.983946095841782, "tokens_seen": 425987072 }, { "epoch": 1.03, "learning_rate": 0.00043984954864593784, "loss": 2.9474, "theoretical_loss": 3.9838782548028524, "tokens_seen": 426052608 }, { "epoch": 1.03, "learning_rate": 0.000439839518555667, "loss": 3.2015, "theoretical_loss": 3.983810427119923, "tokens_seen": 426118144 }, { "epoch": 1.03, "learning_rate": 0.0004398294884653962, "loss": 3.222, "theoretical_loss": 3.983742612788311, "tokens_seen": 426183680 }, { "epoch": 1.03, "learning_rate": 0.0004398194583751254, "loss": 3.0617, "theoretical_loss": 3.983674811803335, "tokens_seen": 426249216 }, { "epoch": 1.03, "learning_rate": 0.0004398094282848546, "loss": 3.0085, "theoretical_loss": 3.9836070241603174, "tokens_seen": 426314752 }, { "epoch": 1.03, "learning_rate": 0.00043979939819458375, "loss": 3.1878, "theoretical_loss": 3.9835392498545827, "tokens_seen": 426380288 }, { "epoch": 1.03, "learning_rate": 0.000439789368104313, "loss": 3.138, "theoretical_loss": 3.983471488881456, "tokens_seen": 426445824 }, { "epoch": 1.03, "learning_rate": 0.0004397793380140421, "loss": 3.1293, "theoretical_loss": 3.983403741236268, "tokens_seen": 426511360 }, { "epoch": 1.03, "learning_rate": 0.00043976930792377134, "loss": 3.1207, "theoretical_loss": 3.9833360069143486, "tokens_seen": 426576896 }, { "epoch": 1.03, "learning_rate": 0.0004397592778335005, "loss": 3.2384, "theoretical_loss": 3.983268285911032, "tokens_seen": 426642432 }, { "epoch": 1.03, "learning_rate": 0.0004397492477432297, "loss": 3.257, "theoretical_loss": 3.9832005782216537, "tokens_seen": 426707968 }, { "epoch": 1.03, "learning_rate": 0.0004397392176529589, "loss": 3.0819, "theoretical_loss": 3.9831328838415523, "tokens_seen": 426773504 }, { "epoch": 1.03, "learning_rate": 0.00043972918756268807, "loss": 3.1549, "theoretical_loss": 3.9830652027660682, "tokens_seen": 426839040 }, { "epoch": 1.03, "learning_rate": 0.00043971915747241725, "loss": 3.2315, "theoretical_loss": 3.9829975349905444, "tokens_seen": 426904576 }, { "epoch": 1.03, "learning_rate": 0.0004397091273821465, "loss": 2.9996, "theoretical_loss": 3.9829298805103264, "tokens_seen": 426970112 }, { "epoch": 1.03, "learning_rate": 0.0004396990972918756, "loss": 3.2968, "theoretical_loss": 3.9828622393207613, "tokens_seen": 427035648 }, { "epoch": 1.03, "learning_rate": 0.00043968906720160485, "loss": 3.2766, "theoretical_loss": 3.9827946114171997, "tokens_seen": 427101184 }, { "epoch": 1.03, "learning_rate": 0.00043967903711133403, "loss": 3.1527, "theoretical_loss": 3.9827269967949936, "tokens_seen": 427166720 }, { "epoch": 1.03, "learning_rate": 0.0004396690070210632, "loss": 3.0997, "theoretical_loss": 3.982659395449497, "tokens_seen": 427232256 }, { "epoch": 1.03, "learning_rate": 0.0004396589769307924, "loss": 3.1321, "theoretical_loss": 3.982591807376069, "tokens_seen": 427297792 }, { "epoch": 1.03, "learning_rate": 0.00043964894684052157, "loss": 3.1435, "theoretical_loss": 3.9825242325700665, "tokens_seen": 427363328 }, { "epoch": 1.03, "learning_rate": 0.00043963891675025075, "loss": 3.1008, "theoretical_loss": 3.9824566710268527, "tokens_seen": 427428864 }, { "epoch": 1.03, "learning_rate": 0.00043962888665998, "loss": 3.2363, "theoretical_loss": 3.982389122741791, "tokens_seen": 427494400 }, { "epoch": 1.03, "learning_rate": 0.0004396188565697091, "loss": 3.033, "theoretical_loss": 3.9823215877102482, "tokens_seen": 427559936 }, { "epoch": 1.03, "objective/train/docs_used": 706426, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.070389986038208, "objective/train/theoretical_loss": 3.9822709451314258, "objective/train/tokens_used": 448069088, "theoretical_loss": 3.9822709451314258, "tokens_seen": 427609088 }, { "epoch": 1.03, "learning_rate": 0.00043960882647943835, "loss": 3.0188, "theoretical_loss": 3.9822540659275916, "tokens_seen": 427625472 }, { "epoch": 1.03, "learning_rate": 0.0004395987963891675, "loss": 3.2456, "theoretical_loss": 3.982186557389194, "tokens_seen": 427691008 }, { "epoch": 1.03, "learning_rate": 0.0004395887662988967, "loss": 3.1243, "theoretical_loss": 3.982119062090428, "tokens_seen": 427756544 }, { "epoch": 1.03, "learning_rate": 0.0004395787362086259, "loss": 3.2795, "theoretical_loss": 3.982051580026669, "tokens_seen": 427822080 }, { "epoch": 1.03, "learning_rate": 0.0004395687061183551, "loss": 3.2217, "theoretical_loss": 3.981984111193295, "tokens_seen": 427887616 }, { "epoch": 1.03, "learning_rate": 0.00043955867602808426, "loss": 3.0455, "theoretical_loss": 3.981916655585687, "tokens_seen": 427953152 }, { "epoch": 1.03, "learning_rate": 0.00043954864593781344, "loss": 3.1099, "theoretical_loss": 3.981849213199227, "tokens_seen": 428018688 }, { "epoch": 1.03, "learning_rate": 0.0004395386158475426, "loss": 3.0564, "theoretical_loss": 3.9817817840293, "tokens_seen": 428084224 }, { "epoch": 1.03, "learning_rate": 0.00043952858575727185, "loss": 3.1605, "theoretical_loss": 3.9817143680712928, "tokens_seen": 428149760 }, { "epoch": 1.03, "learning_rate": 0.000439518555667001, "loss": 3.1137, "theoretical_loss": 3.9816469653205955, "tokens_seen": 428215296 }, { "epoch": 1.03, "learning_rate": 0.0004395085255767302, "loss": 3.1088, "theoretical_loss": 3.9815795757725994, "tokens_seen": 428280832 }, { "epoch": 1.03, "learning_rate": 0.0004394984954864594, "loss": 3.161, "theoretical_loss": 3.9815121994226996, "tokens_seen": 428346368 }, { "epoch": 1.03, "learning_rate": 0.0004394884653961886, "loss": 3.0708, "theoretical_loss": 3.9814448362662924, "tokens_seen": 428411904 }, { "epoch": 1.03, "learning_rate": 0.00043947843530591776, "loss": 3.0856, "theoretical_loss": 3.9813774862987756, "tokens_seen": 428477440 }, { "epoch": 1.03, "learning_rate": 0.00043946840521564694, "loss": 3.2036, "theoretical_loss": 3.9813101495155516, "tokens_seen": 428542976 }, { "epoch": 1.03, "learning_rate": 0.0004394583751253761, "loss": 3.1747, "theoretical_loss": 3.981242825912023, "tokens_seen": 428608512 }, { "epoch": 1.03, "learning_rate": 0.00043944834503510536, "loss": 3.0239, "theoretical_loss": 3.981175515483596, "tokens_seen": 428674048 }, { "epoch": 1.03, "learning_rate": 0.0004394383149448345, "loss": 3.1993, "theoretical_loss": 3.9811082182256783, "tokens_seen": 428739584 }, { "epoch": 1.03, "learning_rate": 0.0004394282848545637, "loss": 3.1978, "theoretical_loss": 3.9810409341336808, "tokens_seen": 428805120 }, { "epoch": 1.03, "learning_rate": 0.00043941825476429285, "loss": 3.1091, "theoretical_loss": 3.9809736632030153, "tokens_seen": 428870656 }, { "epoch": 1.03, "learning_rate": 0.0004394082246740221, "loss": 3.0431, "theoretical_loss": 3.9809064054290975, "tokens_seen": 428936192 }, { "epoch": 1.03, "learning_rate": 0.00043939819458375126, "loss": 3.0137, "theoretical_loss": 3.980839160807344, "tokens_seen": 429001728 }, { "epoch": 1.03, "learning_rate": 0.00043938816449348044, "loss": 3.0254, "theoretical_loss": 3.9807719293331743, "tokens_seen": 429067264 }, { "epoch": 1.03, "learning_rate": 0.0004393781344032096, "loss": 3.1963, "theoretical_loss": 3.9807047110020104, "tokens_seen": 429132800 }, { "epoch": 1.03, "learning_rate": 0.0004393681043129388, "loss": 3.1074, "theoretical_loss": 3.980637505809277, "tokens_seen": 429198336 }, { "epoch": 1.03, "objective/train/docs_used": 707874, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.155245065689087, "objective/train/theoretical_loss": 3.9805871105340698, "objective/train/tokens_used": 449707488, "theoretical_loss": 3.9805871105340698, "tokens_seen": 429247488 }, { "epoch": 1.03, "learning_rate": 0.000439358074222668, "loss": 3.1545, "theoretical_loss": 3.9805703137503996, "tokens_seen": 429263872 }, { "epoch": 1.03, "learning_rate": 0.0004393480441323972, "loss": 3.3067, "theoretical_loss": 3.9805031348208075, "tokens_seen": 429329408 }, { "epoch": 1.03, "learning_rate": 0.00043933801404212635, "loss": 3.0935, "theoretical_loss": 3.9804359690159314, "tokens_seen": 429394944 }, { "epoch": 1.03, "learning_rate": 0.0004393279839518556, "loss": 2.976, "theoretical_loss": 3.9803688163312048, "tokens_seen": 429460480 }, { "epoch": 1.03, "learning_rate": 0.00043931795386158477, "loss": 3.0641, "theoretical_loss": 3.980301676762063, "tokens_seen": 429526016 }, { "epoch": 1.03, "learning_rate": 0.00043930792377131395, "loss": 3.1522, "theoretical_loss": 3.9802345503039445, "tokens_seen": 429591552 }, { "epoch": 1.03, "learning_rate": 0.00043929789368104313, "loss": 3.0431, "theoretical_loss": 3.9801674369522884, "tokens_seen": 429657088 }, { "epoch": 1.03, "learning_rate": 0.0004392878635907723, "loss": 3.1101, "theoretical_loss": 3.980100336702537, "tokens_seen": 429722624 }, { "epoch": 1.03, "learning_rate": 0.0004392778335005015, "loss": 3.1331, "theoretical_loss": 3.9800332495501367, "tokens_seen": 429788160 }, { "epoch": 1.03, "learning_rate": 0.0004392678034102307, "loss": 3.2053, "theoretical_loss": 3.979966175490533, "tokens_seen": 429853696 }, { "epoch": 1.03, "learning_rate": 0.00043925777331995985, "loss": 3.0603, "theoretical_loss": 3.979899114519175, "tokens_seen": 429919232 }, { "epoch": 1.03, "learning_rate": 0.0004392477432296891, "loss": 3.1515, "theoretical_loss": 3.9798320666315146, "tokens_seen": 429984768 }, { "epoch": 1.03, "learning_rate": 0.0004392377131394182, "loss": 2.9697, "theoretical_loss": 3.979765031823006, "tokens_seen": 430050304 }, { "epoch": 1.03, "learning_rate": 0.00043922768304914745, "loss": 3.2294, "theoretical_loss": 3.979698010089105, "tokens_seen": 430115840 }, { "epoch": 1.03, "learning_rate": 0.00043921765295887663, "loss": 3.0905, "theoretical_loss": 3.97963100142527, "tokens_seen": 430181376 }, { "epoch": 1.03, "learning_rate": 0.0004392076228686058, "loss": 2.9662, "theoretical_loss": 3.979564005826961, "tokens_seen": 430246912 }, { "epoch": 1.03, "learning_rate": 0.000439197592778335, "loss": 3.1102, "theoretical_loss": 3.9794970232896416, "tokens_seen": 430312448 }, { "epoch": 1.03, "learning_rate": 0.00043918756268806423, "loss": 2.9511, "theoretical_loss": 3.979430053808777, "tokens_seen": 430377984 }, { "epoch": 1.03, "learning_rate": 0.0004391775325977934, "loss": 3.1616, "theoretical_loss": 3.9793630973798333, "tokens_seen": 430443520 }, { "epoch": 1.03, "learning_rate": 0.0004391675025075226, "loss": 3.0792, "theoretical_loss": 3.979296153998282, "tokens_seen": 430509056 }, { "epoch": 1.03, "learning_rate": 0.00043915747241725177, "loss": 3.059, "theoretical_loss": 3.979229223659593, "tokens_seen": 430574592 }, { "epoch": 1.03, "learning_rate": 0.00043914744232698095, "loss": 3.094, "theoretical_loss": 3.9791623063592425, "tokens_seen": 430640128 }, { "epoch": 1.03, "learning_rate": 0.0004391374122367102, "loss": 3.0665, "theoretical_loss": 3.979095402092706, "tokens_seen": 430705664 }, { "epoch": 1.03, "learning_rate": 0.0004391273821464393, "loss": 3.1293, "theoretical_loss": 3.9790285108554624, "tokens_seen": 430771200 }, { "epoch": 1.03, "learning_rate": 0.00043911735205616855, "loss": 3.0814, "theoretical_loss": 3.978961632642992, "tokens_seen": 430836736 }, { "epoch": 1.03, "objective/train/docs_used": 710735, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0522050857543945, "objective/train/theoretical_loss": 3.9789114825284297, "objective/train/tokens_used": 451345888, "theoretical_loss": 3.9789114825284297, "tokens_seen": 430885888 }, { "epoch": 1.03, "learning_rate": 0.0004391073219658977, "loss": 3.218, "theoretical_loss": 3.978894767450779, "tokens_seen": 430902272 }, { "epoch": 1.03, "learning_rate": 0.0004390972918756269, "loss": 3.0734, "theoretical_loss": 3.9788279152743082, "tokens_seen": 430967808 }, { "epoch": 1.03, "learning_rate": 0.0004390872617853561, "loss": 3.2898, "theoretical_loss": 3.978761076109067, "tokens_seen": 431033344 }, { "epoch": 1.03, "learning_rate": 0.0004390772316950853, "loss": 2.984, "theoretical_loss": 3.9786942499505464, "tokens_seen": 431098880 }, { "epoch": 1.03, "learning_rate": 0.00043906720160481446, "loss": 2.9732, "theoretical_loss": 3.978627436794238, "tokens_seen": 431164416 }, { "epoch": 1.03, "learning_rate": 0.00043905717151454364, "loss": 3.1965, "theoretical_loss": 3.9785606366356365, "tokens_seen": 431229952 }, { "epoch": 1.03, "learning_rate": 0.0004390471414242728, "loss": 3.2068, "theoretical_loss": 3.978493849470238, "tokens_seen": 431295488 }, { "epoch": 1.03, "learning_rate": 0.00043903711133400205, "loss": 3.1117, "theoretical_loss": 3.978427075293542, "tokens_seen": 431361024 }, { "epoch": 1.03, "learning_rate": 0.0004390270812437312, "loss": 3.0743, "theoretical_loss": 3.9783603141010495, "tokens_seen": 431426560 }, { "epoch": 1.03, "learning_rate": 0.0004390170511534604, "loss": 3.213, "theoretical_loss": 3.978293565888264, "tokens_seen": 431492096 }, { "epoch": 1.03, "learning_rate": 0.0004390070210631896, "loss": 3.1224, "theoretical_loss": 3.9782268306506916, "tokens_seen": 431557632 }, { "epoch": 1.03, "learning_rate": 0.0004389969909729188, "loss": 3.1234, "theoretical_loss": 3.9781601083838396, "tokens_seen": 431623168 }, { "epoch": 1.03, "learning_rate": 0.00043898696088264796, "loss": 3.2483, "theoretical_loss": 3.9780933990832184, "tokens_seen": 431688704 }, { "epoch": 1.03, "learning_rate": 0.00043897693079237714, "loss": 3.1373, "theoretical_loss": 3.97802670274434, "tokens_seen": 431754240 }, { "epoch": 1.03, "learning_rate": 0.0004389669007021063, "loss": 3.0827, "theoretical_loss": 3.97796001936272, "tokens_seen": 431819776 }, { "epoch": 1.03, "learning_rate": 0.00043895687061183556, "loss": 3.128, "theoretical_loss": 3.9778933489338737, "tokens_seen": 431885312 }, { "epoch": 1.03, "learning_rate": 0.0004389468405215647, "loss": 3.1801, "theoretical_loss": 3.9778266914533216, "tokens_seen": 431950848 }, { "epoch": 1.03, "learning_rate": 0.0004389368104312939, "loss": 3.0848, "theoretical_loss": 3.9777600469165844, "tokens_seen": 432016384 }, { "epoch": 1.03, "learning_rate": 0.00043892678034102305, "loss": 3.0674, "theoretical_loss": 3.9776934153191856, "tokens_seen": 432081920 }, { "epoch": 1.03, "learning_rate": 0.0004389167502507523, "loss": 3.2251, "theoretical_loss": 3.977626796656651, "tokens_seen": 432147456 }, { "epoch": 1.03, "learning_rate": 0.00043890672016048146, "loss": 3.2743, "theoretical_loss": 3.977560190924509, "tokens_seen": 432212992 }, { "epoch": 1.03, "learning_rate": 0.00043889669007021064, "loss": 3.0438, "theoretical_loss": 3.9774935981182895, "tokens_seen": 432278528 }, { "epoch": 1.03, "learning_rate": 0.0004388866599799398, "loss": 3.0744, "theoretical_loss": 3.977427018233525, "tokens_seen": 432344064 }, { "epoch": 1.03, "learning_rate": 0.000438876629889669, "loss": 3.0849, "theoretical_loss": 3.97736045126575, "tokens_seen": 432409600 }, { "epoch": 1.03, "learning_rate": 0.0004388665997993982, "loss": 2.9926, "theoretical_loss": 3.977293897210501, "tokens_seen": 432475136 }, { "epoch": 1.03, "objective/train/docs_used": 713758, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.004768133163452, "objective/train/theoretical_loss": 3.977243990140227, "objective/train/tokens_used": 452984288, "theoretical_loss": 3.977243990140227, "tokens_seen": 432524288 }, { "epoch": 1.03, "learning_rate": 0.0004388565697091274, "loss": 3.1513, "theoretical_loss": 3.9772273560633185, "tokens_seen": 432540672 }, { "epoch": 1.03, "learning_rate": 0.00043884653961885655, "loss": 3.1272, "theoretical_loss": 3.977160827819742, "tokens_seen": 432606208 }, { "epoch": 1.03, "learning_rate": 0.0004388365095285858, "loss": 3.1901, "theoretical_loss": 3.977094312475317, "tokens_seen": 432671744 }, { "epoch": 1.03, "learning_rate": 0.00043882647943831497, "loss": 3.0427, "theoretical_loss": 3.9770278100255867, "tokens_seen": 432737280 }, { "epoch": 1.03, "learning_rate": 0.00043881644934804415, "loss": 3.1451, "theoretical_loss": 3.976961320466102, "tokens_seen": 432802816 }, { "epoch": 1.03, "learning_rate": 0.00043880641925777333, "loss": 3.0918, "theoretical_loss": 3.9768948437924108, "tokens_seen": 432868352 }, { "epoch": 1.03, "learning_rate": 0.0004387963891675025, "loss": 3.0868, "theoretical_loss": 3.9768283800000663, "tokens_seen": 432933888 }, { "epoch": 1.03, "learning_rate": 0.0004387863590772317, "loss": 3.0919, "theoretical_loss": 3.976761929084623, "tokens_seen": 432999424 }, { "epoch": 1.03, "learning_rate": 0.0004387763289869609, "loss": 3.2026, "theoretical_loss": 3.976695491041638, "tokens_seen": 433064960 }, { "epoch": 1.03, "learning_rate": 0.00043876629889669005, "loss": 3.1139, "theoretical_loss": 3.97662906586667, "tokens_seen": 433130496 }, { "epoch": 1.03, "learning_rate": 0.0004387562688064193, "loss": 3.043, "theoretical_loss": 3.97656265355528, "tokens_seen": 433196032 }, { "epoch": 1.03, "learning_rate": 0.0004387462387161484, "loss": 3.0764, "theoretical_loss": 3.976496254103032, "tokens_seen": 433261568 }, { "epoch": 1.03, "learning_rate": 0.00043873620862587765, "loss": 3.2043, "theoretical_loss": 3.976429867505491, "tokens_seen": 433327104 }, { "epoch": 1.03, "learning_rate": 0.00043872617853560683, "loss": 3.053, "theoretical_loss": 3.9763634937582246, "tokens_seen": 433392640 }, { "epoch": 1.03, "learning_rate": 0.000438716148445336, "loss": 3.2239, "theoretical_loss": 3.976297132856804, "tokens_seen": 433458176 }, { "epoch": 1.03, "learning_rate": 0.0004387061183550652, "loss": 3.1189, "theoretical_loss": 3.9762307847968, "tokens_seen": 433523712 }, { "epoch": 1.03, "learning_rate": 0.00043869608826479443, "loss": 3.0972, "theoretical_loss": 3.976164449573788, "tokens_seen": 433589248 }, { "epoch": 1.03, "learning_rate": 0.00043868605817452356, "loss": 3.1546, "theoretical_loss": 3.976098127183344, "tokens_seen": 433654784 }, { "epoch": 1.03, "learning_rate": 0.0004386760280842528, "loss": 3.0664, "theoretical_loss": 3.9760318176210476, "tokens_seen": 433720320 }, { "epoch": 1.03, "learning_rate": 0.0004386659979939819, "loss": 3.2972, "theoretical_loss": 3.975965520882479, "tokens_seen": 433785856 }, { "epoch": 1.03, "learning_rate": 0.00043865596790371115, "loss": 3.2854, "theoretical_loss": 3.9758992369632207, "tokens_seen": 433851392 }, { "epoch": 1.03, "learning_rate": 0.00043864593781344033, "loss": 3.1481, "theoretical_loss": 3.97583296585886, "tokens_seen": 433916928 }, { "epoch": 1.03, "learning_rate": 0.0004386359077231695, "loss": 3.087, "theoretical_loss": 3.9757667075649827, "tokens_seen": 433982464 }, { "epoch": 1.03, "learning_rate": 0.0004386258776328987, "loss": 3.079, "theoretical_loss": 3.975700462077179, "tokens_seen": 434048000 }, { "epoch": 1.03, "learning_rate": 0.0004386158475426279, "loss": 3.1472, "theoretical_loss": 3.975634229391041, "tokens_seen": 434113536 }, { "epoch": 1.03, "objective/train/docs_used": 716642, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4787890911102295, "objective/train/theoretical_loss": 3.9755845632748805, "objective/train/tokens_used": 454622688, "theoretical_loss": 3.9755845632748805, "tokens_seen": 434162688 }, { "epoch": 1.03, "learning_rate": 0.00043860581745235706, "loss": 3.0079, "theoretical_loss": 3.9755680095021635, "tokens_seen": 434179072 }, { "epoch": 1.03, "learning_rate": 0.0004385957873620863, "loss": 3.118, "theoretical_loss": 3.975501802406141, "tokens_seen": 434244608 }, { "epoch": 1.03, "learning_rate": 0.0004385857572718154, "loss": 3.1769, "theoretical_loss": 3.9754356080985733, "tokens_seen": 434310144 }, { "epoch": 1.03, "learning_rate": 0.00043857572718154466, "loss": 3.2515, "theoretical_loss": 3.9753694265750603, "tokens_seen": 434375680 }, { "epoch": 1.03, "learning_rate": 0.0004385656970912738, "loss": 3.1564, "theoretical_loss": 3.9753032578312055, "tokens_seen": 434441216 }, { "epoch": 1.03, "learning_rate": 0.000438555667001003, "loss": 3.2031, "theoretical_loss": 3.975237101862614, "tokens_seen": 434506752 }, { "epoch": 1.03, "learning_rate": 0.0004385456369107322, "loss": 3.0593, "theoretical_loss": 3.975170958664892, "tokens_seen": 434572288 }, { "epoch": 1.03, "learning_rate": 0.0004385356068204614, "loss": 3.1893, "theoretical_loss": 3.9751048282336496, "tokens_seen": 434637824 }, { "epoch": 1.03, "learning_rate": 0.00043852557673019056, "loss": 3.0476, "theoretical_loss": 3.9750387105644975, "tokens_seen": 434703360 }, { "epoch": 1.03, "learning_rate": 0.0004385155466399198, "loss": 3.0353, "theoretical_loss": 3.9749726056530506, "tokens_seen": 434768896 }, { "epoch": 1.03, "learning_rate": 0.0004385055165496489, "loss": 3.1244, "theoretical_loss": 3.9749065134949233, "tokens_seen": 434834432 }, { "epoch": 1.03, "learning_rate": 0.00043849548645937816, "loss": 3.0275, "theoretical_loss": 3.974840434085735, "tokens_seen": 434899968 }, { "epoch": 1.03, "learning_rate": 0.0004384854563691073, "loss": 3.1958, "theoretical_loss": 3.9747743674211042, "tokens_seen": 434965504 }, { "epoch": 1.03, "learning_rate": 0.0004384754262788365, "loss": 3.0832, "theoretical_loss": 3.974708313496655, "tokens_seen": 435031040 }, { "epoch": 1.03, "learning_rate": 0.0004384653961885657, "loss": 3.1328, "theoretical_loss": 3.974642272308011, "tokens_seen": 435096576 }, { "epoch": 1.03, "learning_rate": 0.0004384553660982949, "loss": 3.1795, "theoretical_loss": 3.974576243850799, "tokens_seen": 435162112 }, { "epoch": 1.03, "learning_rate": 0.00043844533600802407, "loss": 3.0561, "theoretical_loss": 3.9745102281206477, "tokens_seen": 435227648 }, { "epoch": 1.03, "learning_rate": 0.00043843530591775325, "loss": 3.198, "theoretical_loss": 3.9744442251131877, "tokens_seen": 435293184 }, { "epoch": 1.03, "learning_rate": 0.0004384252758274825, "loss": 3.1628, "theoretical_loss": 3.9743782348240533, "tokens_seen": 435358720 }, { "epoch": 1.03, "learning_rate": 0.00043841524573721166, "loss": 2.9717, "theoretical_loss": 3.9743122572488785, "tokens_seen": 435424256 }, { "epoch": 1.03, "learning_rate": 0.00043840521564694084, "loss": 3.2474, "theoretical_loss": 3.9742462923833015, "tokens_seen": 435489792 }, { "epoch": 1.03, "learning_rate": 0.00043839518555667, "loss": 3.0985, "theoretical_loss": 3.9741803402229623, "tokens_seen": 435555328 }, { "epoch": 1.03, "learning_rate": 0.0004383851554663992, "loss": 3.2707, "theoretical_loss": 3.9741144007635016, "tokens_seen": 435620864 }, { "epoch": 1.03, "learning_rate": 0.0004383751253761284, "loss": 3.0472, "theoretical_loss": 3.974048474000564, "tokens_seen": 435686400 }, { "epoch": 1.03, "learning_rate": 0.0004383650952858576, "loss": 3.1873, "theoretical_loss": 3.9739825599297944, "tokens_seen": 435751936 }, { "epoch": 1.03, "objective/train/docs_used": 718634, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.106482982635498, "objective/train/theoretical_loss": 3.973933132703336, "objective/train/tokens_used": 456261088, "theoretical_loss": 3.973933132703336, "tokens_seen": 435801088 }, { "epoch": 1.03, "learning_rate": 0.00043835506519558675, "loss": 3.0825, "theoretical_loss": 3.973916658546843, "tokens_seen": 435817472 }, { "epoch": 1.03, "learning_rate": 0.000438345035105316, "loss": 3.2499, "theoretical_loss": 3.9738507698473584, "tokens_seen": 435883008 }, { "epoch": 1.03, "learning_rate": 0.00043833500501504517, "loss": 3.1747, "theoretical_loss": 3.973784893826994, "tokens_seen": 435948544 }, { "epoch": 1.03, "learning_rate": 0.00043832497492477435, "loss": 2.9031, "theoretical_loss": 3.973719030481404, "tokens_seen": 436014080 }, { "epoch": 1.03, "learning_rate": 0.00043831494483450353, "loss": 3.101, "theoretical_loss": 3.9736531798062456, "tokens_seen": 436079616 }, { "epoch": 1.03, "learning_rate": 0.0004383049147442327, "loss": 3.1472, "theoretical_loss": 3.9735873417971774, "tokens_seen": 436145152 }, { "epoch": 1.03, "learning_rate": 0.0004382948846539619, "loss": 3.283, "theoretical_loss": 3.97352151644986, "tokens_seen": 436210688 }, { "epoch": 1.03, "learning_rate": 0.0004382848545636911, "loss": 3.0234, "theoretical_loss": 3.9734557037599574, "tokens_seen": 436276224 }, { "epoch": 1.03, "learning_rate": 0.00043827482447342025, "loss": 3.1258, "theoretical_loss": 3.9733899037231346, "tokens_seen": 436341760 }, { "epoch": 1.03, "learning_rate": 0.0004382647943831495, "loss": 3.0267, "theoretical_loss": 3.9733241163350597, "tokens_seen": 436407296 }, { "epoch": 1.03, "learning_rate": 0.0004382547642928786, "loss": 3.1996, "theoretical_loss": 3.973258341591401, "tokens_seen": 436472832 }, { "epoch": 1.03, "learning_rate": 0.00043824473420260785, "loss": 3.0465, "theoretical_loss": 3.9731925794878307, "tokens_seen": 436538368 }, { "epoch": 1.03, "learning_rate": 0.00043823470411233703, "loss": 3.1547, "theoretical_loss": 3.973126830020023, "tokens_seen": 436603904 }, { "epoch": 1.03, "learning_rate": 0.0004382246740220662, "loss": 3.0577, "theoretical_loss": 3.9730610931836536, "tokens_seen": 436669440 }, { "epoch": 1.03, "learning_rate": 0.0004382146439317954, "loss": 3.1932, "theoretical_loss": 3.9729953689744013, "tokens_seen": 436734976 }, { "epoch": 1.03, "learning_rate": 0.00043820461384152463, "loss": 2.9773, "theoretical_loss": 3.972929657387945, "tokens_seen": 436800512 }, { "epoch": 1.03, "learning_rate": 0.00043819458375125376, "loss": 3.0692, "theoretical_loss": 3.9728639584199685, "tokens_seen": 436866048 }, { "epoch": 1.03, "learning_rate": 0.000438184553660983, "loss": 3.2792, "theoretical_loss": 3.9727982720661554, "tokens_seen": 436931584 }, { "epoch": 1.03, "learning_rate": 0.0004381745235707121, "loss": 3.1396, "theoretical_loss": 3.972732598322193, "tokens_seen": 436997120 }, { "epoch": 1.03, "learning_rate": 0.00043816449348044135, "loss": 3.2172, "theoretical_loss": 3.972666937183769, "tokens_seen": 437062656 }, { "epoch": 1.03, "learning_rate": 0.00043815446339017053, "loss": 3.0531, "theoretical_loss": 3.972601288646575, "tokens_seen": 437128192 }, { "epoch": 1.03, "learning_rate": 0.0004381444332998997, "loss": 3.1641, "theoretical_loss": 3.9725356527063043, "tokens_seen": 437193728 }, { "epoch": 1.03, "learning_rate": 0.0004381344032096289, "loss": 3.2524, "theoretical_loss": 3.972470029358651, "tokens_seen": 437259264 }, { "epoch": 1.03, "learning_rate": 0.0004381243731193581, "loss": 3.0821, "theoretical_loss": 3.972404418599313, "tokens_seen": 437324800 }, { "epoch": 1.03, "learning_rate": 0.00043811434302908726, "loss": 3.1287, "theoretical_loss": 3.97233882042399, "tokens_seen": 437390336 }, { "epoch": 1.03, "objective/train/docs_used": 721367, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3387866020202637, "objective/train/theoretical_loss": 3.9722896300481705, "objective/train/tokens_used": 457899488, "theoretical_loss": 3.9722896300481705, "tokens_seen": 437439488 }, { "epoch": 1.03, "learning_rate": 0.0004381043129388165, "loss": 3.2029, "theoretical_loss": 3.972273234828382, "tokens_seen": 437455872 }, { "epoch": 1.03, "learning_rate": 0.0004380942828485456, "loss": 3.1183, "theoretical_loss": 3.972207661808194, "tokens_seen": 437521408 }, { "epoch": 1.03, "learning_rate": 0.00043808425275827486, "loss": 3.1637, "theoretical_loss": 3.9721421013591307, "tokens_seen": 437586944 }, { "epoch": 1.03, "learning_rate": 0.000438074222668004, "loss": 3.2299, "theoretical_loss": 3.9720765534769007, "tokens_seen": 437652480 }, { "epoch": 1.03, "learning_rate": 0.0004380641925777332, "loss": 3.0543, "theoretical_loss": 3.9720110181572132, "tokens_seen": 437718016 }, { "epoch": 1.03, "learning_rate": 0.0004380541624874624, "loss": 3.1047, "theoretical_loss": 3.971945495395781, "tokens_seen": 437783552 }, { "epoch": 1.03, "learning_rate": 0.0004380441323971916, "loss": 3.2321, "theoretical_loss": 3.971879985188317, "tokens_seen": 437849088 }, { "epoch": 1.03, "learning_rate": 0.00043803410230692076, "loss": 3.0497, "theoretical_loss": 3.971814487530538, "tokens_seen": 437914624 }, { "epoch": 1.03, "learning_rate": 0.00043802407221665, "loss": 3.2455, "theoretical_loss": 3.9717490024181625, "tokens_seen": 437980160 }, { "epoch": 1.03, "learning_rate": 0.0004380140421263791, "loss": 2.9482, "theoretical_loss": 3.971683529846911, "tokens_seen": 438045696 }, { "epoch": 1.03, "learning_rate": 0.00043800401203610836, "loss": 3.1951, "theoretical_loss": 3.971618069812506, "tokens_seen": 438111232 }, { "epoch": 1.03, "learning_rate": 0.0004379939819458375, "loss": 3.0687, "theoretical_loss": 3.9715526223106714, "tokens_seen": 438176768 }, { "epoch": 1.03, "learning_rate": 0.0004379839518555667, "loss": 3.0672, "theoretical_loss": 3.971487187337134, "tokens_seen": 438242304 }, { "epoch": 1.03, "learning_rate": 0.0004379739217652959, "loss": 3.1796, "theoretical_loss": 3.9714217648876238, "tokens_seen": 438307840 }, { "epoch": 1.03, "learning_rate": 0.0004379638916750251, "loss": 3.3583, "theoretical_loss": 3.97135635495787, "tokens_seen": 438373376 }, { "epoch": 1.03, "learning_rate": 0.00043795386158475427, "loss": 3.1284, "theoretical_loss": 3.9712909575436064, "tokens_seen": 438438912 }, { "epoch": 1.03, "learning_rate": 0.00043794383149448345, "loss": 3.0721, "theoretical_loss": 3.9712255726405683, "tokens_seen": 438504448 }, { "epoch": 1.03, "learning_rate": 0.00043793380140421263, "loss": 3.192, "theoretical_loss": 3.9711602002444923, "tokens_seen": 438569984 }, { "epoch": 1.03, "learning_rate": 0.00043792377131394186, "loss": 3.0468, "theoretical_loss": 3.9710948403511184, "tokens_seen": 438635520 }, { "epoch": 1.03, "learning_rate": 0.000437913741223671, "loss": 3.1196, "theoretical_loss": 3.9710294929561876, "tokens_seen": 438701056 }, { "epoch": 1.03, "learning_rate": 0.0004379037111334002, "loss": 3.0753, "theoretical_loss": 3.970964158055443, "tokens_seen": 438766592 }, { "epoch": 1.03, "learning_rate": 0.00043789368104312935, "loss": 3.0955, "theoretical_loss": 3.97089883564463, "tokens_seen": 438832128 }, { "epoch": 1.03, "learning_rate": 0.0004378836509528586, "loss": 3.0597, "theoretical_loss": 3.9708335257194967, "tokens_seen": 438897664 }, { "epoch": 1.03, "learning_rate": 0.00043787362086258777, "loss": 3.0941, "theoretical_loss": 3.9707682282757926, "tokens_seen": 438963200 }, { "epoch": 1.03, "learning_rate": 0.00043786359077231695, "loss": 3.1006, "theoretical_loss": 3.9707029433092695, "tokens_seen": 439028736 }, { "epoch": 1.03, "objective/train/docs_used": 724037, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9973323345184326, "objective/train/theoretical_loss": 3.970653987769973, "objective/train/tokens_used": 459537888, "theoretical_loss": 3.970653987769973, "tokens_seen": 439077888 }, { "epoch": 1.03, "learning_rate": 0.00043785356068204613, "loss": 3.0785, "theoretical_loss": 3.970637670815681, "tokens_seen": 439094272 }, { "epoch": 1.03, "learning_rate": 0.00043784353059177537, "loss": 3.2686, "theoretical_loss": 3.9705724107907834, "tokens_seen": 439159808 }, { "epoch": 1.03, "learning_rate": 0.0004378335005015045, "loss": 3.0649, "theoretical_loss": 3.9705071632303346, "tokens_seen": 439225344 }, { "epoch": 1.03, "learning_rate": 0.00043782347041123373, "loss": 2.945, "theoretical_loss": 3.970441928130094, "tokens_seen": 439290880 }, { "epoch": 1.03, "learning_rate": 0.00043781344032096286, "loss": 3.0722, "theoretical_loss": 3.9703767054858248, "tokens_seen": 439356416 }, { "epoch": 1.03, "learning_rate": 0.0004378034102306921, "loss": 2.8581, "theoretical_loss": 3.9703114952932905, "tokens_seen": 439421952 }, { "epoch": 1.03, "learning_rate": 0.00043779338014042127, "loss": 3.1644, "theoretical_loss": 3.9702462975482575, "tokens_seen": 439487488 }, { "epoch": 1.03, "learning_rate": 0.00043778335005015045, "loss": 2.9703, "theoretical_loss": 3.9701811122464945, "tokens_seen": 439553024 }, { "epoch": 1.03, "learning_rate": 0.00043777331995987963, "loss": 3.0964, "theoretical_loss": 3.9701159393837715, "tokens_seen": 439618560 }, { "epoch": 1.03, "learning_rate": 0.0004377632898696088, "loss": 3.1616, "theoretical_loss": 3.9700507789558612, "tokens_seen": 439684096 }, { "epoch": 1.03, "learning_rate": 0.000437753259779338, "loss": 3.0308, "theoretical_loss": 3.969985630958538, "tokens_seen": 439749632 }, { "epoch": 1.03, "learning_rate": 0.00043774322968906723, "loss": 2.8996, "theoretical_loss": 3.969920495387579, "tokens_seen": 439815168 }, { "epoch": 1.03, "learning_rate": 0.00043773319959879636, "loss": 3.3063, "theoretical_loss": 3.9698553722387624, "tokens_seen": 439880704 }, { "epoch": 1.03, "learning_rate": 0.0004377231695085256, "loss": 3.1414, "theoretical_loss": 3.969790261507869, "tokens_seen": 439946240 }, { "epoch": 1.03, "learning_rate": 0.0004377131394182547, "loss": 2.9645, "theoretical_loss": 3.969725163190682, "tokens_seen": 440011776 }, { "epoch": 1.03, "learning_rate": 0.00043770310932798396, "loss": 3.0608, "theoretical_loss": 3.9696600772829855, "tokens_seen": 440077312 }, { "epoch": 1.03, "learning_rate": 0.00043769307923771314, "loss": 3.13, "theoretical_loss": 3.9695950037805674, "tokens_seen": 440142848 }, { "epoch": 1.03, "learning_rate": 0.0004376830491474423, "loss": 3.1811, "theoretical_loss": 3.969529942679216, "tokens_seen": 440208384 }, { "epoch": 1.03, "learning_rate": 0.00043767301905717155, "loss": 3.0546, "theoretical_loss": 3.9694648939747226, "tokens_seen": 440273920 }, { "epoch": 1.03, "learning_rate": 0.00043766298896690073, "loss": 3.0169, "theoretical_loss": 3.9693998576628795, "tokens_seen": 440339456 }, { "epoch": 1.03, "learning_rate": 0.0004376529588766299, "loss": 2.9858, "theoretical_loss": 3.9693348337394836, "tokens_seen": 440404992 }, { "epoch": 1.03, "learning_rate": 0.0004376429287863591, "loss": 2.9495, "theoretical_loss": 3.9692698222003306, "tokens_seen": 440470528 }, { "epoch": 1.03, "learning_rate": 0.0004376328986960883, "loss": 3.1535, "theoretical_loss": 3.969204823041221, "tokens_seen": 440536064 }, { "epoch": 1.03, "learning_rate": 0.00043762286860581746, "loss": 2.8363, "theoretical_loss": 3.9691398362579546, "tokens_seen": 440601600 }, { "epoch": 1.03, "learning_rate": 0.0004376128385155467, "loss": 3.0777, "theoretical_loss": 3.9690748618463356, "tokens_seen": 440667136 }, { "epoch": 1.03, "objective/train/docs_used": 726896, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9894087314605713, "objective/train/theoretical_loss": 3.969026139153992, "objective/train/tokens_used": 461176288, "theoretical_loss": 3.969026139153992, "tokens_seen": 440716288 }, { "epoch": 1.03, "learning_rate": 0.0004376028084252758, "loss": 3.179, "theoretical_loss": 3.9690098998021694, "tokens_seen": 440732672 }, { "epoch": 1.03, "learning_rate": 0.00043759277833500506, "loss": 2.9444, "theoretical_loss": 3.968944950121264, "tokens_seen": 440798208 }, { "epoch": 1.03, "learning_rate": 0.0004375827482447342, "loss": 3.0705, "theoretical_loss": 3.9688800127994277, "tokens_seen": 440863744 }, { "epoch": 1.03, "learning_rate": 0.0004375727181544634, "loss": 3.1125, "theoretical_loss": 3.968815087832473, "tokens_seen": 440929280 }, { "epoch": 1.03, "learning_rate": 0.0004375626880641926, "loss": 3.2499, "theoretical_loss": 3.968750175216213, "tokens_seen": 440994816 }, { "epoch": 1.03, "learning_rate": 0.0004375526579739218, "loss": 3.1515, "theoretical_loss": 3.9686852749464627, "tokens_seen": 441060352 }, { "epoch": 1.03, "learning_rate": 0.00043754262788365096, "loss": 3.103, "theoretical_loss": 3.9686203870190413, "tokens_seen": 441125888 }, { "epoch": 1.03, "learning_rate": 0.0004375325977933802, "loss": 3.1605, "theoretical_loss": 3.9685555114297673, "tokens_seen": 441191424 }, { "epoch": 1.03, "learning_rate": 0.0004375225677031093, "loss": 3.2653, "theoretical_loss": 3.968490648174463, "tokens_seen": 441256960 }, { "epoch": 1.03, "learning_rate": 0.00043751253761283856, "loss": 2.9712, "theoretical_loss": 3.968425797248952, "tokens_seen": 441322496 }, { "epoch": 1.03, "learning_rate": 0.0004375025075225677, "loss": 3.0595, "theoretical_loss": 3.96836095864906, "tokens_seen": 441388032 }, { "epoch": 1.03, "learning_rate": 0.0004374924774322969, "loss": 3.0846, "theoretical_loss": 3.9682961323706145, "tokens_seen": 441453568 }, { "epoch": 1.03, "learning_rate": 0.0004374824473420261, "loss": 3.1547, "theoretical_loss": 3.968231318409446, "tokens_seen": 441519104 }, { "epoch": 1.03, "learning_rate": 0.0004374724172517553, "loss": 3.0701, "theoretical_loss": 3.968166516761386, "tokens_seen": 441584640 }, { "epoch": 1.03, "learning_rate": 0.00043746238716148447, "loss": 3.0007, "theoretical_loss": 3.968101727422269, "tokens_seen": 441650176 }, { "epoch": 1.03, "learning_rate": 0.00043745235707121365, "loss": 3.1455, "theoretical_loss": 3.9680369503879303, "tokens_seen": 441715712 }, { "epoch": 1.03, "learning_rate": 0.00043744232698094283, "loss": 2.9984, "theoretical_loss": 3.967972185654208, "tokens_seen": 441781248 }, { "epoch": 1.03, "learning_rate": 0.00043743229689067206, "loss": 2.9989, "theoretical_loss": 3.967907433216942, "tokens_seen": 441846784 }, { "epoch": 1.03, "learning_rate": 0.0004374222668004012, "loss": 2.9853, "theoretical_loss": 3.967842693071975, "tokens_seen": 441912320 }, { "epoch": 1.03, "learning_rate": 0.0004374122367101304, "loss": 3.2408, "theoretical_loss": 3.9677779652151504, "tokens_seen": 441977856 }, { "epoch": 1.03, "learning_rate": 0.00043740220661985955, "loss": 2.946, "theoretical_loss": 3.9677132496423146, "tokens_seen": 442043392 }, { "epoch": 1.03, "learning_rate": 0.0004373921765295888, "loss": 3.0315, "theoretical_loss": 3.967648546349315, "tokens_seen": 442108928 }, { "epoch": 1.03, "learning_rate": 0.00043738214643931797, "loss": 3.1292, "theoretical_loss": 3.967583855332003, "tokens_seen": 442174464 }, { "epoch": 1.03, "learning_rate": 0.00043737211634904715, "loss": 3.1536, "theoretical_loss": 3.9675191765862294, "tokens_seen": 442240000 }, { "epoch": 1.03, "learning_rate": 0.00043736208625877633, "loss": 3.019, "theoretical_loss": 3.967454510107849, "tokens_seen": 442305536 }, { "epoch": 1.03, "objective/train/docs_used": 729712, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3081696033477783, "objective/train/theoretical_loss": 3.967406018297048, "objective/train/tokens_used": 462814688, "theoretical_loss": 3.967406018297048, "tokens_seen": 442354688 }, { "epoch": 1.03, "learning_rate": 0.00043735205616850557, "loss": 3.2172, "theoretical_loss": 3.9673898558927183, "tokens_seen": 442371072 }, { "epoch": 1.03, "learning_rate": 0.0004373420260782347, "loss": 3.1941, "theoretical_loss": 3.967325213936695, "tokens_seen": 442436608 }, { "epoch": 1.03, "learning_rate": 0.00043733199598796393, "loss": 3.034, "theoretical_loss": 3.9672605842356385, "tokens_seen": 442502144 }, { "epoch": 1.03, "learning_rate": 0.00043732196589769306, "loss": 3.2344, "theoretical_loss": 3.9671959667854124, "tokens_seen": 442567680 }, { "epoch": 1.03, "learning_rate": 0.0004373119358074223, "loss": 3.0461, "theoretical_loss": 3.96713136158188, "tokens_seen": 442633216 }, { "epoch": 1.03, "learning_rate": 0.00043730190571715147, "loss": 3.056, "theoretical_loss": 3.967066768620908, "tokens_seen": 442698752 }, { "epoch": 1.03, "learning_rate": 0.00043729187562688065, "loss": 3.0838, "theoretical_loss": 3.9670021878983643, "tokens_seen": 442764288 }, { "epoch": 1.03, "learning_rate": 0.00043728184553660983, "loss": 3.0305, "theoretical_loss": 3.9669376194101194, "tokens_seen": 442829824 }, { "epoch": 1.03, "learning_rate": 0.000437271815446339, "loss": 3.2808, "theoretical_loss": 3.9668730631520455, "tokens_seen": 442895360 }, { "epoch": 1.03, "learning_rate": 0.0004372617853560682, "loss": 3.043, "theoretical_loss": 3.9668085191200166, "tokens_seen": 442960896 }, { "epoch": 1.03, "learning_rate": 0.00043725175526579743, "loss": 3.0392, "theoretical_loss": 3.9667439873099086, "tokens_seen": 443026432 }, { "epoch": 1.03, "learning_rate": 0.00043724172517552656, "loss": 3.0119, "theoretical_loss": 3.9666794677176007, "tokens_seen": 443091968 }, { "epoch": 1.03, "learning_rate": 0.0004372316950852558, "loss": 3.0442, "theoretical_loss": 3.9666149603389727, "tokens_seen": 443157504 }, { "epoch": 1.03, "learning_rate": 0.0004372216649949849, "loss": 3.1902, "theoretical_loss": 3.966550465169906, "tokens_seen": 443223040 }, { "epoch": 1.03, "learning_rate": 0.00043721163490471416, "loss": 3.1468, "theoretical_loss": 3.9664859822062866, "tokens_seen": 443288576 }, { "epoch": 1.03, "learning_rate": 0.00043720160481444334, "loss": 3.2135, "theoretical_loss": 3.9664215114439987, "tokens_seen": 443354112 }, { "epoch": 1.03, "learning_rate": 0.0004371915747241725, "loss": 3.0502, "theoretical_loss": 3.966357052878932, "tokens_seen": 443419648 }, { "epoch": 1.03, "learning_rate": 0.0004371815446339017, "loss": 2.9608, "theoretical_loss": 3.9662926065069763, "tokens_seen": 443485184 }, { "epoch": 1.03, "learning_rate": 0.00043717151454363093, "loss": 2.8271, "theoretical_loss": 3.9662281723240236, "tokens_seen": 443550720 }, { "epoch": 1.03, "learning_rate": 0.00043716148445336006, "loss": 3.0963, "theoretical_loss": 3.966163750325968, "tokens_seen": 443616256 }, { "epoch": 1.03, "learning_rate": 0.0004371514543630893, "loss": 3.2018, "theoretical_loss": 3.966099340508706, "tokens_seen": 443681792 }, { "epoch": 1.03, "learning_rate": 0.0004371414242728184, "loss": 3.1151, "theoretical_loss": 3.9660349428681356, "tokens_seen": 443747328 }, { "epoch": 1.03, "learning_rate": 0.00043713139418254766, "loss": 3.084, "theoretical_loss": 3.9659705574001567, "tokens_seen": 443812864 }, { "epoch": 1.03, "learning_rate": 0.00043712136409227684, "loss": 2.9549, "theoretical_loss": 3.9659061841006724, "tokens_seen": 443878400 }, { "epoch": 1.03, "learning_rate": 0.000437111334002006, "loss": 3.0677, "theoretical_loss": 3.9658418229655856, "tokens_seen": 443943936 }, { "epoch": 1.03, "objective/train/docs_used": 731084, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1077213287353516, "objective/train/theoretical_loss": 3.9657935600946943, "objective/train/tokens_used": 464453088, "theoretical_loss": 3.9657935600946943, "tokens_seen": 443993088 }, { "epoch": 1.03, "learning_rate": 0.0004371013039117352, "loss": 3.1679, "theoretical_loss": 3.9657774739908036, "tokens_seen": 444009472 }, { "epoch": 1.03, "learning_rate": 0.0004370912738214644, "loss": 2.9745, "theoretical_loss": 3.9657131371722336, "tokens_seen": 444075008 }, { "epoch": 1.03, "learning_rate": 0.00043708124373119356, "loss": 3.1011, "theoretical_loss": 3.965648812505786, "tokens_seen": 444140544 }, { "epoch": 1.03, "learning_rate": 0.0004370712136409228, "loss": 3.2076, "theoretical_loss": 3.9655844999873726, "tokens_seen": 444206080 }, { "epoch": 1.03, "learning_rate": 0.0004370611835506519, "loss": 3.0919, "theoretical_loss": 3.965520199612908, "tokens_seen": 444271616 }, { "epoch": 1.03, "learning_rate": 0.00043705115346038116, "loss": 3.005, "theoretical_loss": 3.9654559113783074, "tokens_seen": 444337152 }, { "epoch": 1.03, "learning_rate": 0.0004370411233701103, "loss": 3.2174, "theoretical_loss": 3.9653916352794893, "tokens_seen": 444402688 }, { "epoch": 1.03, "learning_rate": 0.0004370310932798395, "loss": 3.1861, "theoretical_loss": 3.9653273713123736, "tokens_seen": 444468224 }, { "epoch": 1.03, "learning_rate": 0.0004370210631895687, "loss": 3.1526, "theoretical_loss": 3.9652631194728825, "tokens_seen": 444533760 }, { "epoch": 1.03, "learning_rate": 0.0004370110330992979, "loss": 3.207, "theoretical_loss": 3.965198879756939, "tokens_seen": 444599296 }, { "epoch": 1.03, "learning_rate": 0.00043700100300902707, "loss": 3.1095, "theoretical_loss": 3.9651346521604696, "tokens_seen": 444664832 }, { "epoch": 1.03, "learning_rate": 0.0004369909729187563, "loss": 3.0896, "theoretical_loss": 3.9650704366794027, "tokens_seen": 444730368 }, { "epoch": 1.03, "learning_rate": 0.00043698094282848543, "loss": 3.0761, "theoretical_loss": 3.9650062333096674, "tokens_seen": 444795904 }, { "epoch": 1.03, "learning_rate": 0.00043697091273821467, "loss": 2.9629, "theoretical_loss": 3.9649420420471957, "tokens_seen": 444861440 }, { "epoch": 1.03, "learning_rate": 0.0004369608826479438, "loss": 3.2579, "theoretical_loss": 3.9648778628879207, "tokens_seen": 444926976 }, { "epoch": 1.03, "learning_rate": 0.00043695085255767303, "loss": 3.1086, "theoretical_loss": 3.964813695827779, "tokens_seen": 444992512 }, { "epoch": 1.03, "learning_rate": 0.0004369408224674022, "loss": 3.0106, "theoretical_loss": 3.964749540862708, "tokens_seen": 445058048 }, { "epoch": 1.03, "learning_rate": 0.0004369307923771314, "loss": 2.9465, "theoretical_loss": 3.9646853979886467, "tokens_seen": 445123584 }, { "epoch": 1.03, "learning_rate": 0.0004369207622868606, "loss": 3.3021, "theoretical_loss": 3.9646212672015375, "tokens_seen": 445189120 }, { "epoch": 1.03, "learning_rate": 0.00043691073219658975, "loss": 3.0231, "theoretical_loss": 3.964557148497324, "tokens_seen": 445254656 }, { "epoch": 1.03, "learning_rate": 0.000436900702106319, "loss": 3.2187, "theoretical_loss": 3.964493041871951, "tokens_seen": 445320192 }, { "epoch": 1.03, "learning_rate": 0.00043689067201604817, "loss": 3.2205, "theoretical_loss": 3.964428947321366, "tokens_seen": 445385728 }, { "epoch": 1.03, "learning_rate": 0.00043688064192577735, "loss": 3.1018, "theoretical_loss": 3.96436486484152, "tokens_seen": 445451264 }, { "epoch": 1.03, "learning_rate": 0.00043687061183550653, "loss": 3.2986, "theoretical_loss": 3.964300794428362, "tokens_seen": 445516800 }, { "epoch": 1.03, "learning_rate": 0.00043686058174523577, "loss": 2.9632, "theoretical_loss": 3.964236736077847, "tokens_seen": 445582336 }, { "epoch": 1.03, "objective/train/docs_used": 733955, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8105826377868652, "objective/train/theoretical_loss": 3.964188700228636, "objective/train/tokens_used": 466091488, "theoretical_loss": 3.964188700228636, "tokens_seen": 445631488 }, { "epoch": 1.03, "learning_rate": 0.0004368505516549649, "loss": 3.16, "theoretical_loss": 3.9641726897859293, "tokens_seen": 445647872 }, { "epoch": 1.03, "learning_rate": 0.00043684052156469413, "loss": 3.18, "theoretical_loss": 3.964108655548567, "tokens_seen": 445713408 }, { "epoch": 1.03, "learning_rate": 0.00043683049147442326, "loss": 3.121, "theoretical_loss": 3.964044633361719, "tokens_seen": 445778944 }, { "epoch": 1.03, "learning_rate": 0.0004368204613841525, "loss": 3.1703, "theoretical_loss": 3.963980623221346, "tokens_seen": 445844480 }, { "epoch": 1.03, "learning_rate": 0.00043681043129388167, "loss": 3.2165, "theoretical_loss": 3.963916625123412, "tokens_seen": 445910016 }, { "epoch": 1.03, "learning_rate": 0.00043680040120361085, "loss": 3.0194, "theoretical_loss": 3.96385263906388, "tokens_seen": 445975552 }, { "epoch": 1.03, "learning_rate": 0.00043679037111334003, "loss": 3.1505, "theoretical_loss": 3.9637886650387197, "tokens_seen": 446041088 }, { "epoch": 1.03, "learning_rate": 0.0004367803410230692, "loss": 3.0851, "theoretical_loss": 3.963724703043898, "tokens_seen": 446106624 }, { "epoch": 1.03, "learning_rate": 0.0004367703109327984, "loss": 3.249, "theoretical_loss": 3.963660753075387, "tokens_seen": 446172160 }, { "epoch": 1.03, "learning_rate": 0.00043676028084252763, "loss": 3.1248, "theoretical_loss": 3.9635968151291583, "tokens_seen": 446237696 }, { "epoch": 1.03, "learning_rate": 0.00043675025075225676, "loss": 3.1677, "theoretical_loss": 3.9635328892011876, "tokens_seen": 446303232 }, { "epoch": 1.03, "learning_rate": 0.000436740220661986, "loss": 3.1615, "theoretical_loss": 3.9634689752874515, "tokens_seen": 446368768 }, { "epoch": 1.03, "learning_rate": 0.0004367301905717151, "loss": 3.1047, "theoretical_loss": 3.9634050733839272, "tokens_seen": 446434304 }, { "epoch": 1.03, "learning_rate": 0.00043672016048144436, "loss": 3.1133, "theoretical_loss": 3.9633411834865977, "tokens_seen": 446499840 }, { "epoch": 1.03, "learning_rate": 0.00043671013039117354, "loss": 3.1281, "theoretical_loss": 3.9632773055914434, "tokens_seen": 446565376 }, { "epoch": 1.03, "learning_rate": 0.0004367001003009027, "loss": 3.1018, "theoretical_loss": 3.96321343969445, "tokens_seen": 446630912 }, { "epoch": 1.03, "learning_rate": 0.0004366900702106319, "loss": 3.0571, "theoretical_loss": 3.963149585791603, "tokens_seen": 446696448 }, { "epoch": 1.03, "learning_rate": 0.00043668004012036113, "loss": 3.3605, "theoretical_loss": 3.963085743878891, "tokens_seen": 446761984 }, { "epoch": 1.03, "learning_rate": 0.00043667001003009026, "loss": 3.0923, "theoretical_loss": 3.963021913952304, "tokens_seen": 446827520 }, { "epoch": 1.03, "learning_rate": 0.0004366599799398195, "loss": 2.9627, "theoretical_loss": 3.962958096007835, "tokens_seen": 446893056 }, { "epoch": 1.03, "learning_rate": 0.0004366499498495486, "loss": 2.8403, "theoretical_loss": 3.9628942900414765, "tokens_seen": 446958592 }, { "epoch": 1.03, "learning_rate": 0.00043663991975927786, "loss": 3.1739, "theoretical_loss": 3.962830496049226, "tokens_seen": 447024128 }, { "epoch": 1.03, "learning_rate": 0.00043662988966900704, "loss": 3.1698, "theoretical_loss": 3.962766714027081, "tokens_seen": 447089664 }, { "epoch": 1.03, "learning_rate": 0.0004366198595787362, "loss": 3.0927, "theoretical_loss": 3.9627029439710406, "tokens_seen": 447155200 }, { "epoch": 1.03, "learning_rate": 0.0004366098294884654, "loss": 2.9655, "theoretical_loss": 3.9626391858771077, "tokens_seen": 447220736 }, { "epoch": 1.03, "objective/train/docs_used": 736807, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.283816337585449, "objective/train/theoretical_loss": 3.962591375154386, "objective/train/tokens_used": 467729888, "theoretical_loss": 3.962591375154386, "tokens_seen": 447269888 }, { "epoch": 1.03, "learning_rate": 0.0004365997993981946, "loss": 3.1663, "theoretical_loss": 3.9625754397412845, "tokens_seen": 447286272 }, { "epoch": 1.03, "learning_rate": 0.00043658976930792376, "loss": 3.273, "theoretical_loss": 3.9625117055595784, "tokens_seen": 447351808 }, { "epoch": 1.03, "learning_rate": 0.000436579739217653, "loss": 3.2378, "theoretical_loss": 3.962447983327996, "tokens_seen": 447417344 }, { "epoch": 1.03, "learning_rate": 0.00043656970912738213, "loss": 3.1366, "theoretical_loss": 3.962384273042546, "tokens_seen": 447482880 }, { "epoch": 1.03, "learning_rate": 0.00043655967903711136, "loss": 3.1426, "theoretical_loss": 3.962320574699241, "tokens_seen": 447548416 }, { "epoch": 1.03, "learning_rate": 0.0004365496489468405, "loss": 3.0789, "theoretical_loss": 3.9622568882940943, "tokens_seen": 447613952 }, { "epoch": 1.03, "learning_rate": 0.0004365396188565697, "loss": 3.1678, "theoretical_loss": 3.9621932138231197, "tokens_seen": 447679488 }, { "epoch": 1.03, "learning_rate": 0.0004365295887662989, "loss": 3.0757, "theoretical_loss": 3.9621295512823353, "tokens_seen": 447745024 }, { "epoch": 1.03, "learning_rate": 0.0004365195586760281, "loss": 3.0726, "theoretical_loss": 3.9620659006677608, "tokens_seen": 447810560 }, { "epoch": 1.03, "learning_rate": 0.00043650952858575727, "loss": 3.0681, "theoretical_loss": 3.9620022619754156, "tokens_seen": 447876096 }, { "epoch": 1.03, "learning_rate": 0.0004364994984954865, "loss": 3.0265, "theoretical_loss": 3.9619386352013235, "tokens_seen": 447941632 }, { "epoch": 1.03, "learning_rate": 0.00043648946840521563, "loss": 2.9398, "theoretical_loss": 3.961875020341509, "tokens_seen": 448007168 }, { "epoch": 1.03, "learning_rate": 0.00043647943831494487, "loss": 3.2244, "theoretical_loss": 3.961811417391999, "tokens_seen": 448072704 }, { "epoch": 1.03, "learning_rate": 0.000436469408224674, "loss": 2.9585, "theoretical_loss": 3.9617478263488213, "tokens_seen": 448138240 }, { "epoch": 1.03, "learning_rate": 0.00043645937813440323, "loss": 3.1659, "theoretical_loss": 3.961684247208008, "tokens_seen": 448203776 }, { "epoch": 1.03, "learning_rate": 0.0004364493480441324, "loss": 3.1841, "theoretical_loss": 3.961620679965589, "tokens_seen": 448269312 }, { "epoch": 1.03, "learning_rate": 0.0004364393179538616, "loss": 3.0676, "theoretical_loss": 3.9615571246176002, "tokens_seen": 448334848 }, { "epoch": 1.03, "learning_rate": 0.00043642928786359077, "loss": 3.2919, "theoretical_loss": 3.961493581160078, "tokens_seen": 448400384 }, { "epoch": 1.03, "learning_rate": 0.00043641925777331995, "loss": 3.0796, "theoretical_loss": 3.96143004958906, "tokens_seen": 448465920 }, { "epoch": 1.03, "learning_rate": 0.00043640922768304913, "loss": 2.9765, "theoretical_loss": 3.9613665299005856, "tokens_seen": 448531456 }, { "epoch": 1.03, "learning_rate": 0.00043639919759277837, "loss": 3.2413, "theoretical_loss": 3.961303022090697, "tokens_seen": 448596992 }, { "epoch": 1.03, "learning_rate": 0.0004363891675025075, "loss": 3.1787, "theoretical_loss": 3.9612395261554383, "tokens_seen": 448662528 }, { "epoch": 1.03, "learning_rate": 0.00043637913741223673, "loss": 3.025, "theoretical_loss": 3.9611760420908553, "tokens_seen": 448728064 }, { "epoch": 1.03, "learning_rate": 0.0004363691073219659, "loss": 2.8705, "theoretical_loss": 3.961112569892995, "tokens_seen": 448793600 }, { "epoch": 1.03, "learning_rate": 0.0004363590772316951, "loss": 3.1238, "theoretical_loss": 3.9610491095579072, "tokens_seen": 448859136 }, { "epoch": 1.03, "objective/train/docs_used": 739629, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0830881595611572, "objective/train/theoretical_loss": 3.96100152208916, "objective/train/tokens_used": 469368288, "theoretical_loss": 3.96100152208916, "tokens_seen": 448908288 }, { "epoch": 1.03, "learning_rate": 0.0004363490471414243, "loss": 3.221, "theoretical_loss": 3.9609856610816427, "tokens_seen": 448924672 }, { "epoch": 1.03, "learning_rate": 0.00043633901705115346, "loss": 3.02, "theoretical_loss": 3.9609222244602558, "tokens_seen": 448990208 }, { "epoch": 1.03, "learning_rate": 0.00043632898696088264, "loss": 3.177, "theoretical_loss": 3.9608587996898006, "tokens_seen": 449055744 }, { "epoch": 1.03, "learning_rate": 0.00043631895687061187, "loss": 3.1105, "theoretical_loss": 3.9607953867663346, "tokens_seen": 449121280 }, { "epoch": 1.03, "learning_rate": 0.000436308926780341, "loss": 3.199, "theoretical_loss": 3.960731985685916, "tokens_seen": 449186816 }, { "epoch": 1.03, "learning_rate": 0.00043629889669007023, "loss": 3.1961, "theoretical_loss": 3.9606685964446067, "tokens_seen": 449252352 }, { "epoch": 1.03, "learning_rate": 0.00043628886659979936, "loss": 3.1275, "theoretical_loss": 3.960605219038469, "tokens_seen": 449317888 }, { "epoch": 1.03, "learning_rate": 0.0004362788365095286, "loss": 3.1535, "theoretical_loss": 3.9605418534635666, "tokens_seen": 449383424 }, { "epoch": 1.03, "learning_rate": 0.0004362688064192578, "loss": 3.2752, "theoretical_loss": 3.9604784997159665, "tokens_seen": 449448960 }, { "epoch": 1.03, "learning_rate": 0.00043625877632898696, "loss": 2.989, "theoretical_loss": 3.960415157791738, "tokens_seen": 449514496 }, { "epoch": 1.03, "learning_rate": 0.00043624874623871614, "loss": 2.9966, "theoretical_loss": 3.96035182768695, "tokens_seen": 449580032 }, { "epoch": 1.03, "learning_rate": 0.0004362387161484453, "loss": 3.2282, "theoretical_loss": 3.9602885093976745, "tokens_seen": 449645568 }, { "epoch": 1.03, "learning_rate": 0.0004362286860581745, "loss": 3.1468, "theoretical_loss": 3.960225202919986, "tokens_seen": 449711104 }, { "epoch": 1.03, "learning_rate": 0.00043621865596790374, "loss": 2.9478, "theoretical_loss": 3.96016190824996, "tokens_seen": 449776640 }, { "epoch": 1.03, "learning_rate": 0.00043620862587763286, "loss": 3.1353, "theoretical_loss": 3.9600986253836745, "tokens_seen": 449842176 }, { "epoch": 1.03, "learning_rate": 0.0004361985957873621, "loss": 3.2107, "theoretical_loss": 3.960035354317209, "tokens_seen": 449907712 }, { "epoch": 1.03, "learning_rate": 0.0004361885656970913, "loss": 3.0862, "theoretical_loss": 3.959972095046645, "tokens_seen": 449973248 }, { "epoch": 1.03, "learning_rate": 0.00043617853560682046, "loss": 3.1323, "theoretical_loss": 3.9599088475680655, "tokens_seen": 450038784 }, { "epoch": 1.03, "learning_rate": 0.0004361685055165497, "loss": 3.2467, "theoretical_loss": 3.959845611877556, "tokens_seen": 450104320 }, { "epoch": 1.03, "learning_rate": 0.0004361584754262788, "loss": 3.0905, "theoretical_loss": 3.9597823879712033, "tokens_seen": 450169856 }, { "epoch": 1.03, "learning_rate": 0.00043614844533600806, "loss": 3.061, "theoretical_loss": 3.9597191758450965, "tokens_seen": 450235392 }, { "epoch": 1.03, "learning_rate": 0.00043613841524573724, "loss": 3.0274, "theoretical_loss": 3.959655975495326, "tokens_seen": 450300928 }, { "epoch": 1.03, "learning_rate": 0.0004361283851554664, "loss": 3.075, "theoretical_loss": 3.9595927869179857, "tokens_seen": 450366464 }, { "epoch": 1.03, "learning_rate": 0.0004361183550651956, "loss": 3.0915, "theoretical_loss": 3.9595296101091684, "tokens_seen": 450432000 }, { "epoch": 1.03, "learning_rate": 0.0004361083249749248, "loss": 2.8523, "theoretical_loss": 3.9594664450649724, "tokens_seen": 450497536 }, { "epoch": 1.03, "objective/train/docs_used": 742386, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.3907384872436523, "objective/train/theoretical_loss": 3.9594190790000097, "objective/train/tokens_used": 471006688, "theoretical_loss": 3.9594190790000097, "tokens_seen": 450546688 }, { "epoch": 1.03, "learning_rate": 0.00043609829488465397, "loss": 2.9343, "theoretical_loss": 3.959403291781494, "tokens_seen": 450563072 }, { "epoch": 1.03, "learning_rate": 0.0004360882647943832, "loss": 3.0965, "theoretical_loss": 3.959340150254834, "tokens_seen": 450628608 }, { "epoch": 1.03, "learning_rate": 0.00043607823470411233, "loss": 3.2028, "theoretical_loss": 3.959277020481095, "tokens_seen": 450694144 }, { "epoch": 1.03, "learning_rate": 0.00043606820461384156, "loss": 3.0057, "theoretical_loss": 3.95921390245638, "tokens_seen": 450759680 }, { "epoch": 1.03, "learning_rate": 0.0004360581745235707, "loss": 3.2648, "theoretical_loss": 3.959150796176795, "tokens_seen": 450825216 }, { "epoch": 1.03, "learning_rate": 0.0004360481444332999, "loss": 3.0009, "theoretical_loss": 3.959087701638448, "tokens_seen": 450890752 }, { "epoch": 1.03, "learning_rate": 0.0004360381143430291, "loss": 3.1427, "theoretical_loss": 3.9590246188374474, "tokens_seen": 450956288 }, { "epoch": 1.03, "learning_rate": 0.0004360280842527583, "loss": 3.0199, "theoretical_loss": 3.958961547769906, "tokens_seen": 451021824 }, { "epoch": 1.03, "learning_rate": 0.00043601805416248747, "loss": 3.0832, "theoretical_loss": 3.958898488431935, "tokens_seen": 451087360 }, { "epoch": 1.03, "learning_rate": 0.0004360080240722167, "loss": 3.1187, "theoretical_loss": 3.9588354408196507, "tokens_seen": 451152896 }, { "epoch": 1.03, "learning_rate": 0.00043599799398194583, "loss": 3.2197, "theoretical_loss": 3.9587724049291695, "tokens_seen": 451218432 }, { "epoch": 1.03, "learning_rate": 0.00043598796389167507, "loss": 3.0204, "theoretical_loss": 3.95870938075661, "tokens_seen": 451283968 }, { "epoch": 1.03, "learning_rate": 0.0004359779338014042, "loss": 2.9938, "theoretical_loss": 3.9586463682980924, "tokens_seen": 451349504 }, { "epoch": 1.03, "learning_rate": 0.00043596790371113343, "loss": 2.9968, "theoretical_loss": 3.95858336754974, "tokens_seen": 451415040 }, { "epoch": 1.03, "learning_rate": 0.0004359578736208626, "loss": 3.1034, "theoretical_loss": 3.958520378507676, "tokens_seen": 451480576 }, { "epoch": 1.03, "learning_rate": 0.0004359478435305918, "loss": 3.0715, "theoretical_loss": 3.958457401168027, "tokens_seen": 451546112 }, { "epoch": 1.03, "learning_rate": 0.00043593781344032097, "loss": 3.1405, "theoretical_loss": 3.958394435526921, "tokens_seen": 451611648 }, { "epoch": 1.03, "learning_rate": 0.00043592778335005015, "loss": 3.159, "theoretical_loss": 3.958331481580487, "tokens_seen": 451677184 }, { "epoch": 1.03, "learning_rate": 0.00043591775325977933, "loss": 3.072, "theoretical_loss": 3.9582685393248576, "tokens_seen": 451742720 }, { "epoch": 1.03, "learning_rate": 0.00043590772316950857, "loss": 3.0018, "theoretical_loss": 3.9582056087561655, "tokens_seen": 451808256 }, { "epoch": 1.03, "learning_rate": 0.0004358976930792377, "loss": 3.0598, "theoretical_loss": 3.958142689870546, "tokens_seen": 451873792 }, { "epoch": 1.03, "learning_rate": 0.00043588766298896693, "loss": 3.1773, "theoretical_loss": 3.958079782664136, "tokens_seen": 451939328 }, { "epoch": 1.03, "learning_rate": 0.0004358776328986961, "loss": 2.9877, "theoretical_loss": 3.958016887133075, "tokens_seen": 452004864 }, { "epoch": 1.03, "learning_rate": 0.0004358676028084253, "loss": 3.0967, "theoretical_loss": 3.957954003273504, "tokens_seen": 452070400 }, { "epoch": 1.03, "learning_rate": 0.0004358575727181545, "loss": 3.0288, "theoretical_loss": 3.9578911310815643, "tokens_seen": 452135936 }, { "epoch": 1.03, "objective/train/docs_used": 743743, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1820547580718994, "objective/train/theoretical_loss": 3.957843984592174, "objective/train/tokens_used": 472645088, "theoretical_loss": 3.957843984592174, "tokens_seen": 452185088 }, { "epoch": 1.03, "learning_rate": 0.00043584754262788366, "loss": 3.1255, "theoretical_loss": 3.957828270553402, "tokens_seen": 452201472 }, { "epoch": 1.03, "learning_rate": 0.00043583751253761284, "loss": 2.992, "theoretical_loss": 3.9577654216851617, "tokens_seen": 452267008 }, { "epoch": 1.03, "learning_rate": 0.00043582748244734207, "loss": 3.0898, "theoretical_loss": 3.957702584472993, "tokens_seen": 452332544 }, { "epoch": 1.03, "learning_rate": 0.0004358174523570712, "loss": 3.0389, "theoretical_loss": 3.9576397589130448, "tokens_seen": 452398080 }, { "epoch": 1.03, "learning_rate": 0.00043580742226680043, "loss": 3.1379, "theoretical_loss": 3.9575769450014686, "tokens_seen": 452463616 }, { "epoch": 1.03, "learning_rate": 0.00043579739217652956, "loss": 3.1602, "theoretical_loss": 3.9575141427344196, "tokens_seen": 452529152 }, { "epoch": 1.03, "learning_rate": 0.0004357873620862588, "loss": 3.228, "theoretical_loss": 3.957451352108052, "tokens_seen": 452594688 }, { "epoch": 1.03, "learning_rate": 0.000435777331995988, "loss": 3.0949, "theoretical_loss": 3.9573885731185223, "tokens_seen": 452660224 }, { "epoch": 1.03, "learning_rate": 0.00043576730190571716, "loss": 3.2199, "theoretical_loss": 3.9573258057619913, "tokens_seen": 452725760 }, { "epoch": 1.03, "learning_rate": 0.00043575727181544634, "loss": 2.9964, "theoretical_loss": 3.957263050034619, "tokens_seen": 452791296 }, { "epoch": 1.03, "learning_rate": 0.0004357472417251755, "loss": 3.0462, "theoretical_loss": 3.957200305932568, "tokens_seen": 452856832 }, { "epoch": 1.03, "learning_rate": 0.0004357372116349047, "loss": 3.0923, "theoretical_loss": 3.957137573452003, "tokens_seen": 452922368 }, { "epoch": 1.03, "learning_rate": 0.00043572718154463394, "loss": 3.0885, "theoretical_loss": 3.957074852589091, "tokens_seen": 452987904 }, { "epoch": 1.03, "learning_rate": 0.00043571715145436306, "loss": 3.0941, "theoretical_loss": 3.9570121433399987, "tokens_seen": 453053440 }, { "epoch": 1.03, "learning_rate": 0.0004357071213640923, "loss": 3.106, "theoretical_loss": 3.9569494457008973, "tokens_seen": 453118976 }, { "epoch": 1.03, "learning_rate": 0.0004356970912738215, "loss": 2.9483, "theoretical_loss": 3.9568867596679578, "tokens_seen": 453184512 }, { "epoch": 1.03, "learning_rate": 0.00043568706118355066, "loss": 3.156, "theoretical_loss": 3.956824085237355, "tokens_seen": 453250048 }, { "epoch": 1.03, "learning_rate": 0.00043567703109327984, "loss": 3.1673, "theoretical_loss": 3.956761422405263, "tokens_seen": 453315584 }, { "epoch": 1.03, "learning_rate": 0.000435667001003009, "loss": 3.2627, "theoretical_loss": 3.9566987711678596, "tokens_seen": 453381120 }, { "epoch": 1.03, "learning_rate": 0.0004356569709127382, "loss": 3.066, "theoretical_loss": 3.956636131521324, "tokens_seen": 453446656 }, { "epoch": 1.03, "learning_rate": 0.00043564694082246744, "loss": 2.9352, "theoretical_loss": 3.9565735034618372, "tokens_seen": 453512192 }, { "epoch": 1.03, "learning_rate": 0.00043563691073219657, "loss": 2.9197, "theoretical_loss": 3.9565108869855816, "tokens_seen": 453577728 }, { "epoch": 1.03, "learning_rate": 0.0004356268806419258, "loss": 3.2756, "theoretical_loss": 3.9564482820887417, "tokens_seen": 453643264 }, { "epoch": 1.03, "learning_rate": 0.00043561685055165493, "loss": 2.8515, "theoretical_loss": 3.9563856887675035, "tokens_seen": 453708800 }, { "epoch": 1.03, "learning_rate": 0.00043560682046138417, "loss": 3.0239, "theoretical_loss": 3.956323107018056, "tokens_seen": 453774336 }, { "epoch": 1.03, "objective/train/docs_used": 746173, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2255635261535645, "objective/train/theoretical_loss": 3.956276178297665, "objective/train/tokens_used": 474283488, "theoretical_loss": 3.956276178297665, "tokens_seen": 453823488 }, { "epoch": 1.03, "learning_rate": 0.00043559679037111335, "loss": 3.0309, "theoretical_loss": 3.956260536836588, "tokens_seen": 453839872 }, { "epoch": 1.03, "learning_rate": 0.00043558676028084253, "loss": 2.9358, "theoretical_loss": 3.9561979782192918, "tokens_seen": 453905408 }, { "epoch": 1.03, "learning_rate": 0.0004355767301905717, "loss": 3.1249, "theoretical_loss": 3.956135431162361, "tokens_seen": 453970944 }, { "epoch": 1.03, "learning_rate": 0.0004355667001003009, "loss": 3.0785, "theoretical_loss": 3.9560728956619906, "tokens_seen": 454036480 }, { "epoch": 1.03, "learning_rate": 0.00043555667001003007, "loss": 3.0232, "theoretical_loss": 3.956010371714378, "tokens_seen": 454102016 }, { "epoch": 1.03, "learning_rate": 0.0004355466399197593, "loss": 3.0226, "theoretical_loss": 3.9559478593157222, "tokens_seen": 454167552 }, { "epoch": 1.03, "learning_rate": 0.00043553660982948843, "loss": 3.0769, "theoretical_loss": 3.9558853584622238, "tokens_seen": 454233088 }, { "epoch": 1.03, "learning_rate": 0.00043552657973921767, "loss": 3.1928, "theoretical_loss": 3.955822869150085, "tokens_seen": 454298624 }, { "epoch": 1.03, "learning_rate": 0.00043551654964894685, "loss": 3.2443, "theoretical_loss": 3.9557603913755104, "tokens_seen": 454364160 }, { "epoch": 1.03, "learning_rate": 0.00043550651955867603, "loss": 3.0513, "theoretical_loss": 3.9556979251347064, "tokens_seen": 454429696 }, { "epoch": 1.03, "learning_rate": 0.0004354964894684052, "loss": 3.0337, "theoretical_loss": 3.9556354704238803, "tokens_seen": 454495232 }, { "epoch": 1.03, "learning_rate": 0.0004354864593781344, "loss": 3.0413, "theoretical_loss": 3.955573027239242, "tokens_seen": 454560768 }, { "epoch": 1.03, "learning_rate": 0.0004354764292878636, "loss": 3.2681, "theoretical_loss": 3.955510595577003, "tokens_seen": 454626304 }, { "epoch": 1.03, "learning_rate": 0.0004354663991975928, "loss": 3.0896, "theoretical_loss": 3.9554481754333772, "tokens_seen": 454691840 }, { "epoch": 1.03, "learning_rate": 0.00043545636910732194, "loss": 3.0583, "theoretical_loss": 3.955385766804579, "tokens_seen": 454757376 }, { "epoch": 1.03, "learning_rate": 0.00043544633901705117, "loss": 2.9799, "theoretical_loss": 3.9553233696868255, "tokens_seen": 454822912 }, { "epoch": 1.03, "learning_rate": 0.0004354363089267803, "loss": 3.2505, "theoretical_loss": 3.9552609840763346, "tokens_seen": 454888448 }, { "epoch": 1.03, "learning_rate": 0.00043542627883650953, "loss": 3.0916, "theoretical_loss": 3.9551986099693277, "tokens_seen": 454953984 }, { "epoch": 1.03, "learning_rate": 0.00043541624874623877, "loss": 3.0238, "theoretical_loss": 3.9551362473620273, "tokens_seen": 455019520 }, { "epoch": 1.03, "learning_rate": 0.0004354062186559679, "loss": 3.1702, "theoretical_loss": 3.9550738962506564, "tokens_seen": 455085056 }, { "epoch": 1.03, "learning_rate": 0.00043539618856569713, "loss": 3.03, "theoretical_loss": 3.9550115566314403, "tokens_seen": 455150592 }, { "epoch": 1.03, "learning_rate": 0.0004353861584754263, "loss": 3.1086, "theoretical_loss": 3.9549492285006087, "tokens_seen": 455216128 }, { "epoch": 1.03, "learning_rate": 0.0004353761283851555, "loss": 3.1261, "theoretical_loss": 3.9548869118543895, "tokens_seen": 455281664 }, { "epoch": 1.03, "learning_rate": 0.0004353660982948847, "loss": 3.1259, "theoretical_loss": 3.954824606689013, "tokens_seen": 455347200 }, { "epoch": 1.03, "learning_rate": 0.00043535606820461386, "loss": 3.0582, "theoretical_loss": 3.9547623130007143, "tokens_seen": 455412736 }, { "epoch": 1.03, "objective/train/docs_used": 748965, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0852136611938477, "objective/train/theoretical_loss": 3.9547156002640564, "objective/train/tokens_used": 475921888, "theoretical_loss": 3.9547156002640564, "tokens_seen": 455461888 }, { "epoch": 1.03, "learning_rate": 0.00043534603811434304, "loss": 3.1052, "theoretical_loss": 3.9547000307857267, "tokens_seen": 455478272 }, { "epoch": 1.03, "learning_rate": 0.00043533600802407227, "loss": 3.0883, "theoretical_loss": 3.9546377600402867, "tokens_seen": 455543808 }, { "epoch": 1.03, "learning_rate": 0.0004353259779338014, "loss": 3.2308, "theoretical_loss": 3.954575500760633, "tokens_seen": 455609344 }, { "epoch": 1.03, "learning_rate": 0.00043531594784353063, "loss": 3.1014, "theoretical_loss": 3.954513252943005, "tokens_seen": 455674880 }, { "epoch": 1.03, "learning_rate": 0.00043530591775325976, "loss": 3.1171, "theoretical_loss": 3.9544510165836453, "tokens_seen": 455740416 }, { "epoch": 1.03, "learning_rate": 0.000435295887662989, "loss": 3.1076, "theoretical_loss": 3.954388791678796, "tokens_seen": 455805952 }, { "epoch": 1.03, "learning_rate": 0.0004352858575727182, "loss": 3.1042, "theoretical_loss": 3.9543265782247046, "tokens_seen": 455871488 }, { "epoch": 1.03, "learning_rate": 0.00043527582748244736, "loss": 3.2141, "theoretical_loss": 3.9542643762176164, "tokens_seen": 455937024 }, { "epoch": 1.03, "learning_rate": 0.00043526579739217654, "loss": 2.9826, "theoretical_loss": 3.9542021856537817, "tokens_seen": 456002560 }, { "epoch": 1.03, "learning_rate": 0.0004352557673019057, "loss": 3.0912, "theoretical_loss": 3.9541400065294496, "tokens_seen": 456068096 }, { "epoch": 1.03, "learning_rate": 0.0004352457372116349, "loss": 3.1054, "theoretical_loss": 3.9540778388408735, "tokens_seen": 456133632 }, { "epoch": 1.03, "learning_rate": 0.00043523570712136414, "loss": 3.0115, "theoretical_loss": 3.9540156825843074, "tokens_seen": 456199168 }, { "epoch": 1.03, "learning_rate": 0.00043522567703109326, "loss": 3.0468, "theoretical_loss": 3.953953537756007, "tokens_seen": 456264704 }, { "epoch": 1.03, "learning_rate": 0.0004352156469408225, "loss": 3.0914, "theoretical_loss": 3.9538914043522304, "tokens_seen": 456330240 }, { "epoch": 1.03, "learning_rate": 0.0004352056168505517, "loss": 3.0032, "theoretical_loss": 3.953829282369237, "tokens_seen": 456395776 }, { "epoch": 1.03, "learning_rate": 0.00043519558676028086, "loss": 3.0811, "theoretical_loss": 3.9537671718032876, "tokens_seen": 456461312 }, { "epoch": 1.03, "learning_rate": 0.00043518555667001004, "loss": 3.1532, "theoretical_loss": 3.9537050726506457, "tokens_seen": 456526848 }, { "epoch": 1.03, "learning_rate": 0.0004351755265797392, "loss": 3.1281, "theoretical_loss": 3.9536429849075754, "tokens_seen": 456592384 }, { "epoch": 1.03, "learning_rate": 0.0004351654964894684, "loss": 3.0724, "theoretical_loss": 3.953580908570344, "tokens_seen": 456657920 }, { "epoch": 1.03, "learning_rate": 0.00043515546639919764, "loss": 3.1153, "theoretical_loss": 3.9535188436352193, "tokens_seen": 456723456 }, { "epoch": 1.03, "learning_rate": 0.00043514543630892677, "loss": 3.0357, "theoretical_loss": 3.9534567900984716, "tokens_seen": 456788992 }, { "epoch": 1.03, "learning_rate": 0.000435135406218656, "loss": 3.1078, "theoretical_loss": 3.9533947479563722, "tokens_seen": 456854528 }, { "epoch": 1.03, "learning_rate": 0.00043512537612838513, "loss": 3.1212, "theoretical_loss": 3.953332717205195, "tokens_seen": 456920064 }, { "epoch": 1.03, "learning_rate": 0.00043511534603811437, "loss": 3.0997, "theoretical_loss": 3.953270697841215, "tokens_seen": 456985600 }, { "epoch": 1.03, "learning_rate": 0.00043510531594784355, "loss": 3.0323, "theoretical_loss": 3.95320868986071, "tokens_seen": 457051136 }, { "epoch": 1.03, "objective/train/docs_used": 751562, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3233184814453125, "objective/train/theoretical_loss": 3.953162191343497, "objective/train/tokens_used": 477560288, "theoretical_loss": 3.953162191343497, "tokens_seen": 457100288 }, { "epoch": 1.03, "learning_rate": 0.00043509528585757273, "loss": 3.127, "theoretical_loss": 3.9531466932599573, "tokens_seen": 457116672 }, { "epoch": 1.03, "learning_rate": 0.0004350852557673019, "loss": 3.0917, "theoretical_loss": 3.953084708035239, "tokens_seen": 457182208 }, { "epoch": 1.04, "learning_rate": 0.0004350752256770311, "loss": 3.0461, "theoretical_loss": 3.9530227341828366, "tokens_seen": 457247744 }, { "epoch": 1.04, "learning_rate": 0.00043506519558676027, "loss": 2.936, "theoretical_loss": 3.9529607716990336, "tokens_seen": 457313280 }, { "epoch": 1.04, "learning_rate": 0.0004350551654964895, "loss": 3.1068, "theoretical_loss": 3.952898820580117, "tokens_seen": 457378816 }, { "epoch": 1.04, "learning_rate": 0.00043504513540621863, "loss": 3.0951, "theoretical_loss": 3.9528368808223737, "tokens_seen": 457444352 }, { "epoch": 1.04, "learning_rate": 0.00043503510531594787, "loss": 3.0245, "theoretical_loss": 3.952774952422093, "tokens_seen": 457509888 }, { "epoch": 1.04, "learning_rate": 0.00043502507522567705, "loss": 3.1378, "theoretical_loss": 3.952713035375566, "tokens_seen": 457575424 }, { "epoch": 1.04, "learning_rate": 0.00043501504513540623, "loss": 3.0419, "theoretical_loss": 3.9526511296790856, "tokens_seen": 457640960 }, { "epoch": 1.04, "learning_rate": 0.0004350050150451354, "loss": 3.0536, "theoretical_loss": 3.9525892353289453, "tokens_seen": 457706496 }, { "epoch": 1.04, "learning_rate": 0.0004349949849548646, "loss": 3.0442, "theoretical_loss": 3.952527352321443, "tokens_seen": 457772032 }, { "epoch": 1.04, "learning_rate": 0.0004349849548645938, "loss": 3.0715, "theoretical_loss": 3.952465480652875, "tokens_seen": 457837568 }, { "epoch": 1.04, "learning_rate": 0.000434974924774323, "loss": 3.0861, "theoretical_loss": 3.952403620319542, "tokens_seen": 457903104 }, { "epoch": 1.04, "learning_rate": 0.00043496489468405214, "loss": 2.9815, "theoretical_loss": 3.9523417713177453, "tokens_seen": 457968640 }, { "epoch": 1.04, "learning_rate": 0.00043495486459378137, "loss": 3.1267, "theoretical_loss": 3.952279933643788, "tokens_seen": 458034176 }, { "epoch": 1.04, "learning_rate": 0.0004349448345035105, "loss": 3.1558, "theoretical_loss": 3.952218107293975, "tokens_seen": 458099712 }, { "epoch": 1.04, "learning_rate": 0.00043493480441323973, "loss": 3.295, "theoretical_loss": 3.952156292264613, "tokens_seen": 458165248 }, { "epoch": 1.04, "learning_rate": 0.0004349247743229689, "loss": 3.0681, "theoretical_loss": 3.95209448855201, "tokens_seen": 458230784 }, { "epoch": 1.04, "learning_rate": 0.0004349147442326981, "loss": 3.2285, "theoretical_loss": 3.952032696152477, "tokens_seen": 458296320 }, { "epoch": 1.04, "learning_rate": 0.0004349047141424273, "loss": 3.0005, "theoretical_loss": 3.951970915062325, "tokens_seen": 458361856 }, { "epoch": 1.04, "learning_rate": 0.0004348946840521565, "loss": 2.9493, "theoretical_loss": 3.9519091452778676, "tokens_seen": 458427392 }, { "epoch": 1.04, "learning_rate": 0.00043488465396188564, "loss": 3.109, "theoretical_loss": 3.951847386795421, "tokens_seen": 458492928 }, { "epoch": 1.04, "learning_rate": 0.0004348746238716149, "loss": 3.0296, "theoretical_loss": 3.9517856396113014, "tokens_seen": 458558464 }, { "epoch": 1.04, "learning_rate": 0.000434864593781344, "loss": 2.8756, "theoretical_loss": 3.9517239037218275, "tokens_seen": 458624000 }, { "epoch": 1.04, "learning_rate": 0.00043485456369107324, "loss": 3.1288, "theoretical_loss": 3.9516621791233204, "tokens_seen": 458689536 }, { "epoch": 1.04, "objective/train/docs_used": 754498, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0338852405548096, "objective/train/theoretical_loss": 3.9516158930819243, "objective/train/tokens_used": 479198688, "theoretical_loss": 3.9516158930819243, "tokens_seen": 458738688 }, { "epoch": 1.04, "learning_rate": 0.0004348445336008024, "loss": 3.0936, "theoretical_loss": 3.951600465812102, "tokens_seen": 458755072 }, { "epoch": 1.04, "learning_rate": 0.0004348345035105316, "loss": 3.215, "theoretical_loss": 3.9515387637844963, "tokens_seen": 458820608 }, { "epoch": 1.04, "learning_rate": 0.0004348244734202608, "loss": 3.0651, "theoretical_loss": 3.9514770730368283, "tokens_seen": 458886144 }, { "epoch": 1.04, "learning_rate": 0.00043481444332998996, "loss": 3.2743, "theoretical_loss": 3.9514153935654264, "tokens_seen": 458951680 }, { "epoch": 1.04, "learning_rate": 0.00043480441323971914, "loss": 3.1002, "theoretical_loss": 3.9513537253666184, "tokens_seen": 459017216 }, { "epoch": 1.04, "learning_rate": 0.0004347943831494484, "loss": 3.1074, "theoretical_loss": 3.9512920684367367, "tokens_seen": 459082752 }, { "epoch": 1.04, "learning_rate": 0.0004347843530591775, "loss": 3.0103, "theoretical_loss": 3.951230422772113, "tokens_seen": 459148288 }, { "epoch": 1.04, "learning_rate": 0.00043477432296890674, "loss": 3.1277, "theoretical_loss": 3.9511687883690816, "tokens_seen": 459213824 }, { "epoch": 1.04, "learning_rate": 0.00043476429287863587, "loss": 3.0367, "theoretical_loss": 3.951107165223978, "tokens_seen": 459279360 }, { "epoch": 1.04, "learning_rate": 0.0004347542627883651, "loss": 3.1912, "theoretical_loss": 3.951045553333141, "tokens_seen": 459344896 }, { "epoch": 1.04, "learning_rate": 0.0004347442326980943, "loss": 3.1773, "theoretical_loss": 3.950983952692909, "tokens_seen": 459410432 }, { "epoch": 1.04, "learning_rate": 0.00043473420260782346, "loss": 3.1973, "theoretical_loss": 3.950922363299623, "tokens_seen": 459475968 }, { "epoch": 1.04, "learning_rate": 0.00043472417251755265, "loss": 3.0523, "theoretical_loss": 3.9508607851496267, "tokens_seen": 459541504 }, { "epoch": 1.04, "learning_rate": 0.0004347141424272819, "loss": 3.215, "theoretical_loss": 3.9507992182392644, "tokens_seen": 459607040 }, { "epoch": 1.04, "learning_rate": 0.000434704112337011, "loss": 3.1779, "theoretical_loss": 3.9507376625648822, "tokens_seen": 459672576 }, { "epoch": 1.04, "learning_rate": 0.00043469408224674024, "loss": 3.1813, "theoretical_loss": 3.9506761181228276, "tokens_seen": 459738112 }, { "epoch": 1.04, "learning_rate": 0.0004346840521564694, "loss": 3.0482, "theoretical_loss": 3.9506145849094505, "tokens_seen": 459803648 }, { "epoch": 1.04, "learning_rate": 0.0004346740220661986, "loss": 3.1358, "theoretical_loss": 3.9505530629211023, "tokens_seen": 459869184 }, { "epoch": 1.04, "learning_rate": 0.00043466399197592784, "loss": 3.0956, "theoretical_loss": 3.950491552154136, "tokens_seen": 459934720 }, { "epoch": 1.04, "learning_rate": 0.00043465396188565697, "loss": 3.0783, "theoretical_loss": 3.950430052604907, "tokens_seen": 460000256 }, { "epoch": 1.04, "learning_rate": 0.0004346439317953862, "loss": 3.0599, "theoretical_loss": 3.9503685642697706, "tokens_seen": 460065792 }, { "epoch": 1.04, "learning_rate": 0.00043463390170511533, "loss": 2.9846, "theoretical_loss": 3.9503070871450863, "tokens_seen": 460131328 }, { "epoch": 1.04, "learning_rate": 0.00043462387161484457, "loss": 3.1418, "theoretical_loss": 3.950245621227213, "tokens_seen": 460196864 }, { "epoch": 1.04, "learning_rate": 0.00043461384152457375, "loss": 3.0109, "theoretical_loss": 3.9501841665125124, "tokens_seen": 460262400 }, { "epoch": 1.04, "learning_rate": 0.00043460381143430293, "loss": 3.0264, "theoretical_loss": 3.9501227229973486, "tokens_seen": 460327936 }, { "epoch": 1.04, "objective/train/docs_used": 757423, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1190121173858643, "objective/train/theoretical_loss": 3.9500766477084843, "objective/train/tokens_used": 480837088, "theoretical_loss": 3.9500766477084843, "tokens_seen": 460377088 }, { "epoch": 1.04, "learning_rate": 0.0004345937813440321, "loss": 3.0822, "theoretical_loss": 3.950061290678085, "tokens_seen": 460393472 }, { "epoch": 1.04, "learning_rate": 0.0004345837512537613, "loss": 3.2233, "theoretical_loss": 3.94999986955109, "tokens_seen": 460459008 }, { "epoch": 1.04, "learning_rate": 0.00043457372116349047, "loss": 3.1982, "theoretical_loss": 3.9499384596127305, "tokens_seen": 460524544 }, { "epoch": 1.04, "learning_rate": 0.0004345636910732197, "loss": 3.167, "theoretical_loss": 3.949877060859378, "tokens_seen": 460590080 }, { "epoch": 1.04, "learning_rate": 0.00043455366098294883, "loss": 3.0159, "theoretical_loss": 3.949815673287403, "tokens_seen": 460655616 }, { "epoch": 1.04, "learning_rate": 0.00043454363089267807, "loss": 3.0793, "theoretical_loss": 3.949754296893179, "tokens_seen": 460721152 }, { "epoch": 1.04, "learning_rate": 0.00043453360080240725, "loss": 3.2051, "theoretical_loss": 3.949692931673082, "tokens_seen": 460786688 }, { "epoch": 1.04, "learning_rate": 0.00043452357071213643, "loss": 3.0682, "theoretical_loss": 3.9496315776234883, "tokens_seen": 460852224 }, { "epoch": 1.04, "learning_rate": 0.0004345135406218656, "loss": 3.1235, "theoretical_loss": 3.949570234740776, "tokens_seen": 460917760 }, { "epoch": 1.04, "learning_rate": 0.0004345035105315948, "loss": 3.1865, "theoretical_loss": 3.949508903021327, "tokens_seen": 460983296 }, { "epoch": 1.04, "learning_rate": 0.000434493480441324, "loss": 3.0924, "theoretical_loss": 3.949447582461521, "tokens_seen": 461048832 }, { "epoch": 1.04, "learning_rate": 0.0004344834503510532, "loss": 3.0796, "theoretical_loss": 3.9493862730577427, "tokens_seen": 461114368 }, { "epoch": 1.04, "learning_rate": 0.00043447342026078234, "loss": 3.1662, "theoretical_loss": 3.9493249748063777, "tokens_seen": 461179904 }, { "epoch": 1.04, "learning_rate": 0.00043446339017051157, "loss": 3.0444, "theoretical_loss": 3.949263687703812, "tokens_seen": 461245440 }, { "epoch": 1.04, "learning_rate": 0.0004344533600802407, "loss": 2.9707, "theoretical_loss": 3.949202411746435, "tokens_seen": 461310976 }, { "epoch": 1.04, "learning_rate": 0.00043444332998996993, "loss": 2.9537, "theoretical_loss": 3.9491411469306366, "tokens_seen": 461376512 }, { "epoch": 1.04, "learning_rate": 0.0004344332998996991, "loss": 3.151, "theoretical_loss": 3.949079893252809, "tokens_seen": 461442048 }, { "epoch": 1.04, "learning_rate": 0.0004344232698094283, "loss": 3.0158, "theoretical_loss": 3.9490186507093457, "tokens_seen": 461507584 }, { "epoch": 1.04, "learning_rate": 0.0004344132397191575, "loss": 2.94, "theoretical_loss": 3.9489574192966423, "tokens_seen": 461573120 }, { "epoch": 1.04, "learning_rate": 0.0004344032096288867, "loss": 3.1863, "theoretical_loss": 3.948896199011096, "tokens_seen": 461638656 }, { "epoch": 1.04, "learning_rate": 0.00043439317953861584, "loss": 2.9414, "theoretical_loss": 3.9488349898491046, "tokens_seen": 461704192 }, { "epoch": 1.04, "learning_rate": 0.0004343831494483451, "loss": 3.089, "theoretical_loss": 3.9487737918070698, "tokens_seen": 461769728 }, { "epoch": 1.04, "learning_rate": 0.0004343731193580742, "loss": 3.12, "theoretical_loss": 3.9487126048813925, "tokens_seen": 461835264 }, { "epoch": 1.04, "learning_rate": 0.00043436308926780344, "loss": 2.9896, "theoretical_loss": 3.9486514290684767, "tokens_seen": 461900800 }, { "epoch": 1.04, "learning_rate": 0.0004343530591775326, "loss": 3.2405, "theoretical_loss": 3.9485902643647286, "tokens_seen": 461966336 }, { "epoch": 1.04, "objective/train/docs_used": 760065, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3742308616638184, "objective/train/theoretical_loss": 3.948544398125147, "objective/train/tokens_used": 482475488, "theoretical_loss": 3.948544398125147, "tokens_seen": 462015488 }, { "epoch": 1.04, "learning_rate": 0.0004343430290872618, "loss": 3.1982, "theoretical_loss": 3.9485291107665548, "tokens_seen": 462031872 }, { "epoch": 1.04, "learning_rate": 0.000434332998996991, "loss": 3.0618, "theoretical_loss": 3.948467968270364, "tokens_seen": 462097408 }, { "epoch": 1.04, "learning_rate": 0.00043432296890672016, "loss": 3.099, "theoretical_loss": 3.948406836872566, "tokens_seen": 462162944 }, { "epoch": 1.04, "learning_rate": 0.00043431293881644934, "loss": 2.881, "theoretical_loss": 3.948345716569574, "tokens_seen": 462228480 }, { "epoch": 1.04, "learning_rate": 0.0004343029087261786, "loss": 3.0009, "theoretical_loss": 3.9482846073578015, "tokens_seen": 462294016 }, { "epoch": 1.04, "learning_rate": 0.0004342928786359077, "loss": 3.078, "theoretical_loss": 3.9482235092336637, "tokens_seen": 462359552 }, { "epoch": 1.04, "learning_rate": 0.00043428284854563694, "loss": 3.1531, "theoretical_loss": 3.948162422193578, "tokens_seen": 462425088 }, { "epoch": 1.04, "learning_rate": 0.00043427281845536607, "loss": 3.1367, "theoretical_loss": 3.948101346233962, "tokens_seen": 462490624 }, { "epoch": 1.04, "learning_rate": 0.0004342627883650953, "loss": 3.2557, "theoretical_loss": 3.9480402813512376, "tokens_seen": 462556160 }, { "epoch": 1.04, "learning_rate": 0.0004342527582748245, "loss": 3.1364, "theoretical_loss": 3.9479792275418264, "tokens_seen": 462621696 }, { "epoch": 1.04, "learning_rate": 0.00043424272818455367, "loss": 3.0144, "theoretical_loss": 3.947918184802152, "tokens_seen": 462687232 }, { "epoch": 1.04, "learning_rate": 0.00043423269809428285, "loss": 3.1794, "theoretical_loss": 3.9478571531286395, "tokens_seen": 462752768 }, { "epoch": 1.04, "learning_rate": 0.0004342226680040121, "loss": 3.0798, "theoretical_loss": 3.947796132517717, "tokens_seen": 462818304 }, { "epoch": 1.04, "learning_rate": 0.0004342126379137412, "loss": 3.0446, "theoretical_loss": 3.9477351229658124, "tokens_seen": 462883840 }, { "epoch": 1.04, "learning_rate": 0.00043420260782347044, "loss": 2.8434, "theoretical_loss": 3.9476741244693567, "tokens_seen": 462949376 }, { "epoch": 1.04, "learning_rate": 0.00043419257773319957, "loss": 3.1147, "theoretical_loss": 3.947613137024781, "tokens_seen": 463014912 }, { "epoch": 1.04, "learning_rate": 0.0004341825476429288, "loss": 3.0595, "theoretical_loss": 3.9475521606285198, "tokens_seen": 463080448 }, { "epoch": 1.04, "learning_rate": 0.000434172517552658, "loss": 3.1797, "theoretical_loss": 3.947491195277008, "tokens_seen": 463145984 }, { "epoch": 1.04, "learning_rate": 0.00043416248746238717, "loss": 3.1783, "theoretical_loss": 3.9474302409666837, "tokens_seen": 463211520 }, { "epoch": 1.04, "learning_rate": 0.00043415245737211635, "loss": 3.1493, "theoretical_loss": 3.9473692976939843, "tokens_seen": 463277056 }, { "epoch": 1.04, "learning_rate": 0.00043414242728184553, "loss": 3.0437, "theoretical_loss": 3.94730836545535, "tokens_seen": 463342592 }, { "epoch": 1.04, "learning_rate": 0.0004341323971915747, "loss": 3.0412, "theoretical_loss": 3.947247444247224, "tokens_seen": 463408128 }, { "epoch": 1.04, "learning_rate": 0.00043412236710130395, "loss": 3.0252, "theoretical_loss": 3.947186534066049, "tokens_seen": 463473664 }, { "epoch": 1.04, "learning_rate": 0.0004341123370110331, "loss": 3.0224, "theoretical_loss": 3.9471256349082706, "tokens_seen": 463539200 }, { "epoch": 1.04, "learning_rate": 0.0004341023069207623, "loss": 3.0975, "theoretical_loss": 3.9470647467703364, "tokens_seen": 463604736 }, { "epoch": 1.04, "objective/train/docs_used": 762852, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.840392827987671, "objective/train/theoretical_loss": 3.9470190878965203, "objective/train/tokens_used": 484113888, "theoretical_loss": 3.9470190878965203, "tokens_seen": 463653888 }, { "epoch": 1.04, "learning_rate": 0.00043409227683049144, "loss": 3.0406, "theoretical_loss": 3.947003869648693, "tokens_seen": 463670272 }, { "epoch": 1.04, "learning_rate": 0.00043408224674022067, "loss": 3.2034, "theoretical_loss": 3.9469430035397925, "tokens_seen": 463735808 }, { "epoch": 1.04, "learning_rate": 0.00043407221664994985, "loss": 3.3001, "theoretical_loss": 3.946882148440086, "tokens_seen": 463801344 }, { "epoch": 1.04, "learning_rate": 0.00043406218655967903, "loss": 3.0384, "theoretical_loss": 3.9468213043460274, "tokens_seen": 463866880 }, { "epoch": 1.04, "learning_rate": 0.0004340521564694082, "loss": 3.0862, "theoretical_loss": 3.9467604712540716, "tokens_seen": 463932416 }, { "epoch": 1.04, "learning_rate": 0.00043404212637913745, "loss": 3.0801, "theoretical_loss": 3.9466996491606747, "tokens_seen": 463997952 }, { "epoch": 1.04, "learning_rate": 0.0004340320962888666, "loss": 3.052, "theoretical_loss": 3.9466388380622965, "tokens_seen": 464063488 }, { "epoch": 1.04, "learning_rate": 0.0004340220661985958, "loss": 3.2115, "theoretical_loss": 3.946578037955396, "tokens_seen": 464129024 }, { "epoch": 1.04, "learning_rate": 0.00043401203610832494, "loss": 3.0832, "theoretical_loss": 3.946517248836436, "tokens_seen": 464194560 }, { "epoch": 1.04, "learning_rate": 0.0004340020060180542, "loss": 3.0554, "theoretical_loss": 3.9464564707018788, "tokens_seen": 464260096 }, { "epoch": 1.04, "learning_rate": 0.00043399197592778336, "loss": 3.2922, "theoretical_loss": 3.946395703548189, "tokens_seen": 464325632 }, { "epoch": 1.04, "learning_rate": 0.00043398194583751254, "loss": 3.0847, "theoretical_loss": 3.9463349473718345, "tokens_seen": 464391168 }, { "epoch": 1.04, "learning_rate": 0.0004339719157472417, "loss": 3.0102, "theoretical_loss": 3.9462742021692834, "tokens_seen": 464456704 }, { "epoch": 1.04, "learning_rate": 0.0004339618856569709, "loss": 2.8999, "theoretical_loss": 3.946213467937005, "tokens_seen": 464522240 }, { "epoch": 1.04, "learning_rate": 0.0004339518555667001, "loss": 3.1698, "theoretical_loss": 3.946152744671471, "tokens_seen": 464587776 }, { "epoch": 1.04, "learning_rate": 0.0004339418254764293, "loss": 3.182, "theoretical_loss": 3.9460920323691546, "tokens_seen": 464653312 }, { "epoch": 1.04, "learning_rate": 0.0004339317953861585, "loss": 2.9476, "theoretical_loss": 3.9460313310265307, "tokens_seen": 464718848 }, { "epoch": 1.04, "learning_rate": 0.0004339217652958877, "loss": 3.0925, "theoretical_loss": 3.9459706406400756, "tokens_seen": 464784384 }, { "epoch": 1.04, "learning_rate": 0.0004339117352056169, "loss": 3.0488, "theoretical_loss": 3.945909961206267, "tokens_seen": 464849920 }, { "epoch": 1.04, "learning_rate": 0.00043390170511534604, "loss": 3.1733, "theoretical_loss": 3.9458492927215856, "tokens_seen": 464915456 }, { "epoch": 1.04, "learning_rate": 0.0004338916750250753, "loss": 3.3351, "theoretical_loss": 3.9457886351825118, "tokens_seen": 464980992 }, { "epoch": 1.04, "learning_rate": 0.0004338816449348044, "loss": 2.9573, "theoretical_loss": 3.9457279885855288, "tokens_seen": 465046528 }, { "epoch": 1.04, "learning_rate": 0.00043387161484453364, "loss": 3.0252, "theoretical_loss": 3.9456673529271216, "tokens_seen": 465112064 }, { "epoch": 1.04, "learning_rate": 0.0004338615847542628, "loss": 2.9458, "theoretical_loss": 3.9456067282037752, "tokens_seen": 465177600 }, { "epoch": 1.04, "learning_rate": 0.000433851554663992, "loss": 3.0397, "theoretical_loss": 3.9455461144119788, "tokens_seen": 465243136 }, { "epoch": 1.04, "objective/train/docs_used": 765522, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.244889974594116, "objective/train/theoretical_loss": 3.9455006612398487, "objective/train/tokens_used": 485752288, "theoretical_loss": 3.9455006612398487, "tokens_seen": 465292288 }, { "epoch": 1.04, "learning_rate": 0.0004338415245737212, "loss": 3.2208, "theoretical_loss": 3.9454855115482212, "tokens_seen": 465308672 }, { "epoch": 1.04, "learning_rate": 0.00043383149448345036, "loss": 3.1022, "theoretical_loss": 3.9454249196089934, "tokens_seen": 465374208 }, { "epoch": 1.04, "learning_rate": 0.00043382146439317954, "loss": 3.2222, "theoretical_loss": 3.9453643385907875, "tokens_seen": 465439744 }, { "epoch": 1.04, "learning_rate": 0.0004338114343029088, "loss": 3.017, "theoretical_loss": 3.945303768490099, "tokens_seen": 465505280 }, { "epoch": 1.04, "learning_rate": 0.0004338014042126379, "loss": 2.8533, "theoretical_loss": 3.9452432093034235, "tokens_seen": 465570816 }, { "epoch": 1.04, "learning_rate": 0.00043379137412236714, "loss": 3.1373, "theoretical_loss": 3.945182661027258, "tokens_seen": 465636352 }, { "epoch": 1.04, "learning_rate": 0.00043378134403209627, "loss": 2.9059, "theoretical_loss": 3.9451221236581016, "tokens_seen": 465701888 }, { "epoch": 1.04, "learning_rate": 0.0004337713139418255, "loss": 3.0835, "theoretical_loss": 3.945061597192456, "tokens_seen": 465767424 }, { "epoch": 1.04, "learning_rate": 0.0004337612838515547, "loss": 2.9864, "theoretical_loss": 3.9450010816268226, "tokens_seen": 465832960 }, { "epoch": 1.04, "learning_rate": 0.00043375125376128387, "loss": 3.1095, "theoretical_loss": 3.9449405769577055, "tokens_seen": 465898496 }, { "epoch": 1.04, "learning_rate": 0.00043374122367101305, "loss": 3.2117, "theoretical_loss": 3.9448800831816113, "tokens_seen": 465964032 }, { "epoch": 1.04, "learning_rate": 0.0004337311935807423, "loss": 2.922, "theoretical_loss": 3.9448196002950455, "tokens_seen": 466029568 }, { "epoch": 1.04, "learning_rate": 0.0004337211634904714, "loss": 3.1253, "theoretical_loss": 3.9447591282945185, "tokens_seen": 466095104 }, { "epoch": 1.04, "learning_rate": 0.00043371113340020064, "loss": 3.1975, "theoretical_loss": 3.94469866717654, "tokens_seen": 466160640 }, { "epoch": 1.04, "learning_rate": 0.00043370110330992977, "loss": 3.1417, "theoretical_loss": 3.9446382169376224, "tokens_seen": 466226176 }, { "epoch": 1.04, "learning_rate": 0.000433691073219659, "loss": 3.1786, "theoretical_loss": 3.944577777574279, "tokens_seen": 466291712 }, { "epoch": 1.04, "learning_rate": 0.0004336810431293882, "loss": 3.1991, "theoretical_loss": 3.944517349083025, "tokens_seen": 466357248 }, { "epoch": 1.04, "learning_rate": 0.00043367101303911737, "loss": 3.1642, "theoretical_loss": 3.944456931460378, "tokens_seen": 466422784 }, { "epoch": 1.04, "learning_rate": 0.00043366098294884655, "loss": 3.2128, "theoretical_loss": 3.9443965247028556, "tokens_seen": 466488320 }, { "epoch": 1.04, "learning_rate": 0.00043365095285857573, "loss": 3.065, "theoretical_loss": 3.9443361288069783, "tokens_seen": 466553856 }, { "epoch": 1.04, "learning_rate": 0.0004336409227683049, "loss": 3.3006, "theoretical_loss": 3.9442757437692673, "tokens_seen": 466619392 }, { "epoch": 1.04, "learning_rate": 0.00043363089267803415, "loss": 3.0397, "theoretical_loss": 3.9442153695862467, "tokens_seen": 466684928 }, { "epoch": 1.04, "learning_rate": 0.0004336208625877633, "loss": 3.1724, "theoretical_loss": 3.944155006254441, "tokens_seen": 466750464 }, { "epoch": 1.04, "learning_rate": 0.0004336108324974925, "loss": 3.1182, "theoretical_loss": 3.9440946537703767, "tokens_seen": 466816000 }, { "epoch": 1.04, "learning_rate": 0.00043360080240722164, "loss": 2.9468, "theoretical_loss": 3.9440343121305816, "tokens_seen": 466881536 }, { "epoch": 1.04, "objective/train/docs_used": 766913, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.080779790878296, "objective/train/theoretical_loss": 3.9439890630151995, "objective/train/tokens_used": 487390688, "theoretical_loss": 3.9439890630151995, "tokens_seen": 466930688 }, { "epoch": 1.04, "learning_rate": 0.00043359077231695087, "loss": 2.9054, "theoretical_loss": 3.9439739813315855, "tokens_seen": 466947072 }, { "epoch": 1.04, "learning_rate": 0.00043358074222668005, "loss": 3.0481, "theoretical_loss": 3.94391366136992, "tokens_seen": 467012608 }, { "epoch": 1.04, "learning_rate": 0.00043357071213640923, "loss": 2.9532, "theoretical_loss": 3.943853352242118, "tokens_seen": 467078144 }, { "epoch": 1.04, "learning_rate": 0.0004335606820461384, "loss": 3.0794, "theoretical_loss": 3.943793053944713, "tokens_seen": 467143680 }, { "epoch": 1.04, "learning_rate": 0.00043355065195586765, "loss": 3.1549, "theoretical_loss": 3.943732766474242, "tokens_seen": 467209216 }, { "epoch": 1.04, "learning_rate": 0.0004335406218655968, "loss": 3.1075, "theoretical_loss": 3.943672489827243, "tokens_seen": 467274752 }, { "epoch": 1.04, "learning_rate": 0.000433530591775326, "loss": 3.0237, "theoretical_loss": 3.9436122240002547, "tokens_seen": 467340288 }, { "epoch": 1.04, "learning_rate": 0.00043352056168505514, "loss": 3.0536, "theoretical_loss": 3.9435519689898175, "tokens_seen": 467405824 }, { "epoch": 1.04, "learning_rate": 0.0004335105315947844, "loss": 3.1921, "theoretical_loss": 3.943491724792475, "tokens_seen": 467471360 }, { "epoch": 1.04, "learning_rate": 0.00043350050150451356, "loss": 3.0609, "theoretical_loss": 3.9434314914047697, "tokens_seen": 467536896 }, { "epoch": 1.04, "learning_rate": 0.00043349047141424274, "loss": 3.0035, "theoretical_loss": 3.943371268823248, "tokens_seen": 467602432 }, { "epoch": 1.04, "learning_rate": 0.0004334804413239719, "loss": 3.0652, "theoretical_loss": 3.9433110570444576, "tokens_seen": 467667968 }, { "epoch": 1.04, "learning_rate": 0.0004334704112337011, "loss": 3.2898, "theoretical_loss": 3.943250856064947, "tokens_seen": 467733504 }, { "epoch": 1.04, "learning_rate": 0.0004334603811434303, "loss": 3.1399, "theoretical_loss": 3.9431906658812657, "tokens_seen": 467799040 }, { "epoch": 1.04, "learning_rate": 0.0004334503510531595, "loss": 3.0661, "theoretical_loss": 3.9431304864899666, "tokens_seen": 467864576 }, { "epoch": 1.04, "learning_rate": 0.00043344032096288864, "loss": 2.8745, "theoretical_loss": 3.943070317887603, "tokens_seen": 467930112 }, { "epoch": 1.04, "learning_rate": 0.0004334302908726179, "loss": 3.1979, "theoretical_loss": 3.94301016007073, "tokens_seen": 467995648 }, { "epoch": 1.04, "learning_rate": 0.000433420260782347, "loss": 2.9536, "theoretical_loss": 3.9429500130359045, "tokens_seen": 468061184 }, { "epoch": 1.04, "learning_rate": 0.00043341023069207624, "loss": 3.1356, "theoretical_loss": 3.9428898767796845, "tokens_seen": 468126720 }, { "epoch": 1.04, "learning_rate": 0.0004334002006018054, "loss": 3.1619, "theoretical_loss": 3.94282975129863, "tokens_seen": 468192256 }, { "epoch": 1.04, "learning_rate": 0.0004333901705115346, "loss": 3.0608, "theoretical_loss": 3.9427696365893024, "tokens_seen": 468257792 }, { "epoch": 1.04, "learning_rate": 0.0004333801404212638, "loss": 3.1102, "theoretical_loss": 3.942709532648265, "tokens_seen": 468323328 }, { "epoch": 1.04, "learning_rate": 0.000433370110330993, "loss": 3.2623, "theoretical_loss": 3.9426494394720812, "tokens_seen": 468388864 }, { "epoch": 1.04, "learning_rate": 0.00043336008024072215, "loss": 3.1436, "theoretical_loss": 3.942589357057319, "tokens_seen": 468454400 }, { "epoch": 1.04, "learning_rate": 0.0004333500501504514, "loss": 2.9431, "theoretical_loss": 3.9425292854005454, "tokens_seen": 468519936 }, { "epoch": 1.04, "objective/train/docs_used": 769781, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9375522136688232, "objective/train/theoretical_loss": 3.942484238715831, "objective/train/tokens_used": 489029088, "theoretical_loss": 3.942484238715831, "tokens_seen": 468569088 }, { "epoch": 1.04, "learning_rate": 0.0004333400200601805, "loss": 2.9748, "theoretical_loss": 3.9424692244983293, "tokens_seen": 468585472 }, { "epoch": 1.04, "learning_rate": 0.00043332998996990974, "loss": 3.0888, "theoretical_loss": 3.942409174347242, "tokens_seen": 468651008 }, { "epoch": 1.04, "learning_rate": 0.0004333199598796389, "loss": 3.0612, "theoretical_loss": 3.9423491349438557, "tokens_seen": 468716544 }, { "epoch": 1.04, "learning_rate": 0.0004333099297893681, "loss": 3.1567, "theoretical_loss": 3.942289106284745, "tokens_seen": 468782080 }, { "epoch": 1.04, "learning_rate": 0.0004332998996990973, "loss": 3.0675, "theoretical_loss": 3.942229088366485, "tokens_seen": 468847616 }, { "epoch": 1.04, "learning_rate": 0.00043328986960882647, "loss": 3.2188, "theoretical_loss": 3.9421690811856527, "tokens_seen": 468913152 }, { "epoch": 1.04, "learning_rate": 0.00043327983951855565, "loss": 3.0607, "theoretical_loss": 3.942109084738828, "tokens_seen": 468978688 }, { "epoch": 1.04, "learning_rate": 0.0004332698094282849, "loss": 2.9642, "theoretical_loss": 3.9420490990225896, "tokens_seen": 469044224 }, { "epoch": 1.04, "learning_rate": 0.000433259779338014, "loss": 3.0181, "theoretical_loss": 3.9419891240335208, "tokens_seen": 469109760 }, { "epoch": 1.04, "learning_rate": 0.00043324974924774325, "loss": 3.2527, "theoretical_loss": 3.941929159768204, "tokens_seen": 469175296 }, { "epoch": 1.04, "learning_rate": 0.00043323971915747243, "loss": 3.1428, "theoretical_loss": 3.941869206223225, "tokens_seen": 469240832 }, { "epoch": 1.04, "learning_rate": 0.0004332296890672016, "loss": 3.0645, "theoretical_loss": 3.94180926339517, "tokens_seen": 469306368 }, { "epoch": 1.04, "learning_rate": 0.0004332196589769308, "loss": 3.1603, "theoretical_loss": 3.9417493312806275, "tokens_seen": 469371904 }, { "epoch": 1.04, "learning_rate": 0.00043320962888665997, "loss": 2.9786, "theoretical_loss": 3.9416894098761865, "tokens_seen": 469437440 }, { "epoch": 1.04, "learning_rate": 0.00043319959879638915, "loss": 3.0286, "theoretical_loss": 3.9416294991784393, "tokens_seen": 469502976 }, { "epoch": 1.04, "learning_rate": 0.0004331895687061184, "loss": 3.1459, "theoretical_loss": 3.941569599183978, "tokens_seen": 469568512 }, { "epoch": 1.04, "learning_rate": 0.00043317953861584757, "loss": 3.1475, "theoretical_loss": 3.941509709889397, "tokens_seen": 469634048 }, { "epoch": 1.04, "learning_rate": 0.00043316950852557675, "loss": 3.1425, "theoretical_loss": 3.9414498312912927, "tokens_seen": 469699584 }, { "epoch": 1.04, "learning_rate": 0.00043315947843530593, "loss": 3.0547, "theoretical_loss": 3.9413899633862624, "tokens_seen": 469765120 }, { "epoch": 1.04, "learning_rate": 0.0004331494483450351, "loss": 3.3151, "theoretical_loss": 3.9413301061709047, "tokens_seen": 469830656 }, { "epoch": 1.04, "learning_rate": 0.00043313941825476435, "loss": 3.1183, "theoretical_loss": 3.941270259641821, "tokens_seen": 469896192 }, { "epoch": 1.04, "learning_rate": 0.0004331293881644935, "loss": 3.2518, "theoretical_loss": 3.9412104237956127, "tokens_seen": 469961728 }, { "epoch": 1.04, "learning_rate": 0.0004331193580742227, "loss": 3.0986, "theoretical_loss": 3.9411505986288846, "tokens_seen": 470027264 }, { "epoch": 1.04, "learning_rate": 0.00043310932798395184, "loss": 3.1862, "theoretical_loss": 3.9410907841382405, "tokens_seen": 470092800 }, { "epoch": 1.04, "learning_rate": 0.00043309929789368107, "loss": 3.043, "theoretical_loss": 3.941030980320289, "tokens_seen": 470158336 }, { "epoch": 1.04, "objective/train/docs_used": 772206, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2538249492645264, "objective/train/theoretical_loss": 3.9409861344587385, "objective/train/tokens_used": 490667488, "theoretical_loss": 3.9409861344587385, "tokens_seen": 470207488 }, { "epoch": 1.04, "learning_rate": 0.00043308926780341025, "loss": 3.0912, "theoretical_loss": 3.940971187171637, "tokens_seen": 470223872 }, { "epoch": 1.04, "learning_rate": 0.00043307923771313943, "loss": 3.1458, "theoretical_loss": 3.940911404688895, "tokens_seen": 470289408 }, { "epoch": 1.04, "learning_rate": 0.0004330692076228686, "loss": 3.1057, "theoretical_loss": 3.940851632868675, "tokens_seen": 470354944 }, { "epoch": 1.04, "learning_rate": 0.00043305917753259785, "loss": 2.9108, "theoretical_loss": 3.940791871707589, "tokens_seen": 470420480 }, { "epoch": 1.04, "learning_rate": 0.000433049147442327, "loss": 3.1044, "theoretical_loss": 3.9407321212022524, "tokens_seen": 470486016 }, { "epoch": 1.04, "learning_rate": 0.0004330391173520562, "loss": 3.0324, "theoretical_loss": 3.9406723813492808, "tokens_seen": 470551552 }, { "epoch": 1.04, "learning_rate": 0.00043302908726178534, "loss": 3.1229, "theoretical_loss": 3.940612652145292, "tokens_seen": 470617088 }, { "epoch": 1.04, "learning_rate": 0.0004330190571715146, "loss": 3.1705, "theoretical_loss": 3.9405529335869063, "tokens_seen": 470682624 }, { "epoch": 1.04, "learning_rate": 0.00043300902708124376, "loss": 3.1326, "theoretical_loss": 3.9404932256707426, "tokens_seen": 470748160 }, { "epoch": 1.04, "learning_rate": 0.00043299899699097294, "loss": 3.0719, "theoretical_loss": 3.9404335283934246, "tokens_seen": 470813696 }, { "epoch": 1.04, "learning_rate": 0.0004329889669007021, "loss": 3.1393, "theoretical_loss": 3.9403738417515757, "tokens_seen": 470879232 }, { "epoch": 1.04, "learning_rate": 0.0004329789368104313, "loss": 3.1966, "theoretical_loss": 3.940314165741821, "tokens_seen": 470944768 }, { "epoch": 1.04, "learning_rate": 0.0004329689067201605, "loss": 3.0993, "theoretical_loss": 3.940254500360788, "tokens_seen": 471010304 }, { "epoch": 1.04, "learning_rate": 0.0004329588766298897, "loss": 3.1211, "theoretical_loss": 3.9401948456051046, "tokens_seen": 471075840 }, { "epoch": 1.04, "learning_rate": 0.00043294884653961884, "loss": 2.849, "theoretical_loss": 3.940135201471401, "tokens_seen": 471141376 }, { "epoch": 1.04, "learning_rate": 0.0004329388164493481, "loss": 3.0892, "theoretical_loss": 3.940075567956309, "tokens_seen": 471206912 }, { "epoch": 1.04, "learning_rate": 0.0004329287863590772, "loss": 3.0852, "theoretical_loss": 3.9400159450564614, "tokens_seen": 471272448 }, { "epoch": 1.04, "learning_rate": 0.00043291875626880644, "loss": 3.1919, "theoretical_loss": 3.939956332768493, "tokens_seen": 471337984 }, { "epoch": 1.04, "learning_rate": 0.0004329087261785356, "loss": 3.1656, "theoretical_loss": 3.939896731089041, "tokens_seen": 471403520 }, { "epoch": 1.04, "learning_rate": 0.0004328986960882648, "loss": 3.1502, "theoretical_loss": 3.939837140014741, "tokens_seen": 471469056 }, { "epoch": 1.04, "learning_rate": 0.000432888665997994, "loss": 2.8872, "theoretical_loss": 3.939777559542233, "tokens_seen": 471534592 }, { "epoch": 1.04, "learning_rate": 0.0004328786359077232, "loss": 3.1672, "theoretical_loss": 3.939717989668158, "tokens_seen": 471600128 }, { "epoch": 1.04, "learning_rate": 0.00043286860581745235, "loss": 2.9765, "theoretical_loss": 3.9396584303891586, "tokens_seen": 471665664 }, { "epoch": 1.04, "learning_rate": 0.0004328585757271816, "loss": 3.1092, "theoretical_loss": 3.939598881701878, "tokens_seen": 471731200 }, { "epoch": 1.04, "learning_rate": 0.0004328485456369107, "loss": 3.241, "theoretical_loss": 3.939539343602962, "tokens_seen": 471796736 }, { "epoch": 1.04, "objective/train/docs_used": 774963, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9391541481018066, "objective/train/theoretical_loss": 3.939494696975372, "objective/train/tokens_used": 492305888, "theoretical_loss": 3.939494696975372, "tokens_seen": 471845888 }, { "epoch": 1.04, "learning_rate": 0.00043283851554663994, "loss": 3.0603, "theoretical_loss": 3.939479816089057, "tokens_seen": 471862272 }, { "epoch": 1.04, "learning_rate": 0.0004328284854563691, "loss": 2.9927, "theoretical_loss": 3.9394202991568124, "tokens_seen": 471927808 }, { "epoch": 1.04, "learning_rate": 0.0004328184553660983, "loss": 3.0131, "theoretical_loss": 3.9393607928028764, "tokens_seen": 471993344 }, { "epoch": 1.04, "learning_rate": 0.0004328084252758275, "loss": 3.0894, "theoretical_loss": 3.9393012970239023, "tokens_seen": 472058880 }, { "epoch": 1.04, "learning_rate": 0.00043279839518555667, "loss": 2.8415, "theoretical_loss": 3.939241811816542, "tokens_seen": 472124416 }, { "epoch": 1.04, "learning_rate": 0.00043278836509528585, "loss": 3.0248, "theoretical_loss": 3.93918233717745, "tokens_seen": 472189952 }, { "epoch": 1.04, "learning_rate": 0.0004327783350050151, "loss": 3.1546, "theoretical_loss": 3.939122873103283, "tokens_seen": 472255488 }, { "epoch": 1.04, "learning_rate": 0.0004327683049147442, "loss": 2.9692, "theoretical_loss": 3.9390634195906973, "tokens_seen": 472321024 }, { "epoch": 1.04, "learning_rate": 0.00043275827482447345, "loss": 3.138, "theoretical_loss": 3.9390039766363536, "tokens_seen": 472386560 }, { "epoch": 1.04, "learning_rate": 0.00043274824473420263, "loss": 2.9727, "theoretical_loss": 3.9389445442369113, "tokens_seen": 472452096 }, { "epoch": 1.04, "learning_rate": 0.0004327382146439318, "loss": 3.1136, "theoretical_loss": 3.938885122389033, "tokens_seen": 472517632 }, { "epoch": 1.04, "learning_rate": 0.000432728184553661, "loss": 3.0867, "theoretical_loss": 3.938825711089382, "tokens_seen": 472583168 }, { "epoch": 1.04, "learning_rate": 0.00043271815446339017, "loss": 3.0501, "theoretical_loss": 3.938766310334624, "tokens_seen": 472648704 }, { "epoch": 1.04, "learning_rate": 0.00043270812437311935, "loss": 3.2778, "theoretical_loss": 3.9387069201214246, "tokens_seen": 472714240 }, { "epoch": 1.04, "learning_rate": 0.0004326980942828486, "loss": 3.2417, "theoretical_loss": 3.9386475404464534, "tokens_seen": 472779776 }, { "epoch": 1.04, "learning_rate": 0.0004326880641925777, "loss": 3.1389, "theoretical_loss": 3.938588171306379, "tokens_seen": 472845312 }, { "epoch": 1.04, "learning_rate": 0.00043267803410230695, "loss": 3.0841, "theoretical_loss": 3.938528812697873, "tokens_seen": 472910848 }, { "epoch": 1.04, "learning_rate": 0.0004326680040120361, "loss": 3.0664, "theoretical_loss": 3.938469464617608, "tokens_seen": 472976384 }, { "epoch": 1.04, "learning_rate": 0.0004326579739217653, "loss": 3.1907, "theoretical_loss": 3.938410127062258, "tokens_seen": 473041920 }, { "epoch": 1.04, "learning_rate": 0.0004326479438314945, "loss": 3.2589, "theoretical_loss": 3.9383508000284997, "tokens_seen": 473107456 }, { "epoch": 1.04, "learning_rate": 0.0004326379137412237, "loss": 3.0644, "theoretical_loss": 3.938291483513009, "tokens_seen": 473172992 }, { "epoch": 1.04, "learning_rate": 0.00043262788365095286, "loss": 3.0912, "theoretical_loss": 3.938232177512466, "tokens_seen": 473238528 }, { "epoch": 1.04, "learning_rate": 0.00043261785356068204, "loss": 3.102, "theoretical_loss": 3.93817288202355, "tokens_seen": 473304064 }, { "epoch": 1.04, "learning_rate": 0.0004326078234704112, "loss": 3.1853, "theoretical_loss": 3.9381135970429426, "tokens_seen": 473369600 }, { "epoch": 1.04, "learning_rate": 0.00043259779338014045, "loss": 3.0938, "theoretical_loss": 3.938054322567328, "tokens_seen": 473435136 }, { "epoch": 1.04, "objective/train/docs_used": 777810, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0396034717559814, "objective/train/theoretical_loss": 3.9380098736025237, "objective/train/tokens_used": 493944288, "theoretical_loss": 3.9380098736025237, "tokens_seen": 473484288 }, { "epoch": 1.04, "learning_rate": 0.0004325877632898696, "loss": 3.1399, "theoretical_loss": 3.9379950585933905, "tokens_seen": 473500672 }, { "epoch": 1.04, "learning_rate": 0.0004325777331995988, "loss": 3.0815, "theoretical_loss": 3.9379358051178164, "tokens_seen": 473566208 }, { "epoch": 1.04, "learning_rate": 0.000432567703109328, "loss": 3.008, "theoretical_loss": 3.937876562137294, "tokens_seen": 473631744 }, { "epoch": 1.04, "learning_rate": 0.0004325576730190572, "loss": 3.2199, "theoretical_loss": 3.9378173296485115, "tokens_seen": 473697280 }, { "epoch": 1.04, "learning_rate": 0.00043254764292878636, "loss": 2.9883, "theoretical_loss": 3.9377581076481603, "tokens_seen": 473762816 }, { "epoch": 1.04, "learning_rate": 0.00043253761283851554, "loss": 3.1705, "theoretical_loss": 3.9376988961329333, "tokens_seen": 473828352 }, { "epoch": 1.04, "learning_rate": 0.0004325275827482447, "loss": 3.2875, "theoretical_loss": 3.937639695099523, "tokens_seen": 473893888 }, { "epoch": 1.04, "learning_rate": 0.00043251755265797396, "loss": 2.9823, "theoretical_loss": 3.937580504544626, "tokens_seen": 473959424 }, { "epoch": 1.04, "learning_rate": 0.0004325075225677031, "loss": 3.0959, "theoretical_loss": 3.9375213244649383, "tokens_seen": 474024960 }, { "epoch": 1.04, "learning_rate": 0.0004324974924774323, "loss": 3.0515, "theoretical_loss": 3.9374621548571582, "tokens_seen": 474090496 }, { "epoch": 1.04, "learning_rate": 0.00043248746238716145, "loss": 2.9292, "theoretical_loss": 3.937402995717986, "tokens_seen": 474156032 }, { "epoch": 1.04, "learning_rate": 0.0004324774322968907, "loss": 3.0058, "theoretical_loss": 3.937343847044123, "tokens_seen": 474221568 }, { "epoch": 1.04, "learning_rate": 0.00043246740220661986, "loss": 3.0388, "theoretical_loss": 3.937284708832271, "tokens_seen": 474287104 }, { "epoch": 1.04, "learning_rate": 0.00043245737211634904, "loss": 3.1298, "theoretical_loss": 3.9372255810791357, "tokens_seen": 474352640 }, { "epoch": 1.04, "learning_rate": 0.0004324473420260782, "loss": 3.111, "theoretical_loss": 3.9371664637814217, "tokens_seen": 474418176 }, { "epoch": 1.04, "learning_rate": 0.0004324373119358074, "loss": 3.1555, "theoretical_loss": 3.9371073569358366, "tokens_seen": 474483712 }, { "epoch": 1.04, "learning_rate": 0.00043242728184553664, "loss": 3.0273, "theoretical_loss": 3.9370482605390897, "tokens_seen": 474549248 }, { "epoch": 1.04, "learning_rate": 0.0004324172517552658, "loss": 3.2185, "theoretical_loss": 3.9369891745878904, "tokens_seen": 474614784 }, { "epoch": 1.04, "learning_rate": 0.000432407221664995, "loss": 3.0258, "theoretical_loss": 3.9369300990789515, "tokens_seen": 474680320 }, { "epoch": 1.04, "learning_rate": 0.0004323971915747242, "loss": 3.2292, "theoretical_loss": 3.936871034008985, "tokens_seen": 474745856 }, { "epoch": 1.04, "learning_rate": 0.0004323871614844534, "loss": 3.0595, "theoretical_loss": 3.9368119793747063, "tokens_seen": 474811392 }, { "epoch": 1.04, "learning_rate": 0.00043237713139418255, "loss": 3.0249, "theoretical_loss": 3.936752935172832, "tokens_seen": 474876928 }, { "epoch": 1.04, "learning_rate": 0.0004323671013039118, "loss": 3.1485, "theoretical_loss": 3.9366939014000786, "tokens_seen": 474942464 }, { "epoch": 1.04, "learning_rate": 0.0004323570712136409, "loss": 3.1262, "theoretical_loss": 3.936634878053166, "tokens_seen": 475008000 }, { "epoch": 1.04, "learning_rate": 0.00043234704112337014, "loss": 3.0896, "theoretical_loss": 3.936575865128815, "tokens_seen": 475073536 }, { "epoch": 1.04, "objective/train/docs_used": 780678, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1282413005828857, "objective/train/theoretical_loss": 3.936531612273386, "objective/train/tokens_used": 495582688, "theoretical_loss": 3.936531612273386, "tokens_seen": 475122688 }, { "epoch": 1.04, "learning_rate": 0.0004323370110330993, "loss": 3.1683, "theoretical_loss": 3.9365168626237477, "tokens_seen": 475139072 }, { "epoch": 1.04, "learning_rate": 0.0004323269809428285, "loss": 3.1602, "theoretical_loss": 3.9364578705346878, "tokens_seen": 475204608 }, { "epoch": 1.04, "learning_rate": 0.0004323169508525577, "loss": 3.2389, "theoretical_loss": 3.93639888885836, "tokens_seen": 475270144 }, { "epoch": 1.04, "learning_rate": 0.00043230692076228687, "loss": 3.1383, "theoretical_loss": 3.936339917591491, "tokens_seen": 475335680 }, { "epoch": 1.04, "learning_rate": 0.00043229689067201605, "loss": 3.1458, "theoretical_loss": 3.9362809567308092, "tokens_seen": 475401216 }, { "epoch": 1.04, "learning_rate": 0.0004322868605817453, "loss": 3.0747, "theoretical_loss": 3.9362220062730437, "tokens_seen": 475466752 }, { "epoch": 1.04, "learning_rate": 0.0004322768304914744, "loss": 3.0098, "theoretical_loss": 3.9361630662149256, "tokens_seen": 475532288 }, { "epoch": 1.04, "learning_rate": 0.00043226680040120365, "loss": 3.1878, "theoretical_loss": 3.9361041365531877, "tokens_seen": 475597824 }, { "epoch": 1.04, "learning_rate": 0.00043225677031093283, "loss": 3.0491, "theoretical_loss": 3.9360452172845637, "tokens_seen": 475663360 }, { "epoch": 1.04, "learning_rate": 0.000432246740220662, "loss": 3.136, "theoretical_loss": 3.9359863084057896, "tokens_seen": 475728896 }, { "epoch": 1.04, "learning_rate": 0.0004322367101303912, "loss": 3.1902, "theoretical_loss": 3.9359274099136012, "tokens_seen": 475794432 }, { "epoch": 1.04, "learning_rate": 0.00043222668004012037, "loss": 3.1639, "theoretical_loss": 3.9358685218047382, "tokens_seen": 475859968 }, { "epoch": 1.04, "learning_rate": 0.00043221664994984955, "loss": 3.1092, "theoretical_loss": 3.93580964407594, "tokens_seen": 475925504 }, { "epoch": 1.04, "learning_rate": 0.0004322066198595788, "loss": 3.1471, "theoretical_loss": 3.935750776723947, "tokens_seen": 475991040 }, { "epoch": 1.04, "learning_rate": 0.0004321965897693079, "loss": 3.2449, "theoretical_loss": 3.935691919745504, "tokens_seen": 476056576 }, { "epoch": 1.04, "learning_rate": 0.00043218655967903715, "loss": 3.021, "theoretical_loss": 3.9356330731373537, "tokens_seen": 476122112 }, { "epoch": 1.04, "learning_rate": 0.0004321765295887663, "loss": 2.9936, "theoretical_loss": 3.935574236896242, "tokens_seen": 476187648 }, { "epoch": 1.04, "learning_rate": 0.0004321664994984955, "loss": 2.9751, "theoretical_loss": 3.9355154110189163, "tokens_seen": 476253184 }, { "epoch": 1.04, "learning_rate": 0.0004321564694082247, "loss": 3.1343, "theoretical_loss": 3.935456595502126, "tokens_seen": 476318720 }, { "epoch": 1.04, "learning_rate": 0.0004321464393179539, "loss": 3.1437, "theoretical_loss": 3.9353977903426207, "tokens_seen": 476384256 }, { "epoch": 1.04, "learning_rate": 0.00043213640922768306, "loss": 3.1941, "theoretical_loss": 3.935338995537152, "tokens_seen": 476449792 }, { "epoch": 1.04, "learning_rate": 0.00043212637913741224, "loss": 3.1306, "theoretical_loss": 3.935280211082473, "tokens_seen": 476515328 }, { "epoch": 1.04, "learning_rate": 0.0004321163490471414, "loss": 3.1594, "theoretical_loss": 3.935221436975338, "tokens_seen": 476580864 }, { "epoch": 1.04, "learning_rate": 0.00043210631895687065, "loss": 3.0774, "theoretical_loss": 3.9351626732125036, "tokens_seen": 476646400 }, { "epoch": 1.04, "learning_rate": 0.0004320962888665998, "loss": 3.1837, "theoretical_loss": 3.935103919790727, "tokens_seen": 476711936 }, { "epoch": 1.04, "objective/train/docs_used": 783710, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0771515369415283, "objective/train/theoretical_loss": 3.935059861508764, "objective/train/tokens_used": 497221088, "theoretical_loss": 3.935059861508764, "tokens_seen": 476761088 }, { "epoch": 1.04, "learning_rate": 0.000432086258776329, "loss": 3.1931, "theoretical_loss": 3.935045176706767, "tokens_seen": 476777472 }, { "epoch": 1.04, "learning_rate": 0.0004320762286860582, "loss": 3.0519, "theoretical_loss": 3.934986443957385, "tokens_seen": 476843008 }, { "epoch": 1.04, "learning_rate": 0.0004320661985957874, "loss": 3.0795, "theoretical_loss": 3.9349277215393412, "tokens_seen": 476908544 }, { "epoch": 1.04, "learning_rate": 0.00043205616850551656, "loss": 2.9907, "theoretical_loss": 3.9348690094494, "tokens_seen": 476974080 }, { "epoch": 1.04, "learning_rate": 0.00043204613841524574, "loss": 3.0556, "theoretical_loss": 3.934810307684326, "tokens_seen": 477039616 }, { "epoch": 1.04, "learning_rate": 0.0004320361083249749, "loss": 2.9697, "theoretical_loss": 3.9347516162408858, "tokens_seen": 477105152 }, { "epoch": 1.04, "learning_rate": 0.00043202607823470416, "loss": 3.2209, "theoretical_loss": 3.934692935115846, "tokens_seen": 477170688 }, { "epoch": 1.04, "learning_rate": 0.0004320160481444333, "loss": 3.2466, "theoretical_loss": 3.934634264305977, "tokens_seen": 477236224 }, { "epoch": 1.04, "learning_rate": 0.0004320060180541625, "loss": 2.9162, "theoretical_loss": 3.9345756038080495, "tokens_seen": 477301760 }, { "epoch": 1.04, "learning_rate": 0.00043199598796389165, "loss": 3.0471, "theoretical_loss": 3.934516953618834, "tokens_seen": 477367296 }, { "epoch": 1.04, "learning_rate": 0.0004319859578736209, "loss": 3.1269, "theoretical_loss": 3.9344583137351057, "tokens_seen": 477432832 }, { "epoch": 1.04, "learning_rate": 0.00043197592778335006, "loss": 3.2286, "theoretical_loss": 3.9343996841536386, "tokens_seen": 477498368 }, { "epoch": 1.04, "learning_rate": 0.00043196589769307924, "loss": 3.1762, "theoretical_loss": 3.934341064871209, "tokens_seen": 477563904 }, { "epoch": 1.04, "learning_rate": 0.0004319558676028084, "loss": 3.0367, "theoretical_loss": 3.9342824558845955, "tokens_seen": 477629440 }, { "epoch": 1.04, "learning_rate": 0.0004319458375125376, "loss": 3.1073, "theoretical_loss": 3.934223857190578, "tokens_seen": 477694976 }, { "epoch": 1.04, "learning_rate": 0.0004319358074222668, "loss": 3.0244, "theoretical_loss": 3.9341652687859354, "tokens_seen": 477760512 }, { "epoch": 1.04, "learning_rate": 0.000431925777331996, "loss": 3.1287, "theoretical_loss": 3.934106690667451, "tokens_seen": 477826048 }, { "epoch": 1.04, "learning_rate": 0.00043191574724172515, "loss": 3.1331, "theoretical_loss": 3.934048122831909, "tokens_seen": 477891584 }, { "epoch": 1.04, "learning_rate": 0.0004319057171514544, "loss": 3.1553, "theoretical_loss": 3.9339895652760934, "tokens_seen": 477957120 }, { "epoch": 1.04, "learning_rate": 0.00043189568706118357, "loss": 2.9848, "theoretical_loss": 3.9339310179967915, "tokens_seen": 478022656 }, { "epoch": 1.04, "learning_rate": 0.00043188565697091275, "loss": 2.9807, "theoretical_loss": 3.9338724809907912, "tokens_seen": 478088192 }, { "epoch": 1.04, "learning_rate": 0.00043187562688064193, "loss": 3.0146, "theoretical_loss": 3.9338139542548816, "tokens_seen": 478153728 }, { "epoch": 1.04, "learning_rate": 0.0004318655967903711, "loss": 3.2054, "theoretical_loss": 3.933755437785854, "tokens_seen": 478219264 }, { "epoch": 1.04, "learning_rate": 0.0004318555667001003, "loss": 3.0219, "theoretical_loss": 3.9336969315805006, "tokens_seen": 478284800 }, { "epoch": 1.04, "learning_rate": 0.0004318455366098295, "loss": 3.0996, "theoretical_loss": 3.933638435635615, "tokens_seen": 478350336 }, { "epoch": 1.04, "objective/train/docs_used": 785886, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.762563943862915, "objective/train/theoretical_loss": 3.9335945704084554, "objective/train/tokens_used": 498859488, "theoretical_loss": 3.9335945704084554, "tokens_seen": 478399488 }, { "epoch": 1.04, "learning_rate": 0.00043183550651955865, "loss": 3.0269, "theoretical_loss": 3.933579949947993, "tokens_seen": 478415872 }, { "epoch": 1.04, "learning_rate": 0.0004318254764292879, "loss": 3.2255, "theoretical_loss": 3.9335214745144307, "tokens_seen": 478481408 }, { "epoch": 1.04, "learning_rate": 0.000431815446339017, "loss": 2.9606, "theoretical_loss": 3.933463009331726, "tokens_seen": 478546944 }, { "epoch": 1.04, "learning_rate": 0.00043180541624874625, "loss": 3.1516, "theoretical_loss": 3.9334045543966796, "tokens_seen": 478612480 }, { "epoch": 1.04, "learning_rate": 0.00043179538615847543, "loss": 3.2093, "theoretical_loss": 3.9333461097060916, "tokens_seen": 478678016 }, { "epoch": 1.04, "learning_rate": 0.0004317853560682046, "loss": 3.0557, "theoretical_loss": 3.9332876752567643, "tokens_seen": 478743552 }, { "epoch": 1.04, "learning_rate": 0.0004317753259779338, "loss": 2.9932, "theoretical_loss": 3.933229251045501, "tokens_seen": 478809088 }, { "epoch": 1.04, "learning_rate": 0.00043176529588766303, "loss": 3.0865, "theoretical_loss": 3.9331708370691087, "tokens_seen": 478874624 }, { "epoch": 1.04, "learning_rate": 0.00043175526579739215, "loss": 3.0459, "theoretical_loss": 3.9331124333243928, "tokens_seen": 478940160 }, { "epoch": 1.04, "learning_rate": 0.0004317452357071214, "loss": 3.1407, "theoretical_loss": 3.933054039808162, "tokens_seen": 479005696 }, { "epoch": 1.04, "learning_rate": 0.0004317352056168505, "loss": 2.9587, "theoretical_loss": 3.9329956565172255, "tokens_seen": 479071232 }, { "epoch": 1.04, "learning_rate": 0.00043172517552657975, "loss": 3.1053, "theoretical_loss": 3.9329372834483944, "tokens_seen": 479136768 }, { "epoch": 1.04, "learning_rate": 0.00043171514543630893, "loss": 2.9545, "theoretical_loss": 3.9328789205984815, "tokens_seen": 479202304 }, { "epoch": 1.04, "learning_rate": 0.0004317051153460381, "loss": 3.0784, "theoretical_loss": 3.9328205679643, "tokens_seen": 479267840 }, { "epoch": 1.04, "learning_rate": 0.0004316950852557673, "loss": 3.086, "theoretical_loss": 3.9327622255426666, "tokens_seen": 479333376 }, { "epoch": 1.04, "learning_rate": 0.0004316850551654965, "loss": 2.9491, "theoretical_loss": 3.9327038933303964, "tokens_seen": 479398912 }, { "epoch": 1.04, "learning_rate": 0.0004316750250752257, "loss": 3.0634, "theoretical_loss": 3.932645571324308, "tokens_seen": 479464448 }, { "epoch": 1.04, "learning_rate": 0.0004316649949849549, "loss": 3.0268, "theoretical_loss": 3.9325872595212217, "tokens_seen": 479529984 }, { "epoch": 1.04, "learning_rate": 0.0004316549648946841, "loss": 3.2341, "theoretical_loss": 3.932528957917958, "tokens_seen": 479595520 }, { "epoch": 1.04, "learning_rate": 0.00043164493480441326, "loss": 3.1136, "theoretical_loss": 3.932470666511339, "tokens_seen": 479661056 }, { "epoch": 1.04, "learning_rate": 0.00043163490471414244, "loss": 3.0795, "theoretical_loss": 3.932412385298189, "tokens_seen": 479726592 }, { "epoch": 1.04, "learning_rate": 0.0004316248746238716, "loss": 3.0234, "theoretical_loss": 3.9323541142753333, "tokens_seen": 479792128 }, { "epoch": 1.04, "learning_rate": 0.00043161484453360085, "loss": 3.1954, "theoretical_loss": 3.932295853439599, "tokens_seen": 479857664 }, { "epoch": 1.04, "learning_rate": 0.00043160481444333, "loss": 3.1776, "theoretical_loss": 3.9322376027878128, "tokens_seen": 479923200 }, { "epoch": 1.04, "learning_rate": 0.0004315947843530592, "loss": 3.1729, "theoretical_loss": 3.9321793623168055, "tokens_seen": 479988736 }, { "epoch": 1.04, "objective/train/docs_used": 788662, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.919154405593872, "objective/train/theoretical_loss": 3.93213568864278, "objective/train/tokens_used": 500497888, "theoretical_loss": 3.93213568864278, "tokens_seen": 480037888 }, { "epoch": 1.04, "learning_rate": 0.0004315847542627884, "loss": 2.9948, "theoretical_loss": 3.9321211320234077, "tokens_seen": 480054272 }, { "epoch": 1.04, "learning_rate": 0.0004315747241725176, "loss": 3.1325, "theoretical_loss": 3.9320629119044526, "tokens_seen": 480119808 }, { "epoch": 1.04, "learning_rate": 0.00043156469408224676, "loss": 3.0228, "theoretical_loss": 3.9320047019567728, "tokens_seen": 480185344 }, { "epoch": 1.04, "learning_rate": 0.00043155466399197594, "loss": 3.1872, "theoretical_loss": 3.9319465021772033, "tokens_seen": 480250880 }, { "epoch": 1.04, "learning_rate": 0.0004315446339017051, "loss": 3.1122, "theoretical_loss": 3.931888312562582, "tokens_seen": 480316416 }, { "epoch": 1.04, "learning_rate": 0.00043153460381143436, "loss": 3.1618, "theoretical_loss": 3.931830133109746, "tokens_seen": 480381952 }, { "epoch": 1.04, "learning_rate": 0.0004315245737211635, "loss": 3.185, "theoretical_loss": 3.9317719638155353, "tokens_seen": 480447488 }, { "epoch": 1.04, "learning_rate": 0.0004315145436308927, "loss": 3.0807, "theoretical_loss": 3.931713804676791, "tokens_seen": 480513024 }, { "epoch": 1.04, "learning_rate": 0.00043150451354062185, "loss": 3.0036, "theoretical_loss": 3.9316556556903546, "tokens_seen": 480578560 }, { "epoch": 1.04, "learning_rate": 0.0004314944834503511, "loss": 3.1422, "theoretical_loss": 3.9315975168530706, "tokens_seen": 480644096 }, { "epoch": 1.04, "learning_rate": 0.00043148445336008026, "loss": 2.9932, "theoretical_loss": 3.9315393881617835, "tokens_seen": 480709632 }, { "epoch": 1.04, "learning_rate": 0.00043147442326980944, "loss": 3.182, "theoretical_loss": 3.9314812696133394, "tokens_seen": 480775168 }, { "epoch": 1.04, "learning_rate": 0.0004314643931795386, "loss": 3.1497, "theoretical_loss": 3.9314231612045876, "tokens_seen": 480840704 }, { "epoch": 1.04, "learning_rate": 0.0004314543630892678, "loss": 2.8415, "theoretical_loss": 3.9313650629323766, "tokens_seen": 480906240 }, { "epoch": 1.04, "learning_rate": 0.000431444332998997, "loss": 3.1565, "theoretical_loss": 3.931306974793557, "tokens_seen": 480971776 }, { "epoch": 1.04, "learning_rate": 0.0004314343029087262, "loss": 3.0613, "theoretical_loss": 3.9312488967849815, "tokens_seen": 481037312 }, { "epoch": 1.04, "learning_rate": 0.00043142427281845535, "loss": 3.1546, "theoretical_loss": 3.931190828903504, "tokens_seen": 481102848 }, { "epoch": 1.04, "learning_rate": 0.0004314142427281846, "loss": 3.0987, "theoretical_loss": 3.931132771145978, "tokens_seen": 481168384 }, { "epoch": 1.04, "learning_rate": 0.00043140421263791377, "loss": 3.0724, "theoretical_loss": 3.931074723509261, "tokens_seen": 481233920 }, { "epoch": 1.04, "learning_rate": 0.00043139418254764295, "loss": 3.1835, "theoretical_loss": 3.931016685990211, "tokens_seen": 481299456 }, { "epoch": 1.04, "learning_rate": 0.00043138415245737213, "loss": 3.2034, "theoretical_loss": 3.930958658585687, "tokens_seen": 481364992 }, { "epoch": 1.04, "learning_rate": 0.0004313741223671013, "loss": 3.0828, "theoretical_loss": 3.9309006412925482, "tokens_seen": 481430528 }, { "epoch": 1.04, "learning_rate": 0.0004313640922768305, "loss": 3.1024, "theoretical_loss": 3.930842634107658, "tokens_seen": 481496064 }, { "epoch": 1.04, "learning_rate": 0.0004313540621865597, "loss": 3.0768, "theoretical_loss": 3.9307846370278803, "tokens_seen": 481561600 }, { "epoch": 1.04, "learning_rate": 0.00043134403209628885, "loss": 3.1284, "theoretical_loss": 3.9307266500500786, "tokens_seen": 481627136 }, { "epoch": 1.04, "objective/train/docs_used": 791380, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3158342838287354, "objective/train/theoretical_loss": 3.9306831664442643, "objective/train/tokens_used": 502136288, "theoretical_loss": 3.9306831664442643, "tokens_seen": 481676288 }, { "epoch": 1.04, "learning_rate": 0.0004313340020060181, "loss": 3.2919, "theoretical_loss": 3.9306686731711196, "tokens_seen": 481692672 }, { "epoch": 1.04, "learning_rate": 0.0004313239719157472, "loss": 2.9144, "theoretical_loss": 3.9306107063878715, "tokens_seen": 481758208 }, { "epoch": 1.04, "learning_rate": 0.00043131394182547645, "loss": 3.1914, "theoretical_loss": 3.9305527496972017, "tokens_seen": 481823744 }, { "epoch": 1.04, "learning_rate": 0.00043130391173520563, "loss": 3.1032, "theoretical_loss": 3.930494803095982, "tokens_seen": 481889280 }, { "epoch": 1.04, "learning_rate": 0.0004312938816449348, "loss": 3.0799, "theoretical_loss": 3.9304368665810845, "tokens_seen": 481954816 }, { "epoch": 1.04, "learning_rate": 0.000431283851554664, "loss": 2.9655, "theoretical_loss": 3.9303789401493807, "tokens_seen": 482020352 }, { "epoch": 1.04, "learning_rate": 0.00043127382146439323, "loss": 3.2712, "theoretical_loss": 3.9303210237977466, "tokens_seen": 482085888 }, { "epoch": 1.04, "learning_rate": 0.00043126379137412236, "loss": 3.0439, "theoretical_loss": 3.930263117523057, "tokens_seen": 482151424 }, { "epoch": 1.04, "learning_rate": 0.0004312537612838516, "loss": 3.0102, "theoretical_loss": 3.9302052213221907, "tokens_seen": 482216960 }, { "epoch": 1.04, "learning_rate": 0.0004312437311935807, "loss": 3.1782, "theoretical_loss": 3.9301473351920255, "tokens_seen": 482282496 }, { "epoch": 1.04, "learning_rate": 0.00043123370110330995, "loss": 2.9696, "theoretical_loss": 3.9300894591294413, "tokens_seen": 482348032 }, { "epoch": 1.04, "learning_rate": 0.00043122367101303913, "loss": 3.0608, "theoretical_loss": 3.9300315931313206, "tokens_seen": 482413568 }, { "epoch": 1.04, "learning_rate": 0.0004312136409227683, "loss": 3.1794, "theoretical_loss": 3.9299737371945453, "tokens_seen": 482479104 }, { "epoch": 1.04, "learning_rate": 0.0004312036108324975, "loss": 2.9659, "theoretical_loss": 3.9299158913160004, "tokens_seen": 482544640 }, { "epoch": 1.04, "learning_rate": 0.0004311935807422267, "loss": 3.0571, "theoretical_loss": 3.9298580554925717, "tokens_seen": 482610176 }, { "epoch": 1.04, "learning_rate": 0.00043118355065195586, "loss": 2.9442, "theoretical_loss": 3.9298002297211454, "tokens_seen": 482675712 }, { "epoch": 1.04, "learning_rate": 0.0004311735205616851, "loss": 2.9497, "theoretical_loss": 3.9297424139986106, "tokens_seen": 482741248 }, { "epoch": 1.04, "learning_rate": 0.0004311634904714142, "loss": 3.0578, "theoretical_loss": 3.9296846083218573, "tokens_seen": 482806784 }, { "epoch": 1.04, "learning_rate": 0.00043115346038114346, "loss": 2.9408, "theoretical_loss": 3.9296268126877765, "tokens_seen": 482872320 }, { "epoch": 1.04, "learning_rate": 0.0004311434302908726, "loss": 3.1082, "theoretical_loss": 3.9295690270932604, "tokens_seen": 482937856 }, { "epoch": 1.04, "learning_rate": 0.0004311334002006018, "loss": 3.1356, "theoretical_loss": 3.929511251535204, "tokens_seen": 483003392 }, { "epoch": 1.04, "learning_rate": 0.000431123370110331, "loss": 3.1261, "theoretical_loss": 3.9294534860105017, "tokens_seen": 483068928 }, { "epoch": 1.04, "learning_rate": 0.0004311133400200602, "loss": 3.2035, "theoretical_loss": 3.9293957305160507, "tokens_seen": 483134464 }, { "epoch": 1.04, "learning_rate": 0.00043110330992978936, "loss": 2.9251, "theoretical_loss": 3.9293379850487495, "tokens_seen": 483200000 }, { "epoch": 1.04, "learning_rate": 0.0004310932798395186, "loss": 3.2062, "theoretical_loss": 3.929280249605497, "tokens_seen": 483265536 }, { "epoch": 1.04, "objective/train/docs_used": 794373, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.001875877380371, "objective/train/theoretical_loss": 3.9292369545994754, "objective/train/tokens_used": 503774688, "theoretical_loss": 3.9292369545994754, "tokens_seen": 483314688 }, { "epoch": 1.04, "learning_rate": 0.0004310832497492477, "loss": 2.9276, "theoretical_loss": 3.929222524183195, "tokens_seen": 483331072 }, { "epoch": 1.04, "learning_rate": 0.00043107321965897696, "loss": 3.1922, "theoretical_loss": 3.9291648087787445, "tokens_seen": 483396608 }, { "epoch": 1.04, "learning_rate": 0.0004310631895687061, "loss": 3.0558, "theoretical_loss": 3.9291071033890494, "tokens_seen": 483462144 }, { "epoch": 1.04, "learning_rate": 0.0004310531594784353, "loss": 3.1318, "theoretical_loss": 3.929049408011016, "tokens_seen": 483527680 }, { "epoch": 1.04, "learning_rate": 0.0004310431293881645, "loss": 3.088, "theoretical_loss": 3.928991722641549, "tokens_seen": 483593216 }, { "epoch": 1.04, "learning_rate": 0.0004310330992978937, "loss": 3.1044, "theoretical_loss": 3.9289340472775582, "tokens_seen": 483658752 }, { "epoch": 1.04, "learning_rate": 0.00043102306920762286, "loss": 3.2538, "theoretical_loss": 3.928876381915951, "tokens_seen": 483724288 }, { "epoch": 1.04, "learning_rate": 0.00043101303911735205, "loss": 3.1124, "theoretical_loss": 3.928818726553639, "tokens_seen": 483789824 }, { "epoch": 1.04, "learning_rate": 0.0004310030090270812, "loss": 3.0456, "theoretical_loss": 3.9287610811875333, "tokens_seen": 483855360 }, { "epoch": 1.04, "learning_rate": 0.00043099297893681046, "loss": 3.1305, "theoretical_loss": 3.9287034458145476, "tokens_seen": 483920896 }, { "epoch": 1.04, "learning_rate": 0.0004309829488465396, "loss": 3.2602, "theoretical_loss": 3.928645820431597, "tokens_seen": 483986432 }, { "epoch": 1.04, "learning_rate": 0.0004309729187562688, "loss": 3.0673, "theoretical_loss": 3.928588205035596, "tokens_seen": 484051968 }, { "epoch": 1.04, "learning_rate": 0.00043096288866599795, "loss": 2.9022, "theoretical_loss": 3.928530599623464, "tokens_seen": 484117504 }, { "epoch": 1.04, "learning_rate": 0.0004309528585757272, "loss": 3.2916, "theoretical_loss": 3.9284730041921185, "tokens_seen": 484183040 }, { "epoch": 1.04, "learning_rate": 0.00043094282848545637, "loss": 3.2393, "theoretical_loss": 3.92841541873848, "tokens_seen": 484248576 }, { "epoch": 1.04, "learning_rate": 0.00043093279839518555, "loss": 2.8838, "theoretical_loss": 3.92835784325947, "tokens_seen": 484314112 }, { "epoch": 1.04, "learning_rate": 0.0004309227683049148, "loss": 3.0029, "theoretical_loss": 3.928300277752011, "tokens_seen": 484379648 }, { "epoch": 1.04, "learning_rate": 0.00043091273821464397, "loss": 3.2437, "theoretical_loss": 3.928242722213028, "tokens_seen": 484445184 }, { "epoch": 1.04, "learning_rate": 0.00043090270812437315, "loss": 2.9971, "theoretical_loss": 3.9281851766394453, "tokens_seen": 484510720 }, { "epoch": 1.04, "learning_rate": 0.00043089267803410233, "loss": 3.2291, "theoretical_loss": 3.9281276410281913, "tokens_seen": 484576256 }, { "epoch": 1.04, "learning_rate": 0.0004308826479438315, "loss": 3.0737, "theoretical_loss": 3.928070115376194, "tokens_seen": 484641792 }, { "epoch": 1.04, "learning_rate": 0.0004308726178535607, "loss": 3.0394, "theoretical_loss": 3.9280125996803825, "tokens_seen": 484707328 }, { "epoch": 1.04, "learning_rate": 0.0004308625877632899, "loss": 2.9338, "theoretical_loss": 3.9279550939376877, "tokens_seen": 484772864 }, { "epoch": 1.04, "learning_rate": 0.00043085255767301905, "loss": 3.1693, "theoretical_loss": 3.9278975981450426, "tokens_seen": 484838400 }, { "epoch": 1.04, "learning_rate": 0.0004308425275827483, "loss": 3.0729, "theoretical_loss": 3.927840112299381, "tokens_seen": 484903936 }, { "epoch": 1.04, "objective/train/docs_used": 795638, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.839747667312622, "objective/train/theoretical_loss": 3.927797004440999, "objective/train/tokens_used": 505413088, "theoretical_loss": 3.927797004440999, "tokens_seen": 484953088 }, { "epoch": 1.04, "learning_rate": 0.0004308324974924774, "loss": 3.0361, "theoretical_loss": 3.9277826363976382, "tokens_seen": 484969472 }, { "epoch": 1.04, "learning_rate": 0.00043082246740220665, "loss": 3.117, "theoretical_loss": 3.9277251704367497, "tokens_seen": 485035008 }, { "epoch": 1.04, "learning_rate": 0.00043081243731193583, "loss": 3.1916, "theoretical_loss": 3.927667714413654, "tokens_seen": 485100544 }, { "epoch": 1.04, "learning_rate": 0.000430802407221665, "loss": 3.2029, "theoretical_loss": 3.9276102683252905, "tokens_seen": 485166080 }, { "epoch": 1.04, "learning_rate": 0.0004307923771313942, "loss": 3.1574, "theoretical_loss": 3.9275528321685993, "tokens_seen": 485231616 }, { "epoch": 1.04, "learning_rate": 0.00043078234704112343, "loss": 2.9471, "theoretical_loss": 3.9274954059405225, "tokens_seen": 485297152 }, { "epoch": 1.04, "learning_rate": 0.00043077231695085256, "loss": 3.0377, "theoretical_loss": 3.927437989638004, "tokens_seen": 485362688 }, { "epoch": 1.04, "learning_rate": 0.0004307622868605818, "loss": 3.2519, "theoretical_loss": 3.927380583257987, "tokens_seen": 485428224 }, { "epoch": 1.04, "learning_rate": 0.0004307522567703109, "loss": 2.7976, "theoretical_loss": 3.9273231867974183, "tokens_seen": 485493760 }, { "epoch": 1.04, "learning_rate": 0.00043074222668004015, "loss": 3.1047, "theoretical_loss": 3.9272658002532452, "tokens_seen": 485559296 }, { "epoch": 1.04, "learning_rate": 0.00043073219658976933, "loss": 3.0013, "theoretical_loss": 3.9272084236224165, "tokens_seen": 485624832 }, { "epoch": 1.04, "learning_rate": 0.0004307221664994985, "loss": 3.0266, "theoretical_loss": 3.9271510569018817, "tokens_seen": 485690368 }, { "epoch": 1.04, "learning_rate": 0.0004307121364092277, "loss": 3.1738, "theoretical_loss": 3.9270937000885935, "tokens_seen": 485755904 }, { "epoch": 1.04, "learning_rate": 0.0004307021063189569, "loss": 3.1193, "theoretical_loss": 3.9270363531795027, "tokens_seen": 485821440 }, { "epoch": 1.04, "learning_rate": 0.00043069207622868606, "loss": 3.0409, "theoretical_loss": 3.9269790161715648, "tokens_seen": 485886976 }, { "epoch": 1.04, "learning_rate": 0.0004306820461384153, "loss": 2.9125, "theoretical_loss": 3.926921689061735, "tokens_seen": 485952512 }, { "epoch": 1.04, "learning_rate": 0.0004306720160481444, "loss": 3.0166, "theoretical_loss": 3.9268643718469693, "tokens_seen": 486018048 }, { "epoch": 1.04, "learning_rate": 0.00043066198595787366, "loss": 3.0578, "theoretical_loss": 3.926807064524226, "tokens_seen": 486083584 }, { "epoch": 1.04, "learning_rate": 0.0004306519558676028, "loss": 3.1915, "theoretical_loss": 3.926749767090466, "tokens_seen": 486149120 }, { "epoch": 1.04, "learning_rate": 0.000430641925777332, "loss": 2.9221, "theoretical_loss": 3.9266924795426483, "tokens_seen": 486214656 }, { "epoch": 1.04, "learning_rate": 0.0004306318956870612, "loss": 3.1473, "theoretical_loss": 3.9266352018777355, "tokens_seen": 486280192 }, { "epoch": 1.04, "learning_rate": 0.0004306218655967904, "loss": 3.2375, "theoretical_loss": 3.926577934092691, "tokens_seen": 486345728 }, { "epoch": 1.04, "learning_rate": 0.00043061183550651956, "loss": 3.1146, "theoretical_loss": 3.926520676184481, "tokens_seen": 486411264 }, { "epoch": 1.04, "learning_rate": 0.0004306018054162488, "loss": 2.9712, "theoretical_loss": 3.92646342815007, "tokens_seen": 486476800 }, { "epoch": 1.04, "learning_rate": 0.0004305917753259779, "loss": 3.2296, "theoretical_loss": 3.9264061899864258, "tokens_seen": 486542336 }, { "epoch": 1.04, "objective/train/docs_used": 798621, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1069557666778564, "objective/train/theoretical_loss": 3.9263632678395606, "objective/train/tokens_used": 507051488, "theoretical_loss": 3.9263632678395606, "tokens_seen": 486591488 }, { "epoch": 1.04, "learning_rate": 0.00043058174523570716, "loss": 3.0567, "theoretical_loss": 3.9263489616905183, "tokens_seen": 486607872 }, { "epoch": 1.04, "learning_rate": 0.0004305717151454363, "loss": 2.9745, "theoretical_loss": 3.9262917432593163, "tokens_seen": 486673408 }, { "epoch": 1.04, "learning_rate": 0.0004305616850551655, "loss": 3.1121, "theoretical_loss": 3.9262345346897924, "tokens_seen": 486738944 }, { "epoch": 1.04, "learning_rate": 0.0004305516549648947, "loss": 3.0603, "theoretical_loss": 3.9261773359789185, "tokens_seen": 486804480 }, { "epoch": 1.04, "learning_rate": 0.0004305416248746239, "loss": 2.9384, "theoretical_loss": 3.9261201471236697, "tokens_seen": 486870016 }, { "epoch": 1.04, "learning_rate": 0.00043053159478435306, "loss": 2.9908, "theoretical_loss": 3.9260629681210215, "tokens_seen": 486935552 }, { "epoch": 1.04, "learning_rate": 0.00043052156469408225, "loss": 2.9735, "theoretical_loss": 3.9260057989679504, "tokens_seen": 487001088 }, { "epoch": 1.04, "learning_rate": 0.0004305115346038114, "loss": 3.0909, "theoretical_loss": 3.925948639661434, "tokens_seen": 487066624 }, { "epoch": 1.04, "learning_rate": 0.00043050150451354066, "loss": 3.1638, "theoretical_loss": 3.925891490198453, "tokens_seen": 487132160 }, { "epoch": 1.04, "learning_rate": 0.0004304914744232698, "loss": 3.2226, "theoretical_loss": 3.925834350575988, "tokens_seen": 487197696 }, { "epoch": 1.04, "learning_rate": 0.000430481444332999, "loss": 2.9565, "theoretical_loss": 3.92577722079102, "tokens_seen": 487263232 }, { "epoch": 1.04, "learning_rate": 0.00043047141424272815, "loss": 3.1256, "theoretical_loss": 3.9257201008405342, "tokens_seen": 487328768 }, { "epoch": 1.04, "learning_rate": 0.0004304613841524574, "loss": 2.911, "theoretical_loss": 3.9256629907215146, "tokens_seen": 487394304 }, { "epoch": 1.04, "learning_rate": 0.00043045135406218657, "loss": 3.1327, "theoretical_loss": 3.9256058904309477, "tokens_seen": 487459840 }, { "epoch": 1.04, "learning_rate": 0.00043044132397191575, "loss": 3.0006, "theoretical_loss": 3.9255487999658207, "tokens_seen": 487525376 }, { "epoch": 1.04, "learning_rate": 0.00043043129388164493, "loss": 2.8924, "theoretical_loss": 3.9254917193231225, "tokens_seen": 487590912 }, { "epoch": 1.04, "learning_rate": 0.00043042126379137417, "loss": 3.0441, "theoretical_loss": 3.9254346484998432, "tokens_seen": 487656448 }, { "epoch": 1.04, "learning_rate": 0.0004304112337011033, "loss": 3.1157, "theoretical_loss": 3.9253775874929753, "tokens_seen": 487721984 }, { "epoch": 1.04, "learning_rate": 0.00043040120361083253, "loss": 2.9985, "theoretical_loss": 3.92532053629951, "tokens_seen": 487787520 }, { "epoch": 1.04, "learning_rate": 0.00043039117352056165, "loss": 3.2291, "theoretical_loss": 3.9252634949164427, "tokens_seen": 487853056 }, { "epoch": 1.04, "learning_rate": 0.0004303811434302909, "loss": 3.0966, "theoretical_loss": 3.9252064633407686, "tokens_seen": 487918592 }, { "epoch": 1.04, "learning_rate": 0.00043037111334002007, "loss": 3.1008, "theoretical_loss": 3.9251494415694843, "tokens_seen": 487984128 }, { "epoch": 1.04, "learning_rate": 0.00043036108324974925, "loss": 2.8941, "theoretical_loss": 3.9250924295995877, "tokens_seen": 488049664 }, { "epoch": 1.04, "learning_rate": 0.00043035105315947843, "loss": 3.0175, "theoretical_loss": 3.925035427428079, "tokens_seen": 488115200 }, { "epoch": 1.04, "learning_rate": 0.0004303410230692076, "loss": 3.041, "theoretical_loss": 3.9249784350519583, "tokens_seen": 488180736 }, { "epoch": 1.04, "objective/train/docs_used": 801255, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.085075616836548, "objective/train/theoretical_loss": 3.924935697196288, "objective/train/tokens_used": 508689888, "theoretical_loss": 3.924935697196288, "tokens_seen": 488229888 }, { "epoch": 1.04, "learning_rate": 0.0004303309929789368, "loss": 3.0545, "theoretical_loss": 3.9249214524682277, "tokens_seen": 488246272 }, { "epoch": 1.04, "learning_rate": 0.00043032096288866603, "loss": 2.9883, "theoretical_loss": 3.924864479673891, "tokens_seen": 488311808 }, { "epoch": 1.04, "learning_rate": 0.00043031093279839516, "loss": 3.1797, "theoretical_loss": 3.924807516665953, "tokens_seen": 488377344 }, { "epoch": 1.04, "learning_rate": 0.0004303009027081244, "loss": 3.0454, "theoretical_loss": 3.9247505634414193, "tokens_seen": 488442880 }, { "epoch": 1.04, "learning_rate": 0.0004302908726178535, "loss": 3.0222, "theoretical_loss": 3.9246936199972975, "tokens_seen": 488508416 }, { "epoch": 1.04, "learning_rate": 0.00043028084252758276, "loss": 3.2153, "theoretical_loss": 3.9246366863305964, "tokens_seen": 488573952 }, { "epoch": 1.04, "learning_rate": 0.00043027081243731194, "loss": 3.0641, "theoretical_loss": 3.924579762438326, "tokens_seen": 488639488 }, { "epoch": 1.04, "learning_rate": 0.0004302607823470411, "loss": 3.0934, "theoretical_loss": 3.9245228483174968, "tokens_seen": 488705024 }, { "epoch": 1.04, "learning_rate": 0.0004302507522567703, "loss": 3.0029, "theoretical_loss": 3.9244659439651226, "tokens_seen": 488770560 }, { "epoch": 1.04, "learning_rate": 0.00043024072216649953, "loss": 3.0866, "theoretical_loss": 3.924409049378217, "tokens_seen": 488836096 }, { "epoch": 1.04, "learning_rate": 0.00043023069207622866, "loss": 2.9941, "theoretical_loss": 3.9243521645537944, "tokens_seen": 488901632 }, { "epoch": 1.04, "learning_rate": 0.0004302206619859579, "loss": 2.9574, "theoretical_loss": 3.9242952894888723, "tokens_seen": 488967168 }, { "epoch": 1.04, "learning_rate": 0.000430210631895687, "loss": 3.0488, "theoretical_loss": 3.9242384241804684, "tokens_seen": 489032704 }, { "epoch": 1.04, "learning_rate": 0.00043020060180541626, "loss": 3.1678, "theoretical_loss": 3.924181568625602, "tokens_seen": 489098240 }, { "epoch": 1.04, "learning_rate": 0.00043019057171514544, "loss": 2.948, "theoretical_loss": 3.9241247228212934, "tokens_seen": 489163776 }, { "epoch": 1.04, "learning_rate": 0.0004301805416248746, "loss": 3.0417, "theoretical_loss": 3.924067886764564, "tokens_seen": 489229312 }, { "epoch": 1.04, "learning_rate": 0.00043017051153460386, "loss": 3.0709, "theoretical_loss": 3.924011060452438, "tokens_seen": 489294848 }, { "epoch": 1.04, "learning_rate": 0.000430160481444333, "loss": 3.1664, "theoretical_loss": 3.923954243881939, "tokens_seen": 489360384 }, { "epoch": 1.04, "learning_rate": 0.0004301504513540622, "loss": 3.0389, "theoretical_loss": 3.9238974370500923, "tokens_seen": 489425920 }, { "epoch": 1.04, "learning_rate": 0.0004301404212637914, "loss": 3.1592, "theoretical_loss": 3.923840639953926, "tokens_seen": 489491456 }, { "epoch": 1.04, "learning_rate": 0.0004301303911735206, "loss": 3.1576, "theoretical_loss": 3.923783852590468, "tokens_seen": 489556992 }, { "epoch": 1.04, "learning_rate": 0.00043012036108324976, "loss": 3.1351, "theoretical_loss": 3.923727074956748, "tokens_seen": 489622528 }, { "epoch": 1.04, "learning_rate": 0.000430110330992979, "loss": 3.1062, "theoretical_loss": 3.9236703070497962, "tokens_seen": 489688064 }, { "epoch": 1.04, "learning_rate": 0.0004301003009027081, "loss": 3.1662, "theoretical_loss": 3.9236135488666464, "tokens_seen": 489753600 }, { "epoch": 1.04, "learning_rate": 0.00043009027081243736, "loss": 3.0285, "theoretical_loss": 3.923556800404331, "tokens_seen": 489819136 }, { "epoch": 1.04, "objective/train/docs_used": 804229, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.072127342224121, "objective/train/theoretical_loss": 3.923514245435108, "objective/train/tokens_used": 510328288, "theoretical_loss": 3.923514245435108, "tokens_seen": 489868288 }, { "epoch": 1.04, "learning_rate": 0.0004300802407221665, "loss": 3.0786, "theoretical_loss": 3.923500061659885, "tokens_seen": 489884672 }, { "epoch": 1.04, "learning_rate": 0.0004300702106318957, "loss": 3.0896, "theoretical_loss": 3.9234433326303444, "tokens_seen": 489950208 }, { "epoch": 1.04, "learning_rate": 0.0004300601805416249, "loss": 3.1916, "theoretical_loss": 3.9233866133127475, "tokens_seen": 490015744 }, { "epoch": 1.04, "learning_rate": 0.0004300501504513541, "loss": 3.1495, "theoretical_loss": 3.9233299037041327, "tokens_seen": 490081280 }, { "epoch": 1.04, "learning_rate": 0.00043004012036108326, "loss": 3.0425, "theoretical_loss": 3.923273203801539, "tokens_seen": 490146816 }, { "epoch": 1.05, "learning_rate": 0.00043003009027081245, "loss": 2.9781, "theoretical_loss": 3.9232165136020094, "tokens_seen": 490212352 }, { "epoch": 1.05, "learning_rate": 0.00043002006018054163, "loss": 3.2052, "theoretical_loss": 3.9231598331025856, "tokens_seen": 490277888 }, { "epoch": 1.05, "learning_rate": 0.00043001003009027086, "loss": 3.1671, "theoretical_loss": 3.923103162300312, "tokens_seen": 490343424 }, { "epoch": 1.05, "learning_rate": 0.00043, "loss": 3.0317, "theoretical_loss": 3.9230465011922333, "tokens_seen": 490408960 }, { "epoch": 1.05, "learning_rate": 0.0004299899699097292, "loss": 3.1664, "theoretical_loss": 3.922989849775396, "tokens_seen": 490474496 }, { "epoch": 1.05, "learning_rate": 0.00042997993981945835, "loss": 3.134, "theoretical_loss": 3.9229332080468486, "tokens_seen": 490540032 }, { "epoch": 1.05, "learning_rate": 0.0004299699097291876, "loss": 3.2218, "theoretical_loss": 3.92287657600364, "tokens_seen": 490605568 }, { "epoch": 1.05, "learning_rate": 0.00042995987963891677, "loss": 2.8293, "theoretical_loss": 3.9228199536428203, "tokens_seen": 490671104 }, { "epoch": 1.05, "learning_rate": 0.00042994984954864595, "loss": 3.1865, "theoretical_loss": 3.922763340961442, "tokens_seen": 490736640 }, { "epoch": 1.05, "learning_rate": 0.00042993981945837513, "loss": 3.1221, "theoretical_loss": 3.922706737956557, "tokens_seen": 490802176 }, { "epoch": 1.05, "learning_rate": 0.00042992978936810437, "loss": 3.0768, "theoretical_loss": 3.92265014462522, "tokens_seen": 490867712 }, { "epoch": 1.05, "learning_rate": 0.0004299197592778335, "loss": 3.1863, "theoretical_loss": 3.922593560964487, "tokens_seen": 490933248 }, { "epoch": 1.05, "learning_rate": 0.00042990972918756273, "loss": 3.1557, "theoretical_loss": 3.922536986971415, "tokens_seen": 490998784 }, { "epoch": 1.05, "learning_rate": 0.00042989969909729185, "loss": 3.1163, "theoretical_loss": 3.9224804226430607, "tokens_seen": 491064320 }, { "epoch": 1.05, "learning_rate": 0.0004298896690070211, "loss": 2.9946, "theoretical_loss": 3.922423867976485, "tokens_seen": 491129856 }, { "epoch": 1.05, "learning_rate": 0.00042987963891675027, "loss": 3.0294, "theoretical_loss": 3.9223673229687486, "tokens_seen": 491195392 }, { "epoch": 1.05, "learning_rate": 0.00042986960882647945, "loss": 3.1109, "theoretical_loss": 3.9223107876169125, "tokens_seen": 491260928 }, { "epoch": 1.05, "learning_rate": 0.00042985957873620863, "loss": 3.0657, "theoretical_loss": 3.9222542619180416, "tokens_seen": 491326464 }, { "epoch": 1.05, "learning_rate": 0.0004298495486459378, "loss": 3.0744, "theoretical_loss": 3.922197745869199, "tokens_seen": 491392000 }, { "epoch": 1.05, "learning_rate": 0.000429839518555667, "loss": 3.0638, "theoretical_loss": 3.922141239467451, "tokens_seen": 491457536 }, { "debugging/Self-BLEU-5": 0.5735121597250681, "debugging/distinct-1-grams": 0.7725484387095595, "debugging/distinct-2-grams": 0.9624432265812667, "debugging/entropy-1-grams": 6.1736279149786775, "debugging/entropy-2-grams": 7.286824547207616, "debugging/length": 527.6190476190476, "debugging/num_segments": 21, "epoch": 1.05, "objective/train/docs_used": 806881, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.990661382675171, "objective/train/theoretical_loss": 3.922098865995282, "objective/train/tokens_used": 511966688, "theoretical_loss": 3.922098865995282, "tokens_seen": 491506688 }, { "epoch": 1.05, "learning_rate": 0.00042982948846539623, "loss": 3.1148, "theoretical_loss": 3.922084742709865, "tokens_seen": 491523072 }, { "epoch": 1.05, "learning_rate": 0.00042981945837512536, "loss": 3.013, "theoretical_loss": 3.922028255593509, "tokens_seen": 491588608 }, { "epoch": 1.05, "learning_rate": 0.0004298094282848546, "loss": 2.9811, "theoretical_loss": 3.921971778115454, "tokens_seen": 491654144 }, { "epoch": 1.05, "learning_rate": 0.0004297993981945837, "loss": 3.1154, "theoretical_loss": 3.9219153102727695, "tokens_seen": 491719680 }, { "epoch": 1.05, "learning_rate": 0.00042978936810431296, "loss": 3.0265, "theoretical_loss": 3.9218588520625284, "tokens_seen": 491785216 }, { "epoch": 1.05, "learning_rate": 0.00042977933801404214, "loss": 3.1929, "theoretical_loss": 3.9218024034818044, "tokens_seen": 491850752 }, { "epoch": 1.05, "learning_rate": 0.0004297693079237713, "loss": 3.0315, "theoretical_loss": 3.921745964527672, "tokens_seen": 491916288 }, { "epoch": 1.05, "learning_rate": 0.0004297592778335005, "loss": 3.0472, "theoretical_loss": 3.921689535197207, "tokens_seen": 491981824 }, { "epoch": 1.05, "learning_rate": 0.00042974924774322973, "loss": 2.918, "theoretical_loss": 3.9216331154874884, "tokens_seen": 492047360 }, { "epoch": 1.05, "learning_rate": 0.00042973921765295886, "loss": 3.2541, "theoretical_loss": 3.9215767053955934, "tokens_seen": 492112896 }, { "epoch": 1.05, "learning_rate": 0.0004297291875626881, "loss": 2.9245, "theoretical_loss": 3.9215203049186016, "tokens_seen": 492178432 }, { "epoch": 1.05, "learning_rate": 0.0004297191574724172, "loss": 3.24, "theoretical_loss": 3.9214639140535956, "tokens_seen": 492243968 }, { "epoch": 1.05, "learning_rate": 0.00042970912738214646, "loss": 2.9029, "theoretical_loss": 3.9214075327976574, "tokens_seen": 492309504 }, { "epoch": 1.05, "learning_rate": 0.00042969909729187564, "loss": 3.0068, "theoretical_loss": 3.92135116114787, "tokens_seen": 492375040 }, { "epoch": 1.05, "learning_rate": 0.0004296890672016048, "loss": 3.1647, "theoretical_loss": 3.92129479910132, "tokens_seen": 492440576 }, { "epoch": 1.05, "learning_rate": 0.000429679037111334, "loss": 3.1013, "theoretical_loss": 3.921238446655092, "tokens_seen": 492506112 }, { "epoch": 1.05, "learning_rate": 0.0004296690070210632, "loss": 3.1736, "theoretical_loss": 3.9211821038062746, "tokens_seen": 492571648 }, { "epoch": 1.05, "learning_rate": 0.00042965897693079236, "loss": 3.0449, "theoretical_loss": 3.9211257705519564, "tokens_seen": 492637184 }, { "epoch": 1.05, "learning_rate": 0.0004296489468405216, "loss": 3.0858, "theoretical_loss": 3.9210694468892275, "tokens_seen": 492702720 }, { "epoch": 1.05, "learning_rate": 0.0004296389167502507, "loss": 2.9477, "theoretical_loss": 3.9210131328151796, "tokens_seen": 492768256 }, { "epoch": 1.05, "learning_rate": 0.00042962888665997996, "loss": 3.1172, "theoretical_loss": 3.920956828326905, "tokens_seen": 492833792 }, { "epoch": 1.05, "learning_rate": 0.0004296188565697091, "loss": 3.0133, "theoretical_loss": 3.9209005334214986, "tokens_seen": 492899328 }, { "epoch": 1.05, "learning_rate": 0.0004296088264794383, "loss": 3.0496, "theoretical_loss": 3.9208442480960537, "tokens_seen": 492964864 }, { "epoch": 1.05, "learning_rate": 0.0004295987963891675, "loss": 3.0614, "theoretical_loss": 3.9207879723476684, "tokens_seen": 493030400 }, { "epoch": 1.05, "learning_rate": 0.0004295887662988967, "loss": 3.0788, "theoretical_loss": 3.9207317061734397, "tokens_seen": 493095936 }, { "epoch": 1.05, "objective/train/docs_used": 809707, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1040475368499756, "objective/train/theoretical_loss": 3.920689512824064, "objective/train/tokens_used": 513605088, "theoretical_loss": 3.920689512824064, "tokens_seen": 493145088 }, { "epoch": 1.05, "learning_rate": 0.00042957873620862587, "loss": 3.2046, "theoretical_loss": 3.920675449570467, "tokens_seen": 493161472 }, { "epoch": 1.05, "learning_rate": 0.0004295687061183551, "loss": 3.2003, "theoretical_loss": 3.9206192025358506, "tokens_seen": 493227008 }, { "epoch": 1.05, "learning_rate": 0.00042955867602808423, "loss": 3.0359, "theoretical_loss": 3.9205629650666918, "tokens_seen": 493292544 }, { "epoch": 1.05, "learning_rate": 0.00042954864593781347, "loss": 3.0085, "theoretical_loss": 3.9205067371600935, "tokens_seen": 493358080 }, { "epoch": 1.05, "learning_rate": 0.0004295386158475426, "loss": 3.1952, "theoretical_loss": 3.9204505188131593, "tokens_seen": 493423616 }, { "epoch": 1.05, "learning_rate": 0.00042952858575727183, "loss": 3.106, "theoretical_loss": 3.9203943100229957, "tokens_seen": 493489152 }, { "epoch": 1.05, "learning_rate": 0.000429518555667001, "loss": 3.1107, "theoretical_loss": 3.9203381107867075, "tokens_seen": 493554688 }, { "epoch": 1.05, "learning_rate": 0.0004295085255767302, "loss": 3.0397, "theoretical_loss": 3.920281921101404, "tokens_seen": 493620224 }, { "epoch": 1.05, "learning_rate": 0.00042949849548645937, "loss": 3.1409, "theoretical_loss": 3.920225740964194, "tokens_seen": 493685760 }, { "epoch": 1.05, "learning_rate": 0.00042948846539618855, "loss": 3.1442, "theoretical_loss": 3.9201695703721873, "tokens_seen": 493751296 }, { "epoch": 1.05, "learning_rate": 0.00042947843530591773, "loss": 3.1429, "theoretical_loss": 3.9201134093224965, "tokens_seen": 493816832 }, { "epoch": 1.05, "learning_rate": 0.00042946840521564697, "loss": 3.223, "theoretical_loss": 3.920057257812233, "tokens_seen": 493882368 }, { "epoch": 1.05, "learning_rate": 0.0004294583751253761, "loss": 2.9504, "theoretical_loss": 3.920001115838512, "tokens_seen": 493947904 }, { "epoch": 1.05, "learning_rate": 0.00042944834503510533, "loss": 3.0647, "theoretical_loss": 3.9199449833984494, "tokens_seen": 494013440 }, { "epoch": 1.05, "learning_rate": 0.0004294383149448345, "loss": 3.0059, "theoretical_loss": 3.919888860489161, "tokens_seen": 494078976 }, { "epoch": 1.05, "learning_rate": 0.0004294282848545637, "loss": 2.9787, "theoretical_loss": 3.9198327471077645, "tokens_seen": 494144512 }, { "epoch": 1.05, "learning_rate": 0.00042941825476429293, "loss": 2.9841, "theoretical_loss": 3.9197766432513794, "tokens_seen": 494210048 }, { "epoch": 1.05, "learning_rate": 0.00042940822467402206, "loss": 3.0192, "theoretical_loss": 3.919720548917126, "tokens_seen": 494275584 }, { "epoch": 1.05, "learning_rate": 0.0004293981945837513, "loss": 2.9567, "theoretical_loss": 3.9196644641021265, "tokens_seen": 494341120 }, { "epoch": 1.05, "learning_rate": 0.00042938816449348047, "loss": 3.0177, "theoretical_loss": 3.919608388803503, "tokens_seen": 494406656 }, { "epoch": 1.05, "learning_rate": 0.00042937813440320965, "loss": 3.3184, "theoretical_loss": 3.9195523230183804, "tokens_seen": 494472192 }, { "epoch": 1.05, "learning_rate": 0.00042936810431293883, "loss": 2.9234, "theoretical_loss": 3.919496266743883, "tokens_seen": 494537728 }, { "epoch": 1.05, "learning_rate": 0.000429358074222668, "loss": 3.2351, "theoretical_loss": 3.9194402199771385, "tokens_seen": 494603264 }, { "epoch": 1.05, "learning_rate": 0.0004293480441323972, "loss": 3.1544, "theoretical_loss": 3.919384182715275, "tokens_seen": 494668800 }, { "epoch": 1.05, "learning_rate": 0.00042933801404212643, "loss": 3.0746, "theoretical_loss": 3.9193281549554206, "tokens_seen": 494734336 }, { "epoch": 1.05, "objective/train/docs_used": 811217, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.937518358230591, "objective/train/theoretical_loss": 3.9192861403694974, "objective/train/tokens_used": 515243488, "theoretical_loss": 3.9192861403694974, "tokens_seen": 494783488 }, { "epoch": 1.05, "learning_rate": 0.00042932798395185556, "loss": 3.0427, "theoretical_loss": 3.9192721366947065, "tokens_seen": 494799872 }, { "epoch": 1.05, "learning_rate": 0.0004293179538615848, "loss": 3.1604, "theoretical_loss": 3.9192161279302637, "tokens_seen": 494865408 }, { "epoch": 1.05, "learning_rate": 0.0004293079237713139, "loss": 3.1389, "theoretical_loss": 3.9191601286592253, "tokens_seen": 494930944 }, { "epoch": 1.05, "learning_rate": 0.00042929789368104316, "loss": 3.0944, "theoretical_loss": 3.919104138878726, "tokens_seen": 494996480 }, { "epoch": 1.05, "learning_rate": 0.00042928786359077234, "loss": 3.0903, "theoretical_loss": 3.919048158585901, "tokens_seen": 495062016 }, { "epoch": 1.05, "learning_rate": 0.0004292778335005015, "loss": 3.1424, "theoretical_loss": 3.918992187777886, "tokens_seen": 495127552 }, { "epoch": 1.05, "learning_rate": 0.0004292678034102307, "loss": 3.2746, "theoretical_loss": 3.9189362264518204, "tokens_seen": 495193088 }, { "epoch": 1.05, "learning_rate": 0.00042925777331995993, "loss": 3.0186, "theoretical_loss": 3.9188802746048417, "tokens_seen": 495258624 }, { "epoch": 1.05, "learning_rate": 0.00042924774322968906, "loss": 3.036, "theoretical_loss": 3.9188243322340917, "tokens_seen": 495324160 }, { "epoch": 1.05, "learning_rate": 0.0004292377131394183, "loss": 3.1071, "theoretical_loss": 3.9187683993367113, "tokens_seen": 495389696 }, { "epoch": 1.05, "learning_rate": 0.0004292276830491474, "loss": 3.0834, "theoretical_loss": 3.9187124759098433, "tokens_seen": 495455232 }, { "epoch": 1.05, "learning_rate": 0.00042921765295887666, "loss": 3.091, "theoretical_loss": 3.9186565619506313, "tokens_seen": 495520768 }, { "epoch": 1.05, "learning_rate": 0.00042920762286860584, "loss": 2.8008, "theoretical_loss": 3.9186006574562215, "tokens_seen": 495586304 }, { "epoch": 1.05, "learning_rate": 0.000429197592778335, "loss": 3.1642, "theoretical_loss": 3.9185447624237604, "tokens_seen": 495651840 }, { "epoch": 1.05, "learning_rate": 0.0004291875626880642, "loss": 2.9861, "theoretical_loss": 3.9184888768503954, "tokens_seen": 495717376 }, { "epoch": 1.05, "learning_rate": 0.0004291775325977934, "loss": 3.1063, "theoretical_loss": 3.918433000733275, "tokens_seen": 495782912 }, { "epoch": 1.05, "learning_rate": 0.00042916750250752256, "loss": 3.0597, "theoretical_loss": 3.918377134069551, "tokens_seen": 495848448 }, { "epoch": 1.05, "learning_rate": 0.0004291574724172518, "loss": 2.998, "theoretical_loss": 3.9183212768563735, "tokens_seen": 495913984 }, { "epoch": 1.05, "learning_rate": 0.0004291474423269809, "loss": 3.0487, "theoretical_loss": 3.9182654290908956, "tokens_seen": 495979520 }, { "epoch": 1.05, "learning_rate": 0.00042913741223671016, "loss": 3.119, "theoretical_loss": 3.9182095907702714, "tokens_seen": 496045056 }, { "epoch": 1.05, "learning_rate": 0.0004291273821464393, "loss": 3.0622, "theoretical_loss": 3.918153761891656, "tokens_seen": 496110592 }, { "epoch": 1.05, "learning_rate": 0.0004291173520561685, "loss": 3.2584, "theoretical_loss": 3.9180979424522064, "tokens_seen": 496176128 }, { "epoch": 1.05, "learning_rate": 0.0004291073219658977, "loss": 3.0769, "theoretical_loss": 3.9180421324490795, "tokens_seen": 496241664 }, { "epoch": 1.05, "learning_rate": 0.0004290972918756269, "loss": 3.1545, "theoretical_loss": 3.917986331879434, "tokens_seen": 496307200 }, { "epoch": 1.05, "learning_rate": 0.00042908726178535607, "loss": 3.0994, "theoretical_loss": 3.9179305407404312, "tokens_seen": 496372736 }, { "epoch": 1.05, "objective/train/docs_used": 815081, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.079500675201416, "objective/train/theoretical_loss": 3.91788870357333, "objective/train/tokens_used": 516881888, "theoretical_loss": 3.91788870357333, "tokens_seen": 496421888 }, { "epoch": 1.05, "learning_rate": 0.0004290772316950853, "loss": 3.0032, "theoretical_loss": 3.9178747590292313, "tokens_seen": 496438272 }, { "epoch": 1.05, "learning_rate": 0.00042906720160481443, "loss": 3.0286, "theoretical_loss": 3.917818986742997, "tokens_seen": 496503808 }, { "epoch": 1.05, "learning_rate": 0.00042905717151454367, "loss": 3.1936, "theoretical_loss": 3.917763223878893, "tokens_seen": 496569344 }, { "epoch": 1.05, "learning_rate": 0.0004290471414242728, "loss": 3.1134, "theoretical_loss": 3.9177074704340833, "tokens_seen": 496634880 }, { "epoch": 1.05, "learning_rate": 0.00042903711133400203, "loss": 3.1409, "theoretical_loss": 3.917651726405735, "tokens_seen": 496700416 }, { "epoch": 1.05, "learning_rate": 0.0004290270812437312, "loss": 3.2717, "theoretical_loss": 3.917595991791015, "tokens_seen": 496765952 }, { "epoch": 1.05, "learning_rate": 0.0004290170511534604, "loss": 3.1203, "theoretical_loss": 3.9175402665870926, "tokens_seen": 496831488 }, { "epoch": 1.05, "learning_rate": 0.00042900702106318957, "loss": 3.0971, "theoretical_loss": 3.9174845507911367, "tokens_seen": 496897024 }, { "epoch": 1.05, "learning_rate": 0.00042899699097291875, "loss": 3.1173, "theoretical_loss": 3.9174288444003196, "tokens_seen": 496962560 }, { "epoch": 1.05, "learning_rate": 0.00042898696088264793, "loss": 3.1351, "theoretical_loss": 3.917373147411813, "tokens_seen": 497028096 }, { "epoch": 1.05, "learning_rate": 0.00042897693079237717, "loss": 3.0687, "theoretical_loss": 3.917317459822791, "tokens_seen": 497093632 }, { "epoch": 1.05, "learning_rate": 0.0004289669007021063, "loss": 3.1984, "theoretical_loss": 3.917261781630428, "tokens_seen": 497159168 }, { "epoch": 1.05, "learning_rate": 0.00042895687061183553, "loss": 2.9289, "theoretical_loss": 3.9172061128319, "tokens_seen": 497224704 }, { "epoch": 1.05, "learning_rate": 0.0004289468405215647, "loss": 3.0247, "theoretical_loss": 3.9171504534243846, "tokens_seen": 497290240 }, { "epoch": 1.05, "learning_rate": 0.0004289368104312939, "loss": 2.9903, "theoretical_loss": 3.9170948034050603, "tokens_seen": 497355776 }, { "epoch": 1.05, "learning_rate": 0.0004289267803410231, "loss": 3.1515, "theoretical_loss": 3.9170391627711068, "tokens_seen": 497421312 }, { "epoch": 1.05, "learning_rate": 0.00042891675025075226, "loss": 2.9902, "theoretical_loss": 3.916983531519705, "tokens_seen": 497486848 }, { "epoch": 1.05, "learning_rate": 0.00042890672016048144, "loss": 3.1241, "theoretical_loss": 3.916927909648037, "tokens_seen": 497552384 }, { "epoch": 1.05, "learning_rate": 0.00042889669007021067, "loss": 3.1422, "theoretical_loss": 3.9168722971532857, "tokens_seen": 497617920 }, { "epoch": 1.05, "learning_rate": 0.0004288866599799398, "loss": 3.0081, "theoretical_loss": 3.9168166940326365, "tokens_seen": 497683456 }, { "epoch": 1.05, "learning_rate": 0.00042887662988966903, "loss": 3.1028, "theoretical_loss": 3.916761100283275, "tokens_seen": 497748992 }, { "epoch": 1.05, "learning_rate": 0.00042886659979939816, "loss": 3.1011, "theoretical_loss": 3.916705515902388, "tokens_seen": 497814528 }, { "epoch": 1.05, "learning_rate": 0.0004288565697091274, "loss": 3.0661, "theoretical_loss": 3.9166499408871633, "tokens_seen": 497880064 }, { "epoch": 1.05, "learning_rate": 0.0004288465396188566, "loss": 3.0889, "theoretical_loss": 3.9165943752347916, "tokens_seen": 497945600 }, { "epoch": 1.05, "learning_rate": 0.00042883650952858576, "loss": 2.9385, "theoretical_loss": 3.9165388189424624, "tokens_seen": 498011136 }, { "epoch": 1.05, "objective/train/docs_used": 816504, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1975326538085938, "objective/train/theoretical_loss": 3.9164971578640544, "objective/train/tokens_used": 518520288, "theoretical_loss": 3.9164971578640544, "tokens_seen": 498060288 }, { "epoch": 1.05, "learning_rate": 0.00042882647943831494, "loss": 3.16, "theoretical_loss": 3.916483272007368, "tokens_seen": 498076672 }, { "epoch": 1.05, "learning_rate": 0.0004288164493480441, "loss": 3.0941, "theoretical_loss": 3.916427734426702, "tokens_seen": 498142208 }, { "epoch": 1.05, "learning_rate": 0.0004288064192577733, "loss": 3.1535, "theoretical_loss": 3.916372206197657, "tokens_seen": 498207744 }, { "epoch": 1.05, "learning_rate": 0.00042879638916750254, "loss": 3.2268, "theoretical_loss": 3.9163166873174307, "tokens_seen": 498273280 }, { "epoch": 1.05, "learning_rate": 0.00042878635907723166, "loss": 3.0498, "theoretical_loss": 3.916261177783219, "tokens_seen": 498338816 }, { "epoch": 1.05, "learning_rate": 0.0004287763289869609, "loss": 3.1019, "theoretical_loss": 3.9162056775922185, "tokens_seen": 498404352 }, { "epoch": 1.05, "learning_rate": 0.0004287662988966901, "loss": 3.1249, "theoretical_loss": 3.91615018674163, "tokens_seen": 498469888 }, { "epoch": 1.05, "learning_rate": 0.00042875626880641926, "loss": 3.1174, "theoretical_loss": 3.916094705228653, "tokens_seen": 498535424 }, { "epoch": 1.05, "learning_rate": 0.00042874623871614844, "loss": 3.1631, "theoretical_loss": 3.9160392330504896, "tokens_seen": 498600960 }, { "epoch": 1.05, "learning_rate": 0.0004287362086258776, "loss": 3.1065, "theoretical_loss": 3.9159837702043427, "tokens_seen": 498666496 }, { "epoch": 1.05, "learning_rate": 0.0004287261785356068, "loss": 3.0978, "theoretical_loss": 3.915928316687415, "tokens_seen": 498732032 }, { "epoch": 1.05, "learning_rate": 0.00042871614844533604, "loss": 3.0886, "theoretical_loss": 3.9158728724969127, "tokens_seen": 498797568 }, { "epoch": 1.05, "learning_rate": 0.00042870611835506517, "loss": 3.1027, "theoretical_loss": 3.915817437630042, "tokens_seen": 498863104 }, { "epoch": 1.05, "learning_rate": 0.0004286960882647944, "loss": 3.0994, "theoretical_loss": 3.915762012084011, "tokens_seen": 498928640 }, { "epoch": 1.05, "learning_rate": 0.00042868605817452353, "loss": 3.2047, "theoretical_loss": 3.9157065958560273, "tokens_seen": 498994176 }, { "epoch": 1.05, "learning_rate": 0.00042867602808425276, "loss": 3.0377, "theoretical_loss": 3.9156511889433014, "tokens_seen": 499059712 }, { "epoch": 1.05, "learning_rate": 0.000428665997993982, "loss": 3.3059, "theoretical_loss": 3.915595791343045, "tokens_seen": 499125248 }, { "epoch": 1.05, "learning_rate": 0.0004286559679037111, "loss": 3.1719, "theoretical_loss": 3.9155404030524696, "tokens_seen": 499190784 }, { "epoch": 1.05, "learning_rate": 0.00042864593781344036, "loss": 3.032, "theoretical_loss": 3.91548502406879, "tokens_seen": 499256320 }, { "epoch": 1.05, "learning_rate": 0.0004286359077231695, "loss": 3.166, "theoretical_loss": 3.9154296543892197, "tokens_seen": 499321856 }, { "epoch": 1.05, "learning_rate": 0.0004286258776328987, "loss": 3.0814, "theoretical_loss": 3.9153742940109755, "tokens_seen": 499387392 }, { "epoch": 1.05, "learning_rate": 0.0004286158475426279, "loss": 3.0878, "theoretical_loss": 3.9153189429312736, "tokens_seen": 499452928 }, { "epoch": 1.05, "learning_rate": 0.0004286058174523571, "loss": 2.8196, "theoretical_loss": 3.915263601147333, "tokens_seen": 499518464 }, { "epoch": 1.05, "learning_rate": 0.00042859578736208627, "loss": 3.1161, "theoretical_loss": 3.9152082686563743, "tokens_seen": 499584000 }, { "epoch": 1.05, "learning_rate": 0.0004285857572718155, "loss": 3.1599, "theoretical_loss": 3.9151529454556164, "tokens_seen": 499649536 }, { "epoch": 1.05, "objective/train/docs_used": 819336, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1839540004730225, "objective/train/theoretical_loss": 3.915111459150072, "objective/train/tokens_used": 520158688, "theoretical_loss": 3.915111459150072, "tokens_seen": 499698688 }, { "epoch": 1.05, "learning_rate": 0.00042857572718154463, "loss": 3.1505, "theoretical_loss": 3.9150976315422827, "tokens_seen": 499715072 }, { "epoch": 1.05, "learning_rate": 0.00042856569709127387, "loss": 3.109, "theoretical_loss": 3.9150423269135954, "tokens_seen": 499780608 }, { "epoch": 1.05, "learning_rate": 0.000428555667001003, "loss": 3.0834, "theoretical_loss": 3.9149870315667794, "tokens_seen": 499846144 }, { "epoch": 1.05, "learning_rate": 0.00042854563691073223, "loss": 3.0828, "theoretical_loss": 3.91493174549906, "tokens_seen": 499911680 }, { "epoch": 1.05, "learning_rate": 0.0004285356068204614, "loss": 3.1885, "theoretical_loss": 3.9148764687076643, "tokens_seen": 499977216 }, { "epoch": 1.05, "learning_rate": 0.0004285255767301906, "loss": 3.1729, "theoretical_loss": 3.9148212011898194, "tokens_seen": 500042752 }, { "epoch": 1.05, "learning_rate": 0.00042851554663991977, "loss": 3.1521, "theoretical_loss": 3.914765942942756, "tokens_seen": 500108288 }, { "epoch": 1.05, "learning_rate": 0.00042850551654964895, "loss": 3.0967, "theoretical_loss": 3.914710693963702, "tokens_seen": 500173824 }, { "epoch": 1.05, "learning_rate": 0.00042849548645937813, "loss": 3.1478, "theoretical_loss": 3.9146554542498913, "tokens_seen": 500239360 }, { "epoch": 1.05, "learning_rate": 0.00042848545636910737, "loss": 2.9949, "theoretical_loss": 3.9146002237985553, "tokens_seen": 500304896 }, { "epoch": 1.05, "learning_rate": 0.0004284754262788365, "loss": 3.13, "theoretical_loss": 3.9145450026069284, "tokens_seen": 500370432 }, { "epoch": 1.05, "learning_rate": 0.00042846539618856573, "loss": 3.3203, "theoretical_loss": 3.9144897906722456, "tokens_seen": 500435968 }, { "epoch": 1.05, "learning_rate": 0.0004284553660982949, "loss": 3.2012, "theoretical_loss": 3.9144345879917424, "tokens_seen": 500501504 }, { "epoch": 1.05, "learning_rate": 0.0004284453360080241, "loss": 3.1464, "theoretical_loss": 3.9143793945626575, "tokens_seen": 500567040 }, { "epoch": 1.05, "learning_rate": 0.0004284353059177533, "loss": 2.9854, "theoretical_loss": 3.9143242103822287, "tokens_seen": 500632576 }, { "epoch": 1.05, "learning_rate": 0.00042842527582748246, "loss": 3.0417, "theoretical_loss": 3.9142690354476954, "tokens_seen": 500698112 }, { "epoch": 1.05, "learning_rate": 0.00042841524573721164, "loss": 3.1064, "theoretical_loss": 3.9142138697563, "tokens_seen": 500763648 }, { "epoch": 1.05, "learning_rate": 0.00042840521564694087, "loss": 3.0999, "theoretical_loss": 3.9141587133052838, "tokens_seen": 500829184 }, { "epoch": 1.05, "learning_rate": 0.00042839518555667, "loss": 3.1579, "theoretical_loss": 3.9141035660918897, "tokens_seen": 500894720 }, { "epoch": 1.05, "learning_rate": 0.00042838515546639923, "loss": 3.1981, "theoretical_loss": 3.9140484281133627, "tokens_seen": 500960256 }, { "epoch": 1.05, "learning_rate": 0.00042837512537612836, "loss": 2.9984, "theoretical_loss": 3.9139932993669495, "tokens_seen": 501025792 }, { "epoch": 1.05, "learning_rate": 0.0004283650952858576, "loss": 3.1578, "theoretical_loss": 3.913938179849896, "tokens_seen": 501091328 }, { "epoch": 1.05, "learning_rate": 0.0004283550651955868, "loss": 2.9664, "theoretical_loss": 3.91388306955945, "tokens_seen": 501156864 }, { "epoch": 1.05, "learning_rate": 0.00042834503510531596, "loss": 3.1478, "theoretical_loss": 3.9138279684928614, "tokens_seen": 501222400 }, { "epoch": 1.05, "learning_rate": 0.00042833500501504514, "loss": 3.1003, "theoretical_loss": 3.9137728766473803, "tokens_seen": 501287936 }, { "epoch": 1.05, "objective/train/docs_used": 822418, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3057937622070312, "objective/train/theoretical_loss": 3.9137315638129686, "objective/train/tokens_used": 521797088, "theoretical_loss": 3.9137315638129686, "tokens_seen": 501337088 }, { "epoch": 1.05, "learning_rate": 0.0004283249749247743, "loss": 3.3623, "theoretical_loss": 3.9137177940202594, "tokens_seen": 501353472 }, { "epoch": 1.05, "learning_rate": 0.0004283149448345035, "loss": 3.2045, "theoretical_loss": 3.91366272060875, "tokens_seen": 501419008 }, { "epoch": 1.05, "learning_rate": 0.00042830491474423274, "loss": 3.014, "theoretical_loss": 3.9136076564101066, "tokens_seen": 501484544 }, { "epoch": 1.05, "learning_rate": 0.00042829488465396186, "loss": 3.2773, "theoretical_loss": 3.9135526014215847, "tokens_seen": 501550080 }, { "epoch": 1.05, "learning_rate": 0.0004282848545636911, "loss": 3.0541, "theoretical_loss": 3.9134975556404403, "tokens_seen": 501615616 }, { "epoch": 1.05, "learning_rate": 0.0004282748244734203, "loss": 3.0142, "theoretical_loss": 3.9134425190639313, "tokens_seen": 501681152 }, { "epoch": 1.05, "learning_rate": 0.00042826479438314946, "loss": 3.0248, "theoretical_loss": 3.913387491689316, "tokens_seen": 501746688 }, { "epoch": 1.05, "learning_rate": 0.00042825476429287864, "loss": 3.1052, "theoretical_loss": 3.9133324735138544, "tokens_seen": 501812224 }, { "epoch": 1.05, "learning_rate": 0.0004282447342026078, "loss": 3.0813, "theoretical_loss": 3.913277464534808, "tokens_seen": 501877760 }, { "epoch": 1.05, "learning_rate": 0.000428234704112337, "loss": 3.2338, "theoretical_loss": 3.9132224647494382, "tokens_seen": 501943296 }, { "epoch": 1.05, "learning_rate": 0.00042822467402206624, "loss": 3.0884, "theoretical_loss": 3.9131674741550087, "tokens_seen": 502008832 }, { "epoch": 1.05, "learning_rate": 0.00042821464393179537, "loss": 3.108, "theoretical_loss": 3.9131124927487844, "tokens_seen": 502074368 }, { "epoch": 1.05, "learning_rate": 0.0004282046138415246, "loss": 3.2271, "theoretical_loss": 3.9130575205280307, "tokens_seen": 502139904 }, { "epoch": 1.05, "learning_rate": 0.00042819458375125373, "loss": 3.0536, "theoretical_loss": 3.9130025574900147, "tokens_seen": 502205440 }, { "epoch": 1.05, "learning_rate": 0.00042818455366098296, "loss": 3.1876, "theoretical_loss": 3.912947603632004, "tokens_seen": 502270976 }, { "epoch": 1.05, "learning_rate": 0.00042817452357071215, "loss": 3.0373, "theoretical_loss": 3.912892658951268, "tokens_seen": 502336512 }, { "epoch": 1.05, "learning_rate": 0.0004281644934804413, "loss": 3.0355, "theoretical_loss": 3.912837723445078, "tokens_seen": 502402048 }, { "epoch": 1.05, "learning_rate": 0.0004281544633901705, "loss": 3.2219, "theoretical_loss": 3.9127827971107045, "tokens_seen": 502467584 }, { "epoch": 1.05, "learning_rate": 0.0004281444332998997, "loss": 3.2192, "theoretical_loss": 3.912727879945421, "tokens_seen": 502533120 }, { "epoch": 1.05, "learning_rate": 0.00042813440320962887, "loss": 2.9169, "theoretical_loss": 3.9126729719465003, "tokens_seen": 502598656 }, { "epoch": 1.05, "learning_rate": 0.0004281243731193581, "loss": 3.0152, "theoretical_loss": 3.9126180731112186, "tokens_seen": 502664192 }, { "epoch": 1.05, "learning_rate": 0.00042811434302908723, "loss": 3.1212, "theoretical_loss": 3.912563183436852, "tokens_seen": 502729728 }, { "epoch": 1.05, "learning_rate": 0.00042810431293881647, "loss": 3.0153, "theoretical_loss": 3.9125083029206777, "tokens_seen": 502795264 }, { "epoch": 1.05, "learning_rate": 0.00042809428284854565, "loss": 3.0166, "theoretical_loss": 3.9124534315599737, "tokens_seen": 502860800 }, { "epoch": 1.05, "learning_rate": 0.00042808425275827483, "loss": 3.0645, "theoretical_loss": 3.912398569352021, "tokens_seen": 502926336 }, { "epoch": 1.05, "objective/train/docs_used": 825354, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2402231693267822, "objective/train/theoretical_loss": 3.9123574287009126, "objective/train/tokens_used": 523435488, "theoretical_loss": 3.9123574287009126, "tokens_seen": 502975488 }, { "epoch": 1.05, "learning_rate": 0.000428074222668004, "loss": 3.1296, "theoretical_loss": 3.912343716294099, "tokens_seen": 502991872 }, { "epoch": 1.05, "learning_rate": 0.0004280641925777332, "loss": 2.957, "theoretical_loss": 3.912288872383491, "tokens_seen": 503057408 }, { "epoch": 1.05, "learning_rate": 0.0004280541624874624, "loss": 3.1483, "theoretical_loss": 3.9122340376174796, "tokens_seen": 503122944 }, { "epoch": 1.05, "learning_rate": 0.0004280441323971916, "loss": 3.0399, "theoretical_loss": 3.912179211993349, "tokens_seen": 503188480 }, { "epoch": 1.05, "learning_rate": 0.00042803410230692074, "loss": 3.1104, "theoretical_loss": 3.9121243955083855, "tokens_seen": 503254016 }, { "epoch": 1.05, "learning_rate": 0.00042802407221664997, "loss": 3.0085, "theoretical_loss": 3.9120695881598753, "tokens_seen": 503319552 }, { "epoch": 1.05, "learning_rate": 0.0004280140421263791, "loss": 3.039, "theoretical_loss": 3.912014789945106, "tokens_seen": 503385088 }, { "epoch": 1.05, "learning_rate": 0.00042800401203610833, "loss": 3.309, "theoretical_loss": 3.9119600008613675, "tokens_seen": 503450624 }, { "epoch": 1.05, "learning_rate": 0.0004279939819458375, "loss": 2.8351, "theoretical_loss": 3.9119052209059486, "tokens_seen": 503516160 }, { "epoch": 1.05, "learning_rate": 0.0004279839518555667, "loss": 3.1173, "theoretical_loss": 3.911850450076142, "tokens_seen": 503581696 }, { "epoch": 1.05, "learning_rate": 0.0004279739217652959, "loss": 3.1199, "theoretical_loss": 3.9117956883692395, "tokens_seen": 503647232 }, { "epoch": 1.05, "learning_rate": 0.0004279638916750251, "loss": 2.9904, "theoretical_loss": 3.911740935782535, "tokens_seen": 503712768 }, { "epoch": 1.05, "learning_rate": 0.00042795386158475424, "loss": 3.0305, "theoretical_loss": 3.911686192313323, "tokens_seen": 503778304 }, { "epoch": 1.05, "learning_rate": 0.0004279438314944835, "loss": 2.9688, "theoretical_loss": 3.911631457958899, "tokens_seen": 503843840 }, { "epoch": 1.05, "learning_rate": 0.0004279338014042126, "loss": 3.2224, "theoretical_loss": 3.911576732716562, "tokens_seen": 503909376 }, { "epoch": 1.05, "learning_rate": 0.00042792377131394184, "loss": 3.0328, "theoretical_loss": 3.911522016583608, "tokens_seen": 503974912 }, { "epoch": 1.05, "learning_rate": 0.00042791374122367107, "loss": 3.0972, "theoretical_loss": 3.911467309557338, "tokens_seen": 504040448 }, { "epoch": 1.05, "learning_rate": 0.0004279037111334002, "loss": 3.0644, "theoretical_loss": 3.9114126116350514, "tokens_seen": 504105984 }, { "epoch": 1.05, "learning_rate": 0.00042789368104312943, "loss": 3.2529, "theoretical_loss": 3.9113579228140507, "tokens_seen": 504171520 }, { "epoch": 1.05, "learning_rate": 0.00042788365095285856, "loss": 2.9588, "theoretical_loss": 3.9113032430916386, "tokens_seen": 504237056 }, { "epoch": 1.05, "learning_rate": 0.0004278736208625878, "loss": 3.1013, "theoretical_loss": 3.911248572465119, "tokens_seen": 504302592 }, { "epoch": 1.05, "learning_rate": 0.000427863590772317, "loss": 3.1503, "theoretical_loss": 3.9111939109317966, "tokens_seen": 504368128 }, { "epoch": 1.05, "learning_rate": 0.00042785356068204616, "loss": 3.0513, "theoretical_loss": 3.911139258488979, "tokens_seen": 504433664 }, { "epoch": 1.05, "learning_rate": 0.00042784353059177534, "loss": 3.0727, "theoretical_loss": 3.9110846151339724, "tokens_seen": 504499200 }, { "epoch": 1.05, "learning_rate": 0.0004278335005015045, "loss": 3.0848, "theoretical_loss": 3.9110299808640856, "tokens_seen": 504564736 }, { "epoch": 1.05, "objective/train/docs_used": 827406, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.914968967437744, "objective/train/theoretical_loss": 3.9109890111221626, "objective/train/tokens_used": 525073888, "theoretical_loss": 3.9109890111221626, "tokens_seen": 504613888 }, { "epoch": 1.05, "learning_rate": 0.0004278234704112337, "loss": 3.101, "theoretical_loss": 3.910975355676629, "tokens_seen": 504630272 }, { "epoch": 1.05, "learning_rate": 0.00042781344032096294, "loss": 3.0594, "theoretical_loss": 3.910920739568913, "tokens_seen": 504695808 }, { "epoch": 1.05, "learning_rate": 0.00042780341023069206, "loss": 3.1423, "theoretical_loss": 3.91086613253825, "tokens_seen": 504761344 }, { "epoch": 1.05, "learning_rate": 0.0004277933801404213, "loss": 2.8887, "theoretical_loss": 3.9108115345819527, "tokens_seen": 504826880 }, { "epoch": 1.05, "learning_rate": 0.0004277833500501505, "loss": 3.1335, "theoretical_loss": 3.9107569456973357, "tokens_seen": 504892416 }, { "epoch": 1.05, "learning_rate": 0.00042777331995987966, "loss": 3.0739, "theoretical_loss": 3.9107023658817144, "tokens_seen": 504957952 }, { "epoch": 1.05, "learning_rate": 0.00042776328986960884, "loss": 3.1434, "theoretical_loss": 3.9106477951324057, "tokens_seen": 505023488 }, { "epoch": 1.05, "learning_rate": 0.000427753259779338, "loss": 3.0997, "theoretical_loss": 3.910593233446727, "tokens_seen": 505089024 }, { "epoch": 1.05, "learning_rate": 0.0004277432296890672, "loss": 3.0876, "theoretical_loss": 3.910538680821997, "tokens_seen": 505154560 }, { "epoch": 1.05, "learning_rate": 0.00042773319959879644, "loss": 3.0555, "theoretical_loss": 3.910484137255536, "tokens_seen": 505220096 }, { "epoch": 1.05, "learning_rate": 0.00042772316950852557, "loss": 3.1934, "theoretical_loss": 3.910429602744666, "tokens_seen": 505285632 }, { "epoch": 1.05, "learning_rate": 0.0004277131394182548, "loss": 3.1542, "theoretical_loss": 3.910375077286708, "tokens_seen": 505351168 }, { "epoch": 1.05, "learning_rate": 0.00042770310932798393, "loss": 3.1072, "theoretical_loss": 3.9103205608789864, "tokens_seen": 505416704 }, { "epoch": 1.05, "learning_rate": 0.00042769307923771316, "loss": 3.1584, "theoretical_loss": 3.910266053518825, "tokens_seen": 505482240 }, { "epoch": 1.05, "learning_rate": 0.00042768304914744235, "loss": 3.0685, "theoretical_loss": 3.91021155520355, "tokens_seen": 505547776 }, { "epoch": 1.05, "learning_rate": 0.00042767301905717153, "loss": 3.118, "theoretical_loss": 3.9101570659304885, "tokens_seen": 505613312 }, { "epoch": 1.05, "learning_rate": 0.0004276629889669007, "loss": 3.017, "theoretical_loss": 3.9101025856969676, "tokens_seen": 505678848 }, { "epoch": 1.05, "learning_rate": 0.0004276529588766299, "loss": 3.1677, "theoretical_loss": 3.9100481145003183, "tokens_seen": 505744384 }, { "epoch": 1.05, "learning_rate": 0.00042764292878635907, "loss": 3.0655, "theoretical_loss": 3.909993652337868, "tokens_seen": 505809920 }, { "epoch": 1.05, "learning_rate": 0.0004276328986960883, "loss": 3.0296, "theoretical_loss": 3.909939199206951, "tokens_seen": 505875456 }, { "epoch": 1.05, "learning_rate": 0.00042762286860581743, "loss": 3.0816, "theoretical_loss": 3.9098847551048985, "tokens_seen": 505940992 }, { "epoch": 1.05, "learning_rate": 0.00042761283851554667, "loss": 3.1191, "theoretical_loss": 3.909830320029044, "tokens_seen": 506006528 }, { "epoch": 1.05, "learning_rate": 0.00042760280842527585, "loss": 3.0988, "theoretical_loss": 3.9097758939767226, "tokens_seen": 506072064 }, { "epoch": 1.05, "learning_rate": 0.00042759277833500503, "loss": 3.0999, "theoretical_loss": 3.909721476945271, "tokens_seen": 506137600 }, { "epoch": 1.05, "learning_rate": 0.0004275827482447342, "loss": 3.0232, "theoretical_loss": 3.9096670689320243, "tokens_seen": 506203136 }, { "epoch": 1.05, "objective/train/docs_used": 830323, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0432839393615723, "objective/train/theoretical_loss": 3.9096262688386867, "objective/train/tokens_used": 526712288, "theoretical_loss": 3.9096262688386867, "tokens_seen": 506252288 }, { "epoch": 1.05, "learning_rate": 0.0004275727181544634, "loss": 3.0708, "theoretical_loss": 3.909612669934323, "tokens_seen": 506268672 }, { "epoch": 1.05, "learning_rate": 0.0004275626880641926, "loss": 3.2065, "theoretical_loss": 3.909558279949505, "tokens_seen": 506334208 }, { "epoch": 1.05, "learning_rate": 0.0004275526579739218, "loss": 2.9584, "theoretical_loss": 3.9095038989749114, "tokens_seen": 506399744 }, { "epoch": 1.05, "learning_rate": 0.00042754262788365094, "loss": 3.1078, "theoretical_loss": 3.909449527007883, "tokens_seen": 506465280 }, { "epoch": 1.05, "learning_rate": 0.00042753259779338017, "loss": 3.0046, "theoretical_loss": 3.909395164045763, "tokens_seen": 506530816 }, { "epoch": 1.05, "learning_rate": 0.0004275225677031093, "loss": 3.0917, "theoretical_loss": 3.909340810085896, "tokens_seen": 506596352 }, { "epoch": 1.05, "learning_rate": 0.00042751253761283853, "loss": 3.0352, "theoretical_loss": 3.909286465125626, "tokens_seen": 506661888 }, { "epoch": 1.05, "learning_rate": 0.0004275025075225677, "loss": 3.1055, "theoretical_loss": 3.9092321291623, "tokens_seen": 506727424 }, { "epoch": 1.05, "learning_rate": 0.0004274924774322969, "loss": 3.0866, "theoretical_loss": 3.9091778021932635, "tokens_seen": 506792960 }, { "epoch": 1.05, "learning_rate": 0.0004274824473420261, "loss": 2.9368, "theoretical_loss": 3.9091234842158666, "tokens_seen": 506858496 }, { "epoch": 1.05, "learning_rate": 0.0004274724172517553, "loss": 3.1562, "theoretical_loss": 3.909069175227458, "tokens_seen": 506924032 }, { "epoch": 1.05, "learning_rate": 0.00042746238716148444, "loss": 3.1789, "theoretical_loss": 3.909014875225389, "tokens_seen": 506989568 }, { "epoch": 1.05, "learning_rate": 0.0004274523570712137, "loss": 3.0132, "theoretical_loss": 3.9089605842070103, "tokens_seen": 507055104 }, { "epoch": 1.05, "learning_rate": 0.0004274423269809428, "loss": 3.1771, "theoretical_loss": 3.908906302169675, "tokens_seen": 507120640 }, { "epoch": 1.05, "learning_rate": 0.00042743229689067204, "loss": 3.1795, "theoretical_loss": 3.9088520291107383, "tokens_seen": 507186176 }, { "epoch": 1.05, "learning_rate": 0.0004274222668004012, "loss": 3.0419, "theoretical_loss": 3.9087977650275536, "tokens_seen": 507251712 }, { "epoch": 1.05, "learning_rate": 0.0004274122367101304, "loss": 3.0892, "theoretical_loss": 3.9087435099174783, "tokens_seen": 507317248 }, { "epoch": 1.05, "learning_rate": 0.0004274022066198596, "loss": 3.1195, "theoretical_loss": 3.9086892637778687, "tokens_seen": 507382784 }, { "epoch": 1.05, "learning_rate": 0.00042739217652958876, "loss": 3.2775, "theoretical_loss": 3.908635026606084, "tokens_seen": 507448320 }, { "epoch": 1.05, "learning_rate": 0.00042738214643931794, "loss": 3.1814, "theoretical_loss": 3.908580798399484, "tokens_seen": 507513856 }, { "epoch": 1.05, "learning_rate": 0.0004273721163490472, "loss": 2.9541, "theoretical_loss": 3.9085265791554287, "tokens_seen": 507579392 }, { "epoch": 1.05, "learning_rate": 0.0004273620862587763, "loss": 3.0354, "theoretical_loss": 3.90847236887128, "tokens_seen": 507644928 }, { "epoch": 1.05, "learning_rate": 0.00042735205616850554, "loss": 3.1376, "theoretical_loss": 3.9084181675444016, "tokens_seen": 507710464 }, { "epoch": 1.05, "learning_rate": 0.00042734202607823467, "loss": 3.1479, "theoretical_loss": 3.9083639751721564, "tokens_seen": 507776000 }, { "epoch": 1.05, "learning_rate": 0.0004273319959879639, "loss": 3.1411, "theoretical_loss": 3.9083097917519103, "tokens_seen": 507841536 }, { "epoch": 1.05, "objective/train/docs_used": 832992, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0635852813720703, "objective/train/theoretical_loss": 3.908269160059891, "objective/train/tokens_used": 528350688, "theoretical_loss": 3.908269160059891, "tokens_seen": 507890688 }, { "epoch": 1.05, "learning_rate": 0.0004273219658976931, "loss": 3.1711, "theoretical_loss": 3.9082556172810294, "tokens_seen": 507907072 }, { "epoch": 1.05, "learning_rate": 0.00042731193580742226, "loss": 2.9704, "theoretical_loss": 3.9082014517568817, "tokens_seen": 507972608 }, { "epoch": 1.05, "learning_rate": 0.00042730190571715145, "loss": 2.9955, "theoretical_loss": 3.9081472951768346, "tokens_seen": 508038144 }, { "epoch": 1.05, "learning_rate": 0.0004272918756268807, "loss": 3.2359, "theoretical_loss": 3.908093147538258, "tokens_seen": 508103680 }, { "epoch": 1.05, "learning_rate": 0.0004272818455366098, "loss": 3.1361, "theoretical_loss": 3.908039008838523, "tokens_seen": 508169216 }, { "epoch": 1.05, "learning_rate": 0.00042727181544633904, "loss": 3.2096, "theoretical_loss": 3.907984879075001, "tokens_seen": 508234752 }, { "epoch": 1.05, "learning_rate": 0.00042726178535606817, "loss": 3.1249, "theoretical_loss": 3.9079307582450657, "tokens_seen": 508300288 }, { "epoch": 1.05, "learning_rate": 0.0004272517552657974, "loss": 3.105, "theoretical_loss": 3.9078766463460903, "tokens_seen": 508365824 }, { "epoch": 1.05, "learning_rate": 0.0004272417251755266, "loss": 3.0212, "theoretical_loss": 3.9078225433754508, "tokens_seen": 508431360 }, { "epoch": 1.05, "learning_rate": 0.00042723169508525577, "loss": 3.0427, "theoretical_loss": 3.907768449330523, "tokens_seen": 508496896 }, { "epoch": 1.05, "learning_rate": 0.00042722166499498495, "loss": 3.0077, "theoretical_loss": 3.907714364208684, "tokens_seen": 508562432 }, { "epoch": 1.05, "learning_rate": 0.00042721163490471413, "loss": 3.0697, "theoretical_loss": 3.907660288007313, "tokens_seen": 508627968 }, { "epoch": 1.05, "learning_rate": 0.0004272016048144433, "loss": 3.077, "theoretical_loss": 3.907606220723789, "tokens_seen": 508693504 }, { "epoch": 1.05, "learning_rate": 0.00042719157472417255, "loss": 3.0293, "theoretical_loss": 3.907552162355493, "tokens_seen": 508759040 }, { "epoch": 1.05, "learning_rate": 0.0004271815446339017, "loss": 2.931, "theoretical_loss": 3.9074981128998068, "tokens_seen": 508824576 }, { "epoch": 1.05, "learning_rate": 0.0004271715145436309, "loss": 3.2318, "theoretical_loss": 3.9074440723541133, "tokens_seen": 508890112 }, { "epoch": 1.05, "learning_rate": 0.0004271614844533601, "loss": 3.1436, "theoretical_loss": 3.9073900407157964, "tokens_seen": 508955648 }, { "epoch": 1.05, "learning_rate": 0.00042715145436308927, "loss": 3.0373, "theoretical_loss": 3.9073360179822414, "tokens_seen": 509021184 }, { "epoch": 1.05, "learning_rate": 0.0004271414242728185, "loss": 3.1994, "theoretical_loss": 3.9072820041508347, "tokens_seen": 509086720 }, { "epoch": 1.05, "learning_rate": 0.00042713139418254763, "loss": 3.0615, "theoretical_loss": 3.9072279992189634, "tokens_seen": 509152256 }, { "epoch": 1.05, "learning_rate": 0.00042712136409227687, "loss": 3.0748, "theoretical_loss": 3.907174003184016, "tokens_seen": 509217792 }, { "epoch": 1.05, "learning_rate": 0.00042711133400200605, "loss": 3.2379, "theoretical_loss": 3.9071200160433825, "tokens_seen": 509283328 }, { "epoch": 1.05, "learning_rate": 0.00042710130391173523, "loss": 3.1387, "theoretical_loss": 3.9070660377944524, "tokens_seen": 509348864 }, { "epoch": 1.05, "learning_rate": 0.0004270912738214644, "loss": 3.19, "theoretical_loss": 3.907012068434619, "tokens_seen": 509414400 }, { "epoch": 1.05, "learning_rate": 0.0004270812437311936, "loss": 3.0876, "theoretical_loss": 3.906958107961273, "tokens_seen": 509479936 }, { "epoch": 1.05, "objective/train/docs_used": 835907, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9011473655700684, "objective/train/theoretical_loss": 3.9069176434364548, "objective/train/tokens_used": 529989088, "theoretical_loss": 3.9069176434364548, "tokens_seen": 509529088 }, { "epoch": 1.05, "learning_rate": 0.0004270712136409228, "loss": 3.032, "theoretical_loss": 3.9069041563718105, "tokens_seen": 509545472 }, { "epoch": 1.05, "learning_rate": 0.000427061183550652, "loss": 3.0967, "theoretical_loss": 3.906850213663626, "tokens_seen": 509611008 }, { "epoch": 1.05, "learning_rate": 0.00042705115346038114, "loss": 3.0053, "theoretical_loss": 3.906796279834115, "tokens_seen": 509676544 }, { "epoch": 1.05, "learning_rate": 0.00042704112337011037, "loss": 3.1519, "theoretical_loss": 3.906742354880675, "tokens_seen": 509742080 }, { "epoch": 1.05, "learning_rate": 0.0004270310932798395, "loss": 2.9599, "theoretical_loss": 3.906688438800705, "tokens_seen": 509807616 }, { "epoch": 1.05, "learning_rate": 0.00042702106318956873, "loss": 3.1933, "theoretical_loss": 3.9066345315916036, "tokens_seen": 509873152 }, { "epoch": 1.05, "learning_rate": 0.0004270110330992979, "loss": 3.1785, "theoretical_loss": 3.9065806332507718, "tokens_seen": 509938688 }, { "epoch": 1.05, "learning_rate": 0.0004270010030090271, "loss": 3.1744, "theoretical_loss": 3.906526743775611, "tokens_seen": 510004224 }, { "epoch": 1.05, "learning_rate": 0.0004269909729187563, "loss": 3.143, "theoretical_loss": 3.9064728631635237, "tokens_seen": 510069760 }, { "epoch": 1.05, "learning_rate": 0.0004269809428284855, "loss": 3.0922, "theoretical_loss": 3.9064189914119143, "tokens_seen": 510135296 }, { "epoch": 1.05, "learning_rate": 0.00042697091273821464, "loss": 3.1909, "theoretical_loss": 3.9063651285181877, "tokens_seen": 510200832 }, { "epoch": 1.05, "learning_rate": 0.0004269608826479439, "loss": 3.1388, "theoretical_loss": 3.9063112744797497, "tokens_seen": 510266368 }, { "epoch": 1.05, "learning_rate": 0.000426950852557673, "loss": 3.0455, "theoretical_loss": 3.9062574292940067, "tokens_seen": 510331904 }, { "epoch": 1.05, "learning_rate": 0.00042694082246740224, "loss": 3.0382, "theoretical_loss": 3.9062035929583683, "tokens_seen": 510397440 }, { "epoch": 1.05, "learning_rate": 0.0004269307923771314, "loss": 3.0652, "theoretical_loss": 3.906149765470243, "tokens_seen": 510462976 }, { "epoch": 1.05, "learning_rate": 0.0004269207622868606, "loss": 3.0088, "theoretical_loss": 3.906095946827041, "tokens_seen": 510528512 }, { "epoch": 1.05, "learning_rate": 0.0004269107321965898, "loss": 3.1339, "theoretical_loss": 3.9060421370261738, "tokens_seen": 510594048 }, { "epoch": 1.05, "learning_rate": 0.00042690070210631896, "loss": 3.1532, "theoretical_loss": 3.9059883360650547, "tokens_seen": 510659584 }, { "epoch": 1.05, "learning_rate": 0.00042689067201604814, "loss": 3.1551, "theoretical_loss": 3.905934543941097, "tokens_seen": 510725120 }, { "epoch": 1.05, "learning_rate": 0.0004268806419257774, "loss": 2.9677, "theoretical_loss": 3.9058807606517147, "tokens_seen": 510790656 }, { "epoch": 1.05, "learning_rate": 0.0004268706118355065, "loss": 2.9826, "theoretical_loss": 3.9058269861943247, "tokens_seen": 510856192 }, { "epoch": 1.05, "learning_rate": 0.00042686058174523574, "loss": 3.0789, "theoretical_loss": 3.905773220566343, "tokens_seen": 510921728 }, { "epoch": 1.05, "learning_rate": 0.00042685055165496487, "loss": 3.1411, "theoretical_loss": 3.905719463765188, "tokens_seen": 510987264 }, { "epoch": 1.05, "learning_rate": 0.0004268405215646941, "loss": 3.0563, "theoretical_loss": 3.9056657157882793, "tokens_seen": 511052800 }, { "epoch": 1.05, "learning_rate": 0.0004268304914744233, "loss": 3.0085, "theoretical_loss": 3.905611976633036, "tokens_seen": 511118336 }, { "epoch": 1.05, "objective/train/docs_used": 838646, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0957727432250977, "objective/train/theoretical_loss": 3.9055716780542706, "objective/train/tokens_used": 531627488, "theoretical_loss": 3.9055716780542706, "tokens_seen": 511167488 }, { "epoch": 1.05, "learning_rate": 0.00042682046138415246, "loss": 3.0371, "theoretical_loss": 3.90555824629688, "tokens_seen": 511183872 }, { "epoch": 1.05, "learning_rate": 0.00042681043129388165, "loss": 3.0999, "theoretical_loss": 3.9055045247772338, "tokens_seen": 511249408 }, { "epoch": 1.05, "learning_rate": 0.0004268004012036109, "loss": 3.0192, "theoretical_loss": 3.9054508120715203, "tokens_seen": 511314944 }, { "epoch": 1.05, "learning_rate": 0.00042679037111334, "loss": 3.1457, "theoretical_loss": 3.905397108177165, "tokens_seen": 511380480 }, { "epoch": 1.05, "learning_rate": 0.00042678034102306924, "loss": 3.0026, "theoretical_loss": 3.9053434130915923, "tokens_seen": 511446016 }, { "epoch": 1.05, "learning_rate": 0.00042677031093279837, "loss": 3.0901, "theoretical_loss": 3.905289726812229, "tokens_seen": 511511552 }, { "epoch": 1.05, "learning_rate": 0.0004267602808425276, "loss": 3.0636, "theoretical_loss": 3.905236049336504, "tokens_seen": 511577088 }, { "epoch": 1.05, "learning_rate": 0.0004267502507522568, "loss": 3.1424, "theoretical_loss": 3.905182380661845, "tokens_seen": 511642624 }, { "epoch": 1.05, "learning_rate": 0.00042674022066198597, "loss": 3.2019, "theoretical_loss": 3.905128720785682, "tokens_seen": 511708160 }, { "epoch": 1.05, "learning_rate": 0.00042673019057171515, "loss": 2.9935, "theoretical_loss": 3.9050750697054464, "tokens_seen": 511773696 }, { "epoch": 1.05, "learning_rate": 0.00042672016048144433, "loss": 3.0954, "theoretical_loss": 3.90502142741857, "tokens_seen": 511839232 }, { "epoch": 1.05, "learning_rate": 0.0004267101303911735, "loss": 3.1706, "theoretical_loss": 3.904967793922486, "tokens_seen": 511904768 }, { "epoch": 1.05, "learning_rate": 0.00042670010030090275, "loss": 3.1784, "theoretical_loss": 3.9049141692146287, "tokens_seen": 511970304 }, { "epoch": 1.05, "learning_rate": 0.0004266900702106319, "loss": 2.9502, "theoretical_loss": 3.904860553292434, "tokens_seen": 512035840 }, { "epoch": 1.05, "learning_rate": 0.0004266800401203611, "loss": 3.1195, "theoretical_loss": 3.9048069461533372, "tokens_seen": 512101376 }, { "epoch": 1.05, "learning_rate": 0.00042667001003009024, "loss": 3.153, "theoretical_loss": 3.9047533477947765, "tokens_seen": 512166912 }, { "epoch": 1.05, "learning_rate": 0.00042665997993981947, "loss": 3.2089, "theoretical_loss": 3.9046997582141896, "tokens_seen": 512232448 }, { "epoch": 1.05, "learning_rate": 0.00042664994984954865, "loss": 3.0248, "theoretical_loss": 3.904646177409017, "tokens_seen": 512297984 }, { "epoch": 1.05, "learning_rate": 0.00042663991975927783, "loss": 3.1934, "theoretical_loss": 3.904592605376699, "tokens_seen": 512363520 }, { "epoch": 1.05, "learning_rate": 0.000426629889669007, "loss": 3.0926, "theoretical_loss": 3.9045390421146777, "tokens_seen": 512429056 }, { "epoch": 1.05, "learning_rate": 0.00042661985957873625, "loss": 3.1614, "theoretical_loss": 3.904485487620396, "tokens_seen": 512494592 }, { "epoch": 1.05, "learning_rate": 0.0004266098294884654, "loss": 3.1171, "theoretical_loss": 3.9044319418912963, "tokens_seen": 512560128 }, { "epoch": 1.05, "learning_rate": 0.0004265997993981946, "loss": 3.0317, "theoretical_loss": 3.9043784049248256, "tokens_seen": 512625664 }, { "epoch": 1.05, "learning_rate": 0.00042658976930792374, "loss": 3.1104, "theoretical_loss": 3.9043248767184293, "tokens_seen": 512691200 }, { "epoch": 1.05, "learning_rate": 0.000426579739217653, "loss": 3.119, "theoretical_loss": 3.9042713572695535, "tokens_seen": 512756736 }, { "epoch": 1.05, "objective/train/docs_used": 840097, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4301371574401855, "objective/train/theoretical_loss": 3.9042312234284853, "objective/train/tokens_used": 533265888, "theoretical_loss": 3.9042312234284853, "tokens_seen": 512805888 }, { "epoch": 1.05, "learning_rate": 0.00042656970912738216, "loss": 3.2996, "theoretical_loss": 3.9042178465756474, "tokens_seen": 512822272 }, { "epoch": 1.05, "learning_rate": 0.00042655967903711134, "loss": 3.2424, "theoretical_loss": 3.90416434463416, "tokens_seen": 512887808 }, { "epoch": 1.05, "learning_rate": 0.0004265496489468405, "loss": 3.0559, "theoretical_loss": 3.904110851442542, "tokens_seen": 512953344 }, { "epoch": 1.05, "learning_rate": 0.0004265396188565697, "loss": 3.0522, "theoretical_loss": 3.904057366998244, "tokens_seen": 513018880 }, { "epoch": 1.05, "learning_rate": 0.0004265295887662989, "loss": 2.9953, "theoretical_loss": 3.9040038912987196, "tokens_seen": 513084416 }, { "epoch": 1.05, "learning_rate": 0.0004265195586760281, "loss": 3.0972, "theoretical_loss": 3.9039504243414207, "tokens_seen": 513149952 }, { "epoch": 1.05, "learning_rate": 0.00042650952858575724, "loss": 3.0482, "theoretical_loss": 3.9038969661238037, "tokens_seen": 513215488 }, { "epoch": 1.05, "learning_rate": 0.0004264994984954865, "loss": 3.2283, "theoretical_loss": 3.903843516643323, "tokens_seen": 513281024 }, { "epoch": 1.05, "learning_rate": 0.0004264894684052156, "loss": 3.1986, "theoretical_loss": 3.9037900758974353, "tokens_seen": 513346560 }, { "epoch": 1.05, "learning_rate": 0.00042647943831494484, "loss": 3.0628, "theoretical_loss": 3.903736643883599, "tokens_seen": 513412096 }, { "epoch": 1.05, "learning_rate": 0.000426469408224674, "loss": 3.0618, "theoretical_loss": 3.903683220599273, "tokens_seen": 513477632 }, { "epoch": 1.05, "learning_rate": 0.0004264593781344032, "loss": 3.1329, "theoretical_loss": 3.9036298060419172, "tokens_seen": 513543168 }, { "epoch": 1.05, "learning_rate": 0.0004264493480441324, "loss": 2.9031, "theoretical_loss": 3.9035764002089923, "tokens_seen": 513608704 }, { "epoch": 1.05, "learning_rate": 0.0004264393179538616, "loss": 3.132, "theoretical_loss": 3.90352300309796, "tokens_seen": 513674240 }, { "epoch": 1.05, "learning_rate": 0.00042642928786359075, "loss": 3.131, "theoretical_loss": 3.903469614706284, "tokens_seen": 513739776 }, { "epoch": 1.05, "learning_rate": 0.00042641925777332, "loss": 3.0464, "theoretical_loss": 3.903416235031428, "tokens_seen": 513805312 }, { "epoch": 1.05, "learning_rate": 0.00042640922768304916, "loss": 3.076, "theoretical_loss": 3.903362864070858, "tokens_seen": 513870848 }, { "epoch": 1.05, "learning_rate": 0.00042639919759277834, "loss": 3.0657, "theoretical_loss": 3.9033095018220396, "tokens_seen": 513936384 }, { "epoch": 1.05, "learning_rate": 0.0004263891675025076, "loss": 3.0444, "theoretical_loss": 3.9032561482824404, "tokens_seen": 514001920 }, { "epoch": 1.05, "learning_rate": 0.0004263791374122367, "loss": 2.9897, "theoretical_loss": 3.9032028034495285, "tokens_seen": 514067456 }, { "epoch": 1.05, "learning_rate": 0.00042636910732196594, "loss": 3.1804, "theoretical_loss": 3.9031494673207736, "tokens_seen": 514132992 }, { "epoch": 1.05, "learning_rate": 0.00042635907723169507, "loss": 3.1533, "theoretical_loss": 3.9030961398936466, "tokens_seen": 514198528 }, { "epoch": 1.05, "learning_rate": 0.0004263490471414243, "loss": 3.0255, "theoretical_loss": 3.9030428211656183, "tokens_seen": 514264064 }, { "epoch": 1.05, "learning_rate": 0.0004263390170511535, "loss": 3.0741, "theoretical_loss": 3.9029895111341624, "tokens_seen": 514329600 }, { "epoch": 1.05, "learning_rate": 0.00042632898696088266, "loss": 2.8332, "theoretical_loss": 3.902936209796751, "tokens_seen": 514395136 }, { "epoch": 1.05, "objective/train/docs_used": 844055, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2781200408935547, "objective/train/theoretical_loss": 3.9028962394976414, "objective/train/tokens_used": 534904288, "theoretical_loss": 3.9028962394976414, "tokens_seen": 514444288 }, { "epoch": 1.05, "learning_rate": 0.00042631895687061185, "loss": 3.1072, "theoretical_loss": 3.9028829171508606, "tokens_seen": 514460672 }, { "epoch": 1.05, "learning_rate": 0.0004263089267803411, "loss": 3.1674, "theoretical_loss": 3.9028296331939654, "tokens_seen": 514526208 }, { "epoch": 1.05, "learning_rate": 0.0004262988966900702, "loss": 2.9852, "theoretical_loss": 3.9027763579235435, "tokens_seen": 514591744 }, { "epoch": 1.05, "learning_rate": 0.00042628886659979944, "loss": 2.9854, "theoretical_loss": 3.902723091337073, "tokens_seen": 514657280 }, { "epoch": 1.05, "learning_rate": 0.00042627883650952857, "loss": 3.0684, "theoretical_loss": 3.9026698334320313, "tokens_seen": 514722816 }, { "epoch": 1.05, "learning_rate": 0.0004262688064192578, "loss": 3.0865, "theoretical_loss": 3.9026165842058997, "tokens_seen": 514788352 }, { "epoch": 1.05, "learning_rate": 0.000426258776328987, "loss": 3.2123, "theoretical_loss": 3.9025633436561593, "tokens_seen": 514853888 }, { "epoch": 1.05, "learning_rate": 0.00042624874623871617, "loss": 3.1955, "theoretical_loss": 3.902510111780291, "tokens_seen": 514919424 }, { "epoch": 1.05, "learning_rate": 0.00042623871614844535, "loss": 3.0547, "theoretical_loss": 3.90245688857578, "tokens_seen": 514984960 }, { "epoch": 1.05, "learning_rate": 0.00042622868605817453, "loss": 3.1071, "theoretical_loss": 3.9024036740401087, "tokens_seen": 515050496 }, { "epoch": 1.05, "learning_rate": 0.0004262186559679037, "loss": 3.0738, "theoretical_loss": 3.902350468170763, "tokens_seen": 515116032 }, { "epoch": 1.05, "learning_rate": 0.00042620862587763295, "loss": 3.0174, "theoretical_loss": 3.9022972709652293, "tokens_seen": 515181568 }, { "epoch": 1.05, "learning_rate": 0.0004261985957873621, "loss": 3.1743, "theoretical_loss": 3.902244082420995, "tokens_seen": 515247104 }, { "epoch": 1.05, "learning_rate": 0.0004261885656970913, "loss": 3.0023, "theoretical_loss": 3.9021909025355486, "tokens_seen": 515312640 }, { "epoch": 1.05, "learning_rate": 0.00042617853560682044, "loss": 3.1448, "theoretical_loss": 3.9021377313063788, "tokens_seen": 515378176 }, { "epoch": 1.05, "learning_rate": 0.00042616850551654967, "loss": 2.9857, "theoretical_loss": 3.9020845687309773, "tokens_seen": 515443712 }, { "epoch": 1.05, "learning_rate": 0.00042615847542627885, "loss": 3.1198, "theoretical_loss": 3.902031414806835, "tokens_seen": 515509248 }, { "epoch": 1.05, "learning_rate": 0.00042614844533600803, "loss": 2.9897, "theoretical_loss": 3.901978269531445, "tokens_seen": 515574784 }, { "epoch": 1.05, "learning_rate": 0.0004261384152457372, "loss": 3.0285, "theoretical_loss": 3.9019251329022997, "tokens_seen": 515640320 }, { "epoch": 1.05, "learning_rate": 0.00042612838515546645, "loss": 3.2397, "theoretical_loss": 3.9018720049168953, "tokens_seen": 515705856 }, { "epoch": 1.05, "learning_rate": 0.0004261183550651956, "loss": 3.2872, "theoretical_loss": 3.9018188855727267, "tokens_seen": 515771392 }, { "epoch": 1.05, "learning_rate": 0.0004261083249749248, "loss": 3.1496, "theoretical_loss": 3.9017657748672905, "tokens_seen": 515836928 }, { "epoch": 1.05, "learning_rate": 0.00042609829488465394, "loss": 3.1687, "theoretical_loss": 3.9017126727980855, "tokens_seen": 515902464 }, { "epoch": 1.05, "learning_rate": 0.0004260882647943832, "loss": 2.8337, "theoretical_loss": 3.9016595793626094, "tokens_seen": 515968000 }, { "epoch": 1.05, "learning_rate": 0.00042607823470411236, "loss": 2.9147, "theoretical_loss": 3.901606494558363, "tokens_seen": 516033536 }, { "epoch": 1.05, "objective/train/docs_used": 845414, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.028416395187378, "objective/train/theoretical_loss": 3.901566686617919, "objective/train/tokens_used": 536542688, "theoretical_loss": 3.901566686617919, "tokens_seen": 516082688 }, { "epoch": 1.05, "learning_rate": 0.00042606820461384154, "loss": 2.9688, "theoretical_loss": 3.901553418382847, "tokens_seen": 516099072 }, { "epoch": 1.05, "learning_rate": 0.0004260581745235707, "loss": 3.0708, "theoretical_loss": 3.9015003508335635, "tokens_seen": 516164608 }, { "epoch": 1.05, "learning_rate": 0.0004260481444332999, "loss": 3.0869, "theoretical_loss": 3.9014472919080148, "tokens_seen": 516230144 }, { "epoch": 1.05, "learning_rate": 0.0004260381143430291, "loss": 3.0234, "theoretical_loss": 3.9013942416037057, "tokens_seen": 516295680 }, { "epoch": 1.05, "learning_rate": 0.0004260280842527583, "loss": 3.0826, "theoretical_loss": 3.9013411999181415, "tokens_seen": 516361216 }, { "epoch": 1.05, "learning_rate": 0.00042601805416248744, "loss": 3.0811, "theoretical_loss": 3.901288166848828, "tokens_seen": 516426752 }, { "epoch": 1.05, "learning_rate": 0.0004260080240722167, "loss": 3.0233, "theoretical_loss": 3.901235142393272, "tokens_seen": 516492288 }, { "epoch": 1.05, "learning_rate": 0.0004259979939819458, "loss": 3.2206, "theoretical_loss": 3.9011821265489823, "tokens_seen": 516557824 }, { "epoch": 1.05, "learning_rate": 0.00042598796389167504, "loss": 3.0007, "theoretical_loss": 3.9011291193134685, "tokens_seen": 516623360 }, { "epoch": 1.05, "learning_rate": 0.0004259779338014042, "loss": 3.1974, "theoretical_loss": 3.90107612068424, "tokens_seen": 516688896 }, { "epoch": 1.05, "learning_rate": 0.0004259679037111334, "loss": 3.0991, "theoretical_loss": 3.901023130658808, "tokens_seen": 516754432 }, { "epoch": 1.05, "learning_rate": 0.0004259578736208626, "loss": 3.1165, "theoretical_loss": 3.9009701492346864, "tokens_seen": 516819968 }, { "epoch": 1.05, "learning_rate": 0.0004259478435305918, "loss": 2.9753, "theoretical_loss": 3.9009171764093873, "tokens_seen": 516885504 }, { "epoch": 1.05, "learning_rate": 0.00042593781344032095, "loss": 2.9025, "theoretical_loss": 3.900864212180425, "tokens_seen": 516951040 }, { "epoch": 1.05, "learning_rate": 0.0004259277833500502, "loss": 3.155, "theoretical_loss": 3.9008112565453157, "tokens_seen": 517016576 }, { "epoch": 1.05, "learning_rate": 0.0004259177532597793, "loss": 2.8603, "theoretical_loss": 3.900758309501576, "tokens_seen": 517082112 }, { "epoch": 1.05, "learning_rate": 0.00042590772316950854, "loss": 3.0373, "theoretical_loss": 3.9007053710467225, "tokens_seen": 517147648 }, { "epoch": 1.05, "learning_rate": 0.0004258976930792377, "loss": 3.2007, "theoretical_loss": 3.9006524411782744, "tokens_seen": 517213184 }, { "epoch": 1.05, "learning_rate": 0.0004258876629889669, "loss": 3.0838, "theoretical_loss": 3.9005995198937518, "tokens_seen": 517278720 }, { "epoch": 1.05, "learning_rate": 0.0004258776328986961, "loss": 3.0268, "theoretical_loss": 3.900546607190674, "tokens_seen": 517344256 }, { "epoch": 1.05, "learning_rate": 0.00042586760280842527, "loss": 3.1747, "theoretical_loss": 3.900493703066564, "tokens_seen": 517409792 }, { "epoch": 1.05, "learning_rate": 0.00042585757271815445, "loss": 3.0901, "theoretical_loss": 3.900440807518944, "tokens_seen": 517475328 }, { "epoch": 1.05, "learning_rate": 0.0004258475426278837, "loss": 3.0037, "theoretical_loss": 3.900387920545337, "tokens_seen": 517540864 }, { "epoch": 1.05, "learning_rate": 0.0004258375125376128, "loss": 3.1782, "theoretical_loss": 3.9003350421432694, "tokens_seen": 517606400 }, { "epoch": 1.05, "learning_rate": 0.00042582748244734205, "loss": 2.899, "theoretical_loss": 3.9002821723102654, "tokens_seen": 517671936 }, { "epoch": 1.05, "objective/train/docs_used": 848143, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9694607257843018, "objective/train/theoretical_loss": 3.9002425255574726, "objective/train/tokens_used": 538181088, "theoretical_loss": 3.9002425255574726, "tokens_seen": 517721088 }, { "epoch": 1.05, "learning_rate": 0.0004258174523570712, "loss": 3.1181, "theoretical_loss": 3.9002293110438524, "tokens_seen": 517737472 }, { "epoch": 1.05, "learning_rate": 0.0004258074222668004, "loss": 3.0982, "theoretical_loss": 3.900176458341558, "tokens_seen": 517803008 }, { "epoch": 1.05, "learning_rate": 0.0004257973921765296, "loss": 3.0904, "theoretical_loss": 3.9001236142009112, "tokens_seen": 517868544 }, { "epoch": 1.05, "learning_rate": 0.00042578736208625877, "loss": 3.1794, "theoretical_loss": 3.900070778619443, "tokens_seen": 517934080 }, { "epoch": 1.05, "learning_rate": 0.00042577733199598795, "loss": 3.1993, "theoretical_loss": 3.9000179515946822, "tokens_seen": 517999616 }, { "epoch": 1.05, "learning_rate": 0.0004257673019057172, "loss": 3.0491, "theoretical_loss": 3.8999651331241623, "tokens_seen": 518065152 }, { "epoch": 1.05, "learning_rate": 0.0004257572718154463, "loss": 3.1055, "theoretical_loss": 3.8999123232054154, "tokens_seen": 518130688 }, { "epoch": 1.05, "learning_rate": 0.00042574724172517555, "loss": 3.0431, "theoretical_loss": 3.8998595218359764, "tokens_seen": 518196224 }, { "epoch": 1.05, "learning_rate": 0.0004257372116349047, "loss": 3.1341, "theoretical_loss": 3.8998067290133793, "tokens_seen": 518261760 }, { "epoch": 1.05, "learning_rate": 0.0004257271815446339, "loss": 3.0782, "theoretical_loss": 3.899753944735161, "tokens_seen": 518327296 }, { "epoch": 1.05, "learning_rate": 0.0004257171514543631, "loss": 3.1472, "theoretical_loss": 3.8997011689988574, "tokens_seen": 518392832 }, { "epoch": 1.05, "learning_rate": 0.0004257071213640923, "loss": 2.9481, "theoretical_loss": 3.8996484018020077, "tokens_seen": 518458368 }, { "epoch": 1.05, "learning_rate": 0.00042569709127382145, "loss": 3.0229, "theoretical_loss": 3.899595643142151, "tokens_seen": 518523904 }, { "epoch": 1.05, "learning_rate": 0.00042568706118355064, "loss": 2.9795, "theoretical_loss": 3.899542893016826, "tokens_seen": 518589440 }, { "epoch": 1.05, "learning_rate": 0.0004256770310932798, "loss": 2.8902, "theoretical_loss": 3.899490151423575, "tokens_seen": 518654976 }, { "epoch": 1.05, "learning_rate": 0.00042566700100300905, "loss": 3.0833, "theoretical_loss": 3.8994374183599403, "tokens_seen": 518720512 }, { "epoch": 1.05, "learning_rate": 0.00042565697091273823, "loss": 3.021, "theoretical_loss": 3.899384693823465, "tokens_seen": 518786048 }, { "epoch": 1.05, "learning_rate": 0.0004256469408224674, "loss": 3.2574, "theoretical_loss": 3.899331977811693, "tokens_seen": 518851584 }, { "epoch": 1.05, "learning_rate": 0.00042563691073219665, "loss": 3.0542, "theoretical_loss": 3.899279270322169, "tokens_seen": 518917120 }, { "epoch": 1.05, "learning_rate": 0.0004256268806419258, "loss": 3.0468, "theoretical_loss": 3.8992265713524397, "tokens_seen": 518982656 }, { "epoch": 1.05, "learning_rate": 0.000425616850551655, "loss": 3.1966, "theoretical_loss": 3.899173880900053, "tokens_seen": 519048192 }, { "epoch": 1.05, "learning_rate": 0.00042560682046138414, "loss": 3.1936, "theoretical_loss": 3.8991211989625563, "tokens_seen": 519113728 }, { "epoch": 1.05, "learning_rate": 0.0004255967903711134, "loss": 3.0629, "theoretical_loss": 3.899068525537499, "tokens_seen": 519179264 }, { "epoch": 1.05, "learning_rate": 0.00042558676028084256, "loss": 3.0071, "theoretical_loss": 3.8990158606224314, "tokens_seen": 519244800 }, { "epoch": 1.05, "learning_rate": 0.00042557673019057174, "loss": 3.2291, "theoretical_loss": 3.8989632042149047, "tokens_seen": 519310336 }, { "epoch": 1.05, "objective/train/docs_used": 851078, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2192952632904053, "objective/train/theoretical_loss": 3.8989237174908613, "objective/train/tokens_used": 539819488, "theoretical_loss": 3.8989237174908613, "tokens_seen": 519359488 }, { "epoch": 1.05, "learning_rate": 0.0004255667001003009, "loss": 3.2767, "theoretical_loss": 3.8989105563124715, "tokens_seen": 519375872 }, { "epoch": 1.05, "learning_rate": 0.0004255566700100301, "loss": 3.1646, "theoretical_loss": 3.8988579169126854, "tokens_seen": 519441408 }, { "epoch": 1.05, "learning_rate": 0.0004255466399197593, "loss": 3.1198, "theoretical_loss": 3.8988052860130997, "tokens_seen": 519506944 }, { "epoch": 1.05, "learning_rate": 0.0004255366098294885, "loss": 2.9099, "theoretical_loss": 3.898752663611271, "tokens_seen": 519572480 }, { "epoch": 1.05, "learning_rate": 0.00042552657973921764, "loss": 3.0092, "theoretical_loss": 3.898700049704755, "tokens_seen": 519638016 }, { "epoch": 1.05, "learning_rate": 0.0004255165496489469, "loss": 3.0758, "theoretical_loss": 3.8986474442911083, "tokens_seen": 519703552 }, { "epoch": 1.05, "learning_rate": 0.000425506519558676, "loss": 3.1712, "theoretical_loss": 3.8985948473678906, "tokens_seen": 519769088 }, { "epoch": 1.05, "learning_rate": 0.00042549648946840524, "loss": 3.0394, "theoretical_loss": 3.8985422589326606, "tokens_seen": 519834624 }, { "epoch": 1.05, "learning_rate": 0.0004254864593781344, "loss": 3.1631, "theoretical_loss": 3.898489678982979, "tokens_seen": 519900160 }, { "epoch": 1.05, "learning_rate": 0.0004254764292878636, "loss": 3.1442, "theoretical_loss": 3.898437107516407, "tokens_seen": 519965696 }, { "epoch": 1.05, "learning_rate": 0.0004254663991975928, "loss": 3.127, "theoretical_loss": 3.898384544530507, "tokens_seen": 520031232 }, { "epoch": 1.05, "learning_rate": 0.000425456369107322, "loss": 3.1317, "theoretical_loss": 3.8983319900228426, "tokens_seen": 520096768 }, { "epoch": 1.05, "learning_rate": 0.00042544633901705115, "loss": 3.1436, "theoretical_loss": 3.898279443990978, "tokens_seen": 520162304 }, { "epoch": 1.05, "learning_rate": 0.0004254363089267804, "loss": 3.1367, "theoretical_loss": 3.8982269064324786, "tokens_seen": 520227840 }, { "epoch": 1.05, "learning_rate": 0.0004254262788365095, "loss": 3.0456, "theoretical_loss": 3.8981743773449113, "tokens_seen": 520293376 }, { "epoch": 1.05, "learning_rate": 0.00042541624874623874, "loss": 2.9942, "theoretical_loss": 3.8981218567258433, "tokens_seen": 520358912 }, { "epoch": 1.05, "learning_rate": 0.0004254062186559679, "loss": 2.9816, "theoretical_loss": 3.8980693445728427, "tokens_seen": 520424448 }, { "epoch": 1.05, "learning_rate": 0.0004253961885656971, "loss": 2.9017, "theoretical_loss": 3.8980168408834794, "tokens_seen": 520489984 }, { "epoch": 1.05, "learning_rate": 0.0004253861584754263, "loss": 2.9346, "theoretical_loss": 3.897964345655324, "tokens_seen": 520555520 }, { "epoch": 1.05, "learning_rate": 0.00042537612838515547, "loss": 3.0601, "theoretical_loss": 3.897911858885947, "tokens_seen": 520621056 }, { "epoch": 1.05, "learning_rate": 0.00042536609829488465, "loss": 2.9897, "theoretical_loss": 3.897859380572922, "tokens_seen": 520686592 }, { "epoch": 1.05, "learning_rate": 0.0004253560682046139, "loss": 3.1706, "theoretical_loss": 3.897806910713822, "tokens_seen": 520752128 }, { "epoch": 1.05, "learning_rate": 0.000425346038114343, "loss": 3.0223, "theoretical_loss": 3.897754449306221, "tokens_seen": 520817664 }, { "epoch": 1.05, "learning_rate": 0.00042533600802407225, "loss": 2.9758, "theoretical_loss": 3.897701996347695, "tokens_seen": 520883200 }, { "epoch": 1.05, "learning_rate": 0.0004253259779338014, "loss": 3.0596, "theoretical_loss": 3.897649551835821, "tokens_seen": 520948736 }, { "epoch": 1.05, "objective/train/docs_used": 854004, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9294333457946777, "objective/train/theoretical_loss": 3.897610223993573, "objective/train/tokens_used": 541457888, "theoretical_loss": 3.897610223993573, "tokens_seen": 520997888 }, { "epoch": 1.05, "learning_rate": 0.0004253159478435306, "loss": 3.0844, "theoretical_loss": 3.8975971157681757, "tokens_seen": 521014272 }, { "epoch": 1.05, "learning_rate": 0.0004253059177532598, "loss": 2.9775, "theoretical_loss": 3.8975446881423372, "tokens_seen": 521079808 }, { "epoch": 1.05, "learning_rate": 0.00042529588766298897, "loss": 3.1562, "theoretical_loss": 3.8974922689558857, "tokens_seen": 521145344 }, { "epoch": 1.05, "learning_rate": 0.00042528585757271815, "loss": 3.0351, "theoretical_loss": 3.8974398582064014, "tokens_seen": 521210880 }, { "epoch": 1.05, "learning_rate": 0.0004252758274824474, "loss": 3.0897, "theoretical_loss": 3.897387455891466, "tokens_seen": 521276416 }, { "epoch": 1.05, "learning_rate": 0.0004252657973921765, "loss": 3.1102, "theoretical_loss": 3.897335062008662, "tokens_seen": 521341952 }, { "epoch": 1.05, "learning_rate": 0.00042525576730190575, "loss": 3.154, "theoretical_loss": 3.8972826765555726, "tokens_seen": 521407488 }, { "epoch": 1.05, "learning_rate": 0.0004252457372116349, "loss": 3.0228, "theoretical_loss": 3.897230299529782, "tokens_seen": 521473024 }, { "epoch": 1.05, "learning_rate": 0.0004252357071213641, "loss": 3.0112, "theoretical_loss": 3.897177930928876, "tokens_seen": 521538560 }, { "epoch": 1.05, "learning_rate": 0.0004252256770310933, "loss": 3.0596, "theoretical_loss": 3.897125570750441, "tokens_seen": 521604096 }, { "epoch": 1.05, "learning_rate": 0.0004252156469408225, "loss": 3.0784, "theoretical_loss": 3.8970732189920643, "tokens_seen": 521669632 }, { "epoch": 1.05, "learning_rate": 0.00042520561685055165, "loss": 3.0422, "theoretical_loss": 3.897020875651335, "tokens_seen": 521735168 }, { "epoch": 1.05, "learning_rate": 0.00042519558676028084, "loss": 3.1444, "theoretical_loss": 3.896968540725841, "tokens_seen": 521800704 }, { "epoch": 1.05, "learning_rate": 0.00042518555667001, "loss": 3.0701, "theoretical_loss": 3.896916214213175, "tokens_seen": 521866240 }, { "epoch": 1.05, "learning_rate": 0.00042517552657973925, "loss": 3.0213, "theoretical_loss": 3.896863896110926, "tokens_seen": 521931776 }, { "epoch": 1.05, "learning_rate": 0.0004251654964894684, "loss": 3.2383, "theoretical_loss": 3.896811586416688, "tokens_seen": 521997312 }, { "epoch": 1.05, "learning_rate": 0.0004251554663991976, "loss": 3.1074, "theoretical_loss": 3.896759285128054, "tokens_seen": 522062848 }, { "epoch": 1.05, "learning_rate": 0.0004251454363089268, "loss": 3.1048, "theoretical_loss": 3.896706992242618, "tokens_seen": 522128384 }, { "epoch": 1.05, "learning_rate": 0.000425135406218656, "loss": 3.0887, "theoretical_loss": 3.896654707757976, "tokens_seen": 522193920 }, { "epoch": 1.05, "learning_rate": 0.00042512537612838516, "loss": 2.9653, "theoretical_loss": 3.896602431671724, "tokens_seen": 522259456 }, { "epoch": 1.05, "learning_rate": 0.00042511534603811434, "loss": 3.1532, "theoretical_loss": 3.896550163981459, "tokens_seen": 522324992 }, { "epoch": 1.05, "learning_rate": 0.0004251053159478435, "loss": 3.1102, "theoretical_loss": 3.89649790468478, "tokens_seen": 522390528 }, { "epoch": 1.05, "learning_rate": 0.00042509528585757276, "loss": 3.1007, "theoretical_loss": 3.8964456537792858, "tokens_seen": 522456064 }, { "epoch": 1.05, "learning_rate": 0.0004250852557673019, "loss": 3.0061, "theoretical_loss": 3.8963934112625775, "tokens_seen": 522521600 }, { "epoch": 1.05, "learning_rate": 0.0004250752256770311, "loss": 3.1376, "theoretical_loss": 3.8963411771322556, "tokens_seen": 522587136 }, { "epoch": 1.05, "objective/train/docs_used": 856298, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8949711322784424, "objective/train/theoretical_loss": 3.8963020070366383, "objective/train/tokens_used": 543096288, "theoretical_loss": 3.8963020070366383, "tokens_seen": 522636288 }, { "epoch": 1.05, "learning_rate": 0.00042506519558676024, "loss": 3.0754, "theoretical_loss": 3.896288951385923, "tokens_seen": 522652672 }, { "epoch": 1.05, "learning_rate": 0.0004250551654964895, "loss": 2.9716, "theoretical_loss": 3.8962367340211825, "tokens_seen": 522718208 }, { "epoch": 1.05, "learning_rate": 0.00042504513540621866, "loss": 3.0146, "theoretical_loss": 3.8961845250356384, "tokens_seen": 522783744 }, { "epoch": 1.05, "learning_rate": 0.00042503510531594784, "loss": 3.0446, "theoretical_loss": 3.8961323244268957, "tokens_seen": 522849280 }, { "epoch": 1.05, "learning_rate": 0.000425025075225677, "loss": 3.0111, "theoretical_loss": 3.8960801321925618, "tokens_seen": 522914816 }, { "epoch": 1.05, "learning_rate": 0.0004250150451354062, "loss": 2.947, "theoretical_loss": 3.896027948330243, "tokens_seen": 522980352 }, { "epoch": 1.05, "learning_rate": 0.0004250050150451354, "loss": 3.1205, "theoretical_loss": 3.895975772837547, "tokens_seen": 523045888 }, { "epoch": 1.05, "learning_rate": 0.0004249949849548646, "loss": 2.8977, "theoretical_loss": 3.895923605712084, "tokens_seen": 523111424 }, { "epoch": 1.05, "learning_rate": 0.00042498495486459375, "loss": 3.0954, "theoretical_loss": 3.895871446951464, "tokens_seen": 523176960 }, { "epoch": 1.06, "learning_rate": 0.000424974924774323, "loss": 3.0136, "theoretical_loss": 3.895819296553298, "tokens_seen": 523242496 }, { "epoch": 1.06, "learning_rate": 0.00042496489468405216, "loss": 3.0273, "theoretical_loss": 3.8957671545151977, "tokens_seen": 523308032 }, { "epoch": 1.06, "learning_rate": 0.00042495486459378135, "loss": 3.177, "theoretical_loss": 3.8957150208347766, "tokens_seen": 523373568 }, { "epoch": 1.06, "learning_rate": 0.0004249448345035105, "loss": 3.1572, "theoretical_loss": 3.8956628955096493, "tokens_seen": 523439104 }, { "epoch": 1.06, "learning_rate": 0.0004249348044132397, "loss": 3.1608, "theoretical_loss": 3.8956107785374297, "tokens_seen": 523504640 }, { "epoch": 1.06, "learning_rate": 0.00042492477432296894, "loss": 3.1785, "theoretical_loss": 3.8955586699157347, "tokens_seen": 523570176 }, { "epoch": 1.06, "learning_rate": 0.0004249147442326981, "loss": 2.925, "theoretical_loss": 3.8955065696421807, "tokens_seen": 523635712 }, { "epoch": 1.06, "learning_rate": 0.0004249047141424273, "loss": 3.1453, "theoretical_loss": 3.8954544777143867, "tokens_seen": 523701248 }, { "epoch": 1.06, "learning_rate": 0.0004248946840521565, "loss": 3.1976, "theoretical_loss": 3.895402394129971, "tokens_seen": 523766784 }, { "epoch": 1.06, "learning_rate": 0.00042488465396188567, "loss": 2.9847, "theoretical_loss": 3.8953503188865533, "tokens_seen": 523832320 }, { "epoch": 1.06, "learning_rate": 0.00042487462387161485, "loss": 3.1426, "theoretical_loss": 3.895298251981756, "tokens_seen": 523897856 }, { "epoch": 1.06, "learning_rate": 0.0004248645937813441, "loss": 3.1382, "theoretical_loss": 3.8952461934131986, "tokens_seen": 523963392 }, { "epoch": 1.06, "learning_rate": 0.0004248545636910732, "loss": 3.0093, "theoretical_loss": 3.895194143178506, "tokens_seen": 524028928 }, { "epoch": 1.06, "learning_rate": 0.00042484453360080245, "loss": 3.1956, "theoretical_loss": 3.8951421012753014, "tokens_seen": 524094464 }, { "epoch": 1.06, "learning_rate": 0.0004248345035105316, "loss": 2.8608, "theoretical_loss": 3.8950900677012097, "tokens_seen": 524160000 }, { "epoch": 1.06, "learning_rate": 0.0004248244734202608, "loss": 3.1198, "theoretical_loss": 3.895038042453856, "tokens_seen": 524225536 }, { "epoch": 1.06, "objective/train/docs_used": 859220, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1464102268218994, "objective/train/theoretical_loss": 3.894999028981336, "objective/train/tokens_used": 544734688, "theoretical_loss": 3.894999028981336, "tokens_seen": 524274688 }, { "epoch": 1.06, "learning_rate": 0.00042481444332999, "loss": 3.0773, "theoretical_loss": 3.8949860255308684, "tokens_seen": 524291072 }, { "epoch": 1.06, "learning_rate": 0.00042480441323971917, "loss": 3.1309, "theoretical_loss": 3.8949340169298736, "tokens_seen": 524356608 }, { "epoch": 1.06, "learning_rate": 0.00042479438314944835, "loss": 3.0476, "theoretical_loss": 3.8948820166485008, "tokens_seen": 524422144 }, { "epoch": 1.06, "learning_rate": 0.0004247843530591776, "loss": 3.1374, "theoretical_loss": 3.8948300246843797, "tokens_seen": 524487680 }, { "epoch": 1.06, "learning_rate": 0.0004247743229689067, "loss": 2.9754, "theoretical_loss": 3.8947780410351407, "tokens_seen": 524553216 }, { "epoch": 1.06, "learning_rate": 0.00042476429287863595, "loss": 3.12, "theoretical_loss": 3.8947260656984155, "tokens_seen": 524618752 }, { "epoch": 1.06, "learning_rate": 0.0004247542627883651, "loss": 3.0382, "theoretical_loss": 3.894674098671837, "tokens_seen": 524684288 }, { "epoch": 1.06, "learning_rate": 0.0004247442326980943, "loss": 3.0214, "theoretical_loss": 3.8946221399530385, "tokens_seen": 524749824 }, { "epoch": 1.06, "learning_rate": 0.0004247342026078235, "loss": 3.1629, "theoretical_loss": 3.894570189539655, "tokens_seen": 524815360 }, { "epoch": 1.06, "learning_rate": 0.0004247241725175527, "loss": 3.1008, "theoretical_loss": 3.8945182474293216, "tokens_seen": 524880896 }, { "epoch": 1.06, "learning_rate": 0.00042471414242728186, "loss": 3.0164, "theoretical_loss": 3.894466313619675, "tokens_seen": 524946432 }, { "epoch": 1.06, "learning_rate": 0.00042470411233701104, "loss": 3.2193, "theoretical_loss": 3.8944143881083524, "tokens_seen": 525011968 }, { "epoch": 1.06, "learning_rate": 0.0004246940822467402, "loss": 3.0911, "theoretical_loss": 3.8943624708929927, "tokens_seen": 525077504 }, { "epoch": 1.06, "learning_rate": 0.00042468405215646945, "loss": 3.0428, "theoretical_loss": 3.894310561971235, "tokens_seen": 525143040 }, { "epoch": 1.06, "learning_rate": 0.0004246740220661986, "loss": 3.0264, "theoretical_loss": 3.894258661340719, "tokens_seen": 525208576 }, { "epoch": 1.06, "learning_rate": 0.0004246639919759278, "loss": 3.0028, "theoretical_loss": 3.894206768999087, "tokens_seen": 525274112 }, { "epoch": 1.06, "learning_rate": 0.000424653961885657, "loss": 3.0653, "theoretical_loss": 3.894154884943981, "tokens_seen": 525339648 }, { "epoch": 1.06, "learning_rate": 0.0004246439317953862, "loss": 3.0222, "theoretical_loss": 3.8941030091730444, "tokens_seen": 525405184 }, { "epoch": 1.06, "learning_rate": 0.00042463390170511536, "loss": 3.1483, "theoretical_loss": 3.894051141683921, "tokens_seen": 525470720 }, { "epoch": 1.06, "learning_rate": 0.00042462387161484454, "loss": 3.0777, "theoretical_loss": 3.8939992824742564, "tokens_seen": 525536256 }, { "epoch": 1.06, "learning_rate": 0.0004246138415245737, "loss": 3.1082, "theoretical_loss": 3.8939474315416964, "tokens_seen": 525601792 }, { "epoch": 1.06, "learning_rate": 0.00042460381143430296, "loss": 3.1657, "theoretical_loss": 3.893895588883889, "tokens_seen": 525667328 }, { "epoch": 1.06, "learning_rate": 0.0004245937813440321, "loss": 3.0993, "theoretical_loss": 3.8938437544984805, "tokens_seen": 525732864 }, { "epoch": 1.06, "learning_rate": 0.0004245837512537613, "loss": 3.1115, "theoretical_loss": 3.8937919283831217, "tokens_seen": 525798400 }, { "epoch": 1.06, "learning_rate": 0.00042457372116349044, "loss": 3.1612, "theoretical_loss": 3.893740110535462, "tokens_seen": 525863936 }, { "epoch": 1.06, "objective/train/docs_used": 862003, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.844754695892334, "objective/train/theoretical_loss": 3.893701252573981, "objective/train/tokens_used": 546373088, "theoretical_loss": 3.893701252573981, "tokens_seen": 525913088 }, { "epoch": 1.06, "learning_rate": 0.0004245636910732197, "loss": 2.9295, "theoretical_loss": 3.8936883009531513, "tokens_seen": 525929472 }, { "epoch": 1.06, "learning_rate": 0.00042455366098294886, "loss": 3.043, "theoretical_loss": 3.893636499633843, "tokens_seen": 525995008 }, { "epoch": 1.06, "learning_rate": 0.00042454363089267804, "loss": 3.1142, "theoretical_loss": 3.8935847065751896, "tokens_seen": 526060544 }, { "epoch": 1.06, "learning_rate": 0.0004245336008024072, "loss": 3.2038, "theoretical_loss": 3.8935329217748444, "tokens_seen": 526126080 }, { "epoch": 1.06, "learning_rate": 0.0004245235707121364, "loss": 2.974, "theoretical_loss": 3.8934811452304627, "tokens_seen": 526191616 }, { "epoch": 1.06, "learning_rate": 0.0004245135406218656, "loss": 3.0451, "theoretical_loss": 3.8934293769397, "tokens_seen": 526257152 }, { "epoch": 1.06, "learning_rate": 0.0004245035105315948, "loss": 3.2014, "theoretical_loss": 3.8933776169002137, "tokens_seen": 526322688 }, { "epoch": 1.06, "learning_rate": 0.00042449348044132395, "loss": 3.1041, "theoretical_loss": 3.8933258651096603, "tokens_seen": 526388224 }, { "epoch": 1.06, "learning_rate": 0.0004244834503510532, "loss": 2.9844, "theoretical_loss": 3.8932741215656987, "tokens_seen": 526453760 }, { "epoch": 1.06, "learning_rate": 0.00042447342026078236, "loss": 3.043, "theoretical_loss": 3.8932223862659896, "tokens_seen": 526519296 }, { "epoch": 1.06, "learning_rate": 0.00042446339017051155, "loss": 2.9519, "theoretical_loss": 3.893170659208192, "tokens_seen": 526584832 }, { "epoch": 1.06, "learning_rate": 0.0004244533600802407, "loss": 3.1711, "theoretical_loss": 3.893118940389969, "tokens_seen": 526650368 }, { "epoch": 1.06, "learning_rate": 0.0004244433299899699, "loss": 3.2001, "theoretical_loss": 3.893067229808981, "tokens_seen": 526715904 }, { "epoch": 1.06, "learning_rate": 0.0004244332998996991, "loss": 3.132, "theoretical_loss": 3.893015527462893, "tokens_seen": 526781440 }, { "epoch": 1.06, "learning_rate": 0.0004244232698094283, "loss": 3.262, "theoretical_loss": 3.892963833349369, "tokens_seen": 526846976 }, { "epoch": 1.06, "learning_rate": 0.00042441323971915745, "loss": 3.2172, "theoretical_loss": 3.892912147466074, "tokens_seen": 526912512 }, { "epoch": 1.06, "learning_rate": 0.0004244032096288867, "loss": 3.0929, "theoretical_loss": 3.892860469810674, "tokens_seen": 526978048 }, { "epoch": 1.06, "learning_rate": 0.0004243931795386158, "loss": 3.0808, "theoretical_loss": 3.8928088003808377, "tokens_seen": 527043584 }, { "epoch": 1.06, "learning_rate": 0.00042438314944834505, "loss": 3.179, "theoretical_loss": 3.8927571391742313, "tokens_seen": 527109120 }, { "epoch": 1.06, "learning_rate": 0.00042437311935807423, "loss": 2.9928, "theoretical_loss": 3.8927054861885253, "tokens_seen": 527174656 }, { "epoch": 1.06, "learning_rate": 0.0004243630892678034, "loss": 3.1031, "theoretical_loss": 3.8926538414213887, "tokens_seen": 527240192 }, { "epoch": 1.06, "learning_rate": 0.0004243530591775326, "loss": 3.1488, "theoretical_loss": 3.8926022048704936, "tokens_seen": 527305728 }, { "epoch": 1.06, "learning_rate": 0.0004243430290872618, "loss": 3.129, "theoretical_loss": 3.8925505765335116, "tokens_seen": 527371264 }, { "epoch": 1.06, "learning_rate": 0.00042433299899699095, "loss": 3.0928, "theoretical_loss": 3.8924989564081147, "tokens_seen": 527436800 }, { "epoch": 1.06, "learning_rate": 0.0004243229689067202, "loss": 2.9094, "theoretical_loss": 3.892447344491978, "tokens_seen": 527502336 }, { "epoch": 1.06, "objective/train/docs_used": 864669, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.256032943725586, "objective/train/theoretical_loss": 3.8924086409408027, "objective/train/tokens_used": 548011488, "theoretical_loss": 3.8924086409408027, "tokens_seen": 527551488 }, { "epoch": 1.06, "learning_rate": 0.0004243129388164493, "loss": 3.0456, "theoretical_loss": 3.892395740782775, "tokens_seen": 527567872 }, { "epoch": 1.06, "learning_rate": 0.00042430290872617855, "loss": 3.0318, "theoretical_loss": 3.892344145278183, "tokens_seen": 527633408 }, { "epoch": 1.06, "learning_rate": 0.00042429287863590773, "loss": 3.0288, "theoretical_loss": 3.8922925579758783, "tokens_seen": 527698944 }, { "epoch": 1.06, "learning_rate": 0.0004242828485456369, "loss": 2.9809, "theoretical_loss": 3.892240978873537, "tokens_seen": 527764480 }, { "epoch": 1.06, "learning_rate": 0.0004242728184553661, "loss": 3.0587, "theoretical_loss": 3.8921894079688397, "tokens_seen": 527830016 }, { "epoch": 1.06, "learning_rate": 0.0004242627883650953, "loss": 3.2329, "theoretical_loss": 3.892137845259465, "tokens_seen": 527895552 }, { "epoch": 1.06, "learning_rate": 0.00042425275827482446, "loss": 3.0854, "theoretical_loss": 3.8920862907430935, "tokens_seen": 527961088 }, { "epoch": 1.06, "learning_rate": 0.0004242427281845537, "loss": 3.05, "theoretical_loss": 3.8920347444174066, "tokens_seen": 528026624 }, { "epoch": 1.06, "learning_rate": 0.0004242326980942828, "loss": 3.1141, "theoretical_loss": 3.891983206280086, "tokens_seen": 528092160 }, { "epoch": 1.06, "learning_rate": 0.00042422266800401206, "loss": 3.0966, "theoretical_loss": 3.8919316763288165, "tokens_seen": 528157696 }, { "epoch": 1.06, "learning_rate": 0.0004242126379137412, "loss": 3.073, "theoretical_loss": 3.8918801545612816, "tokens_seen": 528223232 }, { "epoch": 1.06, "learning_rate": 0.0004242026078234704, "loss": 3.0978, "theoretical_loss": 3.8918286409751657, "tokens_seen": 528288768 }, { "epoch": 1.06, "learning_rate": 0.0004241925777331996, "loss": 3.0557, "theoretical_loss": 3.891777135568156, "tokens_seen": 528354304 }, { "epoch": 1.06, "learning_rate": 0.0004241825476429288, "loss": 3.2849, "theoretical_loss": 3.891725638337939, "tokens_seen": 528419840 }, { "epoch": 1.06, "learning_rate": 0.000424172517552658, "loss": 3.1267, "theoretical_loss": 3.8916741492822036, "tokens_seen": 528485376 }, { "epoch": 1.06, "learning_rate": 0.0004241624874623872, "loss": 3.0347, "theoretical_loss": 3.8916226683986377, "tokens_seen": 528550912 }, { "epoch": 1.06, "learning_rate": 0.0004241524573721164, "loss": 3.0191, "theoretical_loss": 3.8915711956849317, "tokens_seen": 528616448 }, { "epoch": 1.06, "learning_rate": 0.00042414242728184556, "loss": 3.0641, "theoretical_loss": 3.891519731138776, "tokens_seen": 528681984 }, { "epoch": 1.06, "learning_rate": 0.00042413239719157474, "loss": 3.0, "theoretical_loss": 3.8914682747578633, "tokens_seen": 528747520 }, { "epoch": 1.06, "learning_rate": 0.0004241223671013039, "loss": 2.974, "theoretical_loss": 3.891416826539886, "tokens_seen": 528813056 }, { "epoch": 1.06, "learning_rate": 0.00042411233701103316, "loss": 3.1907, "theoretical_loss": 3.8913653864825366, "tokens_seen": 528878592 }, { "epoch": 1.06, "learning_rate": 0.0004241023069207623, "loss": 3.1123, "theoretical_loss": 3.891313954583511, "tokens_seen": 528944128 }, { "epoch": 1.06, "learning_rate": 0.0004240922768304915, "loss": 3.0394, "theoretical_loss": 3.891262530840504, "tokens_seen": 529009664 }, { "epoch": 1.06, "learning_rate": 0.00042408224674022065, "loss": 3.2146, "theoretical_loss": 3.891211115251213, "tokens_seen": 529075200 }, { "epoch": 1.06, "learning_rate": 0.0004240722166499499, "loss": 2.9911, "theoretical_loss": 3.8911597078133346, "tokens_seen": 529140736 }, { "epoch": 1.06, "objective/train/docs_used": 866091, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2652318477630615, "objective/train/theoretical_loss": 3.891121157582906, "objective/train/tokens_used": 549649888, "theoretical_loss": 3.891121157582906, "tokens_seen": 529189888 }, { "epoch": 1.06, "learning_rate": 0.00042406218655967906, "loss": 3.2006, "theoretical_loss": 3.891108308524567, "tokens_seen": 529206272 }, { "epoch": 1.06, "learning_rate": 0.00042405215646940824, "loss": 3.0565, "theoretical_loss": 3.89105691738261, "tokens_seen": 529271808 }, { "epoch": 1.06, "learning_rate": 0.0004240421263791374, "loss": 3.1039, "theoretical_loss": 3.8910055343851635, "tokens_seen": 529337344 }, { "epoch": 1.06, "learning_rate": 0.0004240320962888666, "loss": 2.9919, "theoretical_loss": 3.890954159529929, "tokens_seen": 529402880 }, { "epoch": 1.06, "learning_rate": 0.0004240220661985958, "loss": 2.944, "theoretical_loss": 3.8909027928146083, "tokens_seen": 529468416 }, { "epoch": 1.06, "learning_rate": 0.000424012036108325, "loss": 3.0479, "theoretical_loss": 3.8908514342369047, "tokens_seen": 529533952 }, { "epoch": 1.06, "learning_rate": 0.00042400200601805415, "loss": 2.9341, "theoretical_loss": 3.890800083794521, "tokens_seen": 529599488 }, { "epoch": 1.06, "learning_rate": 0.0004239919759277834, "loss": 3.0989, "theoretical_loss": 3.8907487414851634, "tokens_seen": 529665024 }, { "epoch": 1.06, "learning_rate": 0.00042398194583751256, "loss": 3.1958, "theoretical_loss": 3.8906974073065372, "tokens_seen": 529730560 }, { "epoch": 1.06, "learning_rate": 0.00042397191574724175, "loss": 3.177, "theoretical_loss": 3.890646081256349, "tokens_seen": 529796096 }, { "epoch": 1.06, "learning_rate": 0.0004239618856569709, "loss": 3.1181, "theoretical_loss": 3.890594763332307, "tokens_seen": 529861632 }, { "epoch": 1.06, "learning_rate": 0.0004239518555667001, "loss": 2.938, "theoretical_loss": 3.8905434535321186, "tokens_seen": 529927168 }, { "epoch": 1.06, "learning_rate": 0.0004239418254764293, "loss": 2.9935, "theoretical_loss": 3.890492151853495, "tokens_seen": 529992704 }, { "epoch": 1.06, "learning_rate": 0.0004239317953861585, "loss": 3.1235, "theoretical_loss": 3.890440858294145, "tokens_seen": 530058240 }, { "epoch": 1.06, "learning_rate": 0.00042392176529588765, "loss": 3.0834, "theoretical_loss": 3.8903895728517814, "tokens_seen": 530123776 }, { "epoch": 1.06, "learning_rate": 0.0004239117352056169, "loss": 2.9165, "theoretical_loss": 3.8903382955241157, "tokens_seen": 530189312 }, { "epoch": 1.06, "learning_rate": 0.000423901705115346, "loss": 2.9996, "theoretical_loss": 3.8902870263088607, "tokens_seen": 530254848 }, { "epoch": 1.06, "learning_rate": 0.00042389167502507525, "loss": 3.0322, "theoretical_loss": 3.890235765203732, "tokens_seen": 530320384 }, { "epoch": 1.06, "learning_rate": 0.00042388164493480443, "loss": 3.0527, "theoretical_loss": 3.8901845122064436, "tokens_seen": 530385920 }, { "epoch": 1.06, "learning_rate": 0.0004238716148445336, "loss": 2.9296, "theoretical_loss": 3.8901332673147113, "tokens_seen": 530451456 }, { "epoch": 1.06, "learning_rate": 0.0004238615847542628, "loss": 3.2916, "theoretical_loss": 3.890082030526253, "tokens_seen": 530516992 }, { "epoch": 1.06, "learning_rate": 0.000423851554663992, "loss": 3.0408, "theoretical_loss": 3.890030801838786, "tokens_seen": 530582528 }, { "epoch": 1.06, "learning_rate": 0.00042384152457372115, "loss": 3.0066, "theoretical_loss": 3.889979581250029, "tokens_seen": 530648064 }, { "epoch": 1.06, "learning_rate": 0.0004238314944834504, "loss": 3.0025, "theoretical_loss": 3.889928368757702, "tokens_seen": 530713600 }, { "epoch": 1.06, "learning_rate": 0.0004238214643931795, "loss": 2.9818, "theoretical_loss": 3.8898771643595254, "tokens_seen": 530779136 }, { "epoch": 1.06, "objective/train/docs_used": 868661, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.094383955001831, "objective/train/theoretical_loss": 3.8898387663713083, "objective/train/tokens_used": 551288288, "theoretical_loss": 3.8898387663713083, "tokens_seen": 530828288 }, { "epoch": 1.06, "learning_rate": 0.00042381143430290875, "loss": 3.0014, "theoretical_loss": 3.889825968053221, "tokens_seen": 530844672 }, { "epoch": 1.06, "learning_rate": 0.00042380140421263793, "loss": 3.0813, "theoretical_loss": 3.8897747798365105, "tokens_seen": 530910208 }, { "epoch": 1.06, "learning_rate": 0.0004237913741223671, "loss": 2.9998, "theoretical_loss": 3.8897235997071187, "tokens_seen": 530975744 }, { "epoch": 1.06, "learning_rate": 0.0004237813440320963, "loss": 3.044, "theoretical_loss": 3.8896724276627683, "tokens_seen": 531041280 }, { "epoch": 1.06, "learning_rate": 0.0004237713139418255, "loss": 2.8894, "theoretical_loss": 3.889621263701186, "tokens_seen": 531106816 }, { "epoch": 1.06, "learning_rate": 0.00042376128385155466, "loss": 3.1702, "theoretical_loss": 3.8895701078200973, "tokens_seen": 531172352 }, { "epoch": 1.06, "learning_rate": 0.0004237512537612839, "loss": 2.7551, "theoretical_loss": 3.8895189600172295, "tokens_seen": 531237888 }, { "epoch": 1.06, "learning_rate": 0.000423741223671013, "loss": 2.9161, "theoretical_loss": 3.88946782029031, "tokens_seen": 531303424 }, { "epoch": 1.06, "learning_rate": 0.00042373119358074226, "loss": 3.0482, "theoretical_loss": 3.889416688637069, "tokens_seen": 531368960 }, { "epoch": 1.06, "learning_rate": 0.0004237211634904714, "loss": 3.059, "theoretical_loss": 3.889365565055235, "tokens_seen": 531434496 }, { "epoch": 1.06, "learning_rate": 0.0004237111334002006, "loss": 3.1417, "theoretical_loss": 3.889314449542539, "tokens_seen": 531500032 }, { "epoch": 1.06, "learning_rate": 0.0004237011033099298, "loss": 3.1147, "theoretical_loss": 3.8892633420967133, "tokens_seen": 531565568 }, { "epoch": 1.06, "learning_rate": 0.000423691073219659, "loss": 3.1322, "theoretical_loss": 3.88921224271549, "tokens_seen": 531631104 }, { "epoch": 1.06, "learning_rate": 0.00042368104312938816, "loss": 3.027, "theoretical_loss": 3.8891611513966033, "tokens_seen": 531696640 }, { "epoch": 1.06, "learning_rate": 0.0004236710130391174, "loss": 2.8295, "theoretical_loss": 3.8891100681377866, "tokens_seen": 531762176 }, { "epoch": 1.06, "learning_rate": 0.0004236609829488465, "loss": 3.2726, "theoretical_loss": 3.889058992936776, "tokens_seen": 531827712 }, { "epoch": 1.06, "learning_rate": 0.00042365095285857576, "loss": 2.9792, "theoretical_loss": 3.889007925791307, "tokens_seen": 531893248 }, { "epoch": 1.06, "learning_rate": 0.0004236409227683049, "loss": 2.997, "theoretical_loss": 3.888956866699118, "tokens_seen": 531958784 }, { "epoch": 1.06, "learning_rate": 0.0004236308926780341, "loss": 3.1235, "theoretical_loss": 3.888905815657946, "tokens_seen": 532024320 }, { "epoch": 1.06, "learning_rate": 0.0004236208625877633, "loss": 3.0846, "theoretical_loss": 3.88885477266553, "tokens_seen": 532089856 }, { "epoch": 1.06, "learning_rate": 0.0004236108324974925, "loss": 3.0191, "theoretical_loss": 3.888803737719611, "tokens_seen": 532155392 }, { "epoch": 1.06, "learning_rate": 0.00042360080240722166, "loss": 3.1293, "theoretical_loss": 3.888752710817929, "tokens_seen": 532220928 }, { "epoch": 1.06, "learning_rate": 0.00042359077231695085, "loss": 3.0712, "theoretical_loss": 3.888701691958226, "tokens_seen": 532286464 }, { "epoch": 1.06, "learning_rate": 0.00042358074222668, "loss": 3.0961, "theoretical_loss": 3.888650681138244, "tokens_seen": 532352000 }, { "epoch": 1.06, "learning_rate": 0.00042357071213640926, "loss": 3.0329, "theoretical_loss": 3.8885996783557273, "tokens_seen": 532417536 }, { "epoch": 1.06, "objective/train/docs_used": 871511, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.043645143508911, "objective/train/theoretical_loss": 3.88856143154207, "objective/train/tokens_used": 552926688, "theoretical_loss": 3.88856143154207, "tokens_seen": 532466688 }, { "epoch": 1.06, "learning_rate": 0.0004235606820461384, "loss": 3.03, "theoretical_loss": 3.8885486836084207, "tokens_seen": 532483072 }, { "epoch": 1.06, "learning_rate": 0.0004235506519558676, "loss": 3.1529, "theoretical_loss": 3.8884976968940683, "tokens_seen": 532548608 }, { "epoch": 1.06, "learning_rate": 0.00042354062186559675, "loss": 3.0244, "theoretical_loss": 3.888446718210418, "tokens_seen": 532614144 }, { "epoch": 1.06, "learning_rate": 0.000423530591775326, "loss": 3.0716, "theoretical_loss": 3.8883957475552156, "tokens_seen": 532679680 }, { "epoch": 1.06, "learning_rate": 0.00042352056168505517, "loss": 2.8541, "theoretical_loss": 3.8883447849262094, "tokens_seen": 532745216 }, { "epoch": 1.06, "learning_rate": 0.00042351053159478435, "loss": 3.0092, "theoretical_loss": 3.8882938303211496, "tokens_seen": 532810752 }, { "epoch": 1.06, "learning_rate": 0.00042350050150451353, "loss": 3.0843, "theoretical_loss": 3.888242883737785, "tokens_seen": 532876288 }, { "epoch": 1.06, "learning_rate": 0.00042349047141424276, "loss": 3.1023, "theoretical_loss": 3.8881919451738662, "tokens_seen": 532941824 }, { "epoch": 1.06, "learning_rate": 0.0004234804413239719, "loss": 3.1739, "theoretical_loss": 3.8881410146271467, "tokens_seen": 533007360 }, { "epoch": 1.06, "learning_rate": 0.00042347041123370113, "loss": 3.166, "theoretical_loss": 3.888090092095377, "tokens_seen": 533072896 }, { "epoch": 1.06, "learning_rate": 0.00042346038114343025, "loss": 2.9887, "theoretical_loss": 3.888039177576312, "tokens_seen": 533138432 }, { "epoch": 1.06, "learning_rate": 0.0004234503510531595, "loss": 3.0967, "theoretical_loss": 3.887988271067706, "tokens_seen": 533203968 }, { "epoch": 1.06, "learning_rate": 0.00042344032096288867, "loss": 3.1821, "theoretical_loss": 3.887937372567314, "tokens_seen": 533269504 }, { "epoch": 1.06, "learning_rate": 0.00042343029087261785, "loss": 2.9899, "theoretical_loss": 3.8878864820728922, "tokens_seen": 533335040 }, { "epoch": 1.06, "learning_rate": 0.0004234202607823471, "loss": 2.9899, "theoretical_loss": 3.887835599582198, "tokens_seen": 533400576 }, { "epoch": 1.06, "learning_rate": 0.0004234102306920762, "loss": 3.192, "theoretical_loss": 3.88778472509299, "tokens_seen": 533466112 }, { "epoch": 1.06, "learning_rate": 0.00042340020060180545, "loss": 3.0955, "theoretical_loss": 3.887733858603027, "tokens_seen": 533531648 }, { "epoch": 1.06, "learning_rate": 0.00042339017051153463, "loss": 3.1135, "theoretical_loss": 3.8876830001100675, "tokens_seen": 533597184 }, { "epoch": 1.06, "learning_rate": 0.0004233801404212638, "loss": 3.1335, "theoretical_loss": 3.887632149611874, "tokens_seen": 533662720 }, { "epoch": 1.06, "learning_rate": 0.000423370110330993, "loss": 3.0503, "theoretical_loss": 3.8875813071062075, "tokens_seen": 533728256 }, { "epoch": 1.06, "learning_rate": 0.0004233600802407222, "loss": 3.1273, "theoretical_loss": 3.8875304725908304, "tokens_seen": 533793792 }, { "epoch": 1.06, "learning_rate": 0.00042335005015045135, "loss": 3.0741, "theoretical_loss": 3.8874796460635066, "tokens_seen": 533859328 }, { "epoch": 1.06, "learning_rate": 0.0004233400200601806, "loss": 3.0363, "theoretical_loss": 3.887428827522001, "tokens_seen": 533924864 }, { "epoch": 1.06, "learning_rate": 0.0004233299899699097, "loss": 3.1111, "theoretical_loss": 3.8873780169640773, "tokens_seen": 533990400 }, { "epoch": 1.06, "learning_rate": 0.00042331995987963895, "loss": 3.0187, "theoretical_loss": 3.8873272143875033, "tokens_seen": 534055936 }, { "epoch": 1.06, "objective/train/docs_used": 874112, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1278533935546875, "objective/train/theoretical_loss": 3.8872891176914894, "objective/train/tokens_used": 554565088, "theoretical_loss": 3.8872891176914894, "tokens_seen": 534105088 }, { "epoch": 1.06, "learning_rate": 0.00042330992978936813, "loss": 3.2169, "theoretical_loss": 3.8872764197900453, "tokens_seen": 534121472 }, { "epoch": 1.06, "learning_rate": 0.0004232998996990973, "loss": 2.8838, "theoretical_loss": 3.8872256331694715, "tokens_seen": 534187008 }, { "epoch": 1.06, "learning_rate": 0.0004232898696088265, "loss": 3.0513, "theoretical_loss": 3.8871748545235505, "tokens_seen": 534252544 }, { "epoch": 1.06, "learning_rate": 0.0004232798395185557, "loss": 2.9958, "theoretical_loss": 3.8871240838500523, "tokens_seen": 534318080 }, { "epoch": 1.06, "learning_rate": 0.00042326980942828486, "loss": 3.0153, "theoretical_loss": 3.8870733211467483, "tokens_seen": 534383616 }, { "epoch": 1.06, "learning_rate": 0.0004232597793380141, "loss": 3.1903, "theoretical_loss": 3.8870225664114084, "tokens_seen": 534449152 }, { "epoch": 1.06, "learning_rate": 0.0004232497492477432, "loss": 3.0256, "theoretical_loss": 3.886971819641807, "tokens_seen": 534514688 }, { "epoch": 1.06, "learning_rate": 0.00042323971915747246, "loss": 3.0252, "theoretical_loss": 3.886921080835716, "tokens_seen": 534580224 }, { "epoch": 1.06, "learning_rate": 0.0004232296890672016, "loss": 3.0305, "theoretical_loss": 3.88687034999091, "tokens_seen": 534645760 }, { "epoch": 1.06, "learning_rate": 0.0004232196589769308, "loss": 3.1591, "theoretical_loss": 3.8868196271051643, "tokens_seen": 534711296 }, { "epoch": 1.06, "learning_rate": 0.00042320962888666, "loss": 3.1067, "theoretical_loss": 3.8867689121762554, "tokens_seen": 534776832 }, { "epoch": 1.06, "learning_rate": 0.0004231995987963892, "loss": 2.8953, "theoretical_loss": 3.88671820520196, "tokens_seen": 534842368 }, { "epoch": 1.06, "learning_rate": 0.00042318956870611836, "loss": 3.0309, "theoretical_loss": 3.886667506180056, "tokens_seen": 534907904 }, { "epoch": 1.06, "learning_rate": 0.0004231795386158476, "loss": 3.2016, "theoretical_loss": 3.8866168151083214, "tokens_seen": 534973440 }, { "epoch": 1.06, "learning_rate": 0.0004231695085255767, "loss": 3.0882, "theoretical_loss": 3.886566131984537, "tokens_seen": 535038976 }, { "epoch": 1.06, "learning_rate": 0.00042315947843530596, "loss": 3.0441, "theoretical_loss": 3.8865154568064826, "tokens_seen": 535104512 }, { "epoch": 1.06, "learning_rate": 0.0004231494483450351, "loss": 2.9953, "theoretical_loss": 3.8864647895719395, "tokens_seen": 535170048 }, { "epoch": 1.06, "learning_rate": 0.0004231394182547643, "loss": 3.1442, "theoretical_loss": 3.8864141302786903, "tokens_seen": 535235584 }, { "epoch": 1.06, "learning_rate": 0.0004231293881644935, "loss": 3.1332, "theoretical_loss": 3.886363478924518, "tokens_seen": 535301120 }, { "epoch": 1.06, "learning_rate": 0.0004231193580742227, "loss": 3.0233, "theoretical_loss": 3.8863128355072076, "tokens_seen": 535366656 }, { "epoch": 1.06, "learning_rate": 0.00042310932798395186, "loss": 3.1448, "theoretical_loss": 3.886262200024543, "tokens_seen": 535432192 }, { "epoch": 1.06, "learning_rate": 0.00042309929789368105, "loss": 3.052, "theoretical_loss": 3.88621157247431, "tokens_seen": 535497728 }, { "epoch": 1.06, "learning_rate": 0.0004230892678034102, "loss": 2.9851, "theoretical_loss": 3.8861609528542957, "tokens_seen": 535563264 }, { "epoch": 1.06, "learning_rate": 0.00042307923771313946, "loss": 3.1232, "theoretical_loss": 3.886110341162288, "tokens_seen": 535628800 }, { "epoch": 1.06, "learning_rate": 0.0004230692076228686, "loss": 3.145, "theoretical_loss": 3.8860597373960752, "tokens_seen": 535694336 }, { "epoch": 1.06, "objective/train/docs_used": 877004, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.85671067237854, "objective/train/theoretical_loss": 3.886021789771389, "objective/train/tokens_used": 556203488, "theoretical_loss": 3.886021789771389, "tokens_seen": 535743488 }, { "epoch": 1.06, "learning_rate": 0.0004230591775325978, "loss": 3.1264, "theoretical_loss": 3.886009141553447, "tokens_seen": 535759872 }, { "epoch": 1.06, "learning_rate": 0.00042304914744232695, "loss": 2.8345, "theoretical_loss": 3.885958553632193, "tokens_seen": 535825408 }, { "epoch": 1.06, "learning_rate": 0.0004230391173520562, "loss": 3.0807, "theoretical_loss": 3.885907973630105, "tokens_seen": 535890944 }, { "epoch": 1.06, "learning_rate": 0.00042302908726178537, "loss": 3.07, "theoretical_loss": 3.885857401544975, "tokens_seen": 535956480 }, { "epoch": 1.06, "learning_rate": 0.00042301905717151455, "loss": 3.2314, "theoretical_loss": 3.885806837374596, "tokens_seen": 536022016 }, { "epoch": 1.06, "learning_rate": 0.00042300902708124373, "loss": 3.1235, "theoretical_loss": 3.8857562811167616, "tokens_seen": 536087552 }, { "epoch": 1.06, "learning_rate": 0.00042299899699097297, "loss": 3.1864, "theoretical_loss": 3.8857057327692672, "tokens_seen": 536153088 }, { "epoch": 1.06, "learning_rate": 0.0004229889669007021, "loss": 3.2015, "theoretical_loss": 3.8856551923299074, "tokens_seen": 536218624 }, { "epoch": 1.06, "learning_rate": 0.00042297893681043133, "loss": 3.2125, "theoretical_loss": 3.8856046597964786, "tokens_seen": 536284160 }, { "epoch": 1.06, "learning_rate": 0.00042296890672016045, "loss": 3.086, "theoretical_loss": 3.885554135166779, "tokens_seen": 536349696 }, { "epoch": 1.06, "learning_rate": 0.0004229588766298897, "loss": 3.1535, "theoretical_loss": 3.885503618438607, "tokens_seen": 536415232 }, { "epoch": 1.06, "learning_rate": 0.00042294884653961887, "loss": 2.926, "theoretical_loss": 3.8854531096097613, "tokens_seen": 536480768 }, { "epoch": 1.06, "learning_rate": 0.00042293881644934805, "loss": 3.2238, "theoretical_loss": 3.885402608678042, "tokens_seen": 536546304 }, { "epoch": 1.06, "learning_rate": 0.00042292878635907723, "loss": 3.1048, "theoretical_loss": 3.88535211564125, "tokens_seen": 536611840 }, { "epoch": 1.06, "learning_rate": 0.0004229187562688064, "loss": 3.0263, "theoretical_loss": 3.885301630497187, "tokens_seen": 536677376 }, { "epoch": 1.06, "learning_rate": 0.0004229087261785356, "loss": 3.0271, "theoretical_loss": 3.885251153243656, "tokens_seen": 536742912 }, { "epoch": 1.06, "learning_rate": 0.00042289869608826483, "loss": 3.1228, "theoretical_loss": 3.88520068387846, "tokens_seen": 536808448 }, { "epoch": 1.06, "learning_rate": 0.00042288866599799396, "loss": 3.0232, "theoretical_loss": 3.885150222399404, "tokens_seen": 536873984 }, { "epoch": 1.06, "learning_rate": 0.0004228786359077232, "loss": 3.0849, "theoretical_loss": 3.885099768804293, "tokens_seen": 536939520 }, { "epoch": 1.06, "learning_rate": 0.0004228686058174523, "loss": 2.9835, "theoretical_loss": 3.885049323090933, "tokens_seen": 537005056 }, { "epoch": 1.06, "learning_rate": 0.00042285857572718155, "loss": 3.1185, "theoretical_loss": 3.884998885257132, "tokens_seen": 537070592 }, { "epoch": 1.06, "learning_rate": 0.00042284854563691074, "loss": 3.16, "theoretical_loss": 3.8849484553006968, "tokens_seen": 537136128 }, { "epoch": 1.06, "learning_rate": 0.0004228385155466399, "loss": 3.0289, "theoretical_loss": 3.8848980332194367, "tokens_seen": 537201664 }, { "epoch": 1.06, "learning_rate": 0.0004228284854563691, "loss": 2.8052, "theoretical_loss": 3.884847619011161, "tokens_seen": 537267200 }, { "epoch": 1.06, "learning_rate": 0.00042281845536609833, "loss": 3.099, "theoretical_loss": 3.8847972126736816, "tokens_seen": 537332736 }, { "epoch": 1.06, "objective/train/docs_used": 879748, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2657930850982666, "objective/train/theoretical_loss": 3.8847594130844643, "objective/train/tokens_used": 557841888, "theoretical_loss": 3.8847594130844643, "tokens_seen": 537381888 }, { "epoch": 1.06, "learning_rate": 0.00042280842527582746, "loss": 3.1233, "theoretical_loss": 3.8847468142048083, "tokens_seen": 537398272 }, { "epoch": 1.06, "learning_rate": 0.0004227983951855567, "loss": 3.1184, "theoretical_loss": 3.884696423602355, "tokens_seen": 537463808 }, { "epoch": 1.06, "learning_rate": 0.0004227883650952858, "loss": 2.8755, "theoretical_loss": 3.8846460408641335, "tokens_seen": 537529344 }, { "epoch": 1.06, "learning_rate": 0.00042277833500501506, "loss": 2.9908, "theoretical_loss": 3.884595665987958, "tokens_seen": 537594880 }, { "epoch": 1.06, "learning_rate": 0.00042276830491474424, "loss": 2.9439, "theoretical_loss": 3.884545298971644, "tokens_seen": 537660416 }, { "epoch": 1.06, "learning_rate": 0.0004227582748244734, "loss": 3.0397, "theoretical_loss": 3.884494939813008, "tokens_seen": 537725952 }, { "epoch": 1.06, "learning_rate": 0.0004227482447342026, "loss": 3.0687, "theoretical_loss": 3.884444588509865, "tokens_seen": 537791488 }, { "epoch": 1.06, "learning_rate": 0.0004227382146439318, "loss": 3.1784, "theoretical_loss": 3.884394245060034, "tokens_seen": 537857024 }, { "epoch": 1.06, "learning_rate": 0.00042272818455366096, "loss": 2.8869, "theoretical_loss": 3.8843439094613323, "tokens_seen": 537922560 }, { "epoch": 1.06, "learning_rate": 0.0004227181544633902, "loss": 3.0832, "theoretical_loss": 3.8842935817115807, "tokens_seen": 537988096 }, { "epoch": 1.06, "learning_rate": 0.0004227081243731193, "loss": 3.1589, "theoretical_loss": 3.884243261808598, "tokens_seen": 538053632 }, { "epoch": 1.06, "learning_rate": 0.00042269809428284856, "loss": 3.0466, "theoretical_loss": 3.884192949750206, "tokens_seen": 538119168 }, { "epoch": 1.06, "learning_rate": 0.0004226880641925777, "loss": 3.0664, "theoretical_loss": 3.8841426455342263, "tokens_seen": 538184704 }, { "epoch": 1.06, "learning_rate": 0.0004226780341023069, "loss": 3.0821, "theoretical_loss": 3.8840923491584816, "tokens_seen": 538250240 }, { "epoch": 1.06, "learning_rate": 0.00042266800401203616, "loss": 3.0921, "theoretical_loss": 3.8840420606207955, "tokens_seen": 538315776 }, { "epoch": 1.06, "learning_rate": 0.0004226579739217653, "loss": 3.0251, "theoretical_loss": 3.8839917799189934, "tokens_seen": 538381312 }, { "epoch": 1.06, "learning_rate": 0.0004226479438314945, "loss": 3.0307, "theoretical_loss": 3.8839415070509, "tokens_seen": 538446848 }, { "epoch": 1.06, "learning_rate": 0.0004226379137412237, "loss": 2.8923, "theoretical_loss": 3.883891242014341, "tokens_seen": 538512384 }, { "epoch": 1.06, "learning_rate": 0.0004226278836509529, "loss": 3.1328, "theoretical_loss": 3.8838409848071453, "tokens_seen": 538577920 }, { "epoch": 1.06, "learning_rate": 0.00042261785356068206, "loss": 3.1386, "theoretical_loss": 3.883790735427139, "tokens_seen": 538643456 }, { "epoch": 1.06, "learning_rate": 0.00042260782347041125, "loss": 3.1013, "theoretical_loss": 3.8837404938721516, "tokens_seen": 538708992 }, { "epoch": 1.06, "learning_rate": 0.0004225977933801404, "loss": 2.9568, "theoretical_loss": 3.8836902601400136, "tokens_seen": 538774528 }, { "epoch": 1.06, "learning_rate": 0.00042258776328986966, "loss": 3.0766, "theoretical_loss": 3.883640034228555, "tokens_seen": 538840064 }, { "epoch": 1.06, "learning_rate": 0.0004225777331995988, "loss": 3.0785, "theoretical_loss": 3.883589816135607, "tokens_seen": 538905600 }, { "epoch": 1.06, "learning_rate": 0.000422567703109328, "loss": 2.9871, "theoretical_loss": 3.883539605859002, "tokens_seen": 538971136 }, { "epoch": 1.06, "objective/train/docs_used": 882573, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7831521034240723, "objective/train/theoretical_loss": 3.88350195327972, "objective/train/tokens_used": 559480288, "theoretical_loss": 3.88350195327972, "tokens_seen": 539020288 }, { "epoch": 1.06, "learning_rate": 0.00042255767301905715, "loss": 3.0169, "theoretical_loss": 3.883489403396574, "tokens_seen": 539036672 }, { "epoch": 1.06, "learning_rate": 0.0004225476429287864, "loss": 2.9763, "theoretical_loss": 3.8834392087461556, "tokens_seen": 539102208 }, { "epoch": 1.06, "learning_rate": 0.00042253761283851557, "loss": 3.1432, "theoretical_loss": 3.8833890219055833, "tokens_seen": 539167744 }, { "epoch": 1.06, "learning_rate": 0.00042252758274824475, "loss": 2.9656, "theoretical_loss": 3.8833388428726923, "tokens_seen": 539233280 }, { "epoch": 1.06, "learning_rate": 0.00042251755265797393, "loss": 3.0572, "theoretical_loss": 3.8832886716453188, "tokens_seen": 539298816 }, { "epoch": 1.06, "learning_rate": 0.00042250752256770317, "loss": 3.0498, "theoretical_loss": 3.883238508221301, "tokens_seen": 539364352 }, { "epoch": 1.06, "learning_rate": 0.0004224974924774323, "loss": 3.0303, "theoretical_loss": 3.8831883525984763, "tokens_seen": 539429888 }, { "epoch": 1.06, "learning_rate": 0.00042248746238716153, "loss": 3.1154, "theoretical_loss": 3.883138204774685, "tokens_seen": 539495424 }, { "epoch": 1.06, "learning_rate": 0.00042247743229689065, "loss": 3.0497, "theoretical_loss": 3.8830880647477666, "tokens_seen": 539560960 }, { "epoch": 1.06, "learning_rate": 0.0004224674022066199, "loss": 3.1623, "theoretical_loss": 3.8830379325155624, "tokens_seen": 539626496 }, { "epoch": 1.06, "learning_rate": 0.00042245737211634907, "loss": 3.1364, "theoretical_loss": 3.8829878080759137, "tokens_seen": 539692032 }, { "epoch": 1.06, "learning_rate": 0.00042244734202607825, "loss": 3.1934, "theoretical_loss": 3.882937691426664, "tokens_seen": 539757568 }, { "epoch": 1.06, "learning_rate": 0.00042243731193580743, "loss": 3.2146, "theoretical_loss": 3.8828875825656564, "tokens_seen": 539823104 }, { "epoch": 1.06, "learning_rate": 0.0004224272818455366, "loss": 2.8501, "theoretical_loss": 3.8828374814907347, "tokens_seen": 539888640 }, { "epoch": 1.06, "learning_rate": 0.0004224172517552658, "loss": 3.1932, "theoretical_loss": 3.8827873881997457, "tokens_seen": 539954176 }, { "epoch": 1.06, "learning_rate": 0.00042240722166499503, "loss": 3.188, "theoretical_loss": 3.8827373026905336, "tokens_seen": 540019712 }, { "epoch": 1.06, "learning_rate": 0.00042239719157472416, "loss": 3.041, "theoretical_loss": 3.8826872249609465, "tokens_seen": 540085248 }, { "epoch": 1.06, "learning_rate": 0.0004223871614844534, "loss": 3.0899, "theoretical_loss": 3.882637155008832, "tokens_seen": 540150784 }, { "epoch": 1.06, "learning_rate": 0.0004223771313941825, "loss": 3.2732, "theoretical_loss": 3.882587092832039, "tokens_seen": 540216320 }, { "epoch": 1.06, "learning_rate": 0.00042236710130391176, "loss": 2.9741, "theoretical_loss": 3.8825370384284166, "tokens_seen": 540281856 }, { "epoch": 1.06, "learning_rate": 0.00042235707121364094, "loss": 3.0483, "theoretical_loss": 3.8824869917958154, "tokens_seen": 540347392 }, { "epoch": 1.06, "learning_rate": 0.0004223470411233701, "loss": 3.3234, "theoretical_loss": 3.8824369529320872, "tokens_seen": 540412928 }, { "epoch": 1.06, "learning_rate": 0.0004223370110330993, "loss": 3.2732, "theoretical_loss": 3.882386921835083, "tokens_seen": 540478464 }, { "epoch": 1.06, "learning_rate": 0.00042232698094282853, "loss": 3.1458, "theoretical_loss": 3.8823368985026567, "tokens_seen": 540544000 }, { "epoch": 1.06, "learning_rate": 0.00042231695085255766, "loss": 3.0079, "theoretical_loss": 3.8822868829326618, "tokens_seen": 540609536 }, { "epoch": 1.06, "objective/train/docs_used": 885364, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.225994110107422, "objective/train/theoretical_loss": 3.8822493763479704, "objective/train/tokens_used": 561118688, "theoretical_loss": 3.8822493763479704, "tokens_seen": 540658688 }, { "epoch": 1.06, "learning_rate": 0.0004223069207622869, "loss": 3.0154, "theoretical_loss": 3.8822368751229526, "tokens_seen": 540675072 }, { "epoch": 1.06, "learning_rate": 0.000422296890672016, "loss": 3.1326, "theoretical_loss": 3.882186875071385, "tokens_seen": 540740608 }, { "epoch": 1.06, "learning_rate": 0.00042228686058174526, "loss": 3.0896, "theoretical_loss": 3.882136882775815, "tokens_seen": 540806144 }, { "epoch": 1.06, "learning_rate": 0.00042227683049147444, "loss": 2.9498, "theoretical_loss": 3.8820868982341006, "tokens_seen": 540871680 }, { "epoch": 1.06, "learning_rate": 0.0004222668004012036, "loss": 3.1846, "theoretical_loss": 3.8820369214440995, "tokens_seen": 540937216 }, { "epoch": 1.06, "learning_rate": 0.0004222567703109328, "loss": 3.0129, "theoretical_loss": 3.88198695240367, "tokens_seen": 541002752 }, { "epoch": 1.06, "learning_rate": 0.000422246740220662, "loss": 3.15, "theoretical_loss": 3.881936991110673, "tokens_seen": 541068288 }, { "epoch": 1.06, "learning_rate": 0.00042223671013039116, "loss": 3.1765, "theoretical_loss": 3.881887037562968, "tokens_seen": 541133824 }, { "epoch": 1.06, "learning_rate": 0.0004222266800401204, "loss": 3.1909, "theoretical_loss": 3.8818370917584177, "tokens_seen": 541199360 }, { "epoch": 1.06, "learning_rate": 0.0004222166499498495, "loss": 2.9573, "theoretical_loss": 3.881787153694883, "tokens_seen": 541264896 }, { "epoch": 1.06, "learning_rate": 0.00042220661985957876, "loss": 3.0614, "theoretical_loss": 3.8817372233702283, "tokens_seen": 541330432 }, { "epoch": 1.06, "learning_rate": 0.0004221965897693079, "loss": 3.1054, "theoretical_loss": 3.8816873007823167, "tokens_seen": 541395968 }, { "epoch": 1.06, "learning_rate": 0.0004221865596790371, "loss": 2.8949, "theoretical_loss": 3.881637385929014, "tokens_seen": 541461504 }, { "epoch": 1.06, "learning_rate": 0.0004221765295887663, "loss": 3.1013, "theoretical_loss": 3.881587478808185, "tokens_seen": 541527040 }, { "epoch": 1.06, "learning_rate": 0.0004221664994984955, "loss": 3.0932, "theoretical_loss": 3.8815375794176967, "tokens_seen": 541592576 }, { "epoch": 1.06, "learning_rate": 0.00042215646940822467, "loss": 3.0526, "theoretical_loss": 3.881487687755417, "tokens_seen": 541658112 }, { "epoch": 1.06, "learning_rate": 0.0004221464393179539, "loss": 3.1263, "theoretical_loss": 3.881437803819213, "tokens_seen": 541723648 }, { "epoch": 1.06, "learning_rate": 0.00042213640922768303, "loss": 3.1716, "theoretical_loss": 3.881387927606955, "tokens_seen": 541789184 }, { "epoch": 1.06, "learning_rate": 0.00042212637913741226, "loss": 3.1583, "theoretical_loss": 3.881338059116512, "tokens_seen": 541854720 }, { "epoch": 1.06, "learning_rate": 0.0004221163490471414, "loss": 3.202, "theoretical_loss": 3.8812881983457554, "tokens_seen": 541920256 }, { "epoch": 1.06, "learning_rate": 0.0004221063189568706, "loss": 3.101, "theoretical_loss": 3.8812383452925565, "tokens_seen": 541985792 }, { "epoch": 1.06, "learning_rate": 0.0004220962888665998, "loss": 3.0861, "theoretical_loss": 3.8811884999547877, "tokens_seen": 542051328 }, { "epoch": 1.06, "learning_rate": 0.000422086258776329, "loss": 3.0908, "theoretical_loss": 3.881138662330323, "tokens_seen": 542116864 }, { "epoch": 1.06, "learning_rate": 0.00042207622868605817, "loss": 3.0339, "theoretical_loss": 3.881088832417036, "tokens_seen": 542182400 }, { "epoch": 1.06, "learning_rate": 0.00042206619859578735, "loss": 3.2825, "theoretical_loss": 3.8810390102128016, "tokens_seen": 542247936 }, { "epoch": 1.06, "objective/train/docs_used": 886740, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9755074977874756, "objective/train/theoretical_loss": 3.8810016486174135, "objective/train/tokens_used": 562757088, "theoretical_loss": 3.8810016486174135, "tokens_seen": 542297088 }, { "epoch": 1.06, "learning_rate": 0.00042205616850551653, "loss": 3.1196, "theoretical_loss": 3.880989195715496, "tokens_seen": 542313472 }, { "epoch": 1.06, "learning_rate": 0.00042204613841524577, "loss": 2.9544, "theoretical_loss": 3.8809393889229957, "tokens_seen": 542379008 }, { "epoch": 1.06, "learning_rate": 0.0004220361083249749, "loss": 3.118, "theoretical_loss": 3.880889589833178, "tokens_seen": 542444544 }, { "epoch": 1.06, "learning_rate": 0.00042202607823470413, "loss": 2.9317, "theoretical_loss": 3.8808397984439225, "tokens_seen": 542510080 }, { "epoch": 1.06, "learning_rate": 0.0004220160481444333, "loss": 3.0324, "theoretical_loss": 3.8807900147531065, "tokens_seen": 542575616 }, { "epoch": 1.06, "learning_rate": 0.0004220060180541625, "loss": 3.0037, "theoretical_loss": 3.8807402387586114, "tokens_seen": 542641152 }, { "epoch": 1.06, "learning_rate": 0.0004219959879638917, "loss": 3.1715, "theoretical_loss": 3.880690470458318, "tokens_seen": 542706688 }, { "epoch": 1.06, "learning_rate": 0.00042198595787362085, "loss": 2.8161, "theoretical_loss": 3.8806407098501072, "tokens_seen": 542772224 }, { "epoch": 1.06, "learning_rate": 0.00042197592778335004, "loss": 3.1488, "theoretical_loss": 3.8805909569318633, "tokens_seen": 542837760 }, { "epoch": 1.06, "learning_rate": 0.00042196589769307927, "loss": 2.8964, "theoretical_loss": 3.880541211701468, "tokens_seen": 542903296 }, { "epoch": 1.06, "learning_rate": 0.0004219558676028084, "loss": 2.9677, "theoretical_loss": 3.880491474156806, "tokens_seen": 542968832 }, { "epoch": 1.06, "learning_rate": 0.00042194583751253763, "loss": 3.0613, "theoretical_loss": 3.8804417442957626, "tokens_seen": 543034368 }, { "epoch": 1.06, "learning_rate": 0.00042193580742226676, "loss": 3.0955, "theoretical_loss": 3.8803920221162236, "tokens_seen": 543099904 }, { "epoch": 1.06, "learning_rate": 0.000421925777331996, "loss": 3.0273, "theoretical_loss": 3.880342307616076, "tokens_seen": 543165440 }, { "epoch": 1.06, "learning_rate": 0.00042191574724172523, "loss": 3.1791, "theoretical_loss": 3.880292600793207, "tokens_seen": 543230976 }, { "epoch": 1.06, "learning_rate": 0.00042190571715145436, "loss": 3.0842, "theoretical_loss": 3.880242901645506, "tokens_seen": 543296512 }, { "epoch": 1.06, "learning_rate": 0.0004218956870611836, "loss": 2.9302, "theoretical_loss": 3.8801932101708605, "tokens_seen": 543362048 }, { "epoch": 1.06, "learning_rate": 0.0004218856569709127, "loss": 3.1991, "theoretical_loss": 3.8801435263671626, "tokens_seen": 543427584 }, { "epoch": 1.06, "learning_rate": 0.00042187562688064196, "loss": 3.1406, "theoretical_loss": 3.8800938502323015, "tokens_seen": 543493120 }, { "epoch": 1.06, "learning_rate": 0.00042186559679037114, "loss": 3.0796, "theoretical_loss": 3.88004418176417, "tokens_seen": 543558656 }, { "epoch": 1.06, "learning_rate": 0.0004218555667001003, "loss": 3.0011, "theoretical_loss": 3.8799945209606603, "tokens_seen": 543624192 }, { "epoch": 1.06, "learning_rate": 0.0004218455366098295, "loss": 3.0656, "theoretical_loss": 3.879944867819667, "tokens_seen": 543689728 }, { "epoch": 1.06, "learning_rate": 0.00042183550651955873, "loss": 2.8879, "theoretical_loss": 3.8798952223390826, "tokens_seen": 543755264 }, { "epoch": 1.06, "learning_rate": 0.00042182547642928786, "loss": 2.8506, "theoretical_loss": 3.879845584516803, "tokens_seen": 543820800 }, { "epoch": 1.06, "learning_rate": 0.0004218154463390171, "loss": 3.0336, "theoretical_loss": 3.8797959543507243, "tokens_seen": 543886336 }, { "epoch": 1.06, "objective/train/docs_used": 889671, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8488991260528564, "objective/train/theoretical_loss": 3.879758736749282, "objective/train/tokens_used": 564395488, "theoretical_loss": 3.879758736749282, "tokens_seen": 543935488 }, { "epoch": 1.06, "learning_rate": 0.0004218054162487462, "loss": 2.9709, "theoretical_loss": 3.879746331838743, "tokens_seen": 543951872 }, { "epoch": 1.06, "learning_rate": 0.00042179538615847546, "loss": 3.0436, "theoretical_loss": 3.879696716978757, "tokens_seen": 544017408 }, { "epoch": 1.06, "learning_rate": 0.00042178535606820464, "loss": 3.124, "theoretical_loss": 3.8796471097686642, "tokens_seen": 544082944 }, { "epoch": 1.06, "learning_rate": 0.0004217753259779338, "loss": 2.9839, "theoretical_loss": 3.8795975102063647, "tokens_seen": 544148480 }, { "epoch": 1.06, "learning_rate": 0.000421765295887663, "loss": 3.0964, "theoretical_loss": 3.8795479182897576, "tokens_seen": 544214016 }, { "epoch": 1.06, "learning_rate": 0.0004217552657973922, "loss": 2.9558, "theoretical_loss": 3.879498334016745, "tokens_seen": 544279552 }, { "epoch": 1.06, "learning_rate": 0.00042174523570712136, "loss": 3.0717, "theoretical_loss": 3.8794487573852274, "tokens_seen": 544345088 }, { "epoch": 1.06, "learning_rate": 0.0004217352056168506, "loss": 2.9468, "theoretical_loss": 3.8793991883931085, "tokens_seen": 544410624 }, { "epoch": 1.06, "learning_rate": 0.0004217251755265797, "loss": 3.1256, "theoretical_loss": 3.8793496270382906, "tokens_seen": 544476160 }, { "epoch": 1.06, "learning_rate": 0.00042171514543630896, "loss": 3.0944, "theoretical_loss": 3.879300073318679, "tokens_seen": 544541696 }, { "epoch": 1.06, "learning_rate": 0.0004217051153460381, "loss": 3.0262, "theoretical_loss": 3.879250527232178, "tokens_seen": 544607232 }, { "epoch": 1.06, "learning_rate": 0.0004216950852557673, "loss": 2.8766, "theoretical_loss": 3.8792009887766934, "tokens_seen": 544672768 }, { "epoch": 1.06, "learning_rate": 0.0004216850551654965, "loss": 2.9041, "theoretical_loss": 3.8791514579501323, "tokens_seen": 544738304 }, { "epoch": 1.06, "learning_rate": 0.0004216750250752257, "loss": 3.0769, "theoretical_loss": 3.8791019347504028, "tokens_seen": 544803840 }, { "epoch": 1.06, "learning_rate": 0.00042166499498495487, "loss": 3.1264, "theoretical_loss": 3.8790524191754123, "tokens_seen": 544869376 }, { "epoch": 1.06, "learning_rate": 0.0004216549648946841, "loss": 3.1043, "theoretical_loss": 3.879002911223071, "tokens_seen": 544934912 }, { "epoch": 1.06, "learning_rate": 0.00042164493480441323, "loss": 3.0902, "theoretical_loss": 3.8789534108912873, "tokens_seen": 545000448 }, { "epoch": 1.06, "learning_rate": 0.00042163490471414246, "loss": 2.9686, "theoretical_loss": 3.878903918177974, "tokens_seen": 545065984 }, { "epoch": 1.06, "learning_rate": 0.0004216248746238716, "loss": 3.0774, "theoretical_loss": 3.8788544330810417, "tokens_seen": 545131520 }, { "epoch": 1.06, "learning_rate": 0.0004216148445336008, "loss": 3.0787, "theoretical_loss": 3.8788049555984023, "tokens_seen": 545197056 }, { "epoch": 1.06, "learning_rate": 0.00042160481444333, "loss": 2.8684, "theoretical_loss": 3.8787554857279702, "tokens_seen": 545262592 }, { "epoch": 1.06, "learning_rate": 0.0004215947843530592, "loss": 3.0225, "theoretical_loss": 3.878706023467659, "tokens_seen": 545328128 }, { "epoch": 1.06, "learning_rate": 0.00042158475426278837, "loss": 2.9689, "theoretical_loss": 3.8786565688153845, "tokens_seen": 545393664 }, { "epoch": 1.06, "learning_rate": 0.00042157472417251755, "loss": 3.0129, "theoretical_loss": 3.8786071217690608, "tokens_seen": 545459200 }, { "epoch": 1.06, "learning_rate": 0.00042156469408224673, "loss": 3.1857, "theoretical_loss": 3.8785576823266057, "tokens_seen": 545524736 }, { "epoch": 1.06, "objective/train/docs_used": 892417, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.865858316421509, "objective/train/theoretical_loss": 3.8785206077335506, "objective/train/tokens_used": 566033888, "theoretical_loss": 3.8785206077335506, "tokens_seen": 545573888 }, { "epoch": 1.06, "learning_rate": 0.00042155466399197597, "loss": 2.9322, "theoretical_loss": 3.878508250485937, "tokens_seen": 545590272 }, { "epoch": 1.06, "learning_rate": 0.0004215446339017051, "loss": 2.9925, "theoretical_loss": 3.878458826244972, "tokens_seen": 545655808 }, { "epoch": 1.06, "learning_rate": 0.00042153460381143433, "loss": 3.041, "theoretical_loss": 3.87840940960163, "tokens_seen": 545721344 }, { "epoch": 1.06, "learning_rate": 0.0004215245737211635, "loss": 3.053, "theoretical_loss": 3.8783600005538306, "tokens_seen": 545786880 }, { "epoch": 1.06, "learning_rate": 0.0004215145436308927, "loss": 3.0529, "theoretical_loss": 3.8783105990994957, "tokens_seen": 545852416 }, { "epoch": 1.06, "learning_rate": 0.0004215045135406219, "loss": 3.0225, "theoretical_loss": 3.8782612052365457, "tokens_seen": 545917952 }, { "epoch": 1.06, "learning_rate": 0.00042149448345035105, "loss": 2.8827, "theoretical_loss": 3.878211818962903, "tokens_seen": 545983488 }, { "epoch": 1.06, "learning_rate": 0.00042148445336008024, "loss": 3.1247, "theoretical_loss": 3.878162440276491, "tokens_seen": 546049024 }, { "epoch": 1.06, "learning_rate": 0.00042147442326980947, "loss": 3.0786, "theoretical_loss": 3.878113069175234, "tokens_seen": 546114560 }, { "epoch": 1.06, "learning_rate": 0.0004214643931795386, "loss": 3.1009, "theoretical_loss": 3.8780637056570564, "tokens_seen": 546180096 }, { "epoch": 1.06, "learning_rate": 0.00042145436308926783, "loss": 3.0389, "theoretical_loss": 3.8780143497198836, "tokens_seen": 546245632 }, { "epoch": 1.06, "learning_rate": 0.00042144433299899696, "loss": 3.1286, "theoretical_loss": 3.8779650013616425, "tokens_seen": 546311168 }, { "epoch": 1.06, "learning_rate": 0.0004214343029087262, "loss": 3.0992, "theoretical_loss": 3.8779156605802596, "tokens_seen": 546376704 }, { "epoch": 1.06, "learning_rate": 0.0004214242728184554, "loss": 3.0883, "theoretical_loss": 3.877866327373664, "tokens_seen": 546442240 }, { "epoch": 1.06, "learning_rate": 0.00042141424272818456, "loss": 2.9595, "theoretical_loss": 3.877817001739784, "tokens_seen": 546507776 }, { "epoch": 1.06, "learning_rate": 0.00042140421263791374, "loss": 3.2732, "theoretical_loss": 3.8777676836765496, "tokens_seen": 546573312 }, { "epoch": 1.06, "learning_rate": 0.0004213941825476429, "loss": 3.0664, "theoretical_loss": 3.8777183731818905, "tokens_seen": 546638848 }, { "epoch": 1.06, "learning_rate": 0.0004213841524573721, "loss": 3.0759, "theoretical_loss": 3.877669070253739, "tokens_seen": 546704384 }, { "epoch": 1.06, "learning_rate": 0.00042137412236710134, "loss": 3.0783, "theoretical_loss": 3.877619774890026, "tokens_seen": 546769920 }, { "epoch": 1.06, "learning_rate": 0.00042136409227683046, "loss": 3.0779, "theoretical_loss": 3.877570487088686, "tokens_seen": 546835456 }, { "epoch": 1.06, "learning_rate": 0.0004213540621865597, "loss": 3.1379, "theoretical_loss": 3.8775212068476517, "tokens_seen": 546900992 }, { "epoch": 1.06, "learning_rate": 0.0004213440320962889, "loss": 3.0822, "theoretical_loss": 3.8774719341648574, "tokens_seen": 546966528 }, { "epoch": 1.06, "learning_rate": 0.00042133400200601806, "loss": 3.0078, "theoretical_loss": 3.87742266903824, "tokens_seen": 547032064 }, { "epoch": 1.06, "learning_rate": 0.00042132397191574724, "loss": 3.1122, "theoretical_loss": 3.877373411465734, "tokens_seen": 547097600 }, { "epoch": 1.06, "learning_rate": 0.0004213139418254764, "loss": 3.1183, "theoretical_loss": 3.8773241614452774, "tokens_seen": 547163136 }, { "epoch": 1.06, "objective/train/docs_used": 895323, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.988391399383545, "objective/train/theoretical_loss": 3.877287228884726, "objective/train/tokens_used": 567672288, "theoretical_loss": 3.877287228884726, "tokens_seen": 547212288 }, { "epoch": 1.06, "learning_rate": 0.0004213039117352056, "loss": 3.024, "theoretical_loss": 3.8772749189748072, "tokens_seen": 547228672 }, { "epoch": 1.06, "learning_rate": 0.00042129388164493484, "loss": 3.0932, "theoretical_loss": 3.877225684052263, "tokens_seen": 547294208 }, { "epoch": 1.06, "learning_rate": 0.00042128385155466397, "loss": 2.9863, "theoretical_loss": 3.877176456675583, "tokens_seen": 547359744 }, { "epoch": 1.06, "learning_rate": 0.0004212738214643932, "loss": 2.9386, "theoretical_loss": 3.8771272368427088, "tokens_seen": 547425280 }, { "epoch": 1.06, "learning_rate": 0.00042126379137412233, "loss": 3.1828, "theoretical_loss": 3.877078024551581, "tokens_seen": 547490816 }, { "epoch": 1.06, "learning_rate": 0.00042125376128385156, "loss": 3.0967, "theoretical_loss": 3.877028819800141, "tokens_seen": 547556352 }, { "epoch": 1.06, "learning_rate": 0.00042124373119358075, "loss": 3.1747, "theoretical_loss": 3.8769796225863313, "tokens_seen": 547621888 }, { "epoch": 1.06, "learning_rate": 0.0004212337011033099, "loss": 3.0302, "theoretical_loss": 3.876930432908096, "tokens_seen": 547687424 }, { "epoch": 1.06, "learning_rate": 0.0004212236710130391, "loss": 3.0778, "theoretical_loss": 3.876881250763379, "tokens_seen": 547752960 }, { "epoch": 1.06, "learning_rate": 0.0004212136409227683, "loss": 2.9484, "theoretical_loss": 3.876832076150126, "tokens_seen": 547818496 }, { "epoch": 1.06, "learning_rate": 0.00042120361083249747, "loss": 3.0772, "theoretical_loss": 3.876782909066282, "tokens_seen": 547884032 }, { "epoch": 1.06, "learning_rate": 0.0004211935807422267, "loss": 3.0683, "theoretical_loss": 3.8767337495097944, "tokens_seen": 547949568 }, { "epoch": 1.06, "learning_rate": 0.00042118355065195583, "loss": 2.8821, "theoretical_loss": 3.87668459747861, "tokens_seen": 548015104 }, { "epoch": 1.06, "learning_rate": 0.00042117352056168507, "loss": 3.1384, "theoretical_loss": 3.876635452970678, "tokens_seen": 548080640 }, { "epoch": 1.06, "learning_rate": 0.0004211634904714143, "loss": 3.1654, "theoretical_loss": 3.876586315983947, "tokens_seen": 548146176 }, { "epoch": 1.06, "learning_rate": 0.00042115346038114343, "loss": 3.0964, "theoretical_loss": 3.8765371865163667, "tokens_seen": 548211712 }, { "epoch": 1.06, "learning_rate": 0.00042114343029087266, "loss": 3.2314, "theoretical_loss": 3.876488064565888, "tokens_seen": 548277248 }, { "epoch": 1.06, "learning_rate": 0.0004211334002006018, "loss": 2.9482, "theoretical_loss": 3.8764389501304626, "tokens_seen": 548342784 }, { "epoch": 1.06, "learning_rate": 0.00042112337011033103, "loss": 3.0774, "theoretical_loss": 3.8763898432080426, "tokens_seen": 548408320 }, { "epoch": 1.06, "learning_rate": 0.0004211133400200602, "loss": 3.0081, "theoretical_loss": 3.8763407437965816, "tokens_seen": 548473856 }, { "epoch": 1.06, "learning_rate": 0.0004211033099297894, "loss": 3.0195, "theoretical_loss": 3.8762916518940327, "tokens_seen": 548539392 }, { "epoch": 1.06, "learning_rate": 0.00042109327983951857, "loss": 2.9536, "theoretical_loss": 3.876242567498351, "tokens_seen": 548604928 }, { "epoch": 1.06, "learning_rate": 0.00042108324974924775, "loss": 3.146, "theoretical_loss": 3.876193490607492, "tokens_seen": 548670464 }, { "epoch": 1.06, "learning_rate": 0.00042107321965897693, "loss": 3.0228, "theoretical_loss": 3.876144421219413, "tokens_seen": 548736000 }, { "epoch": 1.06, "learning_rate": 0.00042106318956870617, "loss": 3.0176, "theoretical_loss": 3.8760953593320693, "tokens_seen": 548801536 }, { "epoch": 1.06, "objective/train/docs_used": 897788, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1531662940979004, "objective/train/theoretical_loss": 3.876058567837692, "objective/train/tokens_used": 569310688, "theoretical_loss": 3.876058567837692, "tokens_seen": 548850688 }, { "epoch": 1.06, "learning_rate": 0.0004210531594784353, "loss": 3.0986, "theoretical_loss": 3.8760463049434204, "tokens_seen": 548867072 }, { "epoch": 1.06, "learning_rate": 0.00042104312938816453, "loss": 3.0265, "theoretical_loss": 3.8759972580514246, "tokens_seen": 548932608 }, { "epoch": 1.06, "learning_rate": 0.0004210330992978937, "loss": 3.0919, "theoretical_loss": 3.875948218654041, "tokens_seen": 548998144 }, { "epoch": 1.06, "learning_rate": 0.0004210230692076229, "loss": 3.254, "theoretical_loss": 3.87589918674923, "tokens_seen": 549063680 }, { "epoch": 1.06, "learning_rate": 0.0004210130391173521, "loss": 3.1945, "theoretical_loss": 3.8758501623349533, "tokens_seen": 549129216 }, { "epoch": 1.06, "learning_rate": 0.00042100300902708125, "loss": 3.0232, "theoretical_loss": 3.875801145409172, "tokens_seen": 549194752 }, { "epoch": 1.06, "learning_rate": 0.00042099297893681044, "loss": 3.1923, "theoretical_loss": 3.875752135969849, "tokens_seen": 549260288 }, { "epoch": 1.06, "learning_rate": 0.00042098294884653967, "loss": 2.96, "theoretical_loss": 3.875703134014949, "tokens_seen": 549325824 }, { "epoch": 1.06, "learning_rate": 0.0004209729187562688, "loss": 2.8784, "theoretical_loss": 3.875654139542435, "tokens_seen": 549391360 }, { "epoch": 1.06, "learning_rate": 0.00042096288866599803, "loss": 3.0646, "theoretical_loss": 3.875605152550272, "tokens_seen": 549456896 }, { "epoch": 1.06, "learning_rate": 0.00042095285857572716, "loss": 3.0271, "theoretical_loss": 3.8755561730364274, "tokens_seen": 549522432 }, { "epoch": 1.06, "learning_rate": 0.0004209428284854564, "loss": 3.0764, "theoretical_loss": 3.8755072009988663, "tokens_seen": 549587968 }, { "epoch": 1.06, "learning_rate": 0.0004209327983951856, "loss": 3.1887, "theoretical_loss": 3.875458236435557, "tokens_seen": 549653504 }, { "epoch": 1.06, "learning_rate": 0.00042092276830491476, "loss": 3.0783, "theoretical_loss": 3.875409279344468, "tokens_seen": 549719040 }, { "epoch": 1.06, "learning_rate": 0.00042091273821464394, "loss": 3.0381, "theoretical_loss": 3.8753603297235673, "tokens_seen": 549784576 }, { "epoch": 1.06, "learning_rate": 0.0004209027081243731, "loss": 3.2375, "theoretical_loss": 3.875311387570826, "tokens_seen": 549850112 }, { "epoch": 1.06, "learning_rate": 0.0004208926780341023, "loss": 3.0343, "theoretical_loss": 3.875262452884215, "tokens_seen": 549915648 }, { "epoch": 1.06, "learning_rate": 0.00042088264794383154, "loss": 3.1059, "theoretical_loss": 3.875213525661704, "tokens_seen": 549981184 }, { "epoch": 1.06, "learning_rate": 0.00042087261785356066, "loss": 2.994, "theoretical_loss": 3.8751646059012663, "tokens_seen": 550046720 }, { "epoch": 1.06, "learning_rate": 0.0004208625877632899, "loss": 3.3013, "theoretical_loss": 3.8751156936008755, "tokens_seen": 550112256 }, { "epoch": 1.06, "learning_rate": 0.0004208525576730191, "loss": 3.0667, "theoretical_loss": 3.875066788758505, "tokens_seen": 550177792 }, { "epoch": 1.06, "learning_rate": 0.00042084252758274826, "loss": 3.0831, "theoretical_loss": 3.8750178913721287, "tokens_seen": 550243328 }, { "epoch": 1.06, "learning_rate": 0.00042083249749247744, "loss": 3.0751, "theoretical_loss": 3.874969001439723, "tokens_seen": 550308864 }, { "epoch": 1.06, "learning_rate": 0.0004208224674022066, "loss": 3.1908, "theoretical_loss": 3.8749201189592646, "tokens_seen": 550374400 }, { "epoch": 1.06, "learning_rate": 0.0004208124373119358, "loss": 2.9459, "theoretical_loss": 3.8748712439287294, "tokens_seen": 550439936 }, { "epoch": 1.06, "objective/train/docs_used": 900596, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.950566530227661, "objective/train/theoretical_loss": 3.8748345925436234, "objective/train/tokens_used": 570949088, "theoretical_loss": 3.8748345925436234, "tokens_seen": 550489088 }, { "epoch": 1.06, "learning_rate": 0.00042080240722166504, "loss": 3.1197, "theoretical_loss": 3.8748223763460947, "tokens_seen": 550505472 }, { "epoch": 1.06, "learning_rate": 0.00042079237713139417, "loss": 3.1552, "theoretical_loss": 3.8747735162093404, "tokens_seen": 550571008 }, { "epoch": 1.06, "learning_rate": 0.0004207823470411234, "loss": 3.0461, "theoretical_loss": 3.874724663516446, "tokens_seen": 550636544 }, { "epoch": 1.06, "learning_rate": 0.00042077231695085253, "loss": 3.059, "theoretical_loss": 3.8746758182653904, "tokens_seen": 550702080 }, { "epoch": 1.06, "learning_rate": 0.00042076228686058176, "loss": 3.0056, "theoretical_loss": 3.8746269804541558, "tokens_seen": 550767616 }, { "epoch": 1.06, "learning_rate": 0.00042075225677031095, "loss": 3.0306, "theoretical_loss": 3.8745781500807235, "tokens_seen": 550833152 }, { "epoch": 1.06, "learning_rate": 0.0004207422266800401, "loss": 2.9473, "theoretical_loss": 3.874529327143075, "tokens_seen": 550898688 }, { "epoch": 1.06, "learning_rate": 0.0004207321965897693, "loss": 3.0964, "theoretical_loss": 3.874480511639195, "tokens_seen": 550964224 }, { "epoch": 1.06, "learning_rate": 0.0004207221664994985, "loss": 3.1651, "theoretical_loss": 3.8744317035670672, "tokens_seen": 551029760 }, { "epoch": 1.06, "learning_rate": 0.00042071213640922767, "loss": 3.217, "theoretical_loss": 3.874382902924677, "tokens_seen": 551095296 }, { "epoch": 1.06, "learning_rate": 0.0004207021063189569, "loss": 2.9918, "theoretical_loss": 3.874334109710009, "tokens_seen": 551160832 }, { "epoch": 1.06, "learning_rate": 0.00042069207622868603, "loss": 3.1336, "theoretical_loss": 3.87428532392105, "tokens_seen": 551226368 }, { "epoch": 1.06, "learning_rate": 0.00042068204613841527, "loss": 2.9412, "theoretical_loss": 3.874236545555788, "tokens_seen": 551291904 }, { "epoch": 1.06, "learning_rate": 0.00042067201604814445, "loss": 3.0432, "theoretical_loss": 3.87418777461221, "tokens_seen": 551357440 }, { "epoch": 1.06, "learning_rate": 0.00042066198595787363, "loss": 2.9105, "theoretical_loss": 3.8741390110883054, "tokens_seen": 551422976 }, { "epoch": 1.06, "learning_rate": 0.0004206519558676028, "loss": 2.9937, "theoretical_loss": 3.8740902549820637, "tokens_seen": 551488512 }, { "epoch": 1.06, "learning_rate": 0.000420641925777332, "loss": 2.9712, "theoretical_loss": 3.8740415062914755, "tokens_seen": 551554048 }, { "epoch": 1.06, "learning_rate": 0.0004206318956870612, "loss": 3.1424, "theoretical_loss": 3.8739927650145316, "tokens_seen": 551619584 }, { "epoch": 1.06, "learning_rate": 0.0004206218655967904, "loss": 3.0338, "theoretical_loss": 3.8739440311492244, "tokens_seen": 551685120 }, { "epoch": 1.06, "learning_rate": 0.00042061183550651954, "loss": 3.0467, "theoretical_loss": 3.873895304693546, "tokens_seen": 551750656 }, { "epoch": 1.06, "learning_rate": 0.00042060180541624877, "loss": 3.0022, "theoretical_loss": 3.8738465856454907, "tokens_seen": 551816192 }, { "epoch": 1.06, "learning_rate": 0.0004205917753259779, "loss": 3.1432, "theoretical_loss": 3.873797874003052, "tokens_seen": 551881728 }, { "epoch": 1.06, "learning_rate": 0.00042058174523570713, "loss": 3.0256, "theoretical_loss": 3.873749169764225, "tokens_seen": 551947264 }, { "epoch": 1.06, "learning_rate": 0.0004205717151454363, "loss": 3.1105, "theoretical_loss": 3.8737004729270064, "tokens_seen": 552012800 }, { "epoch": 1.06, "learning_rate": 0.0004205616850551655, "loss": 3.2012, "theoretical_loss": 3.873651783489392, "tokens_seen": 552078336 }, { "epoch": 1.06, "objective/train/docs_used": 903502, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.110152006149292, "objective/train/theoretical_loss": 3.8736152712659675, "objective/train/tokens_used": 572587488, "theoretical_loss": 3.8736152712659675, "tokens_seen": 552127488 }, { "epoch": 1.06, "learning_rate": 0.0004205516549648947, "loss": 3.1199, "theoretical_loss": 3.87360310144938, "tokens_seen": 552143872 }, { "epoch": 1.06, "learning_rate": 0.0004205416248746239, "loss": 3.0729, "theoretical_loss": 3.873554426804968, "tokens_seen": 552209408 }, { "epoch": 1.06, "learning_rate": 0.00042053159478435304, "loss": 2.9663, "theoretical_loss": 3.8735057595541553, "tokens_seen": 552274944 }, { "epoch": 1.06, "learning_rate": 0.0004205215646940823, "loss": 3.2278, "theoretical_loss": 3.8734570996949413, "tokens_seen": 552340480 }, { "epoch": 1.06, "learning_rate": 0.0004205115346038114, "loss": 3.0647, "theoretical_loss": 3.8734084472253274, "tokens_seen": 552406016 }, { "epoch": 1.06, "learning_rate": 0.00042050150451354064, "loss": 3.1989, "theoretical_loss": 3.8733598021433133, "tokens_seen": 552471552 }, { "epoch": 1.06, "learning_rate": 0.0004204914744232698, "loss": 2.9714, "theoretical_loss": 3.8733111644469025, "tokens_seen": 552537088 }, { "epoch": 1.06, "learning_rate": 0.000420481444332999, "loss": 3.1111, "theoretical_loss": 3.8732625341340974, "tokens_seen": 552602624 }, { "epoch": 1.06, "learning_rate": 0.0004204714142427282, "loss": 3.0929, "theoretical_loss": 3.8732139112029014, "tokens_seen": 552668160 }, { "epoch": 1.06, "learning_rate": 0.00042046138415245736, "loss": 2.9943, "theoretical_loss": 3.8731652956513196, "tokens_seen": 552733696 }, { "epoch": 1.06, "learning_rate": 0.00042045135406218654, "loss": 3.019, "theoretical_loss": 3.873116687477357, "tokens_seen": 552799232 }, { "epoch": 1.06, "learning_rate": 0.0004204413239719158, "loss": 3.0235, "theoretical_loss": 3.873068086679019, "tokens_seen": 552864768 }, { "epoch": 1.06, "learning_rate": 0.0004204312938816449, "loss": 3.1328, "theoretical_loss": 3.8730194932543127, "tokens_seen": 552930304 }, { "epoch": 1.06, "learning_rate": 0.00042042126379137414, "loss": 3.0201, "theoretical_loss": 3.8729709072012457, "tokens_seen": 552995840 }, { "epoch": 1.06, "learning_rate": 0.0004204112337011033, "loss": 2.9769, "theoretical_loss": 3.8729223285178263, "tokens_seen": 553061376 }, { "epoch": 1.06, "learning_rate": 0.0004204012036108325, "loss": 2.8936, "theoretical_loss": 3.872873757202064, "tokens_seen": 553126912 }, { "epoch": 1.06, "learning_rate": 0.00042039117352056174, "loss": 3.1603, "theoretical_loss": 3.872825193251967, "tokens_seen": 553192448 }, { "epoch": 1.06, "learning_rate": 0.00042038114343029086, "loss": 3.1422, "theoretical_loss": 3.872776636665548, "tokens_seen": 553257984 }, { "epoch": 1.06, "learning_rate": 0.0004203711133400201, "loss": 3.0202, "theoretical_loss": 3.872728087440817, "tokens_seen": 553323520 }, { "epoch": 1.06, "learning_rate": 0.0004203610832497493, "loss": 3.0695, "theoretical_loss": 3.8726795455757874, "tokens_seen": 553389056 }, { "epoch": 1.06, "learning_rate": 0.00042035105315947846, "loss": 3.1224, "theoretical_loss": 3.8726310110684707, "tokens_seen": 553454592 }, { "epoch": 1.06, "learning_rate": 0.00042034102306920764, "loss": 3.1974, "theoretical_loss": 3.8725824839168816, "tokens_seen": 553520128 }, { "epoch": 1.06, "learning_rate": 0.0004203309929789368, "loss": 2.9931, "theoretical_loss": 3.872533964119034, "tokens_seen": 553585664 }, { "epoch": 1.06, "learning_rate": 0.000420320962888666, "loss": 3.1968, "theoretical_loss": 3.8724854516729437, "tokens_seen": 553651200 }, { "epoch": 1.06, "learning_rate": 0.00042031093279839524, "loss": 3.1071, "theoretical_loss": 3.8724369465766264, "tokens_seen": 553716736 }, { "epoch": 1.06, "objective/train/docs_used": 904895, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.053105115890503, "objective/train/theoretical_loss": 3.872400572576484, "objective/train/tokens_used": 574225888, "theoretical_loss": 3.872400572576484, "tokens_seen": 553765888 }, { "epoch": 1.06, "learning_rate": 0.00042030090270812437, "loss": 3.2419, "theoretical_loss": 3.872388448828099, "tokens_seen": 553782272 }, { "epoch": 1.06, "learning_rate": 0.0004202908726178536, "loss": 3.0733, "theoretical_loss": 3.872339958425379, "tokens_seen": 553847808 }, { "epoch": 1.06, "learning_rate": 0.00042028084252758273, "loss": 2.9502, "theoretical_loss": 3.8722914753664845, "tokens_seen": 553913344 }, { "epoch": 1.06, "learning_rate": 0.00042027081243731196, "loss": 2.9895, "theoretical_loss": 3.8722429996494347, "tokens_seen": 553978880 }, { "epoch": 1.06, "learning_rate": 0.00042026078234704115, "loss": 3.0716, "theoretical_loss": 3.87219453127225, "tokens_seen": 554044416 }, { "epoch": 1.06, "learning_rate": 0.0004202507522567703, "loss": 3.2542, "theoretical_loss": 3.8721460702329504, "tokens_seen": 554109952 }, { "epoch": 1.06, "learning_rate": 0.0004202407221664995, "loss": 3.1138, "theoretical_loss": 3.8720976165295578, "tokens_seen": 554175488 }, { "epoch": 1.06, "learning_rate": 0.0004202306920762287, "loss": 3.1949, "theoretical_loss": 3.872049170160094, "tokens_seen": 554241024 }, { "epoch": 1.06, "learning_rate": 0.00042022066198595787, "loss": 3.031, "theoretical_loss": 3.872000731122582, "tokens_seen": 554306560 }, { "epoch": 1.06, "learning_rate": 0.0004202106318956871, "loss": 3.1544, "theoretical_loss": 3.871952299415046, "tokens_seen": 554372096 }, { "epoch": 1.06, "learning_rate": 0.00042020060180541623, "loss": 3.0716, "theoretical_loss": 3.8719038750355095, "tokens_seen": 554437632 }, { "epoch": 1.06, "learning_rate": 0.00042019057171514547, "loss": 3.1271, "theoretical_loss": 3.8718554579819986, "tokens_seen": 554503168 }, { "epoch": 1.06, "learning_rate": 0.00042018054162487465, "loss": 3.114, "theoretical_loss": 3.8718070482525384, "tokens_seen": 554568704 }, { "epoch": 1.06, "learning_rate": 0.00042017051153460383, "loss": 3.2064, "theoretical_loss": 3.8717586458451567, "tokens_seen": 554634240 }, { "epoch": 1.06, "learning_rate": 0.000420160481444333, "loss": 2.9852, "theoretical_loss": 3.8717102507578804, "tokens_seen": 554699776 }, { "epoch": 1.06, "learning_rate": 0.0004201504513540622, "loss": 3.1031, "theoretical_loss": 3.8716618629887387, "tokens_seen": 554765312 }, { "epoch": 1.06, "learning_rate": 0.0004201404212637914, "loss": 3.1639, "theoretical_loss": 3.8716134825357593, "tokens_seen": 554830848 }, { "epoch": 1.06, "learning_rate": 0.0004201303911735206, "loss": 3.0598, "theoretical_loss": 3.8715651093969727, "tokens_seen": 554896384 }, { "epoch": 1.06, "learning_rate": 0.00042012036108324974, "loss": 3.191, "theoretical_loss": 3.8715167435704094, "tokens_seen": 554961920 }, { "epoch": 1.06, "learning_rate": 0.00042011033099297897, "loss": 3.0595, "theoretical_loss": 3.871468385054101, "tokens_seen": 555027456 }, { "epoch": 1.06, "learning_rate": 0.0004201003009027081, "loss": 3.1939, "theoretical_loss": 3.8714200338460794, "tokens_seen": 555092992 }, { "epoch": 1.06, "learning_rate": 0.00042009027081243733, "loss": 3.0229, "theoretical_loss": 3.8713716899443775, "tokens_seen": 555158528 }, { "epoch": 1.06, "learning_rate": 0.0004200802407221665, "loss": 3.0592, "theoretical_loss": 3.8713233533470293, "tokens_seen": 555224064 }, { "epoch": 1.06, "learning_rate": 0.0004200702106318957, "loss": 3.0549, "theoretical_loss": 3.871275024052068, "tokens_seen": 555289600 }, { "epoch": 1.06, "learning_rate": 0.0004200601805416249, "loss": 3.0225, "theoretical_loss": 3.87122670205753, "tokens_seen": 555355136 }, { "epoch": 1.06, "objective/train/docs_used": 907748, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1407899856567383, "objective/train/theoretical_loss": 3.8711904653513476, "objective/train/tokens_used": 575864288, "theoretical_loss": 3.8711904653513476, "tokens_seen": 555404288 }, { "epoch": 1.06, "learning_rate": 0.0004200501504513541, "loss": 3.1204, "theoretical_loss": 3.871178387361451, "tokens_seen": 555420672 }, { "epoch": 1.06, "learning_rate": 0.00042004012036108324, "loss": 3.0325, "theoretical_loss": 3.8711300799618673, "tokens_seen": 555486208 }, { "epoch": 1.06, "learning_rate": 0.0004200300902708125, "loss": 3.1417, "theoretical_loss": 3.8710817798568167, "tokens_seen": 555551744 }, { "epoch": 1.06, "learning_rate": 0.0004200200601805416, "loss": 3.1301, "theoretical_loss": 3.871033487044337, "tokens_seen": 555617280 }, { "epoch": 1.06, "learning_rate": 0.00042001003009027084, "loss": 3.1118, "theoretical_loss": 3.8709852015224673, "tokens_seen": 555682816 }, { "epoch": 1.06, "learning_rate": 0.00042, "loss": 3.119, "theoretical_loss": 3.8709369232892477, "tokens_seen": 555748352 }, { "epoch": 1.06, "learning_rate": 0.0004199899699097292, "loss": 3.0567, "theoretical_loss": 3.8708886523427184, "tokens_seen": 555813888 }, { "epoch": 1.06, "learning_rate": 0.0004199799398194584, "loss": 3.0054, "theoretical_loss": 3.8708403886809206, "tokens_seen": 555879424 }, { "epoch": 1.06, "learning_rate": 0.00041996990972918756, "loss": 3.0751, "theoretical_loss": 3.870792132301896, "tokens_seen": 555944960 }, { "epoch": 1.06, "learning_rate": 0.00041995987963891674, "loss": 2.8609, "theoretical_loss": 3.870743883203688, "tokens_seen": 556010496 }, { "epoch": 1.06, "learning_rate": 0.000419949849548646, "loss": 3.1698, "theoretical_loss": 3.87069564138434, "tokens_seen": 556076032 }, { "epoch": 1.06, "learning_rate": 0.0004199398194583751, "loss": 3.0243, "theoretical_loss": 3.8706474068418952, "tokens_seen": 556141568 }, { "epoch": 1.06, "learning_rate": 0.00041992978936810434, "loss": 3.0808, "theoretical_loss": 3.8705991795743997, "tokens_seen": 556207104 }, { "epoch": 1.07, "learning_rate": 0.00041991975927783347, "loss": 2.9994, "theoretical_loss": 3.8705509595798993, "tokens_seen": 556272640 }, { "epoch": 1.07, "learning_rate": 0.0004199097291875627, "loss": 3.095, "theoretical_loss": 3.87050274685644, "tokens_seen": 556338176 }, { "epoch": 1.07, "learning_rate": 0.0004198996990972919, "loss": 3.0935, "theoretical_loss": 3.8704545414020695, "tokens_seen": 556403712 }, { "epoch": 1.07, "learning_rate": 0.00041988966900702106, "loss": 2.908, "theoretical_loss": 3.8704063432148357, "tokens_seen": 556469248 }, { "epoch": 1.07, "learning_rate": 0.00041987963891675025, "loss": 3.0357, "theoretical_loss": 3.870358152292787, "tokens_seen": 556534784 }, { "epoch": 1.07, "learning_rate": 0.0004198696088264795, "loss": 3.0879, "theoretical_loss": 3.8703099686339737, "tokens_seen": 556600320 }, { "epoch": 1.07, "learning_rate": 0.0004198595787362086, "loss": 3.2235, "theoretical_loss": 3.870261792236445, "tokens_seen": 556665856 }, { "epoch": 1.07, "learning_rate": 0.00041984954864593784, "loss": 3.2071, "theoretical_loss": 3.870213623098253, "tokens_seen": 556731392 }, { "epoch": 1.07, "learning_rate": 0.00041983951855566697, "loss": 3.1247, "theoretical_loss": 3.870165461217449, "tokens_seen": 556796928 }, { "epoch": 1.07, "learning_rate": 0.0004198294884653962, "loss": 2.9581, "theoretical_loss": 3.8701173065920864, "tokens_seen": 556862464 }, { "epoch": 1.07, "learning_rate": 0.0004198194583751254, "loss": 2.9152, "theoretical_loss": 3.8700691592202165, "tokens_seen": 556928000 }, { "epoch": 1.07, "learning_rate": 0.00041980942828485457, "loss": 3.1415, "theoretical_loss": 3.8700210190998954, "tokens_seen": 556993536 }, { "epoch": 1.07, "objective/train/docs_used": 910258, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.218034267425537, "objective/train/theoretical_loss": 3.8699849187673134, "objective/train/tokens_used": 577502688, "theoretical_loss": 3.8699849187673134, "tokens_seen": 557042688 }, { "epoch": 1.07, "learning_rate": 0.00041979939819458375, "loss": 3.1162, "theoretical_loss": 3.8699728862291773, "tokens_seen": 557059072 }, { "epoch": 1.07, "learning_rate": 0.00041978936810431293, "loss": 2.9925, "theoretical_loss": 3.8699247606061173, "tokens_seen": 557124608 }, { "epoch": 1.07, "learning_rate": 0.0004197793380140421, "loss": 3.0782, "theoretical_loss": 3.869876642228772, "tokens_seen": 557190144 }, { "epoch": 1.07, "learning_rate": 0.00041976930792377135, "loss": 3.1405, "theoretical_loss": 3.869828531095199, "tokens_seen": 557255680 }, { "epoch": 1.07, "learning_rate": 0.00041975927783350047, "loss": 3.2227, "theoretical_loss": 3.8697804272034553, "tokens_seen": 557321216 }, { "epoch": 1.07, "learning_rate": 0.0004197492477432297, "loss": 2.9989, "theoretical_loss": 3.8697323305515994, "tokens_seen": 557386752 }, { "epoch": 1.07, "learning_rate": 0.00041973921765295883, "loss": 2.9631, "theoretical_loss": 3.869684241137691, "tokens_seen": 557452288 }, { "epoch": 1.07, "learning_rate": 0.00041972918756268807, "loss": 2.9963, "theoretical_loss": 3.86963615895979, "tokens_seen": 557517824 }, { "epoch": 1.07, "learning_rate": 0.00041971915747241725, "loss": 3.1076, "theoretical_loss": 3.8695880840159576, "tokens_seen": 557583360 }, { "epoch": 1.07, "learning_rate": 0.00041970912738214643, "loss": 3.1831, "theoretical_loss": 3.8695400163042546, "tokens_seen": 557648896 }, { "epoch": 1.07, "learning_rate": 0.0004196990972918756, "loss": 3.1993, "theoretical_loss": 3.8694919558227445, "tokens_seen": 557714432 }, { "epoch": 1.07, "learning_rate": 0.00041968906720160485, "loss": 2.9954, "theoretical_loss": 3.869443902569489, "tokens_seen": 557779968 }, { "epoch": 1.07, "learning_rate": 0.000419679037111334, "loss": 2.8246, "theoretical_loss": 3.8693958565425524, "tokens_seen": 557845504 }, { "epoch": 1.07, "learning_rate": 0.0004196690070210632, "loss": 3.0258, "theoretical_loss": 3.8693478177399996, "tokens_seen": 557911040 }, { "epoch": 1.07, "learning_rate": 0.0004196589769307924, "loss": 3.0278, "theoretical_loss": 3.8692997861598952, "tokens_seen": 557976576 }, { "epoch": 1.07, "learning_rate": 0.0004196489468405216, "loss": 3.0497, "theoretical_loss": 3.8692517618003057, "tokens_seen": 558042112 }, { "epoch": 1.07, "learning_rate": 0.0004196389167502508, "loss": 2.8657, "theoretical_loss": 3.8692037446592984, "tokens_seen": 558107648 }, { "epoch": 1.07, "learning_rate": 0.00041962888665997994, "loss": 3.189, "theoretical_loss": 3.8691557347349397, "tokens_seen": 558173184 }, { "epoch": 1.07, "learning_rate": 0.00041961885656970917, "loss": 3.0753, "theoretical_loss": 3.8691077320252982, "tokens_seen": 558238720 }, { "epoch": 1.07, "learning_rate": 0.0004196088264794383, "loss": 3.0481, "theoretical_loss": 3.8690597365284436, "tokens_seen": 558304256 }, { "epoch": 1.07, "learning_rate": 0.00041959879638916753, "loss": 2.9852, "theoretical_loss": 3.8690117482424444, "tokens_seen": 558369792 }, { "epoch": 1.07, "learning_rate": 0.0004195887662988967, "loss": 3.012, "theoretical_loss": 3.868963767165372, "tokens_seen": 558435328 }, { "epoch": 1.07, "learning_rate": 0.0004195787362086259, "loss": 3.1133, "theoretical_loss": 3.868915793295298, "tokens_seen": 558500864 }, { "epoch": 1.07, "learning_rate": 0.0004195687061183551, "loss": 3.0561, "theoretical_loss": 3.868867826630293, "tokens_seen": 558566400 }, { "epoch": 1.07, "learning_rate": 0.0004195586760280843, "loss": 3.1521, "theoretical_loss": 3.868819867168431, "tokens_seen": 558631936 }, { "epoch": 1.07, "objective/train/docs_used": 913164, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.098371982574463, "objective/train/theoretical_loss": 3.8687839022979373, "objective/train/tokens_used": 579141088, "theoretical_loss": 3.8687839022979373, "tokens_seen": 558681088 }, { "epoch": 1.07, "learning_rate": 0.00041954864593781344, "loss": 2.9992, "theoretical_loss": 3.868771914907785, "tokens_seen": 558697472 }, { "epoch": 1.07, "learning_rate": 0.0004195386158475427, "loss": 3.027, "theoretical_loss": 3.868723969846429, "tokens_seen": 558763008 }, { "epoch": 1.07, "learning_rate": 0.0004195285857572718, "loss": 2.9774, "theoretical_loss": 3.868676031982438, "tokens_seen": 558828544 }, { "epoch": 1.07, "learning_rate": 0.00041951855566700104, "loss": 3.0572, "theoretical_loss": 3.868628101313888, "tokens_seen": 558894080 }, { "epoch": 1.07, "learning_rate": 0.0004195085255767302, "loss": 3.0979, "theoretical_loss": 3.8685801778388553, "tokens_seen": 558959616 }, { "epoch": 1.07, "learning_rate": 0.0004194984954864594, "loss": 3.0926, "theoretical_loss": 3.8685322615554165, "tokens_seen": 559025152 }, { "epoch": 1.07, "learning_rate": 0.0004194884653961886, "loss": 3.1734, "theoretical_loss": 3.8684843524616506, "tokens_seen": 559090688 }, { "epoch": 1.07, "learning_rate": 0.00041947843530591776, "loss": 3.0586, "theoretical_loss": 3.868436450555635, "tokens_seen": 559156224 }, { "epoch": 1.07, "learning_rate": 0.00041946840521564694, "loss": 3.0724, "theoretical_loss": 3.86838855583545, "tokens_seen": 559221760 }, { "epoch": 1.07, "learning_rate": 0.0004194583751253762, "loss": 3.1432, "theoretical_loss": 3.8683406682991754, "tokens_seen": 559287296 }, { "epoch": 1.07, "learning_rate": 0.0004194483450351053, "loss": 2.981, "theoretical_loss": 3.868292787944892, "tokens_seen": 559352832 }, { "epoch": 1.07, "learning_rate": 0.00041943831494483454, "loss": 2.8891, "theoretical_loss": 3.8682449147706817, "tokens_seen": 559418368 }, { "epoch": 1.07, "learning_rate": 0.00041942828485456367, "loss": 2.9884, "theoretical_loss": 3.8681970487746264, "tokens_seen": 559483904 }, { "epoch": 1.07, "learning_rate": 0.0004194182547642929, "loss": 3.0442, "theoretical_loss": 3.8681491899548095, "tokens_seen": 559549440 }, { "epoch": 1.07, "learning_rate": 0.0004194082246740221, "loss": 3.0615, "theoretical_loss": 3.868101338309314, "tokens_seen": 559614976 }, { "epoch": 1.07, "learning_rate": 0.00041939819458375126, "loss": 3.1067, "theoretical_loss": 3.868053493836226, "tokens_seen": 559680512 }, { "epoch": 1.07, "learning_rate": 0.00041938816449348045, "loss": 3.0868, "theoretical_loss": 3.8680056565336294, "tokens_seen": 559746048 }, { "epoch": 1.07, "learning_rate": 0.0004193781344032097, "loss": 2.9669, "theoretical_loss": 3.8679578263996106, "tokens_seen": 559811584 }, { "epoch": 1.07, "learning_rate": 0.0004193681043129388, "loss": 2.959, "theoretical_loss": 3.8679100034322564, "tokens_seen": 559877120 }, { "epoch": 1.07, "learning_rate": 0.00041935807422266804, "loss": 2.9818, "theoretical_loss": 3.8678621876296546, "tokens_seen": 559942656 }, { "epoch": 1.07, "learning_rate": 0.00041934804413239717, "loss": 3.0663, "theoretical_loss": 3.867814378989893, "tokens_seen": 560008192 }, { "epoch": 1.07, "learning_rate": 0.0004193380140421264, "loss": 2.9388, "theoretical_loss": 3.8677665775110603, "tokens_seen": 560073728 }, { "epoch": 1.07, "learning_rate": 0.0004193279839518556, "loss": 2.9641, "theoretical_loss": 3.867718783191247, "tokens_seen": 560139264 }, { "epoch": 1.07, "learning_rate": 0.00041931795386158477, "loss": 3.1623, "theoretical_loss": 3.8676709960285427, "tokens_seen": 560204800 }, { "epoch": 1.07, "learning_rate": 0.00041930792377131395, "loss": 3.117, "theoretical_loss": 3.8676232160210384, "tokens_seen": 560270336 }, { "epoch": 1.07, "objective/train/docs_used": 915957, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8462605476379395, "objective/train/theoretical_loss": 3.867587385709863, "objective/train/tokens_used": 580779488, "theoretical_loss": 3.867587385709863, "tokens_seen": 560319488 }, { "epoch": 1.07, "learning_rate": 0.00041929789368104313, "loss": 3.0052, "theoretical_loss": 3.867575443166827, "tokens_seen": 560335872 }, { "epoch": 1.07, "learning_rate": 0.0004192878635907723, "loss": 3.1316, "theoretical_loss": 3.8675276774640004, "tokens_seen": 560401408 }, { "epoch": 1.07, "learning_rate": 0.00041927783350050155, "loss": 3.1309, "theoretical_loss": 3.867479918910653, "tokens_seen": 560466944 }, { "epoch": 1.07, "learning_rate": 0.00041926780341023067, "loss": 3.0956, "theoretical_loss": 3.8674321675048766, "tokens_seen": 560532480 }, { "epoch": 1.07, "learning_rate": 0.0004192577733199599, "loss": 3.0863, "theoretical_loss": 3.867384423244768, "tokens_seen": 560598016 }, { "epoch": 1.07, "learning_rate": 0.00041924774322968904, "loss": 3.1535, "theoretical_loss": 3.8673366861284224, "tokens_seen": 560663552 }, { "epoch": 1.07, "learning_rate": 0.00041923771313941827, "loss": 3.1087, "theoretical_loss": 3.8672889561539354, "tokens_seen": 560729088 }, { "epoch": 1.07, "learning_rate": 0.00041922768304914745, "loss": 3.0, "theoretical_loss": 3.867241233319404, "tokens_seen": 560794624 }, { "epoch": 1.07, "learning_rate": 0.00041921765295887663, "loss": 3.0742, "theoretical_loss": 3.867193517622927, "tokens_seen": 560860160 }, { "epoch": 1.07, "learning_rate": 0.0004192076228686058, "loss": 2.9824, "theoretical_loss": 3.8671458090626016, "tokens_seen": 560925696 }, { "epoch": 1.07, "learning_rate": 0.00041919759277833505, "loss": 2.9691, "theoretical_loss": 3.867098107636528, "tokens_seen": 560991232 }, { "epoch": 1.07, "learning_rate": 0.0004191875626880642, "loss": 2.9958, "theoretical_loss": 3.867050413342805, "tokens_seen": 561056768 }, { "epoch": 1.07, "learning_rate": 0.0004191775325977934, "loss": 3.1716, "theoretical_loss": 3.8670027261795346, "tokens_seen": 561122304 }, { "epoch": 1.07, "learning_rate": 0.00041916750250752254, "loss": 2.9563, "theoretical_loss": 3.8669550461448168, "tokens_seen": 561187840 }, { "epoch": 1.07, "learning_rate": 0.0004191574724172518, "loss": 3.0134, "theoretical_loss": 3.8669073732367547, "tokens_seen": 561253376 }, { "epoch": 1.07, "learning_rate": 0.00041914744232698095, "loss": 2.9264, "theoretical_loss": 3.8668597074534508, "tokens_seen": 561318912 }, { "epoch": 1.07, "learning_rate": 0.00041913741223671014, "loss": 2.9587, "theoretical_loss": 3.866812048793008, "tokens_seen": 561384448 }, { "epoch": 1.07, "learning_rate": 0.0004191273821464393, "loss": 2.977, "theoretical_loss": 3.8667643972535313, "tokens_seen": 561449984 }, { "epoch": 1.07, "learning_rate": 0.0004191173520561685, "loss": 3.0526, "theoretical_loss": 3.866716752833126, "tokens_seen": 561515520 }, { "epoch": 1.07, "learning_rate": 0.0004191073219658977, "loss": 3.0931, "theoretical_loss": 3.866669115529897, "tokens_seen": 561581056 }, { "epoch": 1.07, "learning_rate": 0.0004190972918756269, "loss": 3.0176, "theoretical_loss": 3.866621485341952, "tokens_seen": 561646592 }, { "epoch": 1.07, "learning_rate": 0.00041908726178535604, "loss": 3.0408, "theoretical_loss": 3.866573862267396, "tokens_seen": 561712128 }, { "epoch": 1.07, "learning_rate": 0.0004190772316950853, "loss": 3.1087, "theoretical_loss": 3.8665262463043386, "tokens_seen": 561777664 }, { "epoch": 1.07, "learning_rate": 0.0004190672016048144, "loss": 3.2231, "theoretical_loss": 3.866478637450888, "tokens_seen": 561843200 }, { "epoch": 1.07, "learning_rate": 0.00041905717151454364, "loss": 2.9905, "theoretical_loss": 3.866431035705154, "tokens_seen": 561908736 }, { "epoch": 1.07, "objective/train/docs_used": 918788, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9304358959198, "objective/train/theoretical_loss": 3.8663953390591543, "objective/train/tokens_used": 582417888, "theoretical_loss": 3.8663953390591543, "tokens_seen": 561957888 }, { "epoch": 1.07, "learning_rate": 0.0004190471414242728, "loss": 3.1118, "theoretical_loss": 3.8663834410652456, "tokens_seen": 561974272 }, { "epoch": 1.07, "learning_rate": 0.000419037111334002, "loss": 3.1486, "theoretical_loss": 3.8663358535292742, "tokens_seen": 562039808 }, { "epoch": 1.07, "learning_rate": 0.0004190270812437312, "loss": 3.1075, "theoretical_loss": 3.8662882730953516, "tokens_seen": 562105344 }, { "epoch": 1.07, "learning_rate": 0.0004190170511534604, "loss": 3.066, "theoretical_loss": 3.8662406997615895, "tokens_seen": 562170880 }, { "epoch": 1.07, "learning_rate": 0.00041900702106318954, "loss": 2.955, "theoretical_loss": 3.8661931335261013, "tokens_seen": 562236416 }, { "epoch": 1.07, "learning_rate": 0.0004189969909729188, "loss": 3.0782, "theoretical_loss": 3.866145574387, "tokens_seen": 562301952 }, { "epoch": 1.07, "learning_rate": 0.0004189869608826479, "loss": 2.8966, "theoretical_loss": 3.866098022342401, "tokens_seen": 562367488 }, { "epoch": 1.07, "learning_rate": 0.00041897693079237714, "loss": 3.0748, "theoretical_loss": 3.8660504773904183, "tokens_seen": 562433024 }, { "epoch": 1.07, "learning_rate": 0.0004189669007021063, "loss": 3.2306, "theoretical_loss": 3.866002939529169, "tokens_seen": 562498560 }, { "epoch": 1.07, "learning_rate": 0.0004189568706118355, "loss": 2.9273, "theoretical_loss": 3.8659554087567685, "tokens_seen": 562564096 }, { "epoch": 1.07, "learning_rate": 0.0004189468405215647, "loss": 3.0273, "theoretical_loss": 3.8659078850713353, "tokens_seen": 562629632 }, { "epoch": 1.07, "learning_rate": 0.00041893681043129387, "loss": 3.1146, "theoretical_loss": 3.865860368470986, "tokens_seen": 562695168 }, { "epoch": 1.07, "learning_rate": 0.00041892678034102305, "loss": 2.9949, "theoretical_loss": 3.8658128589538396, "tokens_seen": 562760704 }, { "epoch": 1.07, "learning_rate": 0.0004189167502507523, "loss": 3.1473, "theoretical_loss": 3.8657653565180166, "tokens_seen": 562826240 }, { "epoch": 1.07, "learning_rate": 0.00041890672016048146, "loss": 2.9784, "theoretical_loss": 3.8657178611616363, "tokens_seen": 562891776 }, { "epoch": 1.07, "learning_rate": 0.00041889669007021065, "loss": 2.9775, "theoretical_loss": 3.8656703728828194, "tokens_seen": 562957312 }, { "epoch": 1.07, "learning_rate": 0.0004188866599799399, "loss": 2.9703, "theoretical_loss": 3.8656228916796884, "tokens_seen": 563022848 }, { "epoch": 1.07, "learning_rate": 0.000418876629889669, "loss": 3.0645, "theoretical_loss": 3.8655754175503647, "tokens_seen": 563088384 }, { "epoch": 1.07, "learning_rate": 0.00041886659979939824, "loss": 3.2591, "theoretical_loss": 3.8655279504929716, "tokens_seen": 563153920 }, { "epoch": 1.07, "learning_rate": 0.00041885656970912737, "loss": 3.0357, "theoretical_loss": 3.8654804905056332, "tokens_seen": 563219456 }, { "epoch": 1.07, "learning_rate": 0.0004188465396188566, "loss": 3.0547, "theoretical_loss": 3.865433037586473, "tokens_seen": 563284992 }, { "epoch": 1.07, "learning_rate": 0.0004188365095285858, "loss": 3.023, "theoretical_loss": 3.8653855917336175, "tokens_seen": 563350528 }, { "epoch": 1.07, "learning_rate": 0.00041882647943831497, "loss": 2.9968, "theoretical_loss": 3.865338152945191, "tokens_seen": 563416064 }, { "epoch": 1.07, "learning_rate": 0.00041881644934804415, "loss": 3.1304, "theoretical_loss": 3.865290721219322, "tokens_seen": 563481600 }, { "epoch": 1.07, "learning_rate": 0.00041880641925777333, "loss": 3.209, "theoretical_loss": 3.8652432965541363, "tokens_seen": 563547136 }, { "epoch": 1.07, "objective/train/docs_used": 921621, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8007009029388428, "objective/train/theoretical_loss": 3.865207732687695, "objective/train/tokens_used": 584056288, "theoretical_loss": 3.865207732687695, "tokens_seen": 563596288 }, { "epoch": 1.07, "learning_rate": 0.0004187963891675025, "loss": 2.9846, "theoretical_loss": 3.8651958789477625, "tokens_seen": 563612672 }, { "epoch": 1.07, "learning_rate": 0.00041878635907723175, "loss": 3.0048, "theoretical_loss": 3.8651484683983295, "tokens_seen": 563678208 }, { "epoch": 1.07, "learning_rate": 0.0004187763289869609, "loss": 2.9828, "theoretical_loss": 3.8651010649039663, "tokens_seen": 563743744 }, { "epoch": 1.07, "learning_rate": 0.0004187662988966901, "loss": 3.064, "theoretical_loss": 3.8650536684628034, "tokens_seen": 563809280 }, { "epoch": 1.07, "learning_rate": 0.00041875626880641924, "loss": 3.1642, "theoretical_loss": 3.8650062790729716, "tokens_seen": 563874816 }, { "epoch": 1.07, "learning_rate": 0.00041874623871614847, "loss": 3.1135, "theoretical_loss": 3.8649588967326025, "tokens_seen": 563940352 }, { "epoch": 1.07, "learning_rate": 0.00041873620862587765, "loss": 2.9939, "theoretical_loss": 3.8649115214398284, "tokens_seen": 564005888 }, { "epoch": 1.07, "learning_rate": 0.00041872617853560683, "loss": 2.9238, "theoretical_loss": 3.864864153192782, "tokens_seen": 564071424 }, { "epoch": 1.07, "learning_rate": 0.000418716148445336, "loss": 3.1779, "theoretical_loss": 3.8648167919895977, "tokens_seen": 564136960 }, { "epoch": 1.07, "learning_rate": 0.00041870611835506525, "loss": 3.0276, "theoretical_loss": 3.86476943782841, "tokens_seen": 564202496 }, { "epoch": 1.07, "learning_rate": 0.0004186960882647944, "loss": 3.0878, "theoretical_loss": 3.864722090707353, "tokens_seen": 564268032 }, { "epoch": 1.07, "learning_rate": 0.0004186860581745236, "loss": 3.1442, "theoretical_loss": 3.8646747506245633, "tokens_seen": 564333568 }, { "epoch": 1.07, "learning_rate": 0.00041867602808425274, "loss": 3.1864, "theoretical_loss": 3.8646274175781774, "tokens_seen": 564399104 }, { "epoch": 1.07, "learning_rate": 0.000418665997993982, "loss": 3.0756, "theoretical_loss": 3.8645800915663324, "tokens_seen": 564464640 }, { "epoch": 1.07, "learning_rate": 0.00041865596790371115, "loss": 3.0197, "theoretical_loss": 3.864532772587167, "tokens_seen": 564530176 }, { "epoch": 1.07, "learning_rate": 0.00041864593781344034, "loss": 3.0314, "theoretical_loss": 3.864485460638819, "tokens_seen": 564595712 }, { "epoch": 1.07, "learning_rate": 0.0004186359077231695, "loss": 3.1048, "theoretical_loss": 3.8644381557194283, "tokens_seen": 564661248 }, { "epoch": 1.07, "learning_rate": 0.0004186258776328987, "loss": 3.2502, "theoretical_loss": 3.864390857827135, "tokens_seen": 564726784 }, { "epoch": 1.07, "learning_rate": 0.0004186158475426279, "loss": 3.0547, "theoretical_loss": 3.864343566960079, "tokens_seen": 564792320 }, { "epoch": 1.07, "learning_rate": 0.0004186058174523571, "loss": 2.8118, "theoretical_loss": 3.8642962831164036, "tokens_seen": 564857856 }, { "epoch": 1.07, "learning_rate": 0.00041859578736208624, "loss": 3.0026, "theoretical_loss": 3.8642490062942496, "tokens_seen": 564923392 }, { "epoch": 1.07, "learning_rate": 0.0004185857572718155, "loss": 3.0664, "theoretical_loss": 3.8642017364917605, "tokens_seen": 564988928 }, { "epoch": 1.07, "learning_rate": 0.0004185757271815446, "loss": 2.971, "theoretical_loss": 3.86415447370708, "tokens_seen": 565054464 }, { "epoch": 1.07, "learning_rate": 0.00041856569709127384, "loss": 3.0335, "theoretical_loss": 3.8641072179383524, "tokens_seen": 565120000 }, { "epoch": 1.07, "learning_rate": 0.000418555667001003, "loss": 2.9213, "theoretical_loss": 3.8640599691837227, "tokens_seen": 565185536 }, { "epoch": 1.07, "objective/train/docs_used": 922982, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7244105339050293, "objective/train/theoretical_loss": 3.8640245372196365, "objective/train/tokens_used": 585694688, "theoretical_loss": 3.8640245372196365, "tokens_seen": 565234688 }, { "epoch": 1.07, "learning_rate": 0.0004185456369107322, "loss": 3.0304, "theoretical_loss": 3.8640127274413363, "tokens_seen": 565251072 }, { "epoch": 1.07, "learning_rate": 0.0004185356068204614, "loss": 3.1864, "theoretical_loss": 3.8639654927093403, "tokens_seen": 565316608 }, { "epoch": 1.07, "learning_rate": 0.0004185255767301906, "loss": 3.0431, "theoretical_loss": 3.8639182649858816, "tokens_seen": 565382144 }, { "epoch": 1.07, "learning_rate": 0.00041851554663991974, "loss": 2.9967, "theoretical_loss": 3.863871044269108, "tokens_seen": 565447680 }, { "epoch": 1.07, "learning_rate": 0.000418505516549649, "loss": 2.8831, "theoretical_loss": 3.863823830557169, "tokens_seen": 565513216 }, { "epoch": 1.07, "learning_rate": 0.0004184954864593781, "loss": 2.8157, "theoretical_loss": 3.8637766238482123, "tokens_seen": 565578752 }, { "epoch": 1.07, "learning_rate": 0.00041848545636910734, "loss": 3.0156, "theoretical_loss": 3.863729424140389, "tokens_seen": 565644288 }, { "epoch": 1.07, "learning_rate": 0.0004184754262788365, "loss": 3.2306, "theoretical_loss": 3.863682231431849, "tokens_seen": 565709824 }, { "epoch": 1.07, "learning_rate": 0.0004184653961885657, "loss": 2.9482, "theoretical_loss": 3.863635045720745, "tokens_seen": 565775360 }, { "epoch": 1.07, "learning_rate": 0.0004184553660982949, "loss": 3.0042, "theoretical_loss": 3.863587867005228, "tokens_seen": 565840896 }, { "epoch": 1.07, "learning_rate": 0.00041844533600802407, "loss": 3.0363, "theoretical_loss": 3.863540695283451, "tokens_seen": 565906432 }, { "epoch": 1.07, "learning_rate": 0.00041843530591775325, "loss": 2.9543, "theoretical_loss": 3.8634935305535674, "tokens_seen": 565971968 }, { "epoch": 1.07, "learning_rate": 0.0004184252758274825, "loss": 3.0166, "theoretical_loss": 3.8634463728137316, "tokens_seen": 566037504 }, { "epoch": 1.07, "learning_rate": 0.0004184152457372116, "loss": 2.9732, "theoretical_loss": 3.863399222062099, "tokens_seen": 566103040 }, { "epoch": 1.07, "learning_rate": 0.00041840521564694085, "loss": 3.064, "theoretical_loss": 3.863352078296825, "tokens_seen": 566168576 }, { "epoch": 1.07, "learning_rate": 0.00041839518555666997, "loss": 3.0468, "theoretical_loss": 3.863304941516065, "tokens_seen": 566234112 }, { "epoch": 1.07, "learning_rate": 0.0004183851554663992, "loss": 3.1371, "theoretical_loss": 3.863257811717977, "tokens_seen": 566299648 }, { "epoch": 1.07, "learning_rate": 0.0004183751253761284, "loss": 3.0724, "theoretical_loss": 3.8632106889007183, "tokens_seen": 566365184 }, { "epoch": 1.07, "learning_rate": 0.00041836509528585757, "loss": 3.0768, "theoretical_loss": 3.8631635730624474, "tokens_seen": 566430720 }, { "epoch": 1.07, "learning_rate": 0.00041835506519558675, "loss": 3.0122, "theoretical_loss": 3.8631164642013234, "tokens_seen": 566496256 }, { "epoch": 1.07, "learning_rate": 0.000418345035105316, "loss": 2.9863, "theoretical_loss": 3.863069362315506, "tokens_seen": 566561792 }, { "epoch": 1.07, "learning_rate": 0.0004183350050150451, "loss": 3.1466, "theoretical_loss": 3.863022267403156, "tokens_seen": 566627328 }, { "epoch": 1.07, "learning_rate": 0.00041832497492477435, "loss": 3.0215, "theoretical_loss": 3.8629751794624343, "tokens_seen": 566692864 }, { "epoch": 1.07, "learning_rate": 0.0004183149448345035, "loss": 2.9137, "theoretical_loss": 3.862928098491503, "tokens_seen": 566758400 }, { "epoch": 1.07, "learning_rate": 0.0004183149448345035, "loss": 2.9783, "theoretical_loss": 3.8628810244885248, "tokens_seen": 566823936 }, { "epoch": 1.07, "objective/train/docs_used": 925571, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.29038667678833, "objective/train/theoretical_loss": 3.8628457235579052, "objective/train/tokens_used": 587333088, "theoretical_loss": 3.8628457235579052, "tokens_seen": 566873088 }, { "epoch": 1.07, "learning_rate": 0.0004183049147442327, "loss": 3.0337, "theoretical_loss": 3.862833957451662, "tokens_seen": 566889472 }, { "epoch": 1.07, "learning_rate": 0.0004182948846539619, "loss": 3.1984, "theoretical_loss": 3.86278689737908, "tokens_seen": 566955008 }, { "epoch": 1.07, "learning_rate": 0.0004182848545636911, "loss": 3.0884, "theoretical_loss": 3.862739844268943, "tokens_seen": 567020544 }, { "epoch": 1.07, "learning_rate": 0.00041827482447342025, "loss": 2.9281, "theoretical_loss": 3.8626927981194163, "tokens_seen": 567086080 }, { "epoch": 1.07, "learning_rate": 0.00041826479438314944, "loss": 3.0582, "theoretical_loss": 3.862645758928666, "tokens_seen": 567151616 }, { "epoch": 1.07, "learning_rate": 0.0004182547642928786, "loss": 3.1633, "theoretical_loss": 3.8625987266948583, "tokens_seen": 567217152 }, { "epoch": 1.07, "learning_rate": 0.00041824473420260785, "loss": 2.9532, "theoretical_loss": 3.8625517014161614, "tokens_seen": 567282688 }, { "epoch": 1.07, "learning_rate": 0.000418234704112337, "loss": 3.1051, "theoretical_loss": 3.8625046830907435, "tokens_seen": 567348224 }, { "epoch": 1.07, "learning_rate": 0.0004182246740220662, "loss": 2.9047, "theoretical_loss": 3.862457671716773, "tokens_seen": 567413760 }, { "epoch": 1.07, "learning_rate": 0.0004182146439317954, "loss": 2.9671, "theoretical_loss": 3.862410667292419, "tokens_seen": 567479296 }, { "epoch": 1.07, "learning_rate": 0.0004182046138415246, "loss": 3.1717, "theoretical_loss": 3.8623636698158537, "tokens_seen": 567544832 }, { "epoch": 1.07, "learning_rate": 0.00041819458375125376, "loss": 3.1384, "theoretical_loss": 3.8623166792852457, "tokens_seen": 567610368 }, { "epoch": 1.07, "learning_rate": 0.00041819458375125376, "loss": 3.2455, "theoretical_loss": 3.8622696956987683, "tokens_seen": 567675904 }, { "epoch": 1.07, "learning_rate": 0.00041819458375125376, "loss": 3.2281, "theoretical_loss": 3.862222719054593, "tokens_seen": 567741440 }, { "epoch": 1.07, "learning_rate": 0.00041818455366098294, "loss": 3.2938, "theoretical_loss": 3.862175749350893, "tokens_seen": 567806976 }, { "epoch": 1.07, "learning_rate": 0.0004181745235707121, "loss": 3.0717, "theoretical_loss": 3.862128786585841, "tokens_seen": 567872512 }, { "epoch": 1.07, "learning_rate": 0.00041816449348044136, "loss": 3.0516, "theoretical_loss": 3.8620818307576137, "tokens_seen": 567938048 }, { "epoch": 1.07, "learning_rate": 0.00041815446339017054, "loss": 3.085, "theoretical_loss": 3.8620348818643846, "tokens_seen": 568003584 }, { "epoch": 1.07, "learning_rate": 0.0004181444332998997, "loss": 2.9988, "theoretical_loss": 3.8619879399043295, "tokens_seen": 568069120 }, { "epoch": 1.07, "learning_rate": 0.0004181344032096289, "loss": 3.2645, "theoretical_loss": 3.861941004875625, "tokens_seen": 568134656 }, { "epoch": 1.07, "learning_rate": 0.0004181243731193581, "loss": 3.2515, "theoretical_loss": 3.8618940767764487, "tokens_seen": 568200192 }, { "epoch": 1.07, "learning_rate": 0.0004181143430290873, "loss": 3.1586, "theoretical_loss": 3.861847155604978, "tokens_seen": 568265728 }, { "epoch": 1.07, "learning_rate": 0.00041810431293881644, "loss": 3.3966, "theoretical_loss": 3.8618002413593917, "tokens_seen": 568331264 }, { "epoch": 1.07, "learning_rate": 0.0004180942828485457, "loss": 3.2893, "theoretical_loss": 3.8617533340378687, "tokens_seen": 568396800 }, { "epoch": 1.07, "learning_rate": 0.0004180842527582748, "loss": 3.2113, "theoretical_loss": 3.861706433638589, "tokens_seen": 568462336 }, { "epoch": 1.07, "objective/train/docs_used": 928352, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1367526054382324, "objective/train/theoretical_loss": 3.8616712628807575, "objective/train/tokens_used": 588971488, "theoretical_loss": 3.8616712628807575, "tokens_seen": 568511488 }, { "epoch": 1.07, "learning_rate": 0.00041807422266800404, "loss": 3.1479, "theoretical_loss": 3.8616595401597333, "tokens_seen": 568527872 }, { "epoch": 1.07, "learning_rate": 0.0004180641925777332, "loss": 3.1153, "theoretical_loss": 3.861612653599483, "tokens_seen": 568593408 }, { "epoch": 1.07, "learning_rate": 0.0004180541624874624, "loss": 3.2049, "theoretical_loss": 3.8615657739560203, "tokens_seen": 568658944 }, { "epoch": 1.07, "learning_rate": 0.0004180441323971916, "loss": 3.0191, "theoretical_loss": 3.861518901227527, "tokens_seen": 568724480 }, { "epoch": 1.07, "learning_rate": 0.0004180341023069208, "loss": 3.2064, "theoretical_loss": 3.861472035412187, "tokens_seen": 568790016 }, { "epoch": 1.07, "learning_rate": 0.00041802407221664994, "loss": 3.188, "theoretical_loss": 3.8614251765081846, "tokens_seen": 568855552 }, { "epoch": 1.07, "learning_rate": 0.0004180140421263792, "loss": 3.0554, "theoretical_loss": 3.861378324513704, "tokens_seen": 568921088 }, { "epoch": 1.07, "learning_rate": 0.0004180040120361083, "loss": 3.104, "theoretical_loss": 3.861331479426931, "tokens_seen": 568986624 }, { "epoch": 1.07, "learning_rate": 0.00041799398194583754, "loss": 3.018, "theoretical_loss": 3.8612846412460513, "tokens_seen": 569052160 }, { "epoch": 1.07, "learning_rate": 0.0004179839518555667, "loss": 3.0484, "theoretical_loss": 3.8612378099692526, "tokens_seen": 569117696 }, { "epoch": 1.07, "learning_rate": 0.0004179739217652959, "loss": 3.082, "theoretical_loss": 3.861190985594721, "tokens_seen": 569183232 }, { "epoch": 1.07, "learning_rate": 0.0004179638916750251, "loss": 2.9255, "theoretical_loss": 3.8611441681206453, "tokens_seen": 569248768 }, { "epoch": 1.07, "learning_rate": 0.00041795386158475427, "loss": 3.1586, "theoretical_loss": 3.8610973575452148, "tokens_seen": 569314304 }, { "epoch": 1.07, "learning_rate": 0.00041794383149448345, "loss": 2.9654, "theoretical_loss": 3.861050553866618, "tokens_seen": 569379840 }, { "epoch": 1.07, "learning_rate": 0.0004179338014042127, "loss": 3.0263, "theoretical_loss": 3.861003757083046, "tokens_seen": 569445376 }, { "epoch": 1.07, "learning_rate": 0.0004179237713139418, "loss": 3.1978, "theoretical_loss": 3.8609569671926898, "tokens_seen": 569510912 }, { "epoch": 1.07, "learning_rate": 0.00041791374122367105, "loss": 3.0941, "theoretical_loss": 3.86091018419374, "tokens_seen": 569576448 }, { "epoch": 1.07, "learning_rate": 0.00041790371113340017, "loss": 3.0444, "theoretical_loss": 3.8608634080843895, "tokens_seen": 569641984 }, { "epoch": 1.07, "learning_rate": 0.0004178936810431294, "loss": 2.8911, "theoretical_loss": 3.8608166388628313, "tokens_seen": 569707520 }, { "epoch": 1.07, "learning_rate": 0.0004178836509528586, "loss": 3.0163, "theoretical_loss": 3.8607698765272582, "tokens_seen": 569773056 }, { "epoch": 1.07, "learning_rate": 0.00041787362086258777, "loss": 3.2161, "theoretical_loss": 3.860723121075866, "tokens_seen": 569838592 }, { "epoch": 1.07, "learning_rate": 0.00041786359077231695, "loss": 3.1637, "theoretical_loss": 3.8606763725068483, "tokens_seen": 569904128 }, { "epoch": 1.07, "learning_rate": 0.0004178535606820462, "loss": 3.0753, "theoretical_loss": 3.8606296308184014, "tokens_seen": 569969664 }, { "epoch": 1.07, "learning_rate": 0.0004178435305917753, "loss": 3.0533, "theoretical_loss": 3.860582896008721, "tokens_seen": 570035200 }, { "epoch": 1.07, "learning_rate": 0.00041783350050150455, "loss": 3.0993, "theoretical_loss": 3.8605361680760053, "tokens_seen": 570100736 }, { "epoch": 1.07, "objective/train/docs_used": 931197, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0234122276306152, "objective/train/theoretical_loss": 3.8605011266383915, "objective/train/tokens_used": 590609888, "theoretical_loss": 3.8605011266383915, "tokens_seen": 570149888 }, { "epoch": 1.07, "learning_rate": 0.0004178234704112337, "loss": 3.1098, "theoretical_loss": 3.860489447018451, "tokens_seen": 570166272 }, { "epoch": 1.07, "learning_rate": 0.0004178134403209629, "loss": 3.1058, "theoretical_loss": 3.8604427328342563, "tokens_seen": 570231808 }, { "epoch": 1.07, "learning_rate": 0.0004178034102306921, "loss": 2.9803, "theoretical_loss": 3.8603960255216214, "tokens_seen": 570297344 }, { "epoch": 1.07, "learning_rate": 0.0004177933801404213, "loss": 3.0796, "theoretical_loss": 3.860349325078745, "tokens_seen": 570362880 }, { "epoch": 1.07, "learning_rate": 0.00041778335005015045, "loss": 3.0688, "theoretical_loss": 3.8603026315038282, "tokens_seen": 570428416 }, { "epoch": 1.07, "learning_rate": 0.00041777331995987964, "loss": 3.0803, "theoretical_loss": 3.8602559447950715, "tokens_seen": 570493952 }, { "epoch": 1.07, "learning_rate": 0.0004177632898696088, "loss": 3.0355, "theoretical_loss": 3.860209264950677, "tokens_seen": 570559488 }, { "epoch": 1.07, "learning_rate": 0.00041775325977933805, "loss": 3.199, "theoretical_loss": 3.860162591968847, "tokens_seen": 570625024 }, { "epoch": 1.07, "learning_rate": 0.0004177432296890672, "loss": 3.0683, "theoretical_loss": 3.8601159258477846, "tokens_seen": 570690560 }, { "epoch": 1.07, "learning_rate": 0.0004177331995987964, "loss": 3.099, "theoretical_loss": 3.860069266585694, "tokens_seen": 570756096 }, { "epoch": 1.07, "learning_rate": 0.0004177231695085256, "loss": 2.9898, "theoretical_loss": 3.86002261418078, "tokens_seen": 570821632 }, { "epoch": 1.07, "learning_rate": 0.0004177131394182548, "loss": 3.0133, "theoretical_loss": 3.859975968631246, "tokens_seen": 570887168 }, { "epoch": 1.07, "learning_rate": 0.00041770310932798396, "loss": 3.0664, "theoretical_loss": 3.8599293299352992, "tokens_seen": 570952704 }, { "epoch": 1.07, "learning_rate": 0.00041769307923771314, "loss": 3.0748, "theoretical_loss": 3.859882698091146, "tokens_seen": 571018240 }, { "epoch": 1.07, "learning_rate": 0.0004176830491474423, "loss": 2.9989, "theoretical_loss": 3.859836073096994, "tokens_seen": 571083776 }, { "epoch": 1.07, "learning_rate": 0.00041767301905717156, "loss": 3.0807, "theoretical_loss": 3.8597894549510494, "tokens_seen": 571149312 }, { "epoch": 1.07, "learning_rate": 0.0004176629889669007, "loss": 3.089, "theoretical_loss": 3.8597428436515226, "tokens_seen": 571214848 }, { "epoch": 1.07, "learning_rate": 0.0004176529588766299, "loss": 3.1774, "theoretical_loss": 3.859696239196621, "tokens_seen": 571280384 }, { "epoch": 1.07, "learning_rate": 0.00041764292878635904, "loss": 3.147, "theoretical_loss": 3.8596496415845567, "tokens_seen": 571345920 }, { "epoch": 1.07, "learning_rate": 0.0004176328986960883, "loss": 3.1328, "theoretical_loss": 3.859603050813539, "tokens_seen": 571411456 }, { "epoch": 1.07, "learning_rate": 0.00041762286860581746, "loss": 2.9897, "theoretical_loss": 3.8595564668817786, "tokens_seen": 571476992 }, { "epoch": 1.07, "learning_rate": 0.00041761283851554664, "loss": 3.0755, "theoretical_loss": 3.8595098897874878, "tokens_seen": 571542528 }, { "epoch": 1.07, "learning_rate": 0.0004176028084252758, "loss": 2.9833, "theoretical_loss": 3.8594633195288797, "tokens_seen": 571608064 }, { "epoch": 1.07, "learning_rate": 0.000417592778335005, "loss": 2.9699, "theoretical_loss": 3.8594167561041672, "tokens_seen": 571673600 }, { "epoch": 1.07, "learning_rate": 0.0004175827482447342, "loss": 3.1647, "theoretical_loss": 3.859370199511564, "tokens_seen": 571739136 }, { "epoch": 1.07, "objective/train/docs_used": 933924, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.89699125289917, "objective/train/theoretical_loss": 3.8593352865496096, "objective/train/tokens_used": 592248288, "theoretical_loss": 3.8593352865496096, "tokens_seen": 571788288 }, { "epoch": 1.07, "learning_rate": 0.0004175727181544634, "loss": 2.9696, "theoretical_loss": 3.8593236497492853, "tokens_seen": 571804672 }, { "epoch": 1.07, "learning_rate": 0.00041756268806419255, "loss": 3.1458, "theoretical_loss": 3.8592771068155454, "tokens_seen": 571870208 }, { "epoch": 1.07, "learning_rate": 0.0004175526579739218, "loss": 2.9859, "theoretical_loss": 3.8592305707085606, "tokens_seen": 571935744 }, { "epoch": 1.07, "learning_rate": 0.00041754262788365096, "loss": 2.9829, "theoretical_loss": 3.859184041426548, "tokens_seen": 572001280 }, { "epoch": 1.07, "learning_rate": 0.00041753259779338015, "loss": 3.0538, "theoretical_loss": 3.8591375189677244, "tokens_seen": 572066816 }, { "epoch": 1.07, "learning_rate": 0.0004175225677031093, "loss": 2.9123, "theoretical_loss": 3.859091003330308, "tokens_seen": 572132352 }, { "epoch": 1.07, "learning_rate": 0.0004175125376128385, "loss": 3.0434, "theoretical_loss": 3.859044494512517, "tokens_seen": 572197888 }, { "epoch": 1.07, "learning_rate": 0.0004175025075225677, "loss": 3.0711, "theoretical_loss": 3.8589979925125704, "tokens_seen": 572263424 }, { "epoch": 1.07, "learning_rate": 0.0004174924774322969, "loss": 3.0541, "theoretical_loss": 3.8589514973286896, "tokens_seen": 572328960 }, { "epoch": 1.07, "learning_rate": 0.00041748244734202605, "loss": 2.8878, "theoretical_loss": 3.858905008959094, "tokens_seen": 572394496 }, { "epoch": 1.07, "learning_rate": 0.0004174724172517553, "loss": 3.0929, "theoretical_loss": 3.8588585274020044, "tokens_seen": 572460032 }, { "epoch": 1.07, "learning_rate": 0.0004174623871614844, "loss": 3.0201, "theoretical_loss": 3.8588120526556446, "tokens_seen": 572525568 }, { "epoch": 1.07, "learning_rate": 0.00041745235707121365, "loss": 2.9875, "theoretical_loss": 3.8587655847182356, "tokens_seen": 572591104 }, { "epoch": 1.07, "learning_rate": 0.00041744232698094283, "loss": 3.0448, "theoretical_loss": 3.858719123588001, "tokens_seen": 572656640 }, { "epoch": 1.07, "learning_rate": 0.000417432296890672, "loss": 3.1494, "theoretical_loss": 3.858672669263165, "tokens_seen": 572722176 }, { "epoch": 1.07, "learning_rate": 0.0004174222668004012, "loss": 2.9598, "theoretical_loss": 3.858626221741952, "tokens_seen": 572787712 }, { "epoch": 1.07, "learning_rate": 0.00041741223671013037, "loss": 2.9747, "theoretical_loss": 3.858579781022588, "tokens_seen": 572853248 }, { "epoch": 1.07, "learning_rate": 0.0004174022066198596, "loss": 3.169, "theoretical_loss": 3.858533347103298, "tokens_seen": 572918784 }, { "epoch": 1.07, "learning_rate": 0.0004173921765295888, "loss": 2.9715, "theoretical_loss": 3.858486919982309, "tokens_seen": 572984320 }, { "epoch": 1.07, "learning_rate": 0.00041738214643931797, "loss": 3.299, "theoretical_loss": 3.8584404996578483, "tokens_seen": 573049856 }, { "epoch": 1.07, "learning_rate": 0.00041737211634904715, "loss": 3.1404, "theoretical_loss": 3.858394086128144, "tokens_seen": 573115392 }, { "epoch": 1.07, "learning_rate": 0.0004173620862587764, "loss": 3.0684, "theoretical_loss": 3.858347679391424, "tokens_seen": 573180928 }, { "epoch": 1.07, "learning_rate": 0.0004173520561685055, "loss": 3.098, "theoretical_loss": 3.8583012794459184, "tokens_seen": 573246464 }, { "epoch": 1.07, "learning_rate": 0.00041734202607823475, "loss": 3.0515, "theoretical_loss": 3.858254886289857, "tokens_seen": 573312000 }, { "epoch": 1.07, "learning_rate": 0.0004173319959879639, "loss": 3.1166, "theoretical_loss": 3.85820849992147, "tokens_seen": 573377536 }, { "epoch": 1.07, "objective/train/docs_used": 936713, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0786449909210205, "objective/train/theoretical_loss": 3.858173714598527, "objective/train/tokens_used": 593886688, "theoretical_loss": 3.858173714598527, "tokens_seen": 573426688 }, { "epoch": 1.07, "learning_rate": 0.0004173219658976931, "loss": 3.2198, "theoretical_loss": 3.8581621203389886, "tokens_seen": 573443072 }, { "epoch": 1.07, "learning_rate": 0.0004173119358074223, "loss": 3.0539, "theoretical_loss": 3.858115747540645, "tokens_seen": 573508608 }, { "epoch": 1.07, "learning_rate": 0.0004173019057171515, "loss": 2.9222, "theoretical_loss": 3.8580693815246727, "tokens_seen": 573574144 }, { "epoch": 1.07, "learning_rate": 0.00041729187562688065, "loss": 3.2926, "theoretical_loss": 3.8580230222893035, "tokens_seen": 573639680 }, { "epoch": 1.07, "learning_rate": 0.00041728184553660984, "loss": 3.0577, "theoretical_loss": 3.8579766698327718, "tokens_seen": 573705216 }, { "epoch": 1.07, "learning_rate": 0.000417271815446339, "loss": 3.0786, "theoretical_loss": 3.8579303241533123, "tokens_seen": 573770752 }, { "epoch": 1.07, "learning_rate": 0.00041726178535606825, "loss": 3.1275, "theoretical_loss": 3.8578839852491598, "tokens_seen": 573836288 }, { "epoch": 1.07, "learning_rate": 0.0004172517552657974, "loss": 3.1624, "theoretical_loss": 3.857837653118551, "tokens_seen": 573901824 }, { "epoch": 1.07, "learning_rate": 0.0004172417251755266, "loss": 3.0857, "theoretical_loss": 3.857791327759722, "tokens_seen": 573967360 }, { "epoch": 1.07, "learning_rate": 0.0004172316950852558, "loss": 3.1236, "theoretical_loss": 3.85774500917091, "tokens_seen": 574032896 }, { "epoch": 1.07, "learning_rate": 0.000417221664994985, "loss": 2.9694, "theoretical_loss": 3.8576986973503526, "tokens_seen": 574098432 }, { "epoch": 1.07, "learning_rate": 0.00041721163490471416, "loss": 3.0117, "theoretical_loss": 3.857652392296289, "tokens_seen": 574163968 }, { "epoch": 1.07, "learning_rate": 0.00041720160481444334, "loss": 3.1241, "theoretical_loss": 3.8576060940069583, "tokens_seen": 574229504 }, { "epoch": 1.07, "learning_rate": 0.0004171915747241725, "loss": 2.852, "theoretical_loss": 3.8575598024805995, "tokens_seen": 574295040 }, { "epoch": 1.07, "learning_rate": 0.00041718154463390176, "loss": 3.1341, "theoretical_loss": 3.857513517715454, "tokens_seen": 574360576 }, { "epoch": 1.07, "learning_rate": 0.0004171715145436309, "loss": 3.1072, "theoretical_loss": 3.857467239709763, "tokens_seen": 574426112 }, { "epoch": 1.07, "learning_rate": 0.0004171614844533601, "loss": 2.8798, "theoretical_loss": 3.8574209684617675, "tokens_seen": 574491648 }, { "epoch": 1.07, "learning_rate": 0.00041715145436308924, "loss": 3.0467, "theoretical_loss": 3.857374703969711, "tokens_seen": 574557184 }, { "epoch": 1.07, "learning_rate": 0.0004171414242728185, "loss": 2.976, "theoretical_loss": 3.857328446231836, "tokens_seen": 574622720 }, { "epoch": 1.07, "learning_rate": 0.00041713139418254766, "loss": 3.0959, "theoretical_loss": 3.857282195246386, "tokens_seen": 574688256 }, { "epoch": 1.07, "learning_rate": 0.00041712136409227684, "loss": 3.0696, "theoretical_loss": 3.8572359510116065, "tokens_seen": 574753792 }, { "epoch": 1.07, "learning_rate": 0.000417111334002006, "loss": 3.044, "theoretical_loss": 3.8571897135257416, "tokens_seen": 574819328 }, { "epoch": 1.07, "learning_rate": 0.0004171013039117352, "loss": 3.0525, "theoretical_loss": 3.857143482787038, "tokens_seen": 574884864 }, { "epoch": 1.07, "learning_rate": 0.0004170912738214644, "loss": 3.0382, "theoretical_loss": 3.8570972587937415, "tokens_seen": 574950400 }, { "epoch": 1.07, "learning_rate": 0.0004170812437311936, "loss": 3.1083, "theoretical_loss": 3.857051041544099, "tokens_seen": 575015936 }, { "epoch": 1.07, "objective/train/docs_used": 939080, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.252971887588501, "objective/train/theoretical_loss": 3.8570163830313366, "objective/train/tokens_used": 595525088, "theoretical_loss": 3.8570163830313366, "tokens_seen": 575065088 }, { "epoch": 1.07, "learning_rate": 0.00041707121364092275, "loss": 3.1385, "theoretical_loss": 3.8570048310363596, "tokens_seen": 575081472 }, { "epoch": 1.07, "learning_rate": 0.000417061183550652, "loss": 2.8215, "theoretical_loss": 3.8569586272687695, "tokens_seen": 575147008 }, { "epoch": 1.07, "learning_rate": 0.00041705115346038116, "loss": 2.8696, "theoretical_loss": 3.8569124302395794, "tokens_seen": 575212544 }, { "epoch": 1.07, "learning_rate": 0.00041704112337011035, "loss": 3.0732, "theoretical_loss": 3.8568662399470384, "tokens_seen": 575278080 }, { "epoch": 1.07, "learning_rate": 0.0004170310932798395, "loss": 3.1911, "theoretical_loss": 3.856820056389397, "tokens_seen": 575343616 }, { "epoch": 1.07, "learning_rate": 0.0004170210631895687, "loss": 2.9956, "theoretical_loss": 3.8567738795649062, "tokens_seen": 575409152 }, { "epoch": 1.07, "learning_rate": 0.0004170110330992979, "loss": 3.1875, "theoretical_loss": 3.8567277094718175, "tokens_seen": 575474688 }, { "epoch": 1.07, "learning_rate": 0.0004170010030090271, "loss": 3.0419, "theoretical_loss": 3.8566815461083843, "tokens_seen": 575540224 }, { "epoch": 1.07, "learning_rate": 0.00041699097291875625, "loss": 3.1125, "theoretical_loss": 3.8566353894728578, "tokens_seen": 575605760 }, { "epoch": 1.07, "learning_rate": 0.0004169809428284855, "loss": 3.1766, "theoretical_loss": 3.856589239563492, "tokens_seen": 575671296 }, { "epoch": 1.07, "learning_rate": 0.0004169709127382146, "loss": 3.0556, "theoretical_loss": 3.8565430963785428, "tokens_seen": 575736832 }, { "epoch": 1.07, "learning_rate": 0.00041696088264794385, "loss": 2.8484, "theoretical_loss": 3.8564969599162633, "tokens_seen": 575802368 }, { "epoch": 1.07, "learning_rate": 0.00041695085255767303, "loss": 3.085, "theoretical_loss": 3.85645083017491, "tokens_seen": 575867904 }, { "epoch": 1.07, "learning_rate": 0.0004169408224674022, "loss": 2.9332, "theoretical_loss": 3.8564047071527385, "tokens_seen": 575933440 }, { "epoch": 1.07, "learning_rate": 0.0004169307923771314, "loss": 3.2234, "theoretical_loss": 3.8563585908480063, "tokens_seen": 575998976 }, { "epoch": 1.07, "learning_rate": 0.0004169207622868606, "loss": 3.0407, "theoretical_loss": 3.8563124812589704, "tokens_seen": 576064512 }, { "epoch": 1.07, "learning_rate": 0.00041691073219658975, "loss": 3.1511, "theoretical_loss": 3.85626637838389, "tokens_seen": 576130048 }, { "epoch": 1.07, "learning_rate": 0.000416900702106319, "loss": 3.0015, "theoretical_loss": 3.856220282221023, "tokens_seen": 576195584 }, { "epoch": 1.07, "learning_rate": 0.0004168906720160481, "loss": 2.9732, "theoretical_loss": 3.856174192768629, "tokens_seen": 576261120 }, { "epoch": 1.07, "learning_rate": 0.00041688064192577735, "loss": 2.8552, "theoretical_loss": 3.8561281100249674, "tokens_seen": 576326656 }, { "epoch": 1.07, "learning_rate": 0.00041687061183550653, "loss": 3.0349, "theoretical_loss": 3.856082033988301, "tokens_seen": 576392192 }, { "epoch": 1.07, "learning_rate": 0.0004168605817452357, "loss": 2.9748, "theoretical_loss": 3.856035964656889, "tokens_seen": 576457728 }, { "epoch": 1.07, "learning_rate": 0.0004168505516549649, "loss": 2.8007, "theoretical_loss": 3.8559899020289947, "tokens_seen": 576523264 }, { "epoch": 1.07, "learning_rate": 0.0004168405215646941, "loss": 3.0223, "theoretical_loss": 3.855943846102881, "tokens_seen": 576588800 }, { "epoch": 1.07, "learning_rate": 0.00041683049147442326, "loss": 3.0399, "theoretical_loss": 3.8558977968768104, "tokens_seen": 576654336 }, { "epoch": 1.07, "objective/train/docs_used": 941938, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.173206090927124, "objective/train/theoretical_loss": 3.8558632643531174, "objective/train/tokens_used": 597163488, "theoretical_loss": 3.8558632643531174, "tokens_seen": 576703488 }, { "epoch": 1.07, "learning_rate": 0.0004168204613841525, "loss": 3.0319, "theoretical_loss": 3.8558517543490485, "tokens_seen": 576719872 }, { "epoch": 1.07, "learning_rate": 0.0004168104312938816, "loss": 3.0506, "theoretical_loss": 3.855805718517858, "tokens_seen": 576785408 }, { "epoch": 1.07, "learning_rate": 0.00041680040120361085, "loss": 2.9208, "theoretical_loss": 3.8557596893815047, "tokens_seen": 576850944 }, { "epoch": 1.07, "learning_rate": 0.00041679037111334, "loss": 2.992, "theoretical_loss": 3.8557136669382555, "tokens_seen": 576916480 }, { "epoch": 1.07, "learning_rate": 0.0004167803410230692, "loss": 3.0244, "theoretical_loss": 3.8556676511863763, "tokens_seen": 576982016 }, { "epoch": 1.07, "learning_rate": 0.0004167703109327984, "loss": 3.0195, "theoretical_loss": 3.8556216421241345, "tokens_seen": 577047552 }, { "epoch": 1.07, "learning_rate": 0.0004167602808425276, "loss": 3.0816, "theoretical_loss": 3.855575639749798, "tokens_seen": 577113088 }, { "epoch": 1.07, "learning_rate": 0.00041675025075225676, "loss": 2.925, "theoretical_loss": 3.8555296440616353, "tokens_seen": 577178624 }, { "epoch": 1.07, "learning_rate": 0.000416740220661986, "loss": 3.0472, "theoretical_loss": 3.855483655057916, "tokens_seen": 577244160 }, { "epoch": 1.07, "learning_rate": 0.0004167301905717151, "loss": 3.0943, "theoretical_loss": 3.855437672736909, "tokens_seen": 577309696 }, { "epoch": 1.07, "learning_rate": 0.00041672016048144436, "loss": 3.029, "theoretical_loss": 3.8553916970968856, "tokens_seen": 577375232 }, { "epoch": 1.07, "learning_rate": 0.0004167101303911735, "loss": 3.0965, "theoretical_loss": 3.8553457281361165, "tokens_seen": 577440768 }, { "epoch": 1.07, "learning_rate": 0.0004167001003009027, "loss": 2.9581, "theoretical_loss": 3.8552997658528736, "tokens_seen": 577506304 }, { "epoch": 1.07, "learning_rate": 0.0004166900702106319, "loss": 3.1122, "theoretical_loss": 3.8552538102454292, "tokens_seen": 577571840 }, { "epoch": 1.07, "learning_rate": 0.0004166800401203611, "loss": 3.16, "theoretical_loss": 3.8552078613120564, "tokens_seen": 577637376 }, { "epoch": 1.07, "learning_rate": 0.00041667001003009026, "loss": 3.0175, "theoretical_loss": 3.855161919051029, "tokens_seen": 577702912 }, { "epoch": 1.07, "learning_rate": 0.00041665997993981944, "loss": 3.2477, "theoretical_loss": 3.8551159834606215, "tokens_seen": 577768448 }, { "epoch": 1.07, "learning_rate": 0.0004166499498495487, "loss": 3.0437, "theoretical_loss": 3.855070054539109, "tokens_seen": 577833984 }, { "epoch": 1.07, "learning_rate": 0.00041663991975927786, "loss": 2.8906, "theoretical_loss": 3.8550241322847656, "tokens_seen": 577899520 }, { "epoch": 1.07, "learning_rate": 0.00041662988966900704, "loss": 3.0335, "theoretical_loss": 3.8549782166958693, "tokens_seen": 577965056 }, { "epoch": 1.07, "learning_rate": 0.0004166198595787362, "loss": 3.1814, "theoretical_loss": 3.8549323077706967, "tokens_seen": 578030592 }, { "epoch": 1.07, "learning_rate": 0.0004166098294884654, "loss": 3.1698, "theoretical_loss": 3.854886405507525, "tokens_seen": 578096128 }, { "epoch": 1.07, "learning_rate": 0.0004165997993981946, "loss": 2.9717, "theoretical_loss": 3.854840509904632, "tokens_seen": 578161664 }, { "epoch": 1.07, "learning_rate": 0.0004165897693079238, "loss": 2.9047, "theoretical_loss": 3.8547946209602975, "tokens_seen": 578227200 }, { "epoch": 1.07, "learning_rate": 0.00041657973921765295, "loss": 2.9566, "theoretical_loss": 3.8547487386728, "tokens_seen": 578292736 }, { "epoch": 1.07, "objective/train/docs_used": 943112, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3337647914886475, "objective/train/theoretical_loss": 3.8547143313246917, "objective/train/tokens_used": 598801888, "theoretical_loss": 3.8547143313246917, "tokens_seen": 578341888 }, { "epoch": 1.07, "learning_rate": 0.0004165697091273822, "loss": 3.0215, "theoretical_loss": 3.85470286304042, "tokens_seen": 578358272 }, { "epoch": 1.07, "learning_rate": 0.00041655967903711136, "loss": 2.8625, "theoretical_loss": 3.8546569940614384, "tokens_seen": 578423808 }, { "epoch": 1.07, "learning_rate": 0.00041654964894684055, "loss": 3.0311, "theoretical_loss": 3.8546111317341363, "tokens_seen": 578489344 }, { "epoch": 1.07, "learning_rate": 0.0004165396188565697, "loss": 3.0326, "theoretical_loss": 3.8545652760567957, "tokens_seen": 578554880 }, { "epoch": 1.07, "learning_rate": 0.0004165295887662989, "loss": 3.0217, "theoretical_loss": 3.8545194270276997, "tokens_seen": 578620416 }, { "epoch": 1.07, "learning_rate": 0.0004165195586760281, "loss": 3.1005, "theoretical_loss": 3.854473584645131, "tokens_seen": 578685952 }, { "epoch": 1.07, "learning_rate": 0.0004165095285857573, "loss": 3.0439, "theoretical_loss": 3.854427748907374, "tokens_seen": 578751488 }, { "epoch": 1.07, "learning_rate": 0.00041649949849548645, "loss": 3.093, "theoretical_loss": 3.854381919812713, "tokens_seen": 578817024 }, { "epoch": 1.07, "learning_rate": 0.0004164894684052157, "loss": 3.1127, "theoretical_loss": 3.8543360973594334, "tokens_seen": 578882560 }, { "epoch": 1.07, "learning_rate": 0.0004164794383149448, "loss": 2.9208, "theoretical_loss": 3.854290281545821, "tokens_seen": 578948096 }, { "epoch": 1.07, "learning_rate": 0.00041646940822467405, "loss": 3.132, "theoretical_loss": 3.854244472370162, "tokens_seen": 579013632 }, { "epoch": 1.07, "learning_rate": 0.00041645937813440323, "loss": 2.9503, "theoretical_loss": 3.854198669830744, "tokens_seen": 579079168 }, { "epoch": 1.07, "learning_rate": 0.0004164493480441324, "loss": 3.171, "theoretical_loss": 3.854152873925854, "tokens_seen": 579144704 }, { "epoch": 1.07, "learning_rate": 0.0004164393179538616, "loss": 3.0267, "theoretical_loss": 3.854107084653781, "tokens_seen": 579210240 }, { "epoch": 1.07, "learning_rate": 0.0004164292878635908, "loss": 2.7608, "theoretical_loss": 3.854061302012814, "tokens_seen": 579275776 }, { "epoch": 1.07, "learning_rate": 0.00041641925777331995, "loss": 3.0809, "theoretical_loss": 3.854015526001242, "tokens_seen": 579341312 }, { "epoch": 1.07, "learning_rate": 0.0004164092276830492, "loss": 3.0612, "theoretical_loss": 3.8539697566173565, "tokens_seen": 579406848 }, { "epoch": 1.07, "learning_rate": 0.0004163991975927783, "loss": 3.0709, "theoretical_loss": 3.8539239938594476, "tokens_seen": 579472384 }, { "epoch": 1.07, "learning_rate": 0.00041638916750250755, "loss": 2.7993, "theoretical_loss": 3.853878237725807, "tokens_seen": 579537920 }, { "epoch": 1.07, "learning_rate": 0.00041637913741223673, "loss": 2.9068, "theoretical_loss": 3.8538324882147266, "tokens_seen": 579603456 }, { "epoch": 1.07, "learning_rate": 0.0004163691073219659, "loss": 3.1013, "theoretical_loss": 3.8537867453244994, "tokens_seen": 579668992 }, { "epoch": 1.07, "learning_rate": 0.0004163590772316951, "loss": 3.0103, "theoretical_loss": 3.8537410090534197, "tokens_seen": 579734528 }, { "epoch": 1.07, "learning_rate": 0.0004163490471414243, "loss": 3.0316, "theoretical_loss": 3.8536952793997803, "tokens_seen": 579800064 }, { "epoch": 1.07, "learning_rate": 0.00041633901705115346, "loss": 3.0083, "theoretical_loss": 3.8536495563618764, "tokens_seen": 579865600 }, { "epoch": 1.07, "learning_rate": 0.0004163289869608827, "loss": 3.152, "theoretical_loss": 3.8536038399380033, "tokens_seen": 579931136 }, { "epoch": 1.07, "objective/train/docs_used": 945877, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0961520671844482, "objective/train/theoretical_loss": 3.8535695569595316, "objective/train/tokens_used": 600440288, "theoretical_loss": 3.8535695569595316, "tokens_seen": 579980288 }, { "epoch": 1.07, "learning_rate": 0.0004163189568706118, "loss": 2.9694, "theoretical_loss": 3.853558130126457, "tokens_seen": 579996672 }, { "epoch": 1.07, "learning_rate": 0.00041630892678034105, "loss": 2.9324, "theoretical_loss": 3.853512426925535, "tokens_seen": 580062208 }, { "epoch": 1.07, "learning_rate": 0.0004162988966900702, "loss": 2.9475, "theoretical_loss": 3.8534667303335333, "tokens_seen": 580127744 }, { "epoch": 1.07, "learning_rate": 0.0004162888665997994, "loss": 3.0114, "theoretical_loss": 3.85342104034875, "tokens_seen": 580193280 }, { "epoch": 1.07, "learning_rate": 0.0004162788365095286, "loss": 3.0964, "theoretical_loss": 3.853375356969485, "tokens_seen": 580258816 }, { "epoch": 1.07, "learning_rate": 0.0004162688064192578, "loss": 3.0572, "theoretical_loss": 3.853329680194035, "tokens_seen": 580324352 }, { "epoch": 1.07, "learning_rate": 0.00041625877632898696, "loss": 3.0859, "theoretical_loss": 3.8532840100207015, "tokens_seen": 580389888 }, { "epoch": 1.07, "learning_rate": 0.0004162487462387162, "loss": 2.8931, "theoretical_loss": 3.8532383464477844, "tokens_seen": 580455424 }, { "epoch": 1.07, "learning_rate": 0.0004162387161484453, "loss": 3.0411, "theoretical_loss": 3.853192689473585, "tokens_seen": 580520960 }, { "epoch": 1.07, "learning_rate": 0.00041622868605817456, "loss": 2.8831, "theoretical_loss": 3.8531470390964047, "tokens_seen": 580586496 }, { "epoch": 1.07, "learning_rate": 0.0004162186559679037, "loss": 3.0082, "theoretical_loss": 3.853101395314546, "tokens_seen": 580652032 }, { "epoch": 1.07, "learning_rate": 0.0004162086258776329, "loss": 3.055, "theoretical_loss": 3.8530557581263114, "tokens_seen": 580717568 }, { "epoch": 1.07, "learning_rate": 0.0004161985957873621, "loss": 3.0527, "theoretical_loss": 3.8530101275300046, "tokens_seen": 580783104 }, { "epoch": 1.07, "learning_rate": 0.0004161885656970913, "loss": 3.0376, "theoretical_loss": 3.85296450352393, "tokens_seen": 580848640 }, { "epoch": 1.07, "learning_rate": 0.00041617853560682046, "loss": 3.0482, "theoretical_loss": 3.8529188861063925, "tokens_seen": 580914176 }, { "epoch": 1.07, "learning_rate": 0.00041616850551654964, "loss": 3.0714, "theoretical_loss": 3.852873275275697, "tokens_seen": 580979712 }, { "epoch": 1.07, "learning_rate": 0.0004161584754262788, "loss": 3.0091, "theoretical_loss": 3.85282767103015, "tokens_seen": 581045248 }, { "epoch": 1.07, "learning_rate": 0.00041614844533600806, "loss": 3.0573, "theoretical_loss": 3.8527820733680587, "tokens_seen": 581110784 }, { "epoch": 1.07, "learning_rate": 0.0004161384152457372, "loss": 3.0535, "theoretical_loss": 3.852736482287729, "tokens_seen": 581176320 }, { "epoch": 1.07, "learning_rate": 0.0004161283851554664, "loss": 3.0247, "theoretical_loss": 3.852690897787469, "tokens_seen": 581241856 }, { "epoch": 1.07, "learning_rate": 0.00041611835506519555, "loss": 2.9742, "theoretical_loss": 3.852645319865589, "tokens_seen": 581307392 }, { "epoch": 1.07, "learning_rate": 0.0004161083249749248, "loss": 2.9773, "theoretical_loss": 3.8525997485203964, "tokens_seen": 581372928 }, { "epoch": 1.07, "learning_rate": 0.00041609829488465397, "loss": 3.0085, "theoretical_loss": 3.852554183750202, "tokens_seen": 581438464 }, { "epoch": 1.07, "learning_rate": 0.00041608826479438315, "loss": 2.9882, "theoretical_loss": 3.852508625553316, "tokens_seen": 581504000 }, { "epoch": 1.07, "learning_rate": 0.00041607823470411233, "loss": 2.8375, "theoretical_loss": 3.8524630739280488, "tokens_seen": 581569536 }, { "epoch": 1.07, "objective/train/docs_used": 948876, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.237966537475586, "objective/train/theoretical_loss": 3.852428914520708, "objective/train/tokens_used": 602078688, "theoretical_loss": 3.852428914520708, "tokens_seen": 581618688 }, { "epoch": 1.07, "learning_rate": 0.00041606820461384156, "loss": 3.2142, "theoretical_loss": 3.8524175288727127, "tokens_seen": 581635072 }, { "epoch": 1.07, "learning_rate": 0.0004160581745235707, "loss": 3.1183, "theoretical_loss": 3.8523719903856204, "tokens_seen": 581700608 }, { "epoch": 1.07, "learning_rate": 0.0004160481444332999, "loss": 3.1154, "theoretical_loss": 3.8523264584650834, "tokens_seen": 581766144 }, { "epoch": 1.07, "learning_rate": 0.00041603811434302905, "loss": 3.0083, "theoretical_loss": 3.8522809331094168, "tokens_seen": 581831680 }, { "epoch": 1.07, "learning_rate": 0.0004160280842527583, "loss": 2.9816, "theoretical_loss": 3.852235414316934, "tokens_seen": 581897216 }, { "epoch": 1.07, "learning_rate": 0.00041601805416248747, "loss": 2.966, "theoretical_loss": 3.85218990208595, "tokens_seen": 581962752 }, { "epoch": 1.07, "learning_rate": 0.00041600802407221665, "loss": 3.0794, "theoretical_loss": 3.85214439641478, "tokens_seen": 582028288 }, { "epoch": 1.07, "learning_rate": 0.00041599799398194583, "loss": 3.1105, "theoretical_loss": 3.852098897301741, "tokens_seen": 582093824 }, { "epoch": 1.07, "learning_rate": 0.000415987963891675, "loss": 2.9438, "theoretical_loss": 3.8520534047451482, "tokens_seen": 582159360 }, { "epoch": 1.07, "learning_rate": 0.0004159779338014042, "loss": 2.9537, "theoretical_loss": 3.85200791874332, "tokens_seen": 582224896 }, { "epoch": 1.07, "learning_rate": 0.00041596790371113343, "loss": 2.9401, "theoretical_loss": 3.8519624392945735, "tokens_seen": 582290432 }, { "epoch": 1.07, "learning_rate": 0.00041595787362086256, "loss": 2.8717, "theoretical_loss": 3.8519169663972277, "tokens_seen": 582355968 }, { "epoch": 1.07, "learning_rate": 0.0004159478435305918, "loss": 3.145, "theoretical_loss": 3.8518715000496018, "tokens_seen": 582421504 }, { "epoch": 1.07, "learning_rate": 0.0004159378134403209, "loss": 3.0273, "theoretical_loss": 3.8518260402500157, "tokens_seen": 582487040 }, { "epoch": 1.07, "learning_rate": 0.00041592778335005015, "loss": 2.9348, "theoretical_loss": 3.8517805869967887, "tokens_seen": 582552576 }, { "epoch": 1.07, "learning_rate": 0.00041591775325977934, "loss": 2.8752, "theoretical_loss": 3.8517351402882434, "tokens_seen": 582618112 }, { "epoch": 1.07, "learning_rate": 0.0004159077231695085, "loss": 2.8528, "theoretical_loss": 3.8516897001227006, "tokens_seen": 582683648 }, { "epoch": 1.07, "learning_rate": 0.00041589769307923775, "loss": 3.1077, "theoretical_loss": 3.8516442664984822, "tokens_seen": 582749184 }, { "epoch": 1.07, "learning_rate": 0.00041588766298896693, "loss": 3.0298, "theoretical_loss": 3.851598839413912, "tokens_seen": 582814720 }, { "epoch": 1.07, "learning_rate": 0.0004158776328986961, "loss": 3.0777, "theoretical_loss": 3.8515534188673124, "tokens_seen": 582880256 }, { "epoch": 1.07, "learning_rate": 0.0004158676028084253, "loss": 3.0383, "theoretical_loss": 3.851508004857008, "tokens_seen": 582945792 }, { "epoch": 1.07, "learning_rate": 0.0004158575727181545, "loss": 2.9348, "theoretical_loss": 3.8514625973813246, "tokens_seen": 583011328 }, { "epoch": 1.07, "learning_rate": 0.00041584754262788366, "loss": 3.0522, "theoretical_loss": 3.8514171964385855, "tokens_seen": 583076864 }, { "epoch": 1.07, "learning_rate": 0.0004158375125376129, "loss": 2.9511, "theoretical_loss": 3.851371802027118, "tokens_seen": 583142400 }, { "epoch": 1.07, "learning_rate": 0.000415827482447342, "loss": 2.9091, "theoretical_loss": 3.8513264141452486, "tokens_seen": 583207936 }, { "epoch": 1.07, "objective/train/docs_used": 951353, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.021881103515625, "objective/train/theoretical_loss": 3.8512923775178884, "objective/train/tokens_used": 603717088, "theoretical_loss": 3.8512923775178884, "tokens_seen": 583257088 }, { "epoch": 1.07, "learning_rate": 0.00041581745235707126, "loss": 2.968, "theoretical_loss": 3.851281032791304, "tokens_seen": 583273472 }, { "epoch": 1.07, "learning_rate": 0.0004158074222668004, "loss": 3.0031, "theoretical_loss": 3.8512356579636124, "tokens_seen": 583339008 }, { "epoch": 1.07, "learning_rate": 0.0004157973921765296, "loss": 3.1121, "theoretical_loss": 3.8511902896605017, "tokens_seen": 583404544 }, { "epoch": 1.07, "learning_rate": 0.0004157873620862588, "loss": 2.9165, "theoretical_loss": 3.8511449278803016, "tokens_seen": 583470080 }, { "epoch": 1.07, "learning_rate": 0.000415777331995988, "loss": 3.0133, "theoretical_loss": 3.851099572621342, "tokens_seen": 583535616 }, { "epoch": 1.07, "learning_rate": 0.00041576730190571716, "loss": 3.1895, "theoretical_loss": 3.8510542238819516, "tokens_seen": 583601152 }, { "epoch": 1.07, "learning_rate": 0.0004157572718154464, "loss": 3.0714, "theoretical_loss": 3.851008881660463, "tokens_seen": 583666688 }, { "epoch": 1.07, "learning_rate": 0.0004157472417251755, "loss": 2.8749, "theoretical_loss": 3.850963545955207, "tokens_seen": 583732224 }, { "epoch": 1.07, "learning_rate": 0.00041573721163490476, "loss": 3.03, "theoretical_loss": 3.8509182167645157, "tokens_seen": 583797760 }, { "epoch": 1.07, "learning_rate": 0.0004157271815446339, "loss": 2.9317, "theoretical_loss": 3.850872894086722, "tokens_seen": 583863296 }, { "epoch": 1.07, "learning_rate": 0.0004157171514543631, "loss": 3.0621, "theoretical_loss": 3.8508275779201586, "tokens_seen": 583928832 }, { "epoch": 1.07, "learning_rate": 0.0004157071213640923, "loss": 2.7941, "theoretical_loss": 3.85078226826316, "tokens_seen": 583994368 }, { "epoch": 1.07, "learning_rate": 0.0004156970912738215, "loss": 3.0778, "theoretical_loss": 3.850736965114061, "tokens_seen": 584059904 }, { "epoch": 1.07, "learning_rate": 0.00041568706118355066, "loss": 3.0284, "theoretical_loss": 3.850691668471197, "tokens_seen": 584125440 }, { "epoch": 1.07, "learning_rate": 0.00041567703109327984, "loss": 3.0341, "theoretical_loss": 3.8506463783329026, "tokens_seen": 584190976 }, { "epoch": 1.07, "learning_rate": 0.000415667001003009, "loss": 3.0007, "theoretical_loss": 3.850601094697515, "tokens_seen": 584256512 }, { "epoch": 1.07, "learning_rate": 0.00041565697091273826, "loss": 3.0365, "theoretical_loss": 3.850555817563371, "tokens_seen": 584322048 }, { "epoch": 1.07, "learning_rate": 0.0004156469408224674, "loss": 3.0491, "theoretical_loss": 3.850510546928809, "tokens_seen": 584387584 }, { "epoch": 1.07, "learning_rate": 0.0004156369107321966, "loss": 2.9697, "theoretical_loss": 3.8504652827921664, "tokens_seen": 584453120 }, { "epoch": 1.07, "learning_rate": 0.00041562688064192575, "loss": 2.8511, "theoretical_loss": 3.8504200251517817, "tokens_seen": 584518656 }, { "epoch": 1.07, "learning_rate": 0.000415616850551655, "loss": 2.9883, "theoretical_loss": 3.8503747740059957, "tokens_seen": 584584192 }, { "epoch": 1.07, "learning_rate": 0.00041560682046138417, "loss": 3.0768, "theoretical_loss": 3.850329529353147, "tokens_seen": 584649728 }, { "epoch": 1.07, "learning_rate": 0.00041559679037111335, "loss": 3.0188, "theoretical_loss": 3.850284291191577, "tokens_seen": 584715264 }, { "epoch": 1.07, "learning_rate": 0.00041558676028084253, "loss": 2.8732, "theoretical_loss": 3.8502390595196276, "tokens_seen": 584780800 }, { "epoch": 1.07, "learning_rate": 0.00041557673019057176, "loss": 3.0032, "theoretical_loss": 3.8501938343356397, "tokens_seen": 584846336 }, { "epoch": 1.07, "objective/train/docs_used": 954355, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.140191078186035, "objective/train/theoretical_loss": 3.8501599197043763, "objective/train/tokens_used": 605355488, "theoretical_loss": 3.8501599197043763, "tokens_seen": 584895488 }, { "epoch": 1.07, "learning_rate": 0.0004155667001003009, "loss": 2.9967, "theoretical_loss": 3.850148615637956, "tokens_seen": 584911872 }, { "epoch": 1.07, "learning_rate": 0.0004155566700100301, "loss": 2.9824, "theoretical_loss": 3.85010340342492, "tokens_seen": 584977408 }, { "epoch": 1.07, "learning_rate": 0.00041554663991975925, "loss": 3.0262, "theoretical_loss": 3.8500581976948753, "tokens_seen": 585042944 }, { "epoch": 1.07, "learning_rate": 0.0004155366098294885, "loss": 2.9541, "theoretical_loss": 3.850012998446166, "tokens_seen": 585108480 }, { "epoch": 1.07, "learning_rate": 0.00041552657973921767, "loss": 2.9393, "theoretical_loss": 3.849967805677137, "tokens_seen": 585174016 }, { "epoch": 1.07, "learning_rate": 0.00041551654964894685, "loss": 2.9338, "theoretical_loss": 3.8499226193861347, "tokens_seen": 585239552 }, { "epoch": 1.07, "learning_rate": 0.00041550651955867603, "loss": 3.1074, "theoretical_loss": 3.8498774395715043, "tokens_seen": 585305088 }, { "epoch": 1.07, "learning_rate": 0.0004154964894684052, "loss": 2.9756, "theoretical_loss": 3.849832266231593, "tokens_seen": 585370624 }, { "epoch": 1.07, "learning_rate": 0.0004154864593781344, "loss": 2.9856, "theoretical_loss": 3.8497870993647476, "tokens_seen": 585436160 }, { "epoch": 1.07, "learning_rate": 0.00041547642928786363, "loss": 2.8294, "theoretical_loss": 3.8497419389693173, "tokens_seen": 585501696 }, { "epoch": 1.07, "learning_rate": 0.00041546639919759276, "loss": 3.0453, "theoretical_loss": 3.8496967850436494, "tokens_seen": 585567232 }, { "epoch": 1.07, "learning_rate": 0.000415456369107322, "loss": 2.9416, "theoretical_loss": 3.8496516375860943, "tokens_seen": 585632768 }, { "epoch": 1.07, "learning_rate": 0.0004154463390170511, "loss": 3.0449, "theoretical_loss": 3.8496064965950008, "tokens_seen": 585698304 }, { "epoch": 1.07, "learning_rate": 0.00041543630892678035, "loss": 3.029, "theoretical_loss": 3.8495613620687195, "tokens_seen": 585763840 }, { "epoch": 1.07, "learning_rate": 0.00041542627883650954, "loss": 3.0634, "theoretical_loss": 3.849516234005602, "tokens_seen": 585829376 }, { "epoch": 1.07, "learning_rate": 0.0004154162487462387, "loss": 2.9103, "theoretical_loss": 3.8494711124039993, "tokens_seen": 585894912 }, { "epoch": 1.07, "learning_rate": 0.0004154062186559679, "loss": 3.0524, "theoretical_loss": 3.849425997262264, "tokens_seen": 585960448 }, { "epoch": 1.07, "learning_rate": 0.00041539618856569713, "loss": 3.1277, "theoretical_loss": 3.8493808885787484, "tokens_seen": 586025984 }, { "epoch": 1.07, "learning_rate": 0.00041538615847542626, "loss": 3.0188, "theoretical_loss": 3.8493357863518067, "tokens_seen": 586091520 }, { "epoch": 1.07, "learning_rate": 0.0004153761283851555, "loss": 3.0588, "theoretical_loss": 3.849290690579792, "tokens_seen": 586157056 }, { "epoch": 1.07, "learning_rate": 0.0004153660982948846, "loss": 2.9002, "theoretical_loss": 3.84924560126106, "tokens_seen": 586222592 }, { "epoch": 1.07, "learning_rate": 0.00041535606820461386, "loss": 2.9449, "theoretical_loss": 3.849200518393965, "tokens_seen": 586288128 }, { "epoch": 1.07, "learning_rate": 0.00041534603811434304, "loss": 2.8635, "theoretical_loss": 3.849155441976863, "tokens_seen": 586353664 }, { "epoch": 1.07, "learning_rate": 0.0004153360080240722, "loss": 2.8673, "theoretical_loss": 3.8491103720081115, "tokens_seen": 586419200 }, { "epoch": 1.07, "learning_rate": 0.0004153259779338014, "loss": 3.0069, "theoretical_loss": 3.8490653084860664, "tokens_seen": 586484736 }, { "epoch": 1.07, "objective/train/docs_used": 957235, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8581111431121826, "objective/train/theoretical_loss": 3.849031515074196, "objective/train/tokens_used": 606993888, "theoretical_loss": 3.849031515074196, "tokens_seen": 586533888 }, { "epoch": 1.07, "learning_rate": 0.0004153159478435306, "loss": 2.986, "theoretical_loss": 3.8490202514090854, "tokens_seen": 586550272 }, { "epoch": 1.07, "learning_rate": 0.00041530591775325976, "loss": 2.9909, "theoretical_loss": 3.848975200775527, "tokens_seen": 586615808 }, { "epoch": 1.07, "learning_rate": 0.000415295887662989, "loss": 3.1951, "theoretical_loss": 3.8489301565837506, "tokens_seen": 586681344 }, { "epoch": 1.07, "learning_rate": 0.0004152858575727181, "loss": 2.9114, "theoretical_loss": 3.848885118832115, "tokens_seen": 586746880 }, { "epoch": 1.07, "learning_rate": 0.00041527582748244736, "loss": 2.9465, "theoretical_loss": 3.8488400875189805, "tokens_seen": 586812416 }, { "epoch": 1.07, "learning_rate": 0.0004152657973921765, "loss": 3.0498, "theoretical_loss": 3.848795062642708, "tokens_seen": 586877952 }, { "epoch": 1.07, "learning_rate": 0.0004152557673019057, "loss": 3.1394, "theoretical_loss": 3.8487500442016582, "tokens_seen": 586943488 }, { "epoch": 1.07, "learning_rate": 0.0004152457372116349, "loss": 2.9982, "theoretical_loss": 3.848705032194193, "tokens_seen": 587009024 }, { "epoch": 1.07, "learning_rate": 0.0004152357071213641, "loss": 2.8507, "theoretical_loss": 3.848660026618675, "tokens_seen": 587074560 }, { "epoch": 1.07, "learning_rate": 0.00041522567703109327, "loss": 2.991, "theoretical_loss": 3.8486150274734676, "tokens_seen": 587140096 }, { "epoch": 1.07, "learning_rate": 0.0004152156469408225, "loss": 3.1797, "theoretical_loss": 3.848570034756934, "tokens_seen": 587205632 }, { "epoch": 1.07, "learning_rate": 0.00041520561685055163, "loss": 2.9033, "theoretical_loss": 3.8485250484674385, "tokens_seen": 587271168 }, { "epoch": 1.07, "learning_rate": 0.00041519558676028086, "loss": 2.9506, "theoretical_loss": 3.8484800686033465, "tokens_seen": 587336704 }, { "epoch": 1.07, "learning_rate": 0.00041518555667001, "loss": 3.0067, "theoretical_loss": 3.8484350951630226, "tokens_seen": 587402240 }, { "epoch": 1.07, "learning_rate": 0.0004151755265797392, "loss": 3.0571, "theoretical_loss": 3.8483901281448336, "tokens_seen": 587467776 }, { "epoch": 1.07, "learning_rate": 0.0004151654964894684, "loss": 3.0465, "theoretical_loss": 3.848345167547146, "tokens_seen": 587533312 }, { "epoch": 1.07, "learning_rate": 0.0004151554663991976, "loss": 3.0961, "theoretical_loss": 3.8483002133683266, "tokens_seen": 587598848 }, { "epoch": 1.07, "learning_rate": 0.0004151454363089268, "loss": 3.198, "theoretical_loss": 3.8482552656067437, "tokens_seen": 587664384 }, { "epoch": 1.07, "learning_rate": 0.00041513540621865595, "loss": 3.1445, "theoretical_loss": 3.8482103242607657, "tokens_seen": 587729920 }, { "epoch": 1.07, "learning_rate": 0.0004151253761283852, "loss": 2.9468, "theoretical_loss": 3.848165389328762, "tokens_seen": 587795456 }, { "epoch": 1.07, "learning_rate": 0.00041511534603811437, "loss": 3.1075, "theoretical_loss": 3.848120460809101, "tokens_seen": 587860992 }, { "epoch": 1.07, "learning_rate": 0.00041510531594784355, "loss": 2.9515, "theoretical_loss": 3.848075538700154, "tokens_seen": 587926528 }, { "epoch": 1.07, "learning_rate": 0.00041509528585757273, "loss": 3.048, "theoretical_loss": 3.848030623000291, "tokens_seen": 587992064 }, { "epoch": 1.07, "learning_rate": 0.00041508525576730196, "loss": 3.1562, "theoretical_loss": 3.8479857137078843, "tokens_seen": 588057600 }, { "epoch": 1.07, "learning_rate": 0.0004150752256770311, "loss": 3.0963, "theoretical_loss": 3.8479408108213065, "tokens_seen": 588123136 }, { "epoch": 1.07, "objective/train/docs_used": 958723, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.3181047439575195, "objective/train/theoretical_loss": 3.8479071378592176, "objective/train/tokens_used": 608632288, "theoretical_loss": 3.8479071378592176, "tokens_seen": 588172288 }, { "epoch": 1.07, "learning_rate": 0.0004150651955867603, "loss": 3.095, "theoretical_loss": 3.847895914338928, "tokens_seen": 588188672 }, { "epoch": 1.07, "learning_rate": 0.00041505516549648945, "loss": 2.9407, "theoretical_loss": 3.847851024259124, "tokens_seen": 588254208 }, { "epoch": 1.07, "learning_rate": 0.0004150451354062187, "loss": 3.0835, "theoretical_loss": 3.847806140580267, "tokens_seen": 588319744 }, { "epoch": 1.07, "learning_rate": 0.00041503510531594787, "loss": 2.9736, "theoretical_loss": 3.8477612633007325, "tokens_seen": 588385280 }, { "epoch": 1.07, "learning_rate": 0.00041502507522567705, "loss": 2.8675, "theoretical_loss": 3.847716392418895, "tokens_seen": 588450816 }, { "epoch": 1.07, "learning_rate": 0.00041501504513540623, "loss": 3.0837, "theoretical_loss": 3.8476715279331297, "tokens_seen": 588516352 }, { "epoch": 1.07, "learning_rate": 0.0004150050150451354, "loss": 2.8706, "theoretical_loss": 3.847626669841813, "tokens_seen": 588581888 }, { "epoch": 1.07, "learning_rate": 0.0004149949849548646, "loss": 3.2091, "theoretical_loss": 3.847581818143323, "tokens_seen": 588647424 }, { "epoch": 1.07, "learning_rate": 0.00041498495486459383, "loss": 2.9938, "theoretical_loss": 3.8475369728360347, "tokens_seen": 588712960 }, { "epoch": 1.07, "learning_rate": 0.00041497492477432296, "loss": 3.0339, "theoretical_loss": 3.847492133918327, "tokens_seen": 588778496 }, { "epoch": 1.07, "learning_rate": 0.0004149648946840522, "loss": 3.1078, "theoretical_loss": 3.8474473013885797, "tokens_seen": 588844032 }, { "epoch": 1.07, "learning_rate": 0.0004149548645937813, "loss": 2.8667, "theoretical_loss": 3.84740247524517, "tokens_seen": 588909568 }, { "epoch": 1.07, "learning_rate": 0.00041494483450351055, "loss": 3.1634, "theoretical_loss": 3.847357655486479, "tokens_seen": 588975104 }, { "epoch": 1.07, "learning_rate": 0.00041493480441323974, "loss": 3.1007, "theoretical_loss": 3.8473128421108864, "tokens_seen": 589040640 }, { "epoch": 1.07, "learning_rate": 0.0004149247743229689, "loss": 3.1217, "theoretical_loss": 3.8472680351167736, "tokens_seen": 589106176 }, { "epoch": 1.07, "learning_rate": 0.0004149147442326981, "loss": 3.0625, "theoretical_loss": 3.847223234502522, "tokens_seen": 589171712 }, { "epoch": 1.08, "learning_rate": 0.00041490471414242733, "loss": 3.0466, "theoretical_loss": 3.847178440266513, "tokens_seen": 589237248 }, { "epoch": 1.08, "learning_rate": 0.00041489468405215646, "loss": 2.9699, "theoretical_loss": 3.8471336524071296, "tokens_seen": 589302784 }, { "epoch": 1.08, "learning_rate": 0.0004148846539618857, "loss": 2.7748, "theoretical_loss": 3.8470888709227555, "tokens_seen": 589368320 }, { "epoch": 1.08, "learning_rate": 0.0004148746238716148, "loss": 2.9218, "theoretical_loss": 3.8470440958117744, "tokens_seen": 589433856 }, { "epoch": 1.08, "learning_rate": 0.00041486459378134406, "loss": 2.9517, "theoretical_loss": 3.84699932707257, "tokens_seen": 589499392 }, { "epoch": 1.08, "learning_rate": 0.00041485456369107324, "loss": 2.9134, "theoretical_loss": 3.846954564703529, "tokens_seen": 589564928 }, { "epoch": 1.08, "learning_rate": 0.0004148445336008024, "loss": 3.0698, "theoretical_loss": 3.846909808703036, "tokens_seen": 589630464 }, { "epoch": 1.08, "learning_rate": 0.0004148345035105316, "loss": 3.0302, "theoretical_loss": 3.8468650590694766, "tokens_seen": 589696000 }, { "epoch": 1.08, "learning_rate": 0.0004148244734202608, "loss": 2.912, "theoretical_loss": 3.8468203158012386, "tokens_seen": 589761536 }, { "epoch": 1.08, "objective/train/docs_used": 961549, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9096224308013916, "objective/train/theoretical_loss": 3.8467867625263317, "objective/train/tokens_used": 610270688, "theoretical_loss": 3.8467867625263317, "tokens_seen": 589810688 }, { "epoch": 1.08, "learning_rate": 0.00041481444332998996, "loss": 3.0963, "theoretical_loss": 3.8467755788967084, "tokens_seen": 589827072 }, { "epoch": 1.08, "learning_rate": 0.0004148044132397192, "loss": 3.0862, "theoretical_loss": 3.8467308483542757, "tokens_seen": 589892608 }, { "epoch": 1.08, "learning_rate": 0.0004147943831494483, "loss": 2.987, "theoretical_loss": 3.846686124172327, "tokens_seen": 589958144 }, { "epoch": 1.08, "learning_rate": 0.00041478435305917756, "loss": 2.9371, "theoretical_loss": 3.846641406349253, "tokens_seen": 590023680 }, { "epoch": 1.08, "learning_rate": 0.0004147743229689067, "loss": 3.0919, "theoretical_loss": 3.846596694883443, "tokens_seen": 590089216 }, { "epoch": 1.08, "learning_rate": 0.0004147642928786359, "loss": 3.125, "theoretical_loss": 3.846551989773287, "tokens_seen": 590154752 }, { "epoch": 1.08, "learning_rate": 0.0004147542627883651, "loss": 2.9802, "theoretical_loss": 3.8465072910171765, "tokens_seen": 590220288 }, { "epoch": 1.08, "learning_rate": 0.0004147442326980943, "loss": 2.9763, "theoretical_loss": 3.846462598613502, "tokens_seen": 590285824 }, { "epoch": 1.08, "learning_rate": 0.00041473420260782347, "loss": 3.0232, "theoretical_loss": 3.846417912560657, "tokens_seen": 590351360 }, { "epoch": 1.08, "learning_rate": 0.0004147241725175527, "loss": 2.8142, "theoretical_loss": 3.846373232857033, "tokens_seen": 590416896 }, { "epoch": 1.08, "learning_rate": 0.00041471414242728183, "loss": 3.0639, "theoretical_loss": 3.846328559501024, "tokens_seen": 590482432 }, { "epoch": 1.08, "learning_rate": 0.00041470411233701106, "loss": 3.0846, "theoretical_loss": 3.846283892491023, "tokens_seen": 590547968 }, { "epoch": 1.08, "learning_rate": 0.0004146940822467402, "loss": 3.1255, "theoretical_loss": 3.8462392318254253, "tokens_seen": 590613504 }, { "epoch": 1.08, "learning_rate": 0.0004146840521564694, "loss": 3.1832, "theoretical_loss": 3.846194577502626, "tokens_seen": 590679040 }, { "epoch": 1.08, "learning_rate": 0.0004146740220661986, "loss": 2.8562, "theoretical_loss": 3.8461499295210198, "tokens_seen": 590744576 }, { "epoch": 1.08, "learning_rate": 0.0004146639919759278, "loss": 3.1114, "theoretical_loss": 3.846105287879003, "tokens_seen": 590810112 }, { "epoch": 1.08, "learning_rate": 0.00041465396188565697, "loss": 3.0078, "theoretical_loss": 3.846060652574973, "tokens_seen": 590875648 }, { "epoch": 1.08, "learning_rate": 0.00041464393179538615, "loss": 3.05, "theoretical_loss": 3.8460160236073273, "tokens_seen": 590941184 }, { "epoch": 1.08, "learning_rate": 0.00041463390170511533, "loss": 2.8787, "theoretical_loss": 3.845971400974463, "tokens_seen": 591006720 }, { "epoch": 1.08, "learning_rate": 0.00041462387161484457, "loss": 2.9897, "theoretical_loss": 3.845926784674779, "tokens_seen": 591072256 }, { "epoch": 1.08, "learning_rate": 0.0004146138415245737, "loss": 2.9609, "theoretical_loss": 3.8458821747066745, "tokens_seen": 591137792 }, { "epoch": 1.08, "learning_rate": 0.00041460381143430293, "loss": 3.0493, "theoretical_loss": 3.845837571068549, "tokens_seen": 591203328 }, { "epoch": 1.08, "learning_rate": 0.0004145937813440321, "loss": 3.0266, "theoretical_loss": 3.845792973758803, "tokens_seen": 591268864 }, { "epoch": 1.08, "learning_rate": 0.0004145837512537613, "loss": 2.8183, "theoretical_loss": 3.845748382775837, "tokens_seen": 591334400 }, { "epoch": 1.08, "learning_rate": 0.0004145737211634905, "loss": 3.0229, "theoretical_loss": 3.845703798118053, "tokens_seen": 591399936 }, { "epoch": 1.08, "objective/train/docs_used": 964467, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.628354549407959, "objective/train/theoretical_loss": 3.8456703637746537, "objective/train/tokens_used": 611909088, "theoretical_loss": 3.8456703637746537, "tokens_seen": 591449088 }, { "epoch": 1.08, "learning_rate": 0.00041456369107321965, "loss": 2.9265, "theoretical_loss": 3.845659219783852, "tokens_seen": 591465472 }, { "epoch": 1.08, "learning_rate": 0.00041455366098294884, "loss": 2.9652, "theoretical_loss": 3.8456146477716375, "tokens_seen": 591531008 }, { "epoch": 1.08, "learning_rate": 0.00041454363089267807, "loss": 3.0489, "theoretical_loss": 3.8455700820798127, "tokens_seen": 591596544 }, { "epoch": 1.08, "learning_rate": 0.0004145336008024072, "loss": 2.9133, "theoretical_loss": 3.8455255227067804, "tokens_seen": 591662080 }, { "epoch": 1.08, "learning_rate": 0.00041452357071213643, "loss": 3.1377, "theoretical_loss": 3.845480969650946, "tokens_seen": 591727616 }, { "epoch": 1.08, "learning_rate": 0.00041451354062186556, "loss": 2.9707, "theoretical_loss": 3.845436422910714, "tokens_seen": 591793152 }, { "epoch": 1.08, "learning_rate": 0.0004145035105315948, "loss": 3.1144, "theoretical_loss": 3.8453918824844893, "tokens_seen": 591858688 }, { "epoch": 1.08, "learning_rate": 0.000414493480441324, "loss": 2.9699, "theoretical_loss": 3.845347348370679, "tokens_seen": 591924224 }, { "epoch": 1.08, "learning_rate": 0.00041448345035105316, "loss": 2.9364, "theoretical_loss": 3.8453028205676887, "tokens_seen": 591989760 }, { "epoch": 1.08, "learning_rate": 0.00041447342026078234, "loss": 3.1315, "theoretical_loss": 3.845258299073927, "tokens_seen": 592055296 }, { "epoch": 1.08, "learning_rate": 0.0004144633901705115, "loss": 3.1873, "theoretical_loss": 3.8452137838878, "tokens_seen": 592120832 }, { "epoch": 1.08, "learning_rate": 0.0004144533600802407, "loss": 2.9048, "theoretical_loss": 3.8451692750077173, "tokens_seen": 592186368 }, { "epoch": 1.08, "learning_rate": 0.00041444332998996994, "loss": 3.017, "theoretical_loss": 3.8451247724320874, "tokens_seen": 592251904 }, { "epoch": 1.08, "learning_rate": 0.00041443329989969906, "loss": 2.9442, "theoretical_loss": 3.8450802761593197, "tokens_seen": 592317440 }, { "epoch": 1.08, "learning_rate": 0.0004144232698094283, "loss": 2.8059, "theoretical_loss": 3.8450357861878253, "tokens_seen": 592382976 }, { "epoch": 1.08, "learning_rate": 0.00041441323971915753, "loss": 3.0403, "theoretical_loss": 3.8449913025160134, "tokens_seen": 592448512 }, { "epoch": 1.08, "learning_rate": 0.00041440320962888666, "loss": 3.1356, "theoretical_loss": 3.844946825142296, "tokens_seen": 592514048 }, { "epoch": 1.08, "learning_rate": 0.0004143931795386159, "loss": 2.9808, "theoretical_loss": 3.844902354065085, "tokens_seen": 592579584 }, { "epoch": 1.08, "learning_rate": 0.000414383149448345, "loss": 2.9598, "theoretical_loss": 3.844857889282793, "tokens_seen": 592645120 }, { "epoch": 1.08, "learning_rate": 0.00041437311935807426, "loss": 3.0146, "theoretical_loss": 3.844813430793832, "tokens_seen": 592710656 }, { "epoch": 1.08, "learning_rate": 0.00041436308926780344, "loss": 3.0383, "theoretical_loss": 3.8447689785966173, "tokens_seen": 592776192 }, { "epoch": 1.08, "learning_rate": 0.0004143530591775326, "loss": 2.9807, "theoretical_loss": 3.844724532689561, "tokens_seen": 592841728 }, { "epoch": 1.08, "learning_rate": 0.0004143430290872618, "loss": 2.9064, "theoretical_loss": 3.8446800930710796, "tokens_seen": 592907264 }, { "epoch": 1.08, "learning_rate": 0.000414332998996991, "loss": 3.0815, "theoretical_loss": 3.8446356597395877, "tokens_seen": 592972800 }, { "epoch": 1.08, "learning_rate": 0.00041432296890672016, "loss": 3.0937, "theoretical_loss": 3.8445912326935003, "tokens_seen": 593038336 }, { "epoch": 1.08, "objective/train/docs_used": 966466, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.829028367996216, "objective/train/theoretical_loss": 3.84455791653278, "objective/train/tokens_used": 613547488, "theoretical_loss": 3.84455791653278, "tokens_seen": 593087488 }, { "epoch": 1.08, "learning_rate": 0.0004143129388164494, "loss": 2.9547, "theoretical_loss": 3.8445468119312354, "tokens_seen": 593103872 }, { "epoch": 1.08, "learning_rate": 0.0004143029087261785, "loss": 2.8795, "theoretical_loss": 3.844502397451209, "tokens_seen": 593169408 }, { "epoch": 1.08, "learning_rate": 0.00041429287863590776, "loss": 2.9105, "theoretical_loss": 3.844457989251839, "tokens_seen": 593234944 }, { "epoch": 1.08, "learning_rate": 0.0004142828485456369, "loss": 3.0476, "theoretical_loss": 3.8444135873315433, "tokens_seen": 593300480 }, { "epoch": 1.08, "learning_rate": 0.0004142728184553661, "loss": 2.9904, "theoretical_loss": 3.844369191688741, "tokens_seen": 593366016 }, { "epoch": 1.08, "learning_rate": 0.0004142627883650953, "loss": 2.9918, "theoretical_loss": 3.844324802321851, "tokens_seen": 593431552 }, { "epoch": 1.08, "learning_rate": 0.0004142527582748245, "loss": 3.0568, "theoretical_loss": 3.8442804192292934, "tokens_seen": 593497088 }, { "epoch": 1.08, "learning_rate": 0.00041424272818455367, "loss": 3.0078, "theoretical_loss": 3.844236042409489, "tokens_seen": 593562624 }, { "epoch": 1.08, "learning_rate": 0.0004142326980942829, "loss": 3.0421, "theoretical_loss": 3.8441916718608584, "tokens_seen": 593628160 }, { "epoch": 1.08, "learning_rate": 0.00041422266800401203, "loss": 3.0289, "theoretical_loss": 3.844147307581823, "tokens_seen": 593693696 }, { "epoch": 1.08, "learning_rate": 0.00041421263791374126, "loss": 3.0189, "theoretical_loss": 3.8441029495708054, "tokens_seen": 593759232 }, { "epoch": 1.08, "learning_rate": 0.0004142026078234704, "loss": 2.9036, "theoretical_loss": 3.844058597826228, "tokens_seen": 593824768 }, { "epoch": 1.08, "learning_rate": 0.0004141925777331996, "loss": 2.8378, "theoretical_loss": 3.844014252346515, "tokens_seen": 593890304 }, { "epoch": 1.08, "learning_rate": 0.0004141825476429288, "loss": 3.0419, "theoretical_loss": 3.8439699131300893, "tokens_seen": 593955840 }, { "epoch": 1.08, "learning_rate": 0.000414172517552658, "loss": 2.9058, "theoretical_loss": 3.843925580175375, "tokens_seen": 594021376 }, { "epoch": 1.08, "learning_rate": 0.00041416248746238717, "loss": 2.9199, "theoretical_loss": 3.8438812534807987, "tokens_seen": 594086912 }, { "epoch": 1.08, "learning_rate": 0.00041415245737211635, "loss": 3.16, "theoretical_loss": 3.8438369330447846, "tokens_seen": 594152448 }, { "epoch": 1.08, "learning_rate": 0.00041414242728184553, "loss": 3.0149, "theoretical_loss": 3.843792618865759, "tokens_seen": 594217984 }, { "epoch": 1.08, "learning_rate": 0.00041413239719157477, "loss": 3.1248, "theoretical_loss": 3.843748310942149, "tokens_seen": 594283520 }, { "epoch": 1.08, "learning_rate": 0.0004141223671013039, "loss": 3.1863, "theoretical_loss": 3.843704009272382, "tokens_seen": 594349056 }, { "epoch": 1.08, "learning_rate": 0.00041411233701103313, "loss": 2.976, "theoretical_loss": 3.843659713854886, "tokens_seen": 594414592 }, { "epoch": 1.08, "learning_rate": 0.0004141023069207623, "loss": 3.0826, "theoretical_loss": 3.843615424688089, "tokens_seen": 594480128 }, { "epoch": 1.08, "learning_rate": 0.0004140922768304915, "loss": 2.973, "theoretical_loss": 3.8435711417704193, "tokens_seen": 594545664 }, { "epoch": 1.08, "learning_rate": 0.0004140822467402207, "loss": 3.0657, "theoretical_loss": 3.843526865100308, "tokens_seen": 594611200 }, { "epoch": 1.08, "learning_rate": 0.00041407221664994985, "loss": 3.0077, "theoretical_loss": 3.8434825946761846, "tokens_seen": 594676736 }, { "epoch": 1.08, "objective/train/docs_used": 969373, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1606059074401855, "objective/train/theoretical_loss": 3.8434493959560774, "objective/train/tokens_used": 615185888, "theoretical_loss": 3.8434493959560774, "tokens_seen": 594725888 }, { "epoch": 1.08, "learning_rate": 0.00041406218655967904, "loss": 3.0531, "theoretical_loss": 3.84343833049648, "tokens_seen": 594742272 }, { "epoch": 1.08, "learning_rate": 0.00041405215646940827, "loss": 2.9104, "theoretical_loss": 3.8433940725596245, "tokens_seen": 594807808 }, { "epoch": 1.08, "learning_rate": 0.0004140421263791374, "loss": 3.0022, "theoretical_loss": 3.843349820864051, "tokens_seen": 594873344 }, { "epoch": 1.08, "learning_rate": 0.00041403209628886663, "loss": 3.0439, "theoretical_loss": 3.8433055754081913, "tokens_seen": 594938880 }, { "epoch": 1.08, "learning_rate": 0.00041402206619859576, "loss": 3.0576, "theoretical_loss": 3.843261336190479, "tokens_seen": 595004416 }, { "epoch": 1.08, "learning_rate": 0.000414012036108325, "loss": 3.0111, "theoretical_loss": 3.843217103209347, "tokens_seen": 595069952 }, { "epoch": 1.08, "learning_rate": 0.0004140020060180542, "loss": 3.1224, "theoretical_loss": 3.84317287646323, "tokens_seen": 595135488 }, { "epoch": 1.08, "learning_rate": 0.00041399197592778336, "loss": 3.1375, "theoretical_loss": 3.843128655950562, "tokens_seen": 595201024 }, { "epoch": 1.08, "learning_rate": 0.00041398194583751254, "loss": 3.1543, "theoretical_loss": 3.843084441669779, "tokens_seen": 595266560 }, { "epoch": 1.08, "learning_rate": 0.0004139719157472417, "loss": 2.7876, "theoretical_loss": 3.843040233619316, "tokens_seen": 595332096 }, { "epoch": 1.08, "learning_rate": 0.0004139618856569709, "loss": 2.9947, "theoretical_loss": 3.84299603179761, "tokens_seen": 595397632 }, { "epoch": 1.08, "learning_rate": 0.00041395185556670014, "loss": 3.0023, "theoretical_loss": 3.842951836203097, "tokens_seen": 595463168 }, { "epoch": 1.08, "learning_rate": 0.00041394182547642926, "loss": 3.123, "theoretical_loss": 3.842907646834216, "tokens_seen": 595528704 }, { "epoch": 1.08, "learning_rate": 0.0004139317953861585, "loss": 2.9437, "theoretical_loss": 3.842863463689404, "tokens_seen": 595594240 }, { "epoch": 1.08, "learning_rate": 0.0004139217652958877, "loss": 3.0039, "theoretical_loss": 3.8428192867671, "tokens_seen": 595659776 }, { "epoch": 1.08, "learning_rate": 0.00041391173520561686, "loss": 3.0051, "theoretical_loss": 3.8427751160657424, "tokens_seen": 595725312 }, { "epoch": 1.08, "learning_rate": 0.00041390170511534604, "loss": 3.0169, "theoretical_loss": 3.842730951583772, "tokens_seen": 595790848 }, { "epoch": 1.08, "learning_rate": 0.0004138916750250752, "loss": 2.9203, "theoretical_loss": 3.842686793319629, "tokens_seen": 595856384 }, { "epoch": 1.08, "learning_rate": 0.0004138816449348044, "loss": 2.8922, "theoretical_loss": 3.8426426412717536, "tokens_seen": 595921920 }, { "epoch": 1.08, "learning_rate": 0.00041387161484453364, "loss": 3.0852, "theoretical_loss": 3.8425984954385877, "tokens_seen": 595987456 }, { "epoch": 1.08, "learning_rate": 0.00041386158475426277, "loss": 2.9141, "theoretical_loss": 3.8425543558185735, "tokens_seen": 596052992 }, { "epoch": 1.08, "learning_rate": 0.000413851554663992, "loss": 2.9775, "theoretical_loss": 3.8425102224101533, "tokens_seen": 596118528 }, { "epoch": 1.08, "learning_rate": 0.00041384152457372113, "loss": 2.9199, "theoretical_loss": 3.8424660952117695, "tokens_seen": 596184064 }, { "epoch": 1.08, "learning_rate": 0.00041383149448345036, "loss": 3.029, "theoretical_loss": 3.8424219742218666, "tokens_seen": 596249600 }, { "epoch": 1.08, "learning_rate": 0.00041382146439317954, "loss": 2.9163, "theoretical_loss": 3.842377859438889, "tokens_seen": 596315136 }, { "epoch": 1.08, "objective/train/docs_used": 972229, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.531693696975708, "objective/train/theoretical_loss": 3.8423447774240147, "objective/train/tokens_used": 616824288, "theoretical_loss": 3.8423447774240147, "tokens_seen": 596364288 }, { "epoch": 1.08, "learning_rate": 0.0004138114343029087, "loss": 2.8743, "theoretical_loss": 3.842333750861281, "tokens_seen": 596380672 }, { "epoch": 1.08, "learning_rate": 0.0004138014042126379, "loss": 2.994, "theoretical_loss": 3.8422896484874887, "tokens_seen": 596446208 }, { "epoch": 1.08, "learning_rate": 0.0004137913741223671, "loss": 3.0503, "theoretical_loss": 3.842245552315957, "tokens_seen": 596511744 }, { "epoch": 1.08, "learning_rate": 0.00041378134403209627, "loss": 2.9374, "theoretical_loss": 3.842201462345133, "tokens_seen": 596577280 }, { "epoch": 1.08, "learning_rate": 0.0004137713139418255, "loss": 2.9398, "theoretical_loss": 3.842157378573464, "tokens_seen": 596642816 }, { "epoch": 1.08, "learning_rate": 0.00041376128385155463, "loss": 3.034, "theoretical_loss": 3.842113300999397, "tokens_seen": 596708352 }, { "epoch": 1.08, "learning_rate": 0.00041375125376128387, "loss": 3.0679, "theoretical_loss": 3.84206922962138, "tokens_seen": 596773888 }, { "epoch": 1.08, "learning_rate": 0.00041374122367101305, "loss": 3.0156, "theoretical_loss": 3.842025164437863, "tokens_seen": 596839424 }, { "epoch": 1.08, "learning_rate": 0.00041373119358074223, "loss": 2.8145, "theoretical_loss": 3.8419811054472937, "tokens_seen": 596904960 }, { "epoch": 1.08, "learning_rate": 0.0004137211634904714, "loss": 2.9123, "theoretical_loss": 3.841937052648123, "tokens_seen": 596970496 }, { "epoch": 1.08, "learning_rate": 0.0004137111334002006, "loss": 2.9975, "theoretical_loss": 3.8418930060388004, "tokens_seen": 597036032 }, { "epoch": 1.08, "learning_rate": 0.00041370110330992977, "loss": 2.9825, "theoretical_loss": 3.8418489656177783, "tokens_seen": 597101568 }, { "epoch": 1.08, "learning_rate": 0.000413691073219659, "loss": 3.0467, "theoretical_loss": 3.8418049313835065, "tokens_seen": 597167104 }, { "epoch": 1.08, "learning_rate": 0.00041368104312938813, "loss": 3.009, "theoretical_loss": 3.841760903334438, "tokens_seen": 597232640 }, { "epoch": 1.08, "learning_rate": 0.00041367101303911737, "loss": 2.9662, "theoretical_loss": 3.841716881469026, "tokens_seen": 597298176 }, { "epoch": 1.08, "learning_rate": 0.00041366098294884655, "loss": 3.1459, "theoretical_loss": 3.8416728657857226, "tokens_seen": 597363712 }, { "epoch": 1.08, "learning_rate": 0.00041365095285857573, "loss": 2.9416, "theoretical_loss": 3.841628856282982, "tokens_seen": 597429248 }, { "epoch": 1.08, "learning_rate": 0.00041364092276830497, "loss": 3.011, "theoretical_loss": 3.841584852959258, "tokens_seen": 597494784 }, { "epoch": 1.08, "learning_rate": 0.0004136308926780341, "loss": 3.0927, "theoretical_loss": 3.8415408558130064, "tokens_seen": 597560320 }, { "epoch": 1.08, "learning_rate": 0.00041362086258776333, "loss": 3.1662, "theoretical_loss": 3.841496864842682, "tokens_seen": 597625856 }, { "epoch": 1.08, "learning_rate": 0.0004136108324974925, "loss": 2.8741, "theoretical_loss": 3.8414528800467407, "tokens_seen": 597691392 }, { "epoch": 1.08, "learning_rate": 0.0004136008024072217, "loss": 2.9422, "theoretical_loss": 3.8414089014236397, "tokens_seen": 597756928 }, { "epoch": 1.08, "learning_rate": 0.0004135907723169509, "loss": 2.9909, "theoretical_loss": 3.841364928971835, "tokens_seen": 597822464 }, { "epoch": 1.08, "learning_rate": 0.00041358074222668005, "loss": 3.0376, "theoretical_loss": 3.8413209626897853, "tokens_seen": 597888000 }, { "epoch": 1.08, "learning_rate": 0.00041357071213640924, "loss": 3.0943, "theoretical_loss": 3.8412770025759477, "tokens_seen": 597953536 }, { "epoch": 1.08, "objective/train/docs_used": 974998, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9554717540740967, "objective/train/theoretical_loss": 3.8412440365375327, "objective/train/tokens_used": 618462688, "theoretical_loss": 3.8412440365375327, "tokens_seen": 598002688 }, { "epoch": 1.08, "learning_rate": 0.00041356068204613847, "loss": 3.0006, "theoretical_loss": 3.841233048628782, "tokens_seen": 598019072 }, { "epoch": 1.08, "learning_rate": 0.0004135506519558676, "loss": 2.9439, "theoretical_loss": 3.841189100846747, "tokens_seen": 598084608 }, { "epoch": 1.08, "learning_rate": 0.00041354062186559683, "loss": 3.0829, "theoretical_loss": 3.8411451592283026, "tokens_seen": 598150144 }, { "epoch": 1.08, "learning_rate": 0.00041353059177532596, "loss": 3.0618, "theoretical_loss": 3.8411012237719087, "tokens_seen": 598215680 }, { "epoch": 1.08, "learning_rate": 0.0004135205616850552, "loss": 3.065, "theoretical_loss": 3.841057294476027, "tokens_seen": 598281216 }, { "epoch": 1.08, "learning_rate": 0.0004135105315947844, "loss": 2.9544, "theoretical_loss": 3.8410133713391184, "tokens_seen": 598346752 }, { "epoch": 1.08, "learning_rate": 0.00041350050150451356, "loss": 2.9127, "theoretical_loss": 3.8409694543596458, "tokens_seen": 598412288 }, { "epoch": 1.08, "learning_rate": 0.00041349047141424274, "loss": 3.1626, "theoretical_loss": 3.8409255435360707, "tokens_seen": 598477824 }, { "epoch": 1.08, "learning_rate": 0.0004134804413239719, "loss": 3.0523, "theoretical_loss": 3.8408816388668576, "tokens_seen": 598543360 }, { "epoch": 1.08, "learning_rate": 0.0004134704112337011, "loss": 2.8816, "theoretical_loss": 3.8408377403504694, "tokens_seen": 598608896 }, { "epoch": 1.08, "learning_rate": 0.00041346038114343034, "loss": 2.8957, "theoretical_loss": 3.8407938479853696, "tokens_seen": 598674432 }, { "epoch": 1.08, "learning_rate": 0.00041345035105315946, "loss": 2.8038, "theoretical_loss": 3.8407499617700247, "tokens_seen": 598739968 }, { "epoch": 1.08, "learning_rate": 0.0004134403209628887, "loss": 3.1056, "theoretical_loss": 3.840706081702899, "tokens_seen": 598805504 }, { "epoch": 1.08, "learning_rate": 0.0004134302908726179, "loss": 3.045, "theoretical_loss": 3.8406622077824584, "tokens_seen": 598871040 }, { "epoch": 1.08, "learning_rate": 0.00041342026078234706, "loss": 3.0595, "theoretical_loss": 3.84061834000717, "tokens_seen": 598936576 }, { "epoch": 1.08, "learning_rate": 0.00041341023069207624, "loss": 2.9185, "theoretical_loss": 3.8405744783755003, "tokens_seen": 599002112 }, { "epoch": 1.08, "learning_rate": 0.0004134002006018054, "loss": 3.1443, "theoretical_loss": 3.8405306228859164, "tokens_seen": 599067648 }, { "epoch": 1.08, "learning_rate": 0.0004133901705115346, "loss": 3.1166, "theoretical_loss": 3.8404867735368877, "tokens_seen": 599133184 }, { "epoch": 1.08, "learning_rate": 0.00041338014042126384, "loss": 2.8575, "theoretical_loss": 3.8404429303268826, "tokens_seen": 599198720 }, { "epoch": 1.08, "learning_rate": 0.00041337011033099297, "loss": 3.0181, "theoretical_loss": 3.8403990932543692, "tokens_seen": 599264256 }, { "epoch": 1.08, "learning_rate": 0.0004133600802407222, "loss": 2.994, "theoretical_loss": 3.840355262317818, "tokens_seen": 599329792 }, { "epoch": 1.08, "learning_rate": 0.00041335005015045133, "loss": 2.9337, "theoretical_loss": 3.8403114375156995, "tokens_seen": 599395328 }, { "epoch": 1.08, "learning_rate": 0.00041334002006018056, "loss": 3.021, "theoretical_loss": 3.8402676188464846, "tokens_seen": 599460864 }, { "epoch": 1.08, "learning_rate": 0.00041332998996990975, "loss": 3.0127, "theoretical_loss": 3.840223806308644, "tokens_seen": 599526400 }, { "epoch": 1.08, "learning_rate": 0.0004133199598796389, "loss": 3.0445, "theoretical_loss": 3.8401799999006507, "tokens_seen": 599591936 }, { "epoch": 1.08, "objective/train/docs_used": 977873, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0075316429138184, "objective/train/theoretical_loss": 3.8401471491164485, "objective/train/tokens_used": 620101088, "theoretical_loss": 3.8401471491164485, "tokens_seen": 599641088 }, { "epoch": 1.08, "learning_rate": 0.0004133099297893681, "loss": 2.9956, "theoretical_loss": 3.840136199620976, "tokens_seen": 599657472 }, { "epoch": 1.08, "learning_rate": 0.0004132998996990973, "loss": 2.8846, "theoretical_loss": 3.840092405468094, "tokens_seen": 599723008 }, { "epoch": 1.08, "learning_rate": 0.00041328986960882647, "loss": 3.023, "theoretical_loss": 3.840048617440478, "tokens_seen": 599788544 }, { "epoch": 1.08, "learning_rate": 0.0004132798395185557, "loss": 3.0752, "theoretical_loss": 3.840004835536602, "tokens_seen": 599854080 }, { "epoch": 1.08, "learning_rate": 0.00041326980942828483, "loss": 3.03, "theoretical_loss": 3.8399610597549407, "tokens_seen": 599919616 }, { "epoch": 1.08, "learning_rate": 0.00041325977933801407, "loss": 3.0413, "theoretical_loss": 3.8399172900939695, "tokens_seen": 599985152 }, { "epoch": 1.08, "learning_rate": 0.00041324974924774325, "loss": 3.0624, "theoretical_loss": 3.8398735265521644, "tokens_seen": 600050688 }, { "epoch": 1.08, "learning_rate": 0.00041323971915747243, "loss": 3.0143, "theoretical_loss": 3.8398297691280012, "tokens_seen": 600116224 }, { "epoch": 1.08, "learning_rate": 0.0004132296890672016, "loss": 2.9281, "theoretical_loss": 3.839786017819957, "tokens_seen": 600181760 }, { "epoch": 1.08, "learning_rate": 0.0004132196589769308, "loss": 2.9148, "theoretical_loss": 3.839742272626509, "tokens_seen": 600247296 }, { "epoch": 1.08, "learning_rate": 0.00041320962888665997, "loss": 3.1451, "theoretical_loss": 3.8396985335461356, "tokens_seen": 600312832 }, { "epoch": 1.08, "learning_rate": 0.0004131995987963892, "loss": 2.9516, "theoretical_loss": 3.839654800577316, "tokens_seen": 600378368 }, { "epoch": 1.08, "learning_rate": 0.00041318956870611833, "loss": 2.9274, "theoretical_loss": 3.839611073718527, "tokens_seen": 600443904 }, { "epoch": 1.08, "learning_rate": 0.00041317953861584757, "loss": 3.1498, "theoretical_loss": 3.8395673529682504, "tokens_seen": 600509440 }, { "epoch": 1.08, "learning_rate": 0.0004131695085255767, "loss": 2.9056, "theoretical_loss": 3.8395236383249656, "tokens_seen": 600574976 }, { "epoch": 1.08, "learning_rate": 0.00041315947843530593, "loss": 2.8957, "theoretical_loss": 3.839479929787153, "tokens_seen": 600640512 }, { "epoch": 1.08, "learning_rate": 0.0004131494483450351, "loss": 3.0552, "theoretical_loss": 3.8394362273532945, "tokens_seen": 600706048 }, { "epoch": 1.08, "learning_rate": 0.0004131394182547643, "loss": 3.0741, "theoretical_loss": 3.839392531021871, "tokens_seen": 600771584 }, { "epoch": 1.08, "learning_rate": 0.0004131293881644935, "loss": 2.9772, "theoretical_loss": 3.8393488407913656, "tokens_seen": 600837120 }, { "epoch": 1.08, "learning_rate": 0.0004131193580742227, "loss": 3.1266, "theoretical_loss": 3.839305156660261, "tokens_seen": 600902656 }, { "epoch": 1.08, "learning_rate": 0.00041310932798395184, "loss": 2.9866, "theoretical_loss": 3.8392614786270407, "tokens_seen": 600968192 }, { "epoch": 1.08, "learning_rate": 0.0004130992978936811, "loss": 3.2371, "theoretical_loss": 3.839217806690188, "tokens_seen": 601033728 }, { "epoch": 1.08, "learning_rate": 0.0004130892678034102, "loss": 2.8171, "theoretical_loss": 3.839174140848188, "tokens_seen": 601099264 }, { "epoch": 1.08, "learning_rate": 0.00041307923771313944, "loss": 2.9517, "theoretical_loss": 3.839130481099526, "tokens_seen": 601164800 }, { "epoch": 1.08, "learning_rate": 0.0004130692076228686, "loss": 2.7925, "theoretical_loss": 3.839086827442687, "tokens_seen": 601230336 }, { "epoch": 1.08, "objective/train/docs_used": 980509, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.948784112930298, "objective/train/theoretical_loss": 3.839054091196906, "objective/train/tokens_used": 621739488, "theoretical_loss": 3.839054091196906, "tokens_seen": 601279488 }, { "epoch": 1.08, "learning_rate": 0.0004130591775325978, "loss": 3.129, "theoretical_loss": 3.839043179876157, "tokens_seen": 601295872 }, { "epoch": 1.08, "learning_rate": 0.000413049147442327, "loss": 3.1765, "theoretical_loss": 3.838999538398423, "tokens_seen": 601361408 }, { "epoch": 1.08, "learning_rate": 0.00041303911735205616, "loss": 3.1653, "theoretical_loss": 3.8389559030079723, "tokens_seen": 601426944 }, { "epoch": 1.08, "learning_rate": 0.00041302908726178534, "loss": 3.0857, "theoretical_loss": 3.8389122737032926, "tokens_seen": 601492480 }, { "epoch": 1.08, "learning_rate": 0.0004130190571715146, "loss": 3.0379, "theoretical_loss": 3.838868650482872, "tokens_seen": 601558016 }, { "epoch": 1.08, "learning_rate": 0.0004130090270812437, "loss": 2.9683, "theoretical_loss": 3.8388250333451994, "tokens_seen": 601623552 }, { "epoch": 1.08, "learning_rate": 0.00041299899699097294, "loss": 3.0644, "theoretical_loss": 3.8387814222887644, "tokens_seen": 601689088 }, { "epoch": 1.08, "learning_rate": 0.00041298896690070207, "loss": 2.947, "theoretical_loss": 3.8387378173120563, "tokens_seen": 601754624 }, { "epoch": 1.08, "learning_rate": 0.0004129789368104313, "loss": 2.9997, "theoretical_loss": 3.838694218413566, "tokens_seen": 601820160 }, { "epoch": 1.08, "learning_rate": 0.0004129689067201605, "loss": 2.9866, "theoretical_loss": 3.838650625591785, "tokens_seen": 601885696 }, { "epoch": 1.08, "learning_rate": 0.00041295887662988966, "loss": 3.0935, "theoretical_loss": 3.838607038845204, "tokens_seen": 601951232 }, { "epoch": 1.08, "learning_rate": 0.00041294884653961884, "loss": 2.9493, "theoretical_loss": 3.838563458172315, "tokens_seen": 602016768 }, { "epoch": 1.08, "learning_rate": 0.0004129388164493481, "loss": 3.0542, "theoretical_loss": 3.8385198835716112, "tokens_seen": 602082304 }, { "epoch": 1.08, "learning_rate": 0.0004129287863590772, "loss": 3.0988, "theoretical_loss": 3.8384763150415853, "tokens_seen": 602147840 }, { "epoch": 1.08, "learning_rate": 0.00041291875626880644, "loss": 3.1007, "theoretical_loss": 3.838432752580731, "tokens_seen": 602213376 }, { "epoch": 1.08, "learning_rate": 0.0004129087261785356, "loss": 3.0663, "theoretical_loss": 3.838389196187543, "tokens_seen": 602278912 }, { "epoch": 1.08, "learning_rate": 0.0004128986960882648, "loss": 2.7674, "theoretical_loss": 3.838345645860515, "tokens_seen": 602344448 }, { "epoch": 1.08, "learning_rate": 0.00041288866599799404, "loss": 2.9889, "theoretical_loss": 3.8383021015981433, "tokens_seen": 602409984 }, { "epoch": 1.08, "learning_rate": 0.00041287863590772317, "loss": 2.8449, "theoretical_loss": 3.8382585633989237, "tokens_seen": 602475520 }, { "epoch": 1.08, "learning_rate": 0.0004128686058174524, "loss": 3.1095, "theoretical_loss": 3.838215031261352, "tokens_seen": 602541056 }, { "epoch": 1.08, "learning_rate": 0.00041285857572718153, "loss": 2.9041, "theoretical_loss": 3.8381715051839254, "tokens_seen": 602606592 }, { "epoch": 1.08, "learning_rate": 0.00041284854563691076, "loss": 3.0581, "theoretical_loss": 3.8381279851651415, "tokens_seen": 602672128 }, { "epoch": 1.08, "learning_rate": 0.00041283851554663995, "loss": 3.0675, "theoretical_loss": 3.838084471203498, "tokens_seen": 602737664 }, { "epoch": 1.08, "learning_rate": 0.0004128284854563691, "loss": 3.0358, "theoretical_loss": 3.8380409632974932, "tokens_seen": 602803200 }, { "epoch": 1.08, "learning_rate": 0.0004128184553660983, "loss": 3.0123, "theoretical_loss": 3.8379974614456263, "tokens_seen": 602868736 }, { "epoch": 1.08, "objective/train/docs_used": 983455, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.243675708770752, "objective/train/theoretical_loss": 3.837964839028852, "objective/train/tokens_used": 623377888, "theoretical_loss": 3.837964839028852, "tokens_seen": 602917888 }, { "epoch": 1.08, "learning_rate": 0.0004128084252758275, "loss": 3.1172, "theoretical_loss": 3.8379539656463972, "tokens_seen": 602934272 }, { "epoch": 1.08, "learning_rate": 0.00041279839518555667, "loss": 2.7752, "theoretical_loss": 3.8379104758983056, "tokens_seen": 602999808 }, { "epoch": 1.08, "learning_rate": 0.0004127883650952859, "loss": 3.1054, "theoretical_loss": 3.837866992199853, "tokens_seen": 603065344 }, { "epoch": 1.08, "learning_rate": 0.00041277833500501503, "loss": 3.1073, "theoretical_loss": 3.837823514549539, "tokens_seen": 603130880 }, { "epoch": 1.08, "learning_rate": 0.00041276830491474427, "loss": 2.9292, "theoretical_loss": 3.837780042945867, "tokens_seen": 603196416 }, { "epoch": 1.08, "learning_rate": 0.00041275827482447345, "loss": 3.0044, "theoretical_loss": 3.8377365773873384, "tokens_seen": 603261952 }, { "epoch": 1.08, "learning_rate": 0.00041274824473420263, "loss": 2.9341, "theoretical_loss": 3.8376931178724556, "tokens_seen": 603327488 }, { "epoch": 1.08, "learning_rate": 0.0004127382146439318, "loss": 2.9277, "theoretical_loss": 3.8376496643997227, "tokens_seen": 603393024 }, { "epoch": 1.08, "learning_rate": 0.000412728184553661, "loss": 2.8349, "theoretical_loss": 3.837606216967643, "tokens_seen": 603458560 }, { "epoch": 1.08, "learning_rate": 0.00041271815446339017, "loss": 3.1076, "theoretical_loss": 3.837562775574721, "tokens_seen": 603524096 }, { "epoch": 1.08, "learning_rate": 0.0004127081243731194, "loss": 3.1541, "theoretical_loss": 3.837519340219462, "tokens_seen": 603589632 }, { "epoch": 1.08, "learning_rate": 0.00041269809428284854, "loss": 3.1454, "theoretical_loss": 3.837475910900371, "tokens_seen": 603655168 }, { "epoch": 1.08, "learning_rate": 0.00041268806419257777, "loss": 3.083, "theoretical_loss": 3.837432487615954, "tokens_seen": 603720704 }, { "epoch": 1.08, "learning_rate": 0.0004126780341023069, "loss": 2.9437, "theoretical_loss": 3.8373890703647175, "tokens_seen": 603786240 }, { "epoch": 1.08, "learning_rate": 0.00041266800401203613, "loss": 2.9074, "theoretical_loss": 3.8373456591451696, "tokens_seen": 603851776 }, { "epoch": 1.08, "learning_rate": 0.0004126579739217653, "loss": 2.9308, "theoretical_loss": 3.837302253955816, "tokens_seen": 603917312 }, { "epoch": 1.08, "learning_rate": 0.0004126479438314945, "loss": 2.8634, "theoretical_loss": 3.8372588547951665, "tokens_seen": 603982848 }, { "epoch": 1.08, "learning_rate": 0.0004126379137412237, "loss": 3.0471, "theoretical_loss": 3.8372154616617284, "tokens_seen": 604048384 }, { "epoch": 1.08, "learning_rate": 0.0004126278836509529, "loss": 3.0681, "theoretical_loss": 3.8371720745540117, "tokens_seen": 604113920 }, { "epoch": 1.08, "learning_rate": 0.00041261785356068204, "loss": 3.0021, "theoretical_loss": 3.837128693470526, "tokens_seen": 604179456 }, { "epoch": 1.08, "learning_rate": 0.0004126078234704113, "loss": 2.8026, "theoretical_loss": 3.8370853184097813, "tokens_seen": 604244992 }, { "epoch": 1.08, "learning_rate": 0.0004125977933801404, "loss": 3.0854, "theoretical_loss": 3.8370419493702883, "tokens_seen": 604310528 }, { "epoch": 1.08, "learning_rate": 0.00041258776328986964, "loss": 3.062, "theoretical_loss": 3.8369985863505587, "tokens_seen": 604376064 }, { "epoch": 1.08, "learning_rate": 0.0004125777331995988, "loss": 3.0455, "theoretical_loss": 3.836955229349104, "tokens_seen": 604441600 }, { "epoch": 1.08, "learning_rate": 0.000412567703109328, "loss": 3.1106, "theoretical_loss": 3.836911878364436, "tokens_seen": 604507136 }, { "epoch": 1.08, "objective/train/docs_used": 984842, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5849223136901855, "objective/train/theoretical_loss": 3.836879369073557, "objective/train/tokens_used": 625016288, "theoretical_loss": 3.836879369073557, "tokens_seen": 604556288 }, { "epoch": 1.08, "learning_rate": 0.0004125576730190572, "loss": 2.9401, "theoretical_loss": 3.8368685333950685, "tokens_seen": 604572672 }, { "epoch": 1.08, "learning_rate": 0.00041254764292878636, "loss": 3.0465, "theoretical_loss": 3.836825194439515, "tokens_seen": 604638208 }, { "epoch": 1.08, "learning_rate": 0.00041253761283851554, "loss": 2.8365, "theoretical_loss": 3.8367818614962887, "tokens_seen": 604703744 }, { "epoch": 1.08, "learning_rate": 0.0004125275827482448, "loss": 3.1373, "theoretical_loss": 3.8367385345639042, "tokens_seen": 604769280 }, { "epoch": 1.08, "learning_rate": 0.0004125175526579739, "loss": 2.9531, "theoretical_loss": 3.8366952136408767, "tokens_seen": 604834816 }, { "epoch": 1.08, "learning_rate": 0.00041250752256770314, "loss": 3.1611, "theoretical_loss": 3.836651898725722, "tokens_seen": 604900352 }, { "epoch": 1.08, "learning_rate": 0.00041249749247743227, "loss": 2.9067, "theoretical_loss": 3.8366085898169553, "tokens_seen": 604965888 }, { "epoch": 1.08, "learning_rate": 0.0004124874623871615, "loss": 3.0193, "theoretical_loss": 3.836565286913094, "tokens_seen": 605031424 }, { "epoch": 1.08, "learning_rate": 0.0004124774322968907, "loss": 3.0338, "theoretical_loss": 3.836521990012655, "tokens_seen": 605096960 }, { "epoch": 1.08, "learning_rate": 0.00041246740220661986, "loss": 3.1343, "theoretical_loss": 3.8364786991141555, "tokens_seen": 605162496 }, { "epoch": 1.08, "learning_rate": 0.00041245737211634904, "loss": 2.9918, "theoretical_loss": 3.8364354142161137, "tokens_seen": 605228032 }, { "epoch": 1.08, "learning_rate": 0.0004124473420260783, "loss": 3.1025, "theoretical_loss": 3.8363921353170487, "tokens_seen": 605293568 }, { "epoch": 1.08, "learning_rate": 0.0004124373119358074, "loss": 3.056, "theoretical_loss": 3.8363488624154796, "tokens_seen": 605359104 }, { "epoch": 1.08, "learning_rate": 0.00041242728184553664, "loss": 2.9463, "theoretical_loss": 3.836305595509926, "tokens_seen": 605424640 }, { "epoch": 1.08, "learning_rate": 0.00041241725175526577, "loss": 3.0179, "theoretical_loss": 3.8362623345989086, "tokens_seen": 605490176 }, { "epoch": 1.08, "learning_rate": 0.000412407221664995, "loss": 3.0001, "theoretical_loss": 3.8362190796809474, "tokens_seen": 605555712 }, { "epoch": 1.08, "learning_rate": 0.0004123971915747242, "loss": 2.9639, "theoretical_loss": 3.8361758307545637, "tokens_seen": 605621248 }, { "epoch": 1.08, "learning_rate": 0.00041238716148445337, "loss": 2.9835, "theoretical_loss": 3.8361325878182804, "tokens_seen": 605686784 }, { "epoch": 1.08, "learning_rate": 0.00041237713139418255, "loss": 2.9885, "theoretical_loss": 3.836089350870619, "tokens_seen": 605752320 }, { "epoch": 1.08, "learning_rate": 0.00041236710130391173, "loss": 2.9806, "theoretical_loss": 3.836046119910103, "tokens_seen": 605817856 }, { "epoch": 1.08, "learning_rate": 0.0004123570712136409, "loss": 3.0292, "theoretical_loss": 3.8360028949352545, "tokens_seen": 605883392 }, { "epoch": 1.08, "learning_rate": 0.00041234704112337015, "loss": 3.0589, "theoretical_loss": 3.835959675944598, "tokens_seen": 605948928 }, { "epoch": 1.08, "learning_rate": 0.00041233701103309927, "loss": 3.0173, "theoretical_loss": 3.835916462936659, "tokens_seen": 606014464 }, { "epoch": 1.08, "learning_rate": 0.0004123269809428285, "loss": 3.095, "theoretical_loss": 3.8358732559099615, "tokens_seen": 606080000 }, { "epoch": 1.08, "learning_rate": 0.00041231695085255763, "loss": 2.7876, "theoretical_loss": 3.835830054863031, "tokens_seen": 606145536 }, { "epoch": 1.08, "objective/train/docs_used": 987508, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.097412347793579, "objective/train/theoretical_loss": 3.835797658001169, "objective/train/tokens_used": 626654688, "theoretical_loss": 3.835797658001169, "tokens_seen": 606194688 }, { "epoch": 1.08, "learning_rate": 0.00041230692076228687, "loss": 3.0459, "theoretical_loss": 3.835786859794394, "tokens_seen": 606211072 }, { "epoch": 1.08, "learning_rate": 0.00041229689067201605, "loss": 2.9452, "theoretical_loss": 3.8357436707025765, "tokens_seen": 606276608 }, { "epoch": 1.08, "learning_rate": 0.00041228686058174523, "loss": 3.0336, "theoretical_loss": 3.835700487586106, "tokens_seen": 606342144 }, { "epoch": 1.08, "learning_rate": 0.0004122768304914744, "loss": 3.113, "theoretical_loss": 3.8356573104435094, "tokens_seen": 606407680 }, { "epoch": 1.08, "learning_rate": 0.00041226680040120365, "loss": 3.0329, "theoretical_loss": 3.835614139273316, "tokens_seen": 606473216 }, { "epoch": 1.08, "learning_rate": 0.0004122567703109328, "loss": 3.0966, "theoretical_loss": 3.835570974074053, "tokens_seen": 606538752 }, { "epoch": 1.08, "learning_rate": 0.000412246740220662, "loss": 3.1502, "theoretical_loss": 3.835527814844251, "tokens_seen": 606604288 }, { "epoch": 1.08, "learning_rate": 0.00041223671013039114, "loss": 2.9877, "theoretical_loss": 3.8354846615824387, "tokens_seen": 606669824 }, { "epoch": 1.08, "learning_rate": 0.0004122266800401204, "loss": 2.9438, "theoretical_loss": 3.8354415142871465, "tokens_seen": 606735360 }, { "epoch": 1.08, "learning_rate": 0.00041221664994984955, "loss": 3.0598, "theoretical_loss": 3.835398372956905, "tokens_seen": 606800896 }, { "epoch": 1.08, "learning_rate": 0.00041220661985957874, "loss": 2.9752, "theoretical_loss": 3.835355237590246, "tokens_seen": 606866432 }, { "epoch": 1.08, "learning_rate": 0.0004121965897693079, "loss": 3.119, "theoretical_loss": 3.8353121081857005, "tokens_seen": 606931968 }, { "epoch": 1.08, "learning_rate": 0.0004121865596790371, "loss": 2.9547, "theoretical_loss": 3.835268984741801, "tokens_seen": 606997504 }, { "epoch": 1.08, "learning_rate": 0.0004121765295887663, "loss": 3.0207, "theoretical_loss": 3.8352258672570807, "tokens_seen": 607063040 }, { "epoch": 1.08, "learning_rate": 0.0004121664994984955, "loss": 3.0622, "theoretical_loss": 3.835182755730072, "tokens_seen": 607128576 }, { "epoch": 1.08, "learning_rate": 0.0004121564694082247, "loss": 3.0437, "theoretical_loss": 3.8351396501593102, "tokens_seen": 607194112 }, { "epoch": 1.08, "learning_rate": 0.0004121464393179539, "loss": 3.1014, "theoretical_loss": 3.8350965505433283, "tokens_seen": 607259648 }, { "epoch": 1.08, "learning_rate": 0.0004121364092276831, "loss": 2.9261, "theoretical_loss": 3.8350534568806616, "tokens_seen": 607325184 }, { "epoch": 1.08, "learning_rate": 0.00041212637913741224, "loss": 2.9804, "theoretical_loss": 3.8350103691698463, "tokens_seen": 607390720 }, { "epoch": 1.08, "learning_rate": 0.0004121163490471415, "loss": 3.0878, "theoretical_loss": 3.834967287409417, "tokens_seen": 607456256 }, { "epoch": 1.08, "learning_rate": 0.0004121063189568706, "loss": 3.0588, "theoretical_loss": 3.8349242115979107, "tokens_seen": 607521792 }, { "epoch": 1.08, "learning_rate": 0.00041209628886659984, "loss": 3.0702, "theoretical_loss": 3.8348811417338644, "tokens_seen": 607587328 }, { "epoch": 1.08, "learning_rate": 0.000412086258776329, "loss": 2.9821, "theoretical_loss": 3.8348380778158155, "tokens_seen": 607652864 }, { "epoch": 1.08, "learning_rate": 0.0004120762286860582, "loss": 2.977, "theoretical_loss": 3.8347950198423018, "tokens_seen": 607718400 }, { "epoch": 1.08, "learning_rate": 0.0004120661985957874, "loss": 3.1154, "theoretical_loss": 3.8347519678118624, "tokens_seen": 607783936 }, { "epoch": 1.08, "objective/train/docs_used": 990422, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7720189094543457, "objective/train/theoretical_loss": 3.834719682688296, "objective/train/tokens_used": 628293088, "theoretical_loss": 3.834719682688296, "tokens_seen": 607833088 }, { "epoch": 1.08, "learning_rate": 0.00041205616850551656, "loss": 2.9985, "theoretical_loss": 3.8347089217230357, "tokens_seen": 607849472 }, { "epoch": 1.08, "learning_rate": 0.00041204613841524574, "loss": 3.0772, "theoretical_loss": 3.8346658815743613, "tokens_seen": 607915008 }, { "epoch": 1.08, "learning_rate": 0.000412036108324975, "loss": 3.1575, "theoretical_loss": 3.834622847364379, "tokens_seen": 607980544 }, { "epoch": 1.08, "learning_rate": 0.0004120260782347041, "loss": 2.9617, "theoretical_loss": 3.8345798190916307, "tokens_seen": 608046080 }, { "epoch": 1.08, "learning_rate": 0.00041201604814443334, "loss": 2.9963, "theoretical_loss": 3.834536796754656, "tokens_seen": 608111616 }, { "epoch": 1.08, "learning_rate": 0.00041200601805416247, "loss": 3.0091, "theoretical_loss": 3.834493780351997, "tokens_seen": 608177152 }, { "epoch": 1.08, "learning_rate": 0.0004119959879638917, "loss": 2.9634, "theoretical_loss": 3.8344507698821957, "tokens_seen": 608242688 }, { "epoch": 1.08, "learning_rate": 0.0004119859578736209, "loss": 3.1934, "theoretical_loss": 3.8344077653437947, "tokens_seen": 608308224 }, { "epoch": 1.08, "learning_rate": 0.00041197592778335006, "loss": 3.0448, "theoretical_loss": 3.8343647667353373, "tokens_seen": 608373760 }, { "epoch": 1.08, "learning_rate": 0.00041196589769307924, "loss": 3.0849, "theoretical_loss": 3.8343217740553674, "tokens_seen": 608439296 }, { "epoch": 1.08, "learning_rate": 0.0004119558676028085, "loss": 3.0618, "theoretical_loss": 3.834278787302429, "tokens_seen": 608504832 }, { "epoch": 1.08, "learning_rate": 0.0004119458375125376, "loss": 3.1835, "theoretical_loss": 3.834235806475066, "tokens_seen": 608570368 }, { "epoch": 1.08, "learning_rate": 0.00041193580742226684, "loss": 3.0149, "theoretical_loss": 3.8341928315718246, "tokens_seen": 608635904 }, { "epoch": 1.08, "learning_rate": 0.00041192577733199597, "loss": 2.9959, "theoretical_loss": 3.83414986259125, "tokens_seen": 608701440 }, { "epoch": 1.08, "learning_rate": 0.0004119157472417252, "loss": 2.9359, "theoretical_loss": 3.8341068995318888, "tokens_seen": 608766976 }, { "epoch": 1.08, "learning_rate": 0.0004119057171514544, "loss": 3.0126, "theoretical_loss": 3.8340639423922878, "tokens_seen": 608832512 }, { "epoch": 1.08, "learning_rate": 0.00041189568706118357, "loss": 3.1116, "theoretical_loss": 3.8340209911709935, "tokens_seen": 608898048 }, { "epoch": 1.08, "learning_rate": 0.00041188565697091275, "loss": 2.8184, "theoretical_loss": 3.833978045866554, "tokens_seen": 608963584 }, { "epoch": 1.08, "learning_rate": 0.00041187562688064193, "loss": 2.8807, "theoretical_loss": 3.8339351064775182, "tokens_seen": 609029120 }, { "epoch": 1.08, "learning_rate": 0.0004118655967903711, "loss": 3.0612, "theoretical_loss": 3.8338921730024342, "tokens_seen": 609094656 }, { "epoch": 1.08, "learning_rate": 0.00041185556670010035, "loss": 3.0729, "theoretical_loss": 3.833849245439852, "tokens_seen": 609160192 }, { "epoch": 1.08, "learning_rate": 0.00041184553660982947, "loss": 3.0661, "theoretical_loss": 3.8338063237883198, "tokens_seen": 609225728 }, { "epoch": 1.08, "learning_rate": 0.0004118355065195587, "loss": 3.0357, "theoretical_loss": 3.83376340804639, "tokens_seen": 609291264 }, { "epoch": 1.08, "learning_rate": 0.00041182547642928783, "loss": 3.0956, "theoretical_loss": 3.833720498212612, "tokens_seen": 609356800 }, { "epoch": 1.08, "learning_rate": 0.00041181544633901707, "loss": 3.007, "theoretical_loss": 3.833677594285538, "tokens_seen": 609422336 }, { "epoch": 1.08, "objective/train/docs_used": 992867, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7006452083587646, "objective/train/theoretical_loss": 3.8336454202156354, "objective/train/tokens_used": 629931488, "theoretical_loss": 3.8336454202156354, "tokens_seen": 609471488 }, { "epoch": 1.08, "learning_rate": 0.00041180541624874625, "loss": 3.0599, "theoretical_loss": 3.8336346962637187, "tokens_seen": 609487872 }, { "epoch": 1.08, "learning_rate": 0.00041179538615847543, "loss": 3.0939, "theoretical_loss": 3.833591804145708, "tokens_seen": 609553408 }, { "epoch": 1.08, "learning_rate": 0.0004117853560682046, "loss": 3.0591, "theoretical_loss": 3.8335489179300577, "tokens_seen": 609618944 }, { "epoch": 1.08, "learning_rate": 0.00041177532597793385, "loss": 2.8298, "theoretical_loss": 3.8335060376153214, "tokens_seen": 609684480 }, { "epoch": 1.08, "learning_rate": 0.000411765295887663, "loss": 2.9253, "theoretical_loss": 3.8334631632000535, "tokens_seen": 609750016 }, { "epoch": 1.08, "learning_rate": 0.0004117552657973922, "loss": 3.0437, "theoretical_loss": 3.8334202946828073, "tokens_seen": 609815552 }, { "epoch": 1.08, "learning_rate": 0.00041174523570712134, "loss": 2.9279, "theoretical_loss": 3.8333774320621394, "tokens_seen": 609881088 }, { "epoch": 1.08, "learning_rate": 0.0004117352056168506, "loss": 3.0614, "theoretical_loss": 3.8333345753366035, "tokens_seen": 609946624 }, { "epoch": 1.08, "learning_rate": 0.00041172517552657975, "loss": 2.8304, "theoretical_loss": 3.8332917245047566, "tokens_seen": 610012160 }, { "epoch": 1.08, "learning_rate": 0.00041171514543630894, "loss": 2.8745, "theoretical_loss": 3.8332488795651543, "tokens_seen": 610077696 }, { "epoch": 1.08, "learning_rate": 0.0004117051153460381, "loss": 2.9148, "theoretical_loss": 3.8332060405163544, "tokens_seen": 610143232 }, { "epoch": 1.08, "learning_rate": 0.0004116950852557673, "loss": 2.8431, "theoretical_loss": 3.8331632073569146, "tokens_seen": 610208768 }, { "epoch": 1.08, "learning_rate": 0.0004116850551654965, "loss": 3.0779, "theoretical_loss": 3.8331203800853917, "tokens_seen": 610274304 }, { "epoch": 1.08, "learning_rate": 0.0004116750250752257, "loss": 3.0198, "theoretical_loss": 3.833077558700345, "tokens_seen": 610339840 }, { "epoch": 1.08, "learning_rate": 0.00041166499498495484, "loss": 2.9239, "theoretical_loss": 3.833034743200333, "tokens_seen": 610405376 }, { "epoch": 1.08, "learning_rate": 0.0004116549648946841, "loss": 2.9177, "theoretical_loss": 3.8329919335839158, "tokens_seen": 610470912 }, { "epoch": 1.08, "learning_rate": 0.0004116449348044132, "loss": 2.9864, "theoretical_loss": 3.8329491298496525, "tokens_seen": 610536448 }, { "epoch": 1.08, "learning_rate": 0.00041163490471414244, "loss": 3.0256, "theoretical_loss": 3.8329063319961048, "tokens_seen": 610601984 }, { "epoch": 1.08, "learning_rate": 0.0004116248746238716, "loss": 3.0092, "theoretical_loss": 3.8328635400218327, "tokens_seen": 610667520 }, { "epoch": 1.08, "learning_rate": 0.0004116148445336008, "loss": 2.9987, "theoretical_loss": 3.8328207539253984, "tokens_seen": 610733056 }, { "epoch": 1.08, "learning_rate": 0.00041160481444333, "loss": 2.9422, "theoretical_loss": 3.8327779737053636, "tokens_seen": 610798592 }, { "epoch": 1.08, "learning_rate": 0.0004115947843530592, "loss": 2.8715, "theoretical_loss": 3.8327351993602905, "tokens_seen": 610864128 }, { "epoch": 1.08, "learning_rate": 0.00041158475426278834, "loss": 3.0056, "theoretical_loss": 3.8326924308887427, "tokens_seen": 610929664 }, { "epoch": 1.08, "learning_rate": 0.0004115747241725176, "loss": 2.9416, "theoretical_loss": 3.8326496682892834, "tokens_seen": 610995200 }, { "epoch": 1.08, "learning_rate": 0.0004115646940822467, "loss": 3.0062, "theoretical_loss": 3.832606911560477, "tokens_seen": 611060736 }, { "epoch": 1.08, "objective/train/docs_used": 995797, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1241135597229004, "objective/train/theoretical_loss": 3.832574847865624, "objective/train/tokens_used": 631569888, "theoretical_loss": 3.832574847865624, "tokens_seen": 611109888 }, { "epoch": 1.08, "learning_rate": 0.00041155466399197594, "loss": 3.0248, "theoretical_loss": 3.832564160700888, "tokens_seen": 611126272 }, { "epoch": 1.08, "learning_rate": 0.0004115446339017051, "loss": 2.8736, "theoretical_loss": 3.8325214157090803, "tokens_seen": 611191808 }, { "epoch": 1.08, "learning_rate": 0.0004115346038114343, "loss": 2.8971, "theoretical_loss": 3.832478676583622, "tokens_seen": 611257344 }, { "epoch": 1.08, "learning_rate": 0.0004115245737211635, "loss": 2.9994, "theoretical_loss": 3.832435943323077, "tokens_seen": 611322880 }, { "epoch": 1.08, "learning_rate": 0.00041151454363089267, "loss": 2.953, "theoretical_loss": 3.832393215926012, "tokens_seen": 611388416 }, { "epoch": 1.08, "learning_rate": 0.00041150451354062185, "loss": 2.982, "theoretical_loss": 3.8323504943909947, "tokens_seen": 611453952 }, { "epoch": 1.08, "learning_rate": 0.0004114944834503511, "loss": 3.1192, "theoretical_loss": 3.832307778716593, "tokens_seen": 611519488 }, { "epoch": 1.08, "learning_rate": 0.0004114844533600802, "loss": 2.9853, "theoretical_loss": 3.832265068901374, "tokens_seen": 611585024 }, { "epoch": 1.08, "learning_rate": 0.00041147442326980944, "loss": 2.8296, "theoretical_loss": 3.8322223649439073, "tokens_seen": 611650560 }, { "epoch": 1.08, "learning_rate": 0.00041146439317953857, "loss": 2.982, "theoretical_loss": 3.8321796668427615, "tokens_seen": 611716096 }, { "epoch": 1.08, "learning_rate": 0.0004114543630892678, "loss": 3.0302, "theoretical_loss": 3.832136974596506, "tokens_seen": 611781632 }, { "epoch": 1.08, "learning_rate": 0.000411444332998997, "loss": 3.1954, "theoretical_loss": 3.8320942882037112, "tokens_seen": 611847168 }, { "epoch": 1.08, "learning_rate": 0.00041143430290872617, "loss": 3.1512, "theoretical_loss": 3.8320516076629474, "tokens_seen": 611912704 }, { "epoch": 1.08, "learning_rate": 0.00041142427281845535, "loss": 3.1806, "theoretical_loss": 3.832008932972786, "tokens_seen": 611978240 }, { "epoch": 1.08, "learning_rate": 0.0004114142427281846, "loss": 3.1045, "theoretical_loss": 3.8319662641317986, "tokens_seen": 612043776 }, { "epoch": 1.08, "learning_rate": 0.00041140421263791377, "loss": 2.9717, "theoretical_loss": 3.8319236011385573, "tokens_seen": 612109312 }, { "epoch": 1.08, "learning_rate": 0.00041139418254764295, "loss": 2.9591, "theoretical_loss": 3.831880943991634, "tokens_seen": 612174848 }, { "epoch": 1.08, "learning_rate": 0.00041138415245737213, "loss": 3.0042, "theoretical_loss": 3.8318382926896035, "tokens_seen": 612240384 }, { "epoch": 1.08, "learning_rate": 0.0004113741223671013, "loss": 3.0721, "theoretical_loss": 3.8317956472310373, "tokens_seen": 612305920 }, { "epoch": 1.08, "learning_rate": 0.00041136409227683055, "loss": 3.0823, "theoretical_loss": 3.8317530076145108, "tokens_seen": 612371456 }, { "epoch": 1.08, "learning_rate": 0.00041135406218655967, "loss": 3.2318, "theoretical_loss": 3.8317103738385985, "tokens_seen": 612436992 }, { "epoch": 1.08, "learning_rate": 0.0004113440320962889, "loss": 3.0024, "theoretical_loss": 3.8316677459018758, "tokens_seen": 612502528 }, { "epoch": 1.08, "learning_rate": 0.00041133400200601803, "loss": 3.0536, "theoretical_loss": 3.831625123802917, "tokens_seen": 612568064 }, { "epoch": 1.08, "learning_rate": 0.00041132397191574727, "loss": 2.9363, "theoretical_loss": 3.8315825075402996, "tokens_seen": 612633600 }, { "epoch": 1.08, "learning_rate": 0.00041131394182547645, "loss": 3.0235, "theoretical_loss": 3.831539897112599, "tokens_seen": 612699136 }, { "epoch": 1.08, "objective/train/docs_used": 998554, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0679996013641357, "objective/train/theoretical_loss": 3.8315079431201324, "objective/train/tokens_used": 633208288, "theoretical_loss": 3.8315079431201324, "tokens_seen": 612748288 }, { "epoch": 1.08, "learning_rate": 0.00041130391173520563, "loss": 3.2123, "theoretical_loss": 3.831497292518393, "tokens_seen": 612764672 }, { "epoch": 1.08, "learning_rate": 0.0004112938816449348, "loss": 2.954, "theoretical_loss": 3.83145469375626, "tokens_seen": 612830208 }, { "epoch": 1.08, "learning_rate": 0.00041128385155466405, "loss": 3.0527, "theoretical_loss": 3.831412100824777, "tokens_seen": 612895744 }, { "epoch": 1.08, "learning_rate": 0.0004112738214643932, "loss": 2.9411, "theoretical_loss": 3.8313695137225223, "tokens_seen": 612961280 }, { "epoch": 1.08, "learning_rate": 0.0004112637913741224, "loss": 3.1527, "theoretical_loss": 3.831326932448076, "tokens_seen": 613026816 }, { "epoch": 1.08, "learning_rate": 0.00041125376128385154, "loss": 2.9398, "theoretical_loss": 3.8312843570000172, "tokens_seen": 613092352 }, { "epoch": 1.08, "learning_rate": 0.0004112437311935808, "loss": 3.0135, "theoretical_loss": 3.8312417873769262, "tokens_seen": 613157888 }, { "epoch": 1.08, "learning_rate": 0.00041123370110330995, "loss": 2.9179, "theoretical_loss": 3.831199223577383, "tokens_seen": 613223424 }, { "epoch": 1.08, "learning_rate": 0.00041122367101303914, "loss": 3.1811, "theoretical_loss": 3.83115666559997, "tokens_seen": 613288960 }, { "epoch": 1.08, "learning_rate": 0.0004112136409227683, "loss": 3.0847, "theoretical_loss": 3.8311141134432676, "tokens_seen": 613354496 }, { "epoch": 1.08, "learning_rate": 0.0004112036108324975, "loss": 2.8481, "theoretical_loss": 3.831071567105858, "tokens_seen": 613420032 }, { "epoch": 1.08, "learning_rate": 0.0004111935807422267, "loss": 3.0701, "theoretical_loss": 3.831029026586324, "tokens_seen": 613485568 }, { "epoch": 1.08, "learning_rate": 0.0004111835506519559, "loss": 2.9969, "theoretical_loss": 3.830986491883249, "tokens_seen": 613551104 }, { "epoch": 1.08, "learning_rate": 0.00041117352056168504, "loss": 3.0477, "theoretical_loss": 3.8309439629952156, "tokens_seen": 613616640 }, { "epoch": 1.08, "learning_rate": 0.0004111634904714143, "loss": 3.0848, "theoretical_loss": 3.8309014399208094, "tokens_seen": 613682176 }, { "epoch": 1.08, "learning_rate": 0.0004111534603811434, "loss": 3.0525, "theoretical_loss": 3.8308589226586136, "tokens_seen": 613747712 }, { "epoch": 1.08, "learning_rate": 0.00041114343029087264, "loss": 2.914, "theoretical_loss": 3.8308164112072145, "tokens_seen": 613813248 }, { "epoch": 1.08, "learning_rate": 0.0004111334002006018, "loss": 3.0428, "theoretical_loss": 3.830773905565196, "tokens_seen": 613878784 }, { "epoch": 1.08, "learning_rate": 0.000411123370110331, "loss": 2.9112, "theoretical_loss": 3.8307314057311457, "tokens_seen": 613944320 }, { "epoch": 1.08, "learning_rate": 0.0004111133400200602, "loss": 3.1208, "theoretical_loss": 3.830688911703649, "tokens_seen": 614009856 }, { "epoch": 1.08, "learning_rate": 0.0004111033099297894, "loss": 2.9971, "theoretical_loss": 3.8306464234812942, "tokens_seen": 614075392 }, { "epoch": 1.08, "learning_rate": 0.00041109327983951854, "loss": 3.0436, "theoretical_loss": 3.8306039410626678, "tokens_seen": 614140928 }, { "epoch": 1.08, "learning_rate": 0.0004110832497492478, "loss": 3.0165, "theoretical_loss": 3.830561464446358, "tokens_seen": 614206464 }, { "epoch": 1.08, "learning_rate": 0.0004110732196589769, "loss": 3.1361, "theoretical_loss": 3.830518993630954, "tokens_seen": 614272000 }, { "epoch": 1.08, "learning_rate": 0.00041106318956870614, "loss": 2.8655, "theoretical_loss": 3.8304765286150437, "tokens_seen": 614337536 }, { "epoch": 1.08, "objective/train/docs_used": 1001400, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2102270126342773, "objective/train/theoretical_loss": 3.8304446836581807, "objective/train/tokens_used": 634846688, "theoretical_loss": 3.8304446836581807, "tokens_seen": 614386688 }, { "epoch": 1.08, "learning_rate": 0.0004110531594784353, "loss": 3.0868, "theoretical_loss": 3.8304340693972176, "tokens_seen": 614403072 }, { "epoch": 1.08, "learning_rate": 0.0004110431293881645, "loss": 2.8922, "theoretical_loss": 3.8303916159760654, "tokens_seen": 614468608 }, { "epoch": 1.08, "learning_rate": 0.0004110330992978937, "loss": 3.037, "theoretical_loss": 3.8303491683501774, "tokens_seen": 614534144 }, { "epoch": 1.08, "learning_rate": 0.00041102306920762287, "loss": 3.0316, "theoretical_loss": 3.8303067265181445, "tokens_seen": 614599680 }, { "epoch": 1.08, "learning_rate": 0.00041101303911735205, "loss": 2.8574, "theoretical_loss": 3.830264290478559, "tokens_seen": 614665216 }, { "epoch": 1.08, "learning_rate": 0.0004110030090270813, "loss": 2.981, "theoretical_loss": 3.830221860230012, "tokens_seen": 614730752 }, { "epoch": 1.08, "learning_rate": 0.0004109929789368104, "loss": 3.0782, "theoretical_loss": 3.8301794357710963, "tokens_seen": 614796288 }, { "epoch": 1.08, "learning_rate": 0.00041098294884653965, "loss": 3.0054, "theoretical_loss": 3.8301370171004048, "tokens_seen": 614861824 }, { "epoch": 1.08, "learning_rate": 0.00041097291875626877, "loss": 2.971, "theoretical_loss": 3.830094604216531, "tokens_seen": 614927360 }, { "epoch": 1.08, "learning_rate": 0.000410962888665998, "loss": 3.2376, "theoretical_loss": 3.8300521971180688, "tokens_seen": 614992896 }, { "epoch": 1.08, "learning_rate": 0.0004109528585757272, "loss": 3.0909, "theoretical_loss": 3.830009795803613, "tokens_seen": 615058432 }, { "epoch": 1.08, "learning_rate": 0.00041094282848545637, "loss": 2.9965, "theoretical_loss": 3.829967400271758, "tokens_seen": 615123968 }, { "epoch": 1.08, "learning_rate": 0.00041093279839518555, "loss": 2.7709, "theoretical_loss": 3.8299250105211, "tokens_seen": 615189504 }, { "epoch": 1.08, "learning_rate": 0.0004109227683049148, "loss": 2.8791, "theoretical_loss": 3.8298826265502335, "tokens_seen": 615255040 }, { "epoch": 1.08, "learning_rate": 0.0004109127382146439, "loss": 3.0696, "theoretical_loss": 3.829840248357756, "tokens_seen": 615320576 }, { "epoch": 1.08, "learning_rate": 0.00041090270812437315, "loss": 2.973, "theoretical_loss": 3.8297978759422646, "tokens_seen": 615386112 }, { "epoch": 1.08, "learning_rate": 0.0004108926780341023, "loss": 2.9799, "theoretical_loss": 3.8297555093023554, "tokens_seen": 615451648 }, { "epoch": 1.08, "learning_rate": 0.0004108826479438315, "loss": 3.1559, "theoretical_loss": 3.8297131484366282, "tokens_seen": 615517184 }, { "epoch": 1.08, "learning_rate": 0.0004108726178535607, "loss": 3.111, "theoretical_loss": 3.8296707933436798, "tokens_seen": 615582720 }, { "epoch": 1.08, "learning_rate": 0.00041086258776328987, "loss": 2.898, "theoretical_loss": 3.8296284440221093, "tokens_seen": 615648256 }, { "epoch": 1.08, "learning_rate": 0.00041085255767301905, "loss": 3.0835, "theoretical_loss": 3.8295861004705163, "tokens_seen": 615713792 }, { "epoch": 1.08, "learning_rate": 0.00041084252758274823, "loss": 3.0262, "theoretical_loss": 3.829543762687501, "tokens_seen": 615779328 }, { "epoch": 1.08, "learning_rate": 0.0004108324974924774, "loss": 2.8286, "theoretical_loss": 3.829501430671663, "tokens_seen": 615844864 }, { "epoch": 1.08, "learning_rate": 0.00041082246740220665, "loss": 3.0483, "theoretical_loss": 3.8294591044216038, "tokens_seen": 615910400 }, { "epoch": 1.08, "learning_rate": 0.0004108124373119358, "loss": 3.0415, "theoretical_loss": 3.829416783935924, "tokens_seen": 615975936 }, { "epoch": 1.08, "objective/train/docs_used": 1002886, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.124361276626587, "objective/train/theoretical_loss": 3.8293850473536972, "objective/train/tokens_used": 636485088, "theoretical_loss": 3.8293850473536972, "tokens_seen": 616025088 }, { "epoch": 1.08, "learning_rate": 0.000410802407221665, "loss": 3.0373, "theoretical_loss": 3.8293744692132257, "tokens_seen": 616041472 }, { "epoch": 1.08, "learning_rate": 0.0004107923771313942, "loss": 2.853, "theoretical_loss": 3.829332160252111, "tokens_seen": 616107008 }, { "epoch": 1.08, "learning_rate": 0.0004107823470411234, "loss": 2.9192, "theoretical_loss": 3.829289857051183, "tokens_seen": 616172544 }, { "epoch": 1.08, "learning_rate": 0.00041077231695085256, "loss": 3.0171, "theoretical_loss": 3.8292475596090454, "tokens_seen": 616238080 }, { "epoch": 1.08, "learning_rate": 0.00041076228686058174, "loss": 2.9861, "theoretical_loss": 3.8292052679243005, "tokens_seen": 616303616 }, { "epoch": 1.08, "learning_rate": 0.0004107522567703109, "loss": 3.0717, "theoretical_loss": 3.8291629819955544, "tokens_seen": 616369152 }, { "epoch": 1.08, "learning_rate": 0.00041074222668004015, "loss": 3.0753, "theoretical_loss": 3.82912070182141, "tokens_seen": 616434688 }, { "epoch": 1.08, "learning_rate": 0.0004107321965897693, "loss": 3.0542, "theoretical_loss": 3.829078427400473, "tokens_seen": 616500224 }, { "epoch": 1.08, "learning_rate": 0.0004107221664994985, "loss": 3.0512, "theoretical_loss": 3.82903615873135, "tokens_seen": 616565760 }, { "epoch": 1.08, "learning_rate": 0.00041071213640922764, "loss": 2.712, "theoretical_loss": 3.828993895812646, "tokens_seen": 616631296 }, { "epoch": 1.08, "learning_rate": 0.0004107021063189569, "loss": 2.9465, "theoretical_loss": 3.8289516386429683, "tokens_seen": 616696832 }, { "epoch": 1.08, "learning_rate": 0.00041069207622868606, "loss": 3.069, "theoretical_loss": 3.828909387220924, "tokens_seen": 616762368 }, { "epoch": 1.08, "learning_rate": 0.00041068204613841524, "loss": 2.8824, "theoretical_loss": 3.8288671415451208, "tokens_seen": 616827904 }, { "epoch": 1.08, "learning_rate": 0.0004106720160481444, "loss": 2.8709, "theoretical_loss": 3.8288249016141664, "tokens_seen": 616893440 }, { "epoch": 1.08, "learning_rate": 0.0004106619859578736, "loss": 3.0162, "theoretical_loss": 3.82878266742667, "tokens_seen": 616958976 }, { "epoch": 1.08, "learning_rate": 0.00041065195586760284, "loss": 3.1618, "theoretical_loss": 3.8287404389812396, "tokens_seen": 617024512 }, { "epoch": 1.08, "learning_rate": 0.000410641925777332, "loss": 2.9993, "theoretical_loss": 3.8286982162764858, "tokens_seen": 617090048 }, { "epoch": 1.08, "learning_rate": 0.0004106318956870612, "loss": 2.9877, "theoretical_loss": 3.8286559993110183, "tokens_seen": 617155584 }, { "epoch": 1.08, "learning_rate": 0.0004106218655967904, "loss": 2.8469, "theoretical_loss": 3.828613788083448, "tokens_seen": 617221120 }, { "epoch": 1.08, "learning_rate": 0.0004106118355065196, "loss": 3.0004, "theoretical_loss": 3.828571582592385, "tokens_seen": 617286656 }, { "epoch": 1.08, "learning_rate": 0.00041060180541624874, "loss": 2.9295, "theoretical_loss": 3.828529382836442, "tokens_seen": 617352192 }, { "epoch": 1.08, "learning_rate": 0.000410591775325978, "loss": 2.9992, "theoretical_loss": 3.82848718881423, "tokens_seen": 617417728 }, { "epoch": 1.08, "learning_rate": 0.0004105817452357071, "loss": 2.9969, "theoretical_loss": 3.8284450005243618, "tokens_seen": 617483264 }, { "epoch": 1.08, "learning_rate": 0.00041057171514543634, "loss": 3.1488, "theoretical_loss": 3.8284028179654506, "tokens_seen": 617548800 }, { "epoch": 1.08, "learning_rate": 0.0004105616850551655, "loss": 3.1883, "theoretical_loss": 3.8283606411361095, "tokens_seen": 617614336 }, { "epoch": 1.08, "objective/train/docs_used": 1005576, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1800708770751953, "objective/train/theoretical_loss": 3.8283290122733, "objective/train/tokens_used": 638123488, "theoretical_loss": 3.8283290122733, "tokens_seen": 617663488 }, { "epoch": 1.08, "learning_rate": 0.0004105516549648947, "loss": 3.234, "theoretical_loss": 3.828318470034952, "tokens_seen": 617679872 }, { "epoch": 1.08, "learning_rate": 0.0004105416248746239, "loss": 3.0968, "theoretical_loss": 3.8282763046605934, "tokens_seen": 617745408 }, { "epoch": 1.08, "learning_rate": 0.00041053159478435307, "loss": 2.8808, "theoretical_loss": 3.828234145011648, "tokens_seen": 617810944 }, { "epoch": 1.08, "learning_rate": 0.00041052156469408225, "loss": 3.0236, "theoretical_loss": 3.828191991086732, "tokens_seen": 617876480 }, { "epoch": 1.08, "learning_rate": 0.0004105115346038115, "loss": 2.9639, "theoretical_loss": 3.8281498428844603, "tokens_seen": 617942016 }, { "epoch": 1.08, "learning_rate": 0.0004105015045135406, "loss": 2.8579, "theoretical_loss": 3.828107700403449, "tokens_seen": 618007552 }, { "epoch": 1.08, "learning_rate": 0.00041049147442326985, "loss": 2.9958, "theoretical_loss": 3.8280655636423164, "tokens_seen": 618073088 }, { "epoch": 1.08, "learning_rate": 0.00041048144433299897, "loss": 2.9254, "theoretical_loss": 3.828023432599678, "tokens_seen": 618138624 }, { "epoch": 1.08, "learning_rate": 0.0004104714142427282, "loss": 3.0386, "theoretical_loss": 3.827981307274152, "tokens_seen": 618204160 }, { "epoch": 1.08, "learning_rate": 0.0004104613841524574, "loss": 3.0198, "theoretical_loss": 3.827939187664358, "tokens_seen": 618269696 }, { "epoch": 1.08, "learning_rate": 0.00041045135406218657, "loss": 3.0829, "theoretical_loss": 3.8278970737689137, "tokens_seen": 618335232 }, { "epoch": 1.08, "learning_rate": 0.00041044132397191575, "loss": 3.0352, "theoretical_loss": 3.827854965586438, "tokens_seen": 618400768 }, { "epoch": 1.08, "learning_rate": 0.000410431293881645, "loss": 3.1588, "theoretical_loss": 3.827812863115551, "tokens_seen": 618466304 }, { "epoch": 1.08, "learning_rate": 0.0004104212637913741, "loss": 2.9531, "theoretical_loss": 3.8277707663548726, "tokens_seen": 618531840 }, { "epoch": 1.08, "learning_rate": 0.00041041123370110335, "loss": 3.1338, "theoretical_loss": 3.8277286753030237, "tokens_seen": 618597376 }, { "epoch": 1.08, "learning_rate": 0.0004104012036108325, "loss": 2.9585, "theoretical_loss": 3.827686589958626, "tokens_seen": 618662912 }, { "epoch": 1.08, "learning_rate": 0.0004103911735205617, "loss": 2.9445, "theoretical_loss": 3.8276445103203, "tokens_seen": 618728448 }, { "epoch": 1.08, "learning_rate": 0.0004103811434302909, "loss": 2.8558, "theoretical_loss": 3.827602436386668, "tokens_seen": 618793984 }, { "epoch": 1.08, "learning_rate": 0.00041037111334002007, "loss": 2.9445, "theoretical_loss": 3.8275603681563535, "tokens_seen": 618859520 }, { "epoch": 1.08, "learning_rate": 0.00041036108324974925, "loss": 2.9239, "theoretical_loss": 3.8275183056279785, "tokens_seen": 618925056 }, { "epoch": 1.08, "learning_rate": 0.00041035105315947844, "loss": 2.8138, "theoretical_loss": 3.827476248800167, "tokens_seen": 618990592 }, { "epoch": 1.08, "learning_rate": 0.0004103410230692076, "loss": 2.9098, "theoretical_loss": 3.827434197671543, "tokens_seen": 619056128 }, { "epoch": 1.08, "learning_rate": 0.00041033099297893685, "loss": 3.2294, "theoretical_loss": 3.8273921522407304, "tokens_seen": 619121664 }, { "epoch": 1.08, "learning_rate": 0.000410320962888666, "loss": 2.938, "theoretical_loss": 3.8273501125063554, "tokens_seen": 619187200 }, { "epoch": 1.08, "learning_rate": 0.0004103109327983952, "loss": 2.9689, "theoretical_loss": 3.827308078467042, "tokens_seen": 619252736 }, { "epoch": 1.08, "objective/train/docs_used": 1007992, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1217892169952393, "objective/train/theoretical_loss": 3.827276556674115, "objective/train/tokens_used": 639761888, "theoretical_loss": 3.827276556674115, "tokens_seen": 619301888 }, { "epoch": 1.08, "learning_rate": 0.0004103009027081244, "loss": 2.9319, "theoretical_loss": 3.827266050121417, "tokens_seen": 619318272 }, { "epoch": 1.08, "learning_rate": 0.0004102908726178536, "loss": 2.9598, "theoretical_loss": 3.8272240274681066, "tokens_seen": 619383808 }, { "epoch": 1.08, "learning_rate": 0.00041028084252758276, "loss": 3.1932, "theoretical_loss": 3.8271820105057377, "tokens_seen": 619449344 }, { "epoch": 1.08, "learning_rate": 0.00041027081243731194, "loss": 3.2006, "theoretical_loss": 3.8271399992329376, "tokens_seen": 619514880 }, { "epoch": 1.08, "learning_rate": 0.0004102607823470411, "loss": 2.9889, "theoretical_loss": 3.827097993648334, "tokens_seen": 619580416 }, { "epoch": 1.08, "learning_rate": 0.00041025075225677035, "loss": 2.9911, "theoretical_loss": 3.8270559937505553, "tokens_seen": 619645952 }, { "epoch": 1.08, "learning_rate": 0.0004102407221664995, "loss": 3.1222, "theoretical_loss": 3.82701399953823, "tokens_seen": 619711488 }, { "epoch": 1.08, "learning_rate": 0.0004102306920762287, "loss": 2.9872, "theoretical_loss": 3.8269720110099876, "tokens_seen": 619777024 }, { "epoch": 1.08, "learning_rate": 0.00041022066198595784, "loss": 3.0382, "theoretical_loss": 3.8269300281644583, "tokens_seen": 619842560 }, { "epoch": 1.08, "learning_rate": 0.0004102106318956871, "loss": 2.9589, "theoretical_loss": 3.826888051000271, "tokens_seen": 619908096 }, { "epoch": 1.08, "learning_rate": 0.00041020060180541626, "loss": 3.0321, "theoretical_loss": 3.8268460795160575, "tokens_seen": 619973632 }, { "epoch": 1.08, "learning_rate": 0.00041019057171514544, "loss": 2.8923, "theoretical_loss": 3.8268041137104487, "tokens_seen": 620039168 }, { "epoch": 1.08, "learning_rate": 0.0004101805416248746, "loss": 3.0394, "theoretical_loss": 3.826762153582076, "tokens_seen": 620104704 }, { "epoch": 1.08, "learning_rate": 0.0004101705115346038, "loss": 2.817, "theoretical_loss": 3.8267201991295714, "tokens_seen": 620170240 }, { "epoch": 1.08, "learning_rate": 0.000410160481444333, "loss": 2.8574, "theoretical_loss": 3.826678250351568, "tokens_seen": 620235776 }, { "epoch": 1.08, "learning_rate": 0.0004101504513540622, "loss": 3.0186, "theoretical_loss": 3.826636307246698, "tokens_seen": 620301312 }, { "epoch": 1.08, "learning_rate": 0.00041014042126379135, "loss": 3.0513, "theoretical_loss": 3.8265943698135962, "tokens_seen": 620366848 }, { "epoch": 1.08, "learning_rate": 0.0004101303911735206, "loss": 3.0538, "theoretical_loss": 3.8265524380508955, "tokens_seen": 620432384 }, { "epoch": 1.08, "learning_rate": 0.00041012036108324976, "loss": 3.0698, "theoretical_loss": 3.8265105119572302, "tokens_seen": 620497920 }, { "epoch": 1.08, "learning_rate": 0.00041011033099297894, "loss": 3.0985, "theoretical_loss": 3.826468591531236, "tokens_seen": 620563456 }, { "epoch": 1.08, "learning_rate": 0.0004101003009027081, "loss": 2.9912, "theoretical_loss": 3.826426676771548, "tokens_seen": 620628992 }, { "epoch": 1.08, "learning_rate": 0.0004100902708124373, "loss": 3.1061, "theoretical_loss": 3.826384767676802, "tokens_seen": 620694528 }, { "epoch": 1.08, "learning_rate": 0.0004100802407221665, "loss": 3.0996, "theoretical_loss": 3.8263428642456345, "tokens_seen": 620760064 }, { "epoch": 1.08, "learning_rate": 0.0004100702106318957, "loss": 2.9576, "theoretical_loss": 3.8263009664766825, "tokens_seen": 620825600 }, { "epoch": 1.08, "learning_rate": 0.00041006018054162485, "loss": 3.0389, "theoretical_loss": 3.8262590743685827, "tokens_seen": 620891136 }, { "epoch": 1.08, "objective/train/docs_used": 1010658, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.05426025390625, "objective/train/theoretical_loss": 3.826227659001623, "objective/train/tokens_used": 641400288, "theoretical_loss": 3.826227659001623, "tokens_seen": 620940288 }, { "epoch": 1.08, "learning_rate": 0.0004100501504513541, "loss": 3.0352, "theoretical_loss": 3.826217187919973, "tokens_seen": 620956672 }, { "epoch": 1.08, "learning_rate": 0.0004100401203610832, "loss": 2.902, "theoretical_loss": 3.826175307129492, "tokens_seen": 621022208 }, { "epoch": 1.08, "learning_rate": 0.00041003009027081245, "loss": 3.0729, "theoretical_loss": 3.8261334319957787, "tokens_seen": 621087744 }, { "epoch": 1.08, "learning_rate": 0.00041002006018054163, "loss": 2.9242, "theoretical_loss": 3.826091562517471, "tokens_seen": 621153280 }, { "epoch": 1.08, "learning_rate": 0.0004100100300902708, "loss": 2.9989, "theoretical_loss": 3.82604969869321, "tokens_seen": 621218816 }, { "epoch": 1.08, "learning_rate": 0.00041, "loss": 3.0472, "theoretical_loss": 3.826007840521635, "tokens_seen": 621284352 }, { "epoch": 1.08, "learning_rate": 0.00040998996990972917, "loss": 2.9475, "theoretical_loss": 3.825965988001387, "tokens_seen": 621349888 }, { "epoch": 1.08, "learning_rate": 0.00040997993981945835, "loss": 3.2126, "theoretical_loss": 3.8259241411311065, "tokens_seen": 621415424 }, { "epoch": 1.08, "learning_rate": 0.0004099699097291876, "loss": 3.0976, "theoretical_loss": 3.8258822999094355, "tokens_seen": 621480960 }, { "epoch": 1.08, "learning_rate": 0.0004099598796389167, "loss": 3.1121, "theoretical_loss": 3.825840464335016, "tokens_seen": 621546496 }, { "epoch": 1.08, "learning_rate": 0.00040994984954864595, "loss": 3.0086, "theoretical_loss": 3.8257986344064903, "tokens_seen": 621612032 }, { "epoch": 1.08, "learning_rate": 0.00040993981945837513, "loss": 3.0185, "theoretical_loss": 3.8257568101225012, "tokens_seen": 621677568 }, { "epoch": 1.08, "learning_rate": 0.0004099297893681043, "loss": 2.9012, "theoretical_loss": 3.825714991481693, "tokens_seen": 621743104 }, { "epoch": 1.08, "learning_rate": 0.0004099197592778335, "loss": 3.0209, "theoretical_loss": 3.825673178482708, "tokens_seen": 621808640 }, { "epoch": 1.08, "learning_rate": 0.0004099097291875627, "loss": 3.0095, "theoretical_loss": 3.8256313711241914, "tokens_seen": 621874176 }, { "epoch": 1.08, "learning_rate": 0.0004098996990972919, "loss": 2.9423, "theoretical_loss": 3.825589569404789, "tokens_seen": 621939712 }, { "epoch": 1.08, "learning_rate": 0.0004098896690070211, "loss": 2.9384, "theoretical_loss": 3.8255477733231444, "tokens_seen": 622005248 }, { "epoch": 1.08, "learning_rate": 0.0004098796389167503, "loss": 2.9885, "theoretical_loss": 3.825505982877904, "tokens_seen": 622070784 }, { "epoch": 1.08, "learning_rate": 0.00040986960882647945, "loss": 3.06, "theoretical_loss": 3.8254641980677144, "tokens_seen": 622136320 }, { "epoch": 1.08, "learning_rate": 0.00040985957873620864, "loss": 3.0799, "theoretical_loss": 3.8254224188912223, "tokens_seen": 622201856 }, { "epoch": 1.09, "learning_rate": 0.0004098495486459378, "loss": 2.8832, "theoretical_loss": 3.8253806453470744, "tokens_seen": 622267392 }, { "epoch": 1.09, "learning_rate": 0.00040983951855566705, "loss": 2.9291, "theoretical_loss": 3.825338877433918, "tokens_seen": 622332928 }, { "epoch": 1.09, "learning_rate": 0.0004098294884653962, "loss": 3.0278, "theoretical_loss": 3.825297115150402, "tokens_seen": 622398464 }, { "epoch": 1.09, "learning_rate": 0.0004098194583751254, "loss": 2.9737, "theoretical_loss": 3.8252553584951743, "tokens_seen": 622464000 }, { "epoch": 1.09, "learning_rate": 0.0004098094282848546, "loss": 2.8907, "theoretical_loss": 3.8252136074668845, "tokens_seen": 622529536 }, { "epoch": 1.09, "objective/train/docs_used": 1013555, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.193575382232666, "objective/train/theoretical_loss": 3.8251822978875327, "objective/train/tokens_used": 643038688, "theoretical_loss": 3.8251822978875327, "tokens_seen": 622578688 }, { "epoch": 1.09, "learning_rate": 0.0004097993981945838, "loss": 3.0588, "theoretical_loss": 3.825171862064182, "tokens_seen": 622595072 }, { "epoch": 1.09, "learning_rate": 0.00040978936810431296, "loss": 2.9535, "theoretical_loss": 3.8251301222857164, "tokens_seen": 622660608 }, { "epoch": 1.09, "learning_rate": 0.00040977933801404214, "loss": 2.9696, "theoretical_loss": 3.8250883881301387, "tokens_seen": 622726144 }, { "epoch": 1.09, "learning_rate": 0.0004097693079237713, "loss": 2.9712, "theoretical_loss": 3.8250466595960986, "tokens_seen": 622791680 }, { "epoch": 1.09, "learning_rate": 0.00040975927783350055, "loss": 3.0238, "theoretical_loss": 3.825004936682249, "tokens_seen": 622857216 }, { "epoch": 1.09, "learning_rate": 0.0004097492477432297, "loss": 2.9292, "theoretical_loss": 3.8249632193872403, "tokens_seen": 622922752 }, { "epoch": 1.09, "learning_rate": 0.0004097392176529589, "loss": 2.9835, "theoretical_loss": 3.8249215077097265, "tokens_seen": 622988288 }, { "epoch": 1.09, "learning_rate": 0.00040972918756268804, "loss": 2.8399, "theoretical_loss": 3.8248798016483585, "tokens_seen": 623053824 }, { "epoch": 1.09, "learning_rate": 0.0004097191574724173, "loss": 3.0192, "theoretical_loss": 3.8248381012017907, "tokens_seen": 623119360 }, { "epoch": 1.09, "learning_rate": 0.00040970912738214646, "loss": 2.9527, "theoretical_loss": 3.8247964063686757, "tokens_seen": 623184896 }, { "epoch": 1.09, "learning_rate": 0.00040969909729187564, "loss": 2.8504, "theoretical_loss": 3.8247547171476692, "tokens_seen": 623250432 }, { "epoch": 1.09, "learning_rate": 0.0004096890672016048, "loss": 2.923, "theoretical_loss": 3.8247130335374244, "tokens_seen": 623315968 }, { "epoch": 1.09, "learning_rate": 0.000409679037111334, "loss": 2.9846, "theoretical_loss": 3.824671355536597, "tokens_seen": 623381504 }, { "epoch": 1.09, "learning_rate": 0.0004096690070210632, "loss": 2.8477, "theoretical_loss": 3.8246296831438427, "tokens_seen": 623447040 }, { "epoch": 1.09, "learning_rate": 0.0004096589769307924, "loss": 3.0156, "theoretical_loss": 3.824588016357817, "tokens_seen": 623512576 }, { "epoch": 1.09, "learning_rate": 0.00040964894684052155, "loss": 3.1196, "theoretical_loss": 3.8245463551771772, "tokens_seen": 623578112 }, { "epoch": 1.09, "learning_rate": 0.0004096389167502508, "loss": 3.1172, "theoretical_loss": 3.8245046996005794, "tokens_seen": 623643648 }, { "epoch": 1.09, "learning_rate": 0.00040962888665997996, "loss": 2.9341, "theoretical_loss": 3.824463049626681, "tokens_seen": 623709184 }, { "epoch": 1.09, "learning_rate": 0.00040961885656970914, "loss": 3.0444, "theoretical_loss": 3.82442140525414, "tokens_seen": 623774720 }, { "epoch": 1.09, "learning_rate": 0.0004096088264794383, "loss": 2.884, "theoretical_loss": 3.824379766481615, "tokens_seen": 623840256 }, { "epoch": 1.09, "learning_rate": 0.0004095987963891675, "loss": 3.1341, "theoretical_loss": 3.824338133307765, "tokens_seen": 623905792 }, { "epoch": 1.09, "learning_rate": 0.0004095887662988967, "loss": 3.1119, "theoretical_loss": 3.8242965057312484, "tokens_seen": 623971328 }, { "epoch": 1.09, "learning_rate": 0.0004095787362086259, "loss": 3.0338, "theoretical_loss": 3.8242548837507253, "tokens_seen": 624036864 }, { "epoch": 1.09, "learning_rate": 0.00040956870611835505, "loss": 3.0976, "theoretical_loss": 3.824213267364856, "tokens_seen": 624102400 }, { "epoch": 1.09, "learning_rate": 0.0004095586760280843, "loss": 3.0212, "theoretical_loss": 3.8241716565723003, "tokens_seen": 624167936 }, { "epoch": 1.09, "objective/train/docs_used": 1016336, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.002312183380127, "objective/train/theoretical_loss": 3.824140452147691, "objective/train/tokens_used": 644677088, "theoretical_loss": 3.824140452147691, "tokens_seen": 624217088 }, { "epoch": 1.09, "learning_rate": 0.0004095486459378134, "loss": 2.954, "theoretical_loss": 3.824130051371721, "tokens_seen": 624233472 }, { "epoch": 1.09, "learning_rate": 0.00040953861584754265, "loss": 2.8653, "theoretical_loss": 3.824088451761778, "tokens_seen": 624299008 }, { "epoch": 1.09, "learning_rate": 0.00040952858575727183, "loss": 2.8787, "theoretical_loss": 3.824046857741134, "tokens_seen": 624364544 }, { "epoch": 1.09, "learning_rate": 0.000409518555667001, "loss": 3.064, "theoretical_loss": 3.8240052693084516, "tokens_seen": 624430080 }, { "epoch": 1.09, "learning_rate": 0.0004095085255767302, "loss": 3.0077, "theoretical_loss": 3.8239636864623936, "tokens_seen": 624495616 }, { "epoch": 1.09, "learning_rate": 0.00040949849548645937, "loss": 2.7877, "theoretical_loss": 3.8239221092016233, "tokens_seen": 624561152 }, { "epoch": 1.09, "learning_rate": 0.00040948846539618855, "loss": 3.0346, "theoretical_loss": 3.823880537524804, "tokens_seen": 624626688 }, { "epoch": 1.09, "learning_rate": 0.0004094784353059178, "loss": 3.0365, "theoretical_loss": 3.823838971430601, "tokens_seen": 624692224 }, { "epoch": 1.09, "learning_rate": 0.0004094684052156469, "loss": 2.8997, "theoretical_loss": 3.8237974109176793, "tokens_seen": 624757760 }, { "epoch": 1.09, "learning_rate": 0.00040945837512537615, "loss": 2.9792, "theoretical_loss": 3.823755855984703, "tokens_seen": 624823296 }, { "epoch": 1.09, "learning_rate": 0.00040944834503510533, "loss": 3.0363, "theoretical_loss": 3.823714306630338, "tokens_seen": 624888832 }, { "epoch": 1.09, "learning_rate": 0.0004094383149448345, "loss": 3.0099, "theoretical_loss": 3.8236727628532505, "tokens_seen": 624954368 }, { "epoch": 1.09, "learning_rate": 0.0004094282848545637, "loss": 3.0224, "theoretical_loss": 3.8236312246521074, "tokens_seen": 625019904 }, { "epoch": 1.09, "learning_rate": 0.0004094182547642929, "loss": 3.0963, "theoretical_loss": 3.8235896920255756, "tokens_seen": 625085440 }, { "epoch": 1.09, "learning_rate": 0.00040940822467402206, "loss": 2.9338, "theoretical_loss": 3.823548164972323, "tokens_seen": 625150976 }, { "epoch": 1.09, "learning_rate": 0.0004093981945837513, "loss": 2.889, "theoretical_loss": 3.823506643491017, "tokens_seen": 625216512 }, { "epoch": 1.09, "learning_rate": 0.0004093881644934804, "loss": 3.0064, "theoretical_loss": 3.8234651275803264, "tokens_seen": 625282048 }, { "epoch": 1.09, "learning_rate": 0.00040937813440320965, "loss": 2.9904, "theoretical_loss": 3.8234236172389204, "tokens_seen": 625347584 }, { "epoch": 1.09, "learning_rate": 0.0004093681043129388, "loss": 2.8866, "theoretical_loss": 3.8233821124654677, "tokens_seen": 625413120 }, { "epoch": 1.09, "learning_rate": 0.000409358074222668, "loss": 3.0059, "theoretical_loss": 3.8233406132586376, "tokens_seen": 625478656 }, { "epoch": 1.09, "learning_rate": 0.0004093480441323972, "loss": 2.8815, "theoretical_loss": 3.823299119617102, "tokens_seen": 625544192 }, { "epoch": 1.09, "learning_rate": 0.0004093380140421264, "loss": 2.91, "theoretical_loss": 3.8232576315395304, "tokens_seen": 625609728 }, { "epoch": 1.09, "learning_rate": 0.00040932798395185556, "loss": 2.7257, "theoretical_loss": 3.823216149024594, "tokens_seen": 625675264 }, { "epoch": 1.09, "learning_rate": 0.0004093179538615848, "loss": 2.9883, "theoretical_loss": 3.823174672070965, "tokens_seen": 625740800 }, { "epoch": 1.09, "learning_rate": 0.0004093079237713139, "loss": 2.8426, "theoretical_loss": 3.823133200677316, "tokens_seen": 625806336 }, { "epoch": 1.09, "objective/train/docs_used": 1019051, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8987340927124023, "objective/train/theoretical_loss": 3.823102100780016, "objective/train/tokens_used": 646315488, "theoretical_loss": 3.823102100780016, "tokens_seen": 625855488 }, { "epoch": 1.09, "learning_rate": 0.00040929789368104316, "loss": 3.016, "theoretical_loss": 3.8230917348423175, "tokens_seen": 625871872 }, { "epoch": 1.09, "learning_rate": 0.0004092878635907723, "loss": 2.926, "theoretical_loss": 3.8230502745646446, "tokens_seen": 625937408 }, { "epoch": 1.09, "learning_rate": 0.0004092778335005015, "loss": 2.861, "theoretical_loss": 3.8230088198429697, "tokens_seen": 626002944 }, { "epoch": 1.09, "learning_rate": 0.0004092678034102307, "loss": 2.9678, "theoretical_loss": 3.8229673706759666, "tokens_seen": 626068480 }, { "epoch": 1.09, "learning_rate": 0.0004092577733199599, "loss": 3.1246, "theoretical_loss": 3.82292592706231, "tokens_seen": 626134016 }, { "epoch": 1.09, "learning_rate": 0.00040924774322968906, "loss": 2.9663, "theoretical_loss": 3.8228844890006757, "tokens_seen": 626199552 }, { "epoch": 1.09, "learning_rate": 0.00040923771313941824, "loss": 3.084, "theoretical_loss": 3.822843056489737, "tokens_seen": 626265088 }, { "epoch": 1.09, "learning_rate": 0.0004092276830491474, "loss": 2.992, "theoretical_loss": 3.8228016295281715, "tokens_seen": 626330624 }, { "epoch": 1.09, "learning_rate": 0.00040921765295887666, "loss": 3.0555, "theoretical_loss": 3.822760208114654, "tokens_seen": 626396160 }, { "epoch": 1.09, "learning_rate": 0.0004092076228686058, "loss": 2.9306, "theoretical_loss": 3.822718792247862, "tokens_seen": 626461696 }, { "epoch": 1.09, "learning_rate": 0.000409197592778335, "loss": 2.8893, "theoretical_loss": 3.8226773819264723, "tokens_seen": 626527232 }, { "epoch": 1.09, "learning_rate": 0.00040918756268806415, "loss": 2.7813, "theoretical_loss": 3.8226359771491625, "tokens_seen": 626592768 }, { "epoch": 1.09, "learning_rate": 0.0004091775325977934, "loss": 3.2389, "theoretical_loss": 3.82259457791461, "tokens_seen": 626658304 }, { "epoch": 1.09, "learning_rate": 0.00040916750250752257, "loss": 3.0976, "theoretical_loss": 3.8225531842214946, "tokens_seen": 626723840 }, { "epoch": 1.09, "learning_rate": 0.00040915747241725175, "loss": 2.9803, "theoretical_loss": 3.8225117960684942, "tokens_seen": 626789376 }, { "epoch": 1.09, "learning_rate": 0.000409147442326981, "loss": 3.0543, "theoretical_loss": 3.8224704134542877, "tokens_seen": 626854912 }, { "epoch": 1.09, "learning_rate": 0.00040913741223671016, "loss": 2.95, "theoretical_loss": 3.8224290363775566, "tokens_seen": 626920448 }, { "epoch": 1.09, "learning_rate": 0.00040912738214643934, "loss": 2.9409, "theoretical_loss": 3.8223876648369792, "tokens_seen": 626985984 }, { "epoch": 1.09, "learning_rate": 0.0004091173520561685, "loss": 3.0001, "theoretical_loss": 3.8223462988312376, "tokens_seen": 627051520 }, { "epoch": 1.09, "learning_rate": 0.0004091073219658977, "loss": 2.9928, "theoretical_loss": 3.8223049383590126, "tokens_seen": 627117056 }, { "epoch": 1.09, "learning_rate": 0.0004090972918756269, "loss": 3.083, "theoretical_loss": 3.8222635834189864, "tokens_seen": 627182592 }, { "epoch": 1.09, "learning_rate": 0.0004090872617853561, "loss": 2.9583, "theoretical_loss": 3.82222223400984, "tokens_seen": 627248128 }, { "epoch": 1.09, "learning_rate": 0.00040907723169508525, "loss": 2.9331, "theoretical_loss": 3.822180890130256, "tokens_seen": 627313664 }, { "epoch": 1.09, "learning_rate": 0.0004090672016048145, "loss": 2.8905, "theoretical_loss": 3.8221395517789185, "tokens_seen": 627379200 }, { "epoch": 1.09, "learning_rate": 0.0004090571715145436, "loss": 3.099, "theoretical_loss": 3.82209821895451, "tokens_seen": 627444736 }, { "epoch": 1.09, "objective/train/docs_used": 1020436, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9886562824249268, "objective/train/theoretical_loss": 3.822067222962459, "objective/train/tokens_used": 647953888, "theoretical_loss": 3.822067222962459, "tokens_seen": 627493888 }, { "epoch": 1.09, "learning_rate": 0.00040904714142427285, "loss": 2.812, "theoretical_loss": 3.8220568916557145, "tokens_seen": 627510272 }, { "epoch": 1.09, "learning_rate": 0.00040903711133400203, "loss": 3.0288, "theoretical_loss": 3.8220155698812164, "tokens_seen": 627575808 }, { "epoch": 1.09, "learning_rate": 0.0004090270812437312, "loss": 3.1363, "theoretical_loss": 3.8219742536297003, "tokens_seen": 627641344 }, { "epoch": 1.09, "learning_rate": 0.0004090170511534604, "loss": 3.0378, "theoretical_loss": 3.821932942899852, "tokens_seen": 627706880 }, { "epoch": 1.09, "learning_rate": 0.00040900702106318957, "loss": 2.8102, "theoretical_loss": 3.821891637690357, "tokens_seen": 627772416 }, { "epoch": 1.09, "learning_rate": 0.00040899699097291875, "loss": 3.0265, "theoretical_loss": 3.8218503379999014, "tokens_seen": 627837952 }, { "epoch": 1.09, "learning_rate": 0.000408986960882648, "loss": 2.8772, "theoretical_loss": 3.821809043827171, "tokens_seen": 627903488 }, { "epoch": 1.09, "learning_rate": 0.0004089769307923771, "loss": 2.9654, "theoretical_loss": 3.8217677551708538, "tokens_seen": 627969024 }, { "epoch": 1.09, "learning_rate": 0.00040896690070210635, "loss": 2.8859, "theoretical_loss": 3.821726472029637, "tokens_seen": 628034560 }, { "epoch": 1.09, "learning_rate": 0.00040895687061183553, "loss": 3.0366, "theoretical_loss": 3.8216851944022086, "tokens_seen": 628100096 }, { "epoch": 1.09, "learning_rate": 0.0004089468405215647, "loss": 2.9101, "theoretical_loss": 3.821643922287257, "tokens_seen": 628165632 }, { "epoch": 1.09, "learning_rate": 0.0004089368104312939, "loss": 3.0855, "theoretical_loss": 3.821602655683471, "tokens_seen": 628231168 }, { "epoch": 1.09, "learning_rate": 0.0004089267803410231, "loss": 2.9058, "theoretical_loss": 3.8215613945895393, "tokens_seen": 628296704 }, { "epoch": 1.09, "learning_rate": 0.00040891675025075226, "loss": 2.9738, "theoretical_loss": 3.8215201390041527, "tokens_seen": 628362240 }, { "epoch": 1.09, "learning_rate": 0.0004089067201604815, "loss": 3.0092, "theoretical_loss": 3.8214788889260007, "tokens_seen": 628427776 }, { "epoch": 1.09, "learning_rate": 0.0004088966900702106, "loss": 2.9664, "theoretical_loss": 3.8214376443537743, "tokens_seen": 628493312 }, { "epoch": 1.09, "learning_rate": 0.00040888665997993985, "loss": 3.0043, "theoretical_loss": 3.8213964052861638, "tokens_seen": 628558848 }, { "epoch": 1.09, "learning_rate": 0.000408876629889669, "loss": 3.2059, "theoretical_loss": 3.8213551717218612, "tokens_seen": 628624384 }, { "epoch": 1.09, "learning_rate": 0.0004088665997993982, "loss": 3.0063, "theoretical_loss": 3.821313943659559, "tokens_seen": 628689920 }, { "epoch": 1.09, "learning_rate": 0.0004088565697091274, "loss": 3.0468, "theoretical_loss": 3.821272721097949, "tokens_seen": 628755456 }, { "epoch": 1.09, "learning_rate": 0.0004088465396188566, "loss": 3.1809, "theoretical_loss": 3.821231504035724, "tokens_seen": 628820992 }, { "epoch": 1.09, "learning_rate": 0.00040883650952858576, "loss": 2.7985, "theoretical_loss": 3.8211902924715777, "tokens_seen": 628886528 }, { "epoch": 1.09, "learning_rate": 0.000408826479438315, "loss": 3.0077, "theoretical_loss": 3.8211490864042035, "tokens_seen": 628952064 }, { "epoch": 1.09, "learning_rate": 0.0004088164493480441, "loss": 3.0364, "theoretical_loss": 3.821107885832296, "tokens_seen": 629017600 }, { "epoch": 1.09, "learning_rate": 0.00040880641925777336, "loss": 3.1328, "theoretical_loss": 3.821066690754549, "tokens_seen": 629083136 }, { "epoch": 1.09, "objective/train/docs_used": 1023249, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.4309182167053223, "objective/train/theoretical_loss": 3.821035798050998, "objective/train/tokens_used": 649592288, "theoretical_loss": 3.821035798050998, "tokens_seen": 629132288 }, { "epoch": 1.09, "learning_rate": 0.0004087963891675025, "loss": 3.0433, "theoretical_loss": 3.821025501169659, "tokens_seen": 629148672 }, { "epoch": 1.09, "learning_rate": 0.0004087863590772317, "loss": 3.1107, "theoretical_loss": 3.8209843170763205, "tokens_seen": 629214208 }, { "epoch": 1.09, "learning_rate": 0.0004087763289869609, "loss": 3.1054, "theoretical_loss": 3.82094313847323, "tokens_seen": 629279744 }, { "epoch": 1.09, "learning_rate": 0.0004087662988966901, "loss": 2.919, "theoretical_loss": 3.820901965359083, "tokens_seen": 629345280 }, { "epoch": 1.09, "learning_rate": 0.00040875626880641926, "loss": 3.026, "theoretical_loss": 3.8208607977325775, "tokens_seen": 629410816 }, { "epoch": 1.09, "learning_rate": 0.00040874623871614844, "loss": 2.8452, "theoretical_loss": 3.82081963559241, "tokens_seen": 629476352 }, { "epoch": 1.09, "learning_rate": 0.0004087362086258776, "loss": 2.9181, "theoretical_loss": 3.8207784789372785, "tokens_seen": 629541888 }, { "epoch": 1.09, "learning_rate": 0.00040872617853560686, "loss": 3.1262, "theoretical_loss": 3.820737327765882, "tokens_seen": 629607424 }, { "epoch": 1.09, "learning_rate": 0.000408716148445336, "loss": 2.9961, "theoretical_loss": 3.820696182076918, "tokens_seen": 629672960 }, { "epoch": 1.09, "learning_rate": 0.0004087061183550652, "loss": 2.961, "theoretical_loss": 3.8206550418690863, "tokens_seen": 629738496 }, { "epoch": 1.09, "learning_rate": 0.00040869608826479435, "loss": 3.1116, "theoretical_loss": 3.820613907141086, "tokens_seen": 629804032 }, { "epoch": 1.09, "learning_rate": 0.0004086860581745236, "loss": 3.0589, "theoretical_loss": 3.820572777891617, "tokens_seen": 629869568 }, { "epoch": 1.09, "learning_rate": 0.00040867602808425277, "loss": 3.1101, "theoretical_loss": 3.820531654119381, "tokens_seen": 629935104 }, { "epoch": 1.09, "learning_rate": 0.00040866599799398195, "loss": 3.056, "theoretical_loss": 3.820490535823078, "tokens_seen": 630000640 }, { "epoch": 1.09, "learning_rate": 0.00040865596790371113, "loss": 2.8466, "theoretical_loss": 3.820449423001408, "tokens_seen": 630066176 }, { "epoch": 1.09, "learning_rate": 0.00040864593781344036, "loss": 2.906, "theoretical_loss": 3.820408315653075, "tokens_seen": 630131712 }, { "epoch": 1.09, "learning_rate": 0.0004086359077231695, "loss": 3.0512, "theoretical_loss": 3.8203672137767795, "tokens_seen": 630197248 }, { "epoch": 1.09, "learning_rate": 0.0004086258776328987, "loss": 2.9725, "theoretical_loss": 3.8203261173712253, "tokens_seen": 630262784 }, { "epoch": 1.09, "learning_rate": 0.00040861584754262785, "loss": 2.9571, "theoretical_loss": 3.820285026435115, "tokens_seen": 630328320 }, { "epoch": 1.09, "learning_rate": 0.0004086058174523571, "loss": 2.9134, "theoretical_loss": 3.8202439409671523, "tokens_seen": 630393856 }, { "epoch": 1.09, "learning_rate": 0.00040859578736208627, "loss": 2.9089, "theoretical_loss": 3.820202860966041, "tokens_seen": 630459392 }, { "epoch": 1.09, "learning_rate": 0.00040858575727181545, "loss": 2.9465, "theoretical_loss": 3.8201617864304853, "tokens_seen": 630524928 }, { "epoch": 1.09, "learning_rate": 0.00040857572718154463, "loss": 3.0397, "theoretical_loss": 3.8201207173591905, "tokens_seen": 630590464 }, { "epoch": 1.09, "learning_rate": 0.0004085656970912738, "loss": 2.9591, "theoretical_loss": 3.8200796537508612, "tokens_seen": 630656000 }, { "epoch": 1.09, "learning_rate": 0.000408555667001003, "loss": 2.9389, "theoretical_loss": 3.8200385956042044, "tokens_seen": 630721536 }, { "epoch": 1.09, "objective/train/docs_used": 1026131, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0061473846435547, "objective/train/theoretical_loss": 3.8200078055776556, "objective/train/tokens_used": 651230688, "theoretical_loss": 3.8200078055776556, "tokens_seen": 630770688 }, { "epoch": 1.09, "learning_rate": 0.00040854563691073223, "loss": 3.007, "theoretical_loss": 3.8199975429179256, "tokens_seen": 630787072 }, { "epoch": 1.09, "learning_rate": 0.00040853560682046136, "loss": 3.1183, "theoretical_loss": 3.8199564956907306, "tokens_seen": 630852608 }, { "epoch": 1.09, "learning_rate": 0.0004085255767301906, "loss": 2.9348, "theoretical_loss": 3.819915453921328, "tokens_seen": 630918144 }, { "epoch": 1.09, "learning_rate": 0.0004085155466399197, "loss": 3.0618, "theoretical_loss": 3.819874417608424, "tokens_seen": 630983680 }, { "epoch": 1.09, "learning_rate": 0.00040850551654964895, "loss": 2.928, "theoretical_loss": 3.819833386750727, "tokens_seen": 631049216 }, { "epoch": 1.09, "learning_rate": 0.00040849548645937814, "loss": 2.9647, "theoretical_loss": 3.8197923613469458, "tokens_seen": 631114752 }, { "epoch": 1.09, "learning_rate": 0.0004084854563691073, "loss": 2.9973, "theoretical_loss": 3.8197513413957886, "tokens_seen": 631180288 }, { "epoch": 1.09, "learning_rate": 0.0004084754262788365, "loss": 3.0629, "theoretical_loss": 3.819710326895965, "tokens_seen": 631245824 }, { "epoch": 1.09, "learning_rate": 0.00040846539618856573, "loss": 3.0751, "theoretical_loss": 3.8196693178461847, "tokens_seen": 631311360 }, { "epoch": 1.09, "learning_rate": 0.00040845536609829486, "loss": 3.0768, "theoretical_loss": 3.8196283142451577, "tokens_seen": 631376896 }, { "epoch": 1.09, "learning_rate": 0.0004084453360080241, "loss": 3.0639, "theoretical_loss": 3.819587316091595, "tokens_seen": 631442432 }, { "epoch": 1.09, "learning_rate": 0.0004084353059177532, "loss": 3.0109, "theoretical_loss": 3.8195463233842064, "tokens_seen": 631507968 }, { "epoch": 1.09, "learning_rate": 0.00040842527582748246, "loss": 3.0436, "theoretical_loss": 3.819505336121705, "tokens_seen": 631573504 }, { "epoch": 1.09, "learning_rate": 0.00040841524573721164, "loss": 3.0009, "theoretical_loss": 3.8194643543028013, "tokens_seen": 631639040 }, { "epoch": 1.09, "learning_rate": 0.0004084052156469408, "loss": 3.0772, "theoretical_loss": 3.819423377926209, "tokens_seen": 631704576 }, { "epoch": 1.09, "learning_rate": 0.00040839518555667005, "loss": 2.9622, "theoretical_loss": 3.8193824069906395, "tokens_seen": 631770112 }, { "epoch": 1.09, "learning_rate": 0.0004083851554663992, "loss": 2.9703, "theoretical_loss": 3.8193414414948066, "tokens_seen": 631835648 }, { "epoch": 1.09, "learning_rate": 0.0004083751253761284, "loss": 3.0009, "theoretical_loss": 3.8193004814374243, "tokens_seen": 631901184 }, { "epoch": 1.09, "learning_rate": 0.0004083650952858576, "loss": 2.91, "theoretical_loss": 3.8192595268172065, "tokens_seen": 631966720 }, { "epoch": 1.09, "learning_rate": 0.0004083550651955868, "loss": 2.9564, "theoretical_loss": 3.819218577632868, "tokens_seen": 632032256 }, { "epoch": 1.09, "learning_rate": 0.00040834503510531596, "loss": 2.9325, "theoretical_loss": 3.8191776338831227, "tokens_seen": 632097792 }, { "epoch": 1.09, "learning_rate": 0.0004083350050150452, "loss": 2.9653, "theoretical_loss": 3.8191366955666863, "tokens_seen": 632163328 }, { "epoch": 1.09, "learning_rate": 0.0004083249749247743, "loss": 3.0697, "theoretical_loss": 3.8190957626822755, "tokens_seen": 632228864 }, { "epoch": 1.09, "learning_rate": 0.00040831494483450356, "loss": 3.0581, "theoretical_loss": 3.819054835228606, "tokens_seen": 632294400 }, { "epoch": 1.09, "learning_rate": 0.0004083049147442327, "loss": 2.9804, "theoretical_loss": 3.8190139132043948, "tokens_seen": 632359936 }, { "epoch": 1.09, "objective/train/docs_used": 1029177, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1116693019866943, "objective/train/theoretical_loss": 3.8189832252485463, "objective/train/tokens_used": 652869088, "theoretical_loss": 3.8189832252485463, "tokens_seen": 632409088 }, { "epoch": 1.09, "learning_rate": 0.0004082948846539619, "loss": 3.0862, "theoretical_loss": 3.818972996608358, "tokens_seen": 632425472 }, { "epoch": 1.09, "learning_rate": 0.0004082848545636911, "loss": 3.003, "theoretical_loss": 3.818932085439215, "tokens_seen": 632491008 }, { "epoch": 1.09, "learning_rate": 0.0004082748244734203, "loss": 3.0089, "theoretical_loss": 3.818891179695682, "tokens_seen": 632556544 }, { "epoch": 1.09, "learning_rate": 0.00040826479438314946, "loss": 2.8535, "theoretical_loss": 3.8188502793764787, "tokens_seen": 632622080 }, { "epoch": 1.09, "learning_rate": 0.00040825476429287864, "loss": 3.0688, "theoretical_loss": 3.818809384480324, "tokens_seen": 632687616 }, { "epoch": 1.09, "learning_rate": 0.0004082447342026078, "loss": 2.8728, "theoretical_loss": 3.818768495005936, "tokens_seen": 632753152 }, { "epoch": 1.09, "learning_rate": 0.00040823470411233706, "loss": 3.0386, "theoretical_loss": 3.8187276109520356, "tokens_seen": 632818688 }, { "epoch": 1.09, "learning_rate": 0.0004082246740220662, "loss": 2.9066, "theoretical_loss": 3.8186867323173423, "tokens_seen": 632884224 }, { "epoch": 1.09, "learning_rate": 0.0004082146439317954, "loss": 2.9598, "theoretical_loss": 3.818645859100577, "tokens_seen": 632949760 }, { "epoch": 1.09, "learning_rate": 0.00040820461384152455, "loss": 2.9568, "theoretical_loss": 3.818604991300461, "tokens_seen": 633015296 }, { "epoch": 1.09, "learning_rate": 0.0004081945837512538, "loss": 3.1048, "theoretical_loss": 3.8185641289157153, "tokens_seen": 633080832 }, { "epoch": 1.09, "learning_rate": 0.00040818455366098297, "loss": 3.152, "theoretical_loss": 3.818523271945063, "tokens_seen": 633146368 }, { "epoch": 1.09, "learning_rate": 0.00040817452357071215, "loss": 2.8759, "theoretical_loss": 3.818482420387225, "tokens_seen": 633211904 }, { "epoch": 1.09, "learning_rate": 0.00040816449348044133, "loss": 3.027, "theoretical_loss": 3.818441574240924, "tokens_seen": 633277440 }, { "epoch": 1.09, "learning_rate": 0.00040815446339017056, "loss": 2.8813, "theoretical_loss": 3.818400733504885, "tokens_seen": 633342976 }, { "epoch": 1.09, "learning_rate": 0.0004081444332998997, "loss": 2.863, "theoretical_loss": 3.81835989817783, "tokens_seen": 633408512 }, { "epoch": 1.09, "learning_rate": 0.0004081344032096289, "loss": 2.8875, "theoretical_loss": 3.818319068258484, "tokens_seen": 633474048 }, { "epoch": 1.09, "learning_rate": 0.00040812437311935805, "loss": 2.9705, "theoretical_loss": 3.8182782437455707, "tokens_seen": 633539584 }, { "epoch": 1.09, "learning_rate": 0.0004081143430290873, "loss": 2.9951, "theoretical_loss": 3.8182374246378155, "tokens_seen": 633605120 }, { "epoch": 1.09, "learning_rate": 0.00040810431293881647, "loss": 3.0398, "theoretical_loss": 3.8181966109339447, "tokens_seen": 633670656 }, { "epoch": 1.09, "learning_rate": 0.00040809428284854565, "loss": 3.0208, "theoretical_loss": 3.8181558026326825, "tokens_seen": 633736192 }, { "epoch": 1.09, "learning_rate": 0.00040808425275827483, "loss": 3.0345, "theoretical_loss": 3.8181149997327566, "tokens_seen": 633801728 }, { "epoch": 1.09, "learning_rate": 0.000408074222668004, "loss": 3.0839, "theoretical_loss": 3.818074202232893, "tokens_seen": 633867264 }, { "epoch": 1.09, "learning_rate": 0.0004080641925777332, "loss": 3.0662, "theoretical_loss": 3.818033410131818, "tokens_seen": 633932800 }, { "epoch": 1.09, "learning_rate": 0.00040805416248746243, "loss": 2.8024, "theoretical_loss": 3.8179926234282604, "tokens_seen": 633998336 }, { "epoch": 1.09, "objective/train/docs_used": 1032058, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0397236347198486, "objective/train/theoretical_loss": 3.817962036941948, "objective/train/tokens_used": 654507488, "theoretical_loss": 3.817962036941948, "tokens_seen": 634047488 }, { "epoch": 1.09, "learning_rate": 0.00040804413239719156, "loss": 3.072, "theoretical_loss": 3.817951842120948, "tokens_seen": 634063872 }, { "epoch": 1.09, "learning_rate": 0.0004080341023069208, "loss": 2.9929, "theoretical_loss": 3.817911066208609, "tokens_seen": 634129408 }, { "epoch": 1.09, "learning_rate": 0.0004080240722166499, "loss": 3.0123, "theoretical_loss": 3.817870295689972, "tokens_seen": 634194944 }, { "epoch": 1.09, "learning_rate": 0.00040801404212637915, "loss": 2.9605, "theoretical_loss": 3.817829530563767, "tokens_seen": 634260480 }, { "epoch": 1.09, "learning_rate": 0.00040800401203610834, "loss": 2.8597, "theoretical_loss": 3.817788770828723, "tokens_seen": 634326016 }, { "epoch": 1.09, "learning_rate": 0.0004079939819458375, "loss": 3.0676, "theoretical_loss": 3.81774801648357, "tokens_seen": 634391552 }, { "epoch": 1.09, "learning_rate": 0.0004079839518555667, "loss": 3.1365, "theoretical_loss": 3.81770726752704, "tokens_seen": 634457088 }, { "epoch": 1.09, "learning_rate": 0.00040797392176529593, "loss": 2.98, "theoretical_loss": 3.817666523957862, "tokens_seen": 634522624 }, { "epoch": 1.09, "learning_rate": 0.00040796389167502506, "loss": 3.1103, "theoretical_loss": 3.8176257857747684, "tokens_seen": 634588160 }, { "epoch": 1.09, "learning_rate": 0.0004079538615847543, "loss": 3.0585, "theoretical_loss": 3.817585052976492, "tokens_seen": 634653696 }, { "epoch": 1.09, "learning_rate": 0.0004079438314944834, "loss": 3.0361, "theoretical_loss": 3.817544325561763, "tokens_seen": 634719232 }, { "epoch": 1.09, "learning_rate": 0.00040793380140421266, "loss": 2.9508, "theoretical_loss": 3.8175036035293157, "tokens_seen": 634784768 }, { "epoch": 1.09, "learning_rate": 0.00040792377131394184, "loss": 3.0965, "theoretical_loss": 3.817462886877883, "tokens_seen": 634850304 }, { "epoch": 1.09, "learning_rate": 0.000407913741223671, "loss": 3.1115, "theoretical_loss": 3.817422175606198, "tokens_seen": 634915840 }, { "epoch": 1.09, "learning_rate": 0.0004079037111334002, "loss": 2.811, "theoretical_loss": 3.817381469712995, "tokens_seen": 634981376 }, { "epoch": 1.09, "learning_rate": 0.0004078936810431294, "loss": 2.773, "theoretical_loss": 3.817340769197009, "tokens_seen": 635046912 }, { "epoch": 1.09, "learning_rate": 0.00040788365095285856, "loss": 2.7618, "theoretical_loss": 3.8173000740569734, "tokens_seen": 635112448 }, { "epoch": 1.09, "learning_rate": 0.0004078736208625878, "loss": 2.9246, "theoretical_loss": 3.817259384291625, "tokens_seen": 635177984 }, { "epoch": 1.09, "learning_rate": 0.0004078635907723169, "loss": 3.0038, "theoretical_loss": 3.8172186998996986, "tokens_seen": 635243520 }, { "epoch": 1.09, "learning_rate": 0.00040785356068204616, "loss": 2.9456, "theoretical_loss": 3.8171780208799304, "tokens_seen": 635309056 }, { "epoch": 1.09, "learning_rate": 0.0004078435305917753, "loss": 3.0677, "theoretical_loss": 3.817137347231058, "tokens_seen": 635374592 }, { "epoch": 1.09, "learning_rate": 0.0004078335005015045, "loss": 3.0443, "theoretical_loss": 3.817096678951817, "tokens_seen": 635440128 }, { "epoch": 1.09, "learning_rate": 0.0004078234704112337, "loss": 3.1896, "theoretical_loss": 3.8170560160409455, "tokens_seen": 635505664 }, { "epoch": 1.09, "learning_rate": 0.0004078134403209629, "loss": 2.9682, "theoretical_loss": 3.8170153584971813, "tokens_seen": 635571200 }, { "epoch": 1.09, "learning_rate": 0.00040780341023069207, "loss": 3.1072, "theoretical_loss": 3.8169747063192627, "tokens_seen": 635636736 }, { "epoch": 1.09, "objective/train/docs_used": 1034455, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8656325340270996, "objective/train/theoretical_loss": 3.816944220706401, "objective/train/tokens_used": 656145888, "theoretical_loss": 3.816944220706401, "tokens_seen": 635685888 }, { "epoch": 1.09, "learning_rate": 0.0004077933801404213, "loss": 3.0127, "theoretical_loss": 3.8169340595059285, "tokens_seen": 635702272 }, { "epoch": 1.09, "learning_rate": 0.00040778335005015043, "loss": 2.9831, "theoretical_loss": 3.8168934180559173, "tokens_seen": 635767808 }, { "epoch": 1.09, "learning_rate": 0.00040777331995987966, "loss": 3.271, "theoretical_loss": 3.8168527819679694, "tokens_seen": 635833344 }, { "epoch": 1.09, "learning_rate": 0.0004077632898696088, "loss": 3.0704, "theoretical_loss": 3.816812151240825, "tokens_seen": 635898880 }, { "epoch": 1.09, "learning_rate": 0.000407753259779338, "loss": 3.0034, "theoretical_loss": 3.8167715258732233, "tokens_seen": 635964416 }, { "epoch": 1.09, "learning_rate": 0.0004077432296890672, "loss": 2.9231, "theoretical_loss": 3.8167309058639063, "tokens_seen": 636029952 }, { "epoch": 1.09, "learning_rate": 0.0004077331995987964, "loss": 3.1567, "theoretical_loss": 3.8166902912116143, "tokens_seen": 636095488 }, { "epoch": 1.09, "learning_rate": 0.00040772316950852557, "loss": 2.9678, "theoretical_loss": 3.8166496819150897, "tokens_seen": 636161024 }, { "epoch": 1.09, "learning_rate": 0.00040771313941825475, "loss": 2.9855, "theoretical_loss": 3.816609077973075, "tokens_seen": 636226560 }, { "epoch": 1.09, "learning_rate": 0.00040770310932798393, "loss": 3.0802, "theoretical_loss": 3.816568479384311, "tokens_seen": 636292096 }, { "epoch": 1.09, "learning_rate": 0.00040769307923771317, "loss": 2.9993, "theoretical_loss": 3.8165278861475422, "tokens_seen": 636357632 }, { "epoch": 1.09, "learning_rate": 0.0004076830491474423, "loss": 2.9829, "theoretical_loss": 3.816487298261512, "tokens_seen": 636423168 }, { "epoch": 1.09, "learning_rate": 0.00040767301905717153, "loss": 2.9833, "theoretical_loss": 3.8164467157249633, "tokens_seen": 636488704 }, { "epoch": 1.09, "learning_rate": 0.0004076629889669007, "loss": 2.932, "theoretical_loss": 3.8164061385366415, "tokens_seen": 636554240 }, { "epoch": 1.09, "learning_rate": 0.0004076529588766299, "loss": 2.8551, "theoretical_loss": 3.81636556669529, "tokens_seen": 636619776 }, { "epoch": 1.09, "learning_rate": 0.0004076429287863591, "loss": 3.0436, "theoretical_loss": 3.8163250001996545, "tokens_seen": 636685312 }, { "epoch": 1.09, "learning_rate": 0.00040763289869608825, "loss": 2.9681, "theoretical_loss": 3.8162844390484807, "tokens_seen": 636750848 }, { "epoch": 1.09, "learning_rate": 0.0004076228686058175, "loss": 3.0787, "theoretical_loss": 3.8162438832405146, "tokens_seen": 636816384 }, { "epoch": 1.09, "learning_rate": 0.00040761283851554667, "loss": 3.0371, "theoretical_loss": 3.816203332774502, "tokens_seen": 636881920 }, { "epoch": 1.09, "learning_rate": 0.00040760280842527585, "loss": 2.9378, "theoretical_loss": 3.8161627876491897, "tokens_seen": 636947456 }, { "epoch": 1.09, "learning_rate": 0.00040759277833500503, "loss": 2.8507, "theoretical_loss": 3.816122247863326, "tokens_seen": 637012992 }, { "epoch": 1.09, "learning_rate": 0.0004075827482447342, "loss": 3.0684, "theoretical_loss": 3.816081713415657, "tokens_seen": 637078528 }, { "epoch": 1.09, "learning_rate": 0.0004075727181544634, "loss": 3.1174, "theoretical_loss": 3.8160411843049316, "tokens_seen": 637144064 }, { "epoch": 1.09, "learning_rate": 0.00040756268806419263, "loss": 3.0128, "theoretical_loss": 3.816000660529898, "tokens_seen": 637209600 }, { "epoch": 1.09, "learning_rate": 0.00040755265797392176, "loss": 2.9315, "theoretical_loss": 3.8159601420893052, "tokens_seen": 637275136 }, { "epoch": 1.09, "objective/train/docs_used": 1037232, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7881546020507812, "objective/train/theoretical_loss": 3.8159297567588357, "objective/train/tokens_used": 657784288, "theoretical_loss": 3.8159297567588357, "tokens_seen": 637324288 }, { "epoch": 1.09, "learning_rate": 0.000407542627883651, "loss": 2.8892, "theoretical_loss": 3.815919628981903, "tokens_seen": 637340672 }, { "epoch": 1.09, "learning_rate": 0.0004075325977933801, "loss": 2.9092, "theoretical_loss": 3.8158791212064402, "tokens_seen": 637406208 }, { "epoch": 1.09, "learning_rate": 0.00040752256770310935, "loss": 2.8352, "theoretical_loss": 3.815838618761668, "tokens_seen": 637471744 }, { "epoch": 1.09, "learning_rate": 0.00040751253761283854, "loss": 2.9657, "theoretical_loss": 3.8157981216463357, "tokens_seen": 637537280 }, { "epoch": 1.09, "learning_rate": 0.0004075025075225677, "loss": 2.9422, "theoretical_loss": 3.815757629859195, "tokens_seen": 637602816 }, { "epoch": 1.09, "learning_rate": 0.0004074924774322969, "loss": 2.9548, "theoretical_loss": 3.815717143398998, "tokens_seen": 637668352 }, { "epoch": 1.09, "learning_rate": 0.00040748244734202613, "loss": 3.0112, "theoretical_loss": 3.815676662264495, "tokens_seen": 637733888 }, { "epoch": 1.09, "learning_rate": 0.00040747241725175526, "loss": 3.0284, "theoretical_loss": 3.81563618645444, "tokens_seen": 637799424 }, { "epoch": 1.09, "learning_rate": 0.0004074623871614845, "loss": 2.8731, "theoretical_loss": 3.8155957159675844, "tokens_seen": 637864960 }, { "epoch": 1.09, "learning_rate": 0.0004074523570712136, "loss": 2.9703, "theoretical_loss": 3.8155552508026815, "tokens_seen": 637930496 }, { "epoch": 1.09, "learning_rate": 0.00040744232698094286, "loss": 3.2423, "theoretical_loss": 3.8155147909584857, "tokens_seen": 637996032 }, { "epoch": 1.09, "learning_rate": 0.00040743229689067204, "loss": 3.1201, "theoretical_loss": 3.8154743364337493, "tokens_seen": 638061568 }, { "epoch": 1.09, "learning_rate": 0.0004074222668004012, "loss": 2.926, "theoretical_loss": 3.815433887227228, "tokens_seen": 638127104 }, { "epoch": 1.09, "learning_rate": 0.0004074122367101304, "loss": 3.0367, "theoretical_loss": 3.815393443337677, "tokens_seen": 638192640 }, { "epoch": 1.09, "learning_rate": 0.0004074022066198596, "loss": 2.9313, "theoretical_loss": 3.8153530047638498, "tokens_seen": 638258176 }, { "epoch": 1.09, "learning_rate": 0.00040739217652958876, "loss": 3.0701, "theoretical_loss": 3.8153125715045038, "tokens_seen": 638323712 }, { "epoch": 1.09, "learning_rate": 0.000407382146439318, "loss": 3.027, "theoretical_loss": 3.8152721435583934, "tokens_seen": 638389248 }, { "epoch": 1.09, "learning_rate": 0.0004073721163490471, "loss": 3.0137, "theoretical_loss": 3.815231720924276, "tokens_seen": 638454784 }, { "epoch": 1.09, "learning_rate": 0.00040736208625877636, "loss": 3.001, "theoretical_loss": 3.815191303600909, "tokens_seen": 638520320 }, { "epoch": 1.09, "learning_rate": 0.0004073520561685055, "loss": 2.8777, "theoretical_loss": 3.8151508915870482, "tokens_seen": 638585856 }, { "epoch": 1.09, "learning_rate": 0.0004073420260782347, "loss": 3.059, "theoretical_loss": 3.8151104848814525, "tokens_seen": 638651392 }, { "epoch": 1.09, "learning_rate": 0.0004073319959879639, "loss": 2.9622, "theoretical_loss": 3.8150700834828797, "tokens_seen": 638716928 }, { "epoch": 1.09, "learning_rate": 0.0004073219658976931, "loss": 2.839, "theoretical_loss": 3.8150296873900884, "tokens_seen": 638782464 }, { "epoch": 1.09, "learning_rate": 0.00040731193580742227, "loss": 3.0883, "theoretical_loss": 3.814989296601837, "tokens_seen": 638848000 }, { "epoch": 1.09, "learning_rate": 0.0004073019057171515, "loss": 2.8358, "theoretical_loss": 3.814948911116886, "tokens_seen": 638913536 }, { "epoch": 1.09, "objective/train/docs_used": 1038718, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.071514368057251, "objective/train/theoretical_loss": 3.8149186254827168, "objective/train/tokens_used": 659422688, "theoretical_loss": 3.8149186254827168, "tokens_seen": 638962688 }, { "epoch": 1.09, "learning_rate": 0.00040729187562688063, "loss": 2.9519, "theoretical_loss": 3.8149085309339945, "tokens_seen": 638979072 }, { "epoch": 1.09, "learning_rate": 0.00040728184553660986, "loss": 2.9403, "theoretical_loss": 3.8148681560519226, "tokens_seen": 639044608 }, { "epoch": 1.09, "learning_rate": 0.000407271815446339, "loss": 2.9304, "theoretical_loss": 3.814827786469431, "tokens_seen": 639110144 }, { "epoch": 1.09, "learning_rate": 0.0004072617853560682, "loss": 3.0738, "theoretical_loss": 3.8147874221852813, "tokens_seen": 639175680 }, { "epoch": 1.09, "learning_rate": 0.0004072517552657974, "loss": 2.8413, "theoretical_loss": 3.8147470631982348, "tokens_seen": 639241216 }, { "epoch": 1.09, "learning_rate": 0.0004072417251755266, "loss": 3.005, "theoretical_loss": 3.8147067095070524, "tokens_seen": 639306752 }, { "epoch": 1.09, "learning_rate": 0.00040723169508525577, "loss": 2.8894, "theoretical_loss": 3.814666361110498, "tokens_seen": 639372288 }, { "epoch": 1.09, "learning_rate": 0.00040722166499498495, "loss": 2.9484, "theoretical_loss": 3.814626018007333, "tokens_seen": 639437824 }, { "epoch": 1.09, "learning_rate": 0.00040721163490471413, "loss": 3.0591, "theoretical_loss": 3.8145856801963207, "tokens_seen": 639503360 }, { "epoch": 1.09, "learning_rate": 0.00040720160481444337, "loss": 2.8217, "theoretical_loss": 3.8145453476762254, "tokens_seen": 639568896 }, { "epoch": 1.09, "learning_rate": 0.0004071915747241725, "loss": 2.9894, "theoretical_loss": 3.814505020445811, "tokens_seen": 639634432 }, { "epoch": 1.09, "learning_rate": 0.00040718154463390173, "loss": 3.1587, "theoretical_loss": 3.8144646985038406, "tokens_seen": 639699968 }, { "epoch": 1.09, "learning_rate": 0.0004071715145436309, "loss": 3.0043, "theoretical_loss": 3.814424381849081, "tokens_seen": 639765504 }, { "epoch": 1.09, "learning_rate": 0.0004071614844533601, "loss": 3.0162, "theoretical_loss": 3.814384070480296, "tokens_seen": 639831040 }, { "epoch": 1.09, "learning_rate": 0.00040715145436308927, "loss": 2.9761, "theoretical_loss": 3.8143437643962512, "tokens_seen": 639896576 }, { "epoch": 1.09, "learning_rate": 0.00040714142427281845, "loss": 3.1128, "theoretical_loss": 3.8143034635957127, "tokens_seen": 639962112 }, { "epoch": 1.09, "learning_rate": 0.00040713139418254763, "loss": 2.9097, "theoretical_loss": 3.8142631680774484, "tokens_seen": 640027648 }, { "epoch": 1.09, "learning_rate": 0.00040712136409227687, "loss": 3.1261, "theoretical_loss": 3.8142228778402227, "tokens_seen": 640093184 }, { "epoch": 1.09, "learning_rate": 0.000407111334002006, "loss": 3.0627, "theoretical_loss": 3.8141825928828053, "tokens_seen": 640158720 }, { "epoch": 1.09, "learning_rate": 0.00040710130391173523, "loss": 2.9422, "theoretical_loss": 3.8141423132039622, "tokens_seen": 640224256 }, { "epoch": 1.09, "learning_rate": 0.00040709127382146436, "loss": 3.2201, "theoretical_loss": 3.8141020388024627, "tokens_seen": 640289792 }, { "epoch": 1.09, "learning_rate": 0.0004070812437311936, "loss": 3.0179, "theoretical_loss": 3.814061769677074, "tokens_seen": 640355328 }, { "epoch": 1.09, "learning_rate": 0.0004070712136409228, "loss": 3.1171, "theoretical_loss": 3.8140215058265667, "tokens_seen": 640420864 }, { "epoch": 1.09, "learning_rate": 0.00040706118355065196, "loss": 2.8964, "theoretical_loss": 3.8139812472497088, "tokens_seen": 640486400 }, { "epoch": 1.09, "learning_rate": 0.00040705115346038114, "loss": 3.0932, "theoretical_loss": 3.8139409939452706, "tokens_seen": 640551936 }, { "epoch": 1.09, "objective/train/docs_used": 1041494, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6086478233337402, "objective/train/theoretical_loss": 3.813910807426227, "objective/train/tokens_used": 661061088, "theoretical_loss": 3.813910807426227, "tokens_seen": 640601088 }, { "epoch": 1.09, "learning_rate": 0.0004070411233701103, "loss": 2.8994, "theoretical_loss": 3.8139007459120218, "tokens_seen": 640617472 }, { "epoch": 1.09, "learning_rate": 0.0004070310932798395, "loss": 2.937, "theoretical_loss": 3.8138605031487334, "tokens_seen": 640683008 }, { "epoch": 1.09, "learning_rate": 0.00040702106318956874, "loss": 3.1337, "theoretical_loss": 3.8138202656541766, "tokens_seen": 640748544 }, { "epoch": 1.09, "learning_rate": 0.00040701103309929786, "loss": 2.9305, "theoretical_loss": 3.8137800334271232, "tokens_seen": 640814080 }, { "epoch": 1.09, "learning_rate": 0.0004070010030090271, "loss": 2.9662, "theoretical_loss": 3.813739806466344, "tokens_seen": 640879616 }, { "epoch": 1.09, "learning_rate": 0.0004069909729187563, "loss": 3.0651, "theoretical_loss": 3.813699584770611, "tokens_seen": 640945152 }, { "epoch": 1.09, "learning_rate": 0.00040698094282848546, "loss": 2.9886, "theoretical_loss": 3.8136593683386986, "tokens_seen": 641010688 }, { "epoch": 1.09, "learning_rate": 0.00040697091273821464, "loss": 2.9917, "theoretical_loss": 3.8136191571693785, "tokens_seen": 641076224 }, { "epoch": 1.09, "learning_rate": 0.0004069608826479438, "loss": 2.8565, "theoretical_loss": 3.813578951261424, "tokens_seen": 641141760 }, { "epoch": 1.09, "learning_rate": 0.000406950852557673, "loss": 2.991, "theoretical_loss": 3.81353875061361, "tokens_seen": 641207296 }, { "epoch": 1.09, "learning_rate": 0.00040694082246740224, "loss": 2.8983, "theoretical_loss": 3.81349855522471, "tokens_seen": 641272832 }, { "epoch": 1.09, "learning_rate": 0.00040693079237713137, "loss": 3.0298, "theoretical_loss": 3.8134583650934992, "tokens_seen": 641338368 }, { "epoch": 1.09, "learning_rate": 0.0004069207622868606, "loss": 2.9944, "theoretical_loss": 3.813418180218752, "tokens_seen": 641403904 }, { "epoch": 1.09, "learning_rate": 0.00040691073219658973, "loss": 3.1503, "theoretical_loss": 3.8133780005992444, "tokens_seen": 641469440 }, { "epoch": 1.09, "learning_rate": 0.00040690070210631896, "loss": 3.0291, "theoretical_loss": 3.813337826233753, "tokens_seen": 641534976 }, { "epoch": 1.09, "learning_rate": 0.0004068906720160482, "loss": 3.0169, "theoretical_loss": 3.813297657121053, "tokens_seen": 641600512 }, { "epoch": 1.09, "learning_rate": 0.0004068806419257773, "loss": 2.8441, "theoretical_loss": 3.813257493259922, "tokens_seen": 641666048 }, { "epoch": 1.09, "learning_rate": 0.00040687061183550656, "loss": 3.0256, "theoretical_loss": 3.8132173346491367, "tokens_seen": 641731584 }, { "epoch": 1.09, "learning_rate": 0.0004068605817452357, "loss": 2.9136, "theoretical_loss": 3.8131771812874744, "tokens_seen": 641797120 }, { "epoch": 1.09, "learning_rate": 0.0004068505516549649, "loss": 2.9394, "theoretical_loss": 3.8131370331737138, "tokens_seen": 641862656 }, { "epoch": 1.09, "learning_rate": 0.0004068405215646941, "loss": 2.9959, "theoretical_loss": 3.8130968903066336, "tokens_seen": 641928192 }, { "epoch": 1.09, "learning_rate": 0.0004068304914744233, "loss": 2.8283, "theoretical_loss": 3.8130567526850108, "tokens_seen": 641993728 }, { "epoch": 1.09, "learning_rate": 0.00040682046138415247, "loss": 2.961, "theoretical_loss": 3.813016620307627, "tokens_seen": 642059264 }, { "epoch": 1.09, "learning_rate": 0.0004068104312938817, "loss": 3.0296, "theoretical_loss": 3.8129764931732595, "tokens_seen": 642124800 }, { "epoch": 1.09, "learning_rate": 0.00040680040120361083, "loss": 3.0492, "theoretical_loss": 3.8129363712806903, "tokens_seen": 642190336 }, { "epoch": 1.09, "objective/train/docs_used": 1044237, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.825730562210083, "objective/train/theoretical_loss": 3.812906283300459, "objective/train/tokens_used": 662699488, "theoretical_loss": 3.812906283300459, "tokens_seen": 642239488 }, { "epoch": 1.09, "learning_rate": 0.00040679037111334006, "loss": 2.9162, "theoretical_loss": 3.812896254628699, "tokens_seen": 642255872 }, { "epoch": 1.09, "learning_rate": 0.0004067803410230692, "loss": 3.032, "theoretical_loss": 3.8128561432160657, "tokens_seen": 642321408 }, { "epoch": 1.09, "learning_rate": 0.0004067703109327984, "loss": 2.9042, "theoretical_loss": 3.812816037041573, "tokens_seen": 642386944 }, { "epoch": 1.09, "learning_rate": 0.0004067602808425276, "loss": 3.013, "theoretical_loss": 3.812775936104002, "tokens_seen": 642452480 }, { "epoch": 1.09, "learning_rate": 0.0004067502507522568, "loss": 3.0199, "theoretical_loss": 3.8127358404021345, "tokens_seen": 642518016 }, { "epoch": 1.09, "learning_rate": 0.00040674022066198597, "loss": 2.8735, "theoretical_loss": 3.812695749934753, "tokens_seen": 642583552 }, { "epoch": 1.09, "learning_rate": 0.00040673019057171515, "loss": 2.9644, "theoretical_loss": 3.8126556647006415, "tokens_seen": 642649088 }, { "epoch": 1.09, "learning_rate": 0.00040672016048144433, "loss": 2.7743, "theoretical_loss": 3.812615584698582, "tokens_seen": 642714624 }, { "epoch": 1.09, "learning_rate": 0.00040671013039117357, "loss": 2.7898, "theoretical_loss": 3.812575509927359, "tokens_seen": 642780160 }, { "epoch": 1.09, "learning_rate": 0.0004067001003009027, "loss": 3.0775, "theoretical_loss": 3.812535440385755, "tokens_seen": 642845696 }, { "epoch": 1.09, "learning_rate": 0.00040669007021063193, "loss": 3.1018, "theoretical_loss": 3.8124953760725564, "tokens_seen": 642911232 }, { "epoch": 1.09, "learning_rate": 0.0004066800401203611, "loss": 3.1091, "theoretical_loss": 3.812455316986548, "tokens_seen": 642976768 }, { "epoch": 1.09, "learning_rate": 0.0004066700100300903, "loss": 2.9043, "theoretical_loss": 3.812415263126514, "tokens_seen": 643042304 }, { "epoch": 1.09, "learning_rate": 0.00040665997993981947, "loss": 3.0761, "theoretical_loss": 3.8123752144912406, "tokens_seen": 643107840 }, { "epoch": 1.09, "learning_rate": 0.00040664994984954865, "loss": 2.9938, "theoretical_loss": 3.812335171079514, "tokens_seen": 643173376 }, { "epoch": 1.09, "learning_rate": 0.00040663991975927783, "loss": 3.0561, "theoretical_loss": 3.812295132890122, "tokens_seen": 643238912 }, { "epoch": 1.09, "learning_rate": 0.00040662988966900707, "loss": 2.9937, "theoretical_loss": 3.812255099921849, "tokens_seen": 643304448 }, { "epoch": 1.09, "learning_rate": 0.0004066198595787362, "loss": 2.9181, "theoretical_loss": 3.812215072173484, "tokens_seen": 643369984 }, { "epoch": 1.09, "learning_rate": 0.00040660982948846543, "loss": 2.878, "theoretical_loss": 3.8121750496438147, "tokens_seen": 643435520 }, { "epoch": 1.09, "learning_rate": 0.00040659979939819456, "loss": 2.9148, "theoretical_loss": 3.812135032331629, "tokens_seen": 643501056 }, { "epoch": 1.09, "learning_rate": 0.0004065897693079238, "loss": 3.0233, "theoretical_loss": 3.8120950202357156, "tokens_seen": 643566592 }, { "epoch": 1.09, "learning_rate": 0.000406579739217653, "loss": 3.0254, "theoretical_loss": 3.812055013354863, "tokens_seen": 643632128 }, { "epoch": 1.09, "learning_rate": 0.00040656970912738216, "loss": 2.8435, "theoretical_loss": 3.812015011687861, "tokens_seen": 643697664 }, { "epoch": 1.09, "learning_rate": 0.00040655967903711134, "loss": 3.0016, "theoretical_loss": 3.8119750152335, "tokens_seen": 643763200 }, { "epoch": 1.09, "learning_rate": 0.0004065496489468405, "loss": 3.1828, "theoretical_loss": 3.8119350239905687, "tokens_seen": 643828736 }, { "epoch": 1.09, "objective/train/docs_used": 1046759, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.134971857070923, "objective/train/theoretical_loss": 3.8119050339776446, "objective/train/tokens_used": 664337888, "theoretical_loss": 3.8119050339776446, "tokens_seen": 643877888 }, { "epoch": 1.09, "learning_rate": 0.0004065396188565697, "loss": 3.1164, "theoretical_loss": 3.811895037957859, "tokens_seen": 643894272 }, { "epoch": 1.09, "learning_rate": 0.00040652958876629894, "loss": 2.8614, "theoretical_loss": 3.8118550571341614, "tokens_seen": 643959808 }, { "epoch": 1.09, "learning_rate": 0.00040651955867602806, "loss": 2.9286, "theoretical_loss": 3.811815081518267, "tokens_seen": 644025344 }, { "epoch": 1.09, "learning_rate": 0.0004065095285857573, "loss": 3.0312, "theoretical_loss": 3.811775111108968, "tokens_seen": 644090880 }, { "epoch": 1.09, "learning_rate": 0.0004064994984954865, "loss": 2.8472, "theoretical_loss": 3.811735145905057, "tokens_seen": 644156416 }, { "epoch": 1.09, "learning_rate": 0.00040648946840521566, "loss": 2.8963, "theoretical_loss": 3.8116951859053256, "tokens_seen": 644221952 }, { "epoch": 1.09, "learning_rate": 0.00040647943831494484, "loss": 3.1512, "theoretical_loss": 3.811655231108568, "tokens_seen": 644287488 }, { "epoch": 1.09, "learning_rate": 0.000406469408224674, "loss": 2.9215, "theoretical_loss": 3.8116152815135758, "tokens_seen": 644353024 }, { "epoch": 1.09, "learning_rate": 0.0004064593781344032, "loss": 3.0998, "theoretical_loss": 3.8115753371191445, "tokens_seen": 644418560 }, { "epoch": 1.09, "learning_rate": 0.00040644934804413244, "loss": 2.8898, "theoretical_loss": 3.811535397924068, "tokens_seen": 644484096 }, { "epoch": 1.09, "learning_rate": 0.00040643931795386157, "loss": 2.978, "theoretical_loss": 3.8114954639271406, "tokens_seen": 644549632 }, { "epoch": 1.09, "learning_rate": 0.0004064292878635908, "loss": 3.0741, "theoretical_loss": 3.8114555351271577, "tokens_seen": 644615168 }, { "epoch": 1.09, "learning_rate": 0.00040641925777331993, "loss": 2.9648, "theoretical_loss": 3.811415611522914, "tokens_seen": 644680704 }, { "epoch": 1.09, "learning_rate": 0.00040640922768304916, "loss": 2.9685, "theoretical_loss": 3.811375693113206, "tokens_seen": 644746240 }, { "epoch": 1.09, "learning_rate": 0.00040639919759277834, "loss": 3.1265, "theoretical_loss": 3.8113357798968295, "tokens_seen": 644811776 }, { "epoch": 1.09, "learning_rate": 0.0004063891675025075, "loss": 2.93, "theoretical_loss": 3.8112958718725816, "tokens_seen": 644877312 }, { "epoch": 1.09, "learning_rate": 0.0004063791374122367, "loss": 2.927, "theoretical_loss": 3.811255969039259, "tokens_seen": 644942848 }, { "epoch": 1.09, "learning_rate": 0.0004063691073219659, "loss": 3.138, "theoretical_loss": 3.811216071395659, "tokens_seen": 645008384 }, { "epoch": 1.09, "learning_rate": 0.00040635907723169507, "loss": 3.1036, "theoretical_loss": 3.81117617894058, "tokens_seen": 645073920 }, { "epoch": 1.09, "learning_rate": 0.0004063490471414243, "loss": 2.7702, "theoretical_loss": 3.8111362916728195, "tokens_seen": 645139456 }, { "epoch": 1.09, "learning_rate": 0.00040633901705115343, "loss": 3.1223, "theoretical_loss": 3.811096409591177, "tokens_seen": 645204992 }, { "epoch": 1.09, "learning_rate": 0.00040632898696088267, "loss": 3.0393, "theoretical_loss": 3.81105653269445, "tokens_seen": 645270528 }, { "epoch": 1.09, "learning_rate": 0.00040631895687061185, "loss": 2.9669, "theoretical_loss": 3.8110166609814398, "tokens_seen": 645336064 }, { "epoch": 1.09, "learning_rate": 0.00040630892678034103, "loss": 2.961, "theoretical_loss": 3.810976794450945, "tokens_seen": 645401600 }, { "epoch": 1.09, "learning_rate": 0.0004062988966900702, "loss": 3.0755, "theoretical_loss": 3.8109369331017664, "tokens_seen": 645467136 }, { "epoch": 1.09, "objective/train/docs_used": 1049677, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.88242244720459, "objective/train/theoretical_loss": 3.8109070404893997, "objective/train/tokens_used": 665976288, "theoretical_loss": 3.8109070404893997, "tokens_seen": 645516288 }, { "epoch": 1.09, "learning_rate": 0.0004062888665997994, "loss": 3.0176, "theoretical_loss": 3.8108970769327053, "tokens_seen": 645532672 }, { "epoch": 1.09, "learning_rate": 0.00040627883650952857, "loss": 2.9779, "theoretical_loss": 3.8108572259425606, "tokens_seen": 645598208 }, { "epoch": 1.09, "learning_rate": 0.0004062688064192578, "loss": 2.8165, "theoretical_loss": 3.8108173801301355, "tokens_seen": 645663744 }, { "epoch": 1.09, "learning_rate": 0.00040625877632898693, "loss": 2.8414, "theoretical_loss": 3.8107775394942314, "tokens_seen": 645729280 }, { "epoch": 1.09, "learning_rate": 0.00040624874623871617, "loss": 2.8294, "theoretical_loss": 3.8107377040336505, "tokens_seen": 645794816 }, { "epoch": 1.09, "learning_rate": 0.0004062387161484453, "loss": 3.0216, "theoretical_loss": 3.8106978737471953, "tokens_seen": 645860352 }, { "epoch": 1.09, "learning_rate": 0.00040622868605817453, "loss": 3.0373, "theoretical_loss": 3.8106580486336687, "tokens_seen": 645925888 }, { "epoch": 1.09, "learning_rate": 0.0004062186559679037, "loss": 2.9184, "theoretical_loss": 3.8106182286918746, "tokens_seen": 645991424 }, { "epoch": 1.09, "learning_rate": 0.0004062086258776329, "loss": 3.0589, "theoretical_loss": 3.8105784139206165, "tokens_seen": 646056960 }, { "epoch": 1.09, "learning_rate": 0.0004061985957873621, "loss": 2.8935, "theoretical_loss": 3.810538604318699, "tokens_seen": 646122496 }, { "epoch": 1.09, "learning_rate": 0.00040618856569709126, "loss": 3.0102, "theoretical_loss": 3.8104987998849262, "tokens_seen": 646188032 }, { "epoch": 1.09, "learning_rate": 0.00040617853560682044, "loss": 3.0036, "theoretical_loss": 3.810459000618103, "tokens_seen": 646253568 }, { "epoch": 1.09, "learning_rate": 0.00040616850551654967, "loss": 2.9174, "theoretical_loss": 3.8104192065170355, "tokens_seen": 646319104 }, { "epoch": 1.09, "learning_rate": 0.0004061584754262788, "loss": 2.8796, "theoretical_loss": 3.810379417580529, "tokens_seen": 646384640 }, { "epoch": 1.09, "learning_rate": 0.00040614844533600804, "loss": 2.7785, "theoretical_loss": 3.81033963380739, "tokens_seen": 646450176 }, { "epoch": 1.09, "learning_rate": 0.00040613841524573727, "loss": 2.9548, "theoretical_loss": 3.810299855196425, "tokens_seen": 646515712 }, { "epoch": 1.09, "learning_rate": 0.0004061283851554664, "loss": 3.0661, "theoretical_loss": 3.810260081746441, "tokens_seen": 646581248 }, { "epoch": 1.09, "learning_rate": 0.00040611835506519563, "loss": 3.0886, "theoretical_loss": 3.810220313456245, "tokens_seen": 646646784 }, { "epoch": 1.09, "learning_rate": 0.00040610832497492476, "loss": 2.907, "theoretical_loss": 3.8101805503246453, "tokens_seen": 646712320 }, { "epoch": 1.09, "learning_rate": 0.000406098294884654, "loss": 3.0524, "theoretical_loss": 3.81014079235045, "tokens_seen": 646777856 }, { "epoch": 1.09, "learning_rate": 0.0004060882647943832, "loss": 3.0429, "theoretical_loss": 3.8101010395324675, "tokens_seen": 646843392 }, { "epoch": 1.09, "learning_rate": 0.00040607823470411236, "loss": 2.9021, "theoretical_loss": 3.8100612918695074, "tokens_seen": 646908928 }, { "epoch": 1.09, "learning_rate": 0.00040606820461384154, "loss": 2.9795, "theoretical_loss": 3.810021549360378, "tokens_seen": 646974464 }, { "epoch": 1.09, "learning_rate": 0.0004060581745235707, "loss": 3.192, "theoretical_loss": 3.8099818120038895, "tokens_seen": 647040000 }, { "epoch": 1.09, "learning_rate": 0.0004060481444332999, "loss": 2.8834, "theoretical_loss": 3.8099420797988524, "tokens_seen": 647105536 }, { "epoch": 1.09, "objective/train/docs_used": 1050941, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2310402393341064, "objective/train/theoretical_loss": 3.8099122840249997, "objective/train/tokens_used": 667614688, "theoretical_loss": 3.8099122840249997, "tokens_seen": 647154688 }, { "epoch": 1.09, "learning_rate": 0.00040603811434302914, "loss": 3.1383, "theoretical_loss": 3.8099023527440776, "tokens_seen": 647171072 }, { "epoch": 1.09, "learning_rate": 0.00040602808425275826, "loss": 3.1296, "theoretical_loss": 3.809862630838375, "tokens_seen": 647236608 }, { "epoch": 1.09, "learning_rate": 0.0004060180541624875, "loss": 3.0291, "theoretical_loss": 3.809822914080557, "tokens_seen": 647302144 }, { "epoch": 1.09, "learning_rate": 0.0004060080240722167, "loss": 2.9782, "theoretical_loss": 3.8097832024694345, "tokens_seen": 647367680 }, { "epoch": 1.09, "learning_rate": 0.00040599799398194586, "loss": 2.9953, "theoretical_loss": 3.8097434960038203, "tokens_seen": 647433216 }, { "epoch": 1.09, "learning_rate": 0.00040598796389167504, "loss": 2.9809, "theoretical_loss": 3.8097037946825263, "tokens_seen": 647498752 }, { "epoch": 1.09, "learning_rate": 0.0004059779338014042, "loss": 3.0939, "theoretical_loss": 3.8096640985043657, "tokens_seen": 647564288 }, { "epoch": 1.09, "learning_rate": 0.0004059679037111334, "loss": 3.0531, "theoretical_loss": 3.809624407468152, "tokens_seen": 647629824 }, { "epoch": 1.09, "learning_rate": 0.00040595787362086264, "loss": 3.167, "theoretical_loss": 3.8095847215726995, "tokens_seen": 647695360 }, { "epoch": 1.09, "learning_rate": 0.00040594784353059177, "loss": 3.1055, "theoretical_loss": 3.8095450408168205, "tokens_seen": 647760896 }, { "epoch": 1.09, "learning_rate": 0.000405937813440321, "loss": 2.9623, "theoretical_loss": 3.8095053651993314, "tokens_seen": 647826432 }, { "epoch": 1.09, "learning_rate": 0.00040592778335005013, "loss": 2.9381, "theoretical_loss": 3.8094656947190457, "tokens_seen": 647891968 }, { "epoch": 1.09, "learning_rate": 0.00040591775325977936, "loss": 3.0543, "theoretical_loss": 3.80942602937478, "tokens_seen": 647957504 }, { "epoch": 1.09, "learning_rate": 0.00040590772316950854, "loss": 2.9461, "theoretical_loss": 3.809386369165349, "tokens_seen": 648023040 }, { "epoch": 1.09, "learning_rate": 0.0004058976930792377, "loss": 3.0657, "theoretical_loss": 3.809346714089569, "tokens_seen": 648088576 }, { "epoch": 1.09, "learning_rate": 0.0004058876629889669, "loss": 2.991, "theoretical_loss": 3.8093070641462567, "tokens_seen": 648154112 }, { "epoch": 1.09, "learning_rate": 0.0004058776328986961, "loss": 2.9435, "theoretical_loss": 3.809267419334229, "tokens_seen": 648219648 }, { "epoch": 1.09, "learning_rate": 0.00040586760280842527, "loss": 2.9426, "theoretical_loss": 3.8092277796523026, "tokens_seen": 648285184 }, { "epoch": 1.09, "learning_rate": 0.0004058575727181545, "loss": 2.9382, "theoretical_loss": 3.8091881450992955, "tokens_seen": 648350720 }, { "epoch": 1.09, "learning_rate": 0.00040584754262788363, "loss": 2.9892, "theoretical_loss": 3.809148515674026, "tokens_seen": 648416256 }, { "epoch": 1.09, "learning_rate": 0.00040583751253761287, "loss": 2.8328, "theoretical_loss": 3.8091088913753124, "tokens_seen": 648481792 }, { "epoch": 1.09, "learning_rate": 0.00040582748244734205, "loss": 2.8883, "theoretical_loss": 3.809069272201973, "tokens_seen": 648547328 }, { "epoch": 1.09, "learning_rate": 0.00040581745235707123, "loss": 3.2048, "theoretical_loss": 3.809029658152828, "tokens_seen": 648612864 }, { "epoch": 1.09, "learning_rate": 0.0004058074222668004, "loss": 3.0131, "theoretical_loss": 3.808990049226696, "tokens_seen": 648678400 }, { "epoch": 1.09, "learning_rate": 0.0004057973921765296, "loss": 2.953, "theoretical_loss": 3.808950445422398, "tokens_seen": 648743936 }, { "epoch": 1.09, "objective/train/docs_used": 1053673, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.088578701019287, "objective/train/theoretical_loss": 3.8089207459296675, "objective/train/tokens_used": 669253088, "theoretical_loss": 3.8089207459296675, "tokens_seen": 648793088 }, { "epoch": 1.09, "learning_rate": 0.00040578736208625877, "loss": 3.0184, "theoretical_loss": 3.808910846738753, "tokens_seen": 648809472 }, { "epoch": 1.09, "learning_rate": 0.000405777331995988, "loss": 2.9326, "theoretical_loss": 3.808871253174583, "tokens_seen": 648875008 }, { "epoch": 1.09, "learning_rate": 0.00040576730190571713, "loss": 2.9456, "theoretical_loss": 3.808831664728709, "tokens_seen": 648940544 }, { "epoch": 1.09, "learning_rate": 0.00040575727181544637, "loss": 2.9814, "theoretical_loss": 3.808792081399952, "tokens_seen": 649006080 }, { "epoch": 1.09, "learning_rate": 0.0004057472417251755, "loss": 2.8416, "theoretical_loss": 3.8087525031871348, "tokens_seen": 649071616 }, { "epoch": 1.09, "learning_rate": 0.00040573721163490473, "loss": 3.0279, "theoretical_loss": 3.8087129300890785, "tokens_seen": 649137152 }, { "epoch": 1.09, "learning_rate": 0.0004057271815446339, "loss": 3.0345, "theoretical_loss": 3.8086733621046074, "tokens_seen": 649202688 }, { "epoch": 1.09, "learning_rate": 0.0004057171514543631, "loss": 2.9969, "theoretical_loss": 3.8086337992325436, "tokens_seen": 649268224 }, { "epoch": 1.09, "learning_rate": 0.0004057071213640923, "loss": 2.9985, "theoretical_loss": 3.8085942414717104, "tokens_seen": 649333760 }, { "epoch": 1.09, "learning_rate": 0.00040569709127382146, "loss": 3.1169, "theoretical_loss": 3.8085546888209323, "tokens_seen": 649399296 }, { "epoch": 1.09, "learning_rate": 0.00040568706118355064, "loss": 2.8916, "theoretical_loss": 3.8085151412790332, "tokens_seen": 649464832 }, { "epoch": 1.09, "learning_rate": 0.0004056770310932799, "loss": 2.9887, "theoretical_loss": 3.808475598844838, "tokens_seen": 649530368 }, { "epoch": 1.09, "learning_rate": 0.000405667001003009, "loss": 2.9079, "theoretical_loss": 3.808436061517172, "tokens_seen": 649595904 }, { "epoch": 1.09, "learning_rate": 0.00040565697091273824, "loss": 2.926, "theoretical_loss": 3.8083965292948605, "tokens_seen": 649661440 }, { "epoch": 1.09, "learning_rate": 0.0004056469408224674, "loss": 3.0851, "theoretical_loss": 3.808357002176729, "tokens_seen": 649726976 }, { "epoch": 1.09, "learning_rate": 0.0004056369107321966, "loss": 2.8454, "theoretical_loss": 3.8083174801616044, "tokens_seen": 649792512 }, { "epoch": 1.09, "learning_rate": 0.0004056268806419258, "loss": 2.9762, "theoretical_loss": 3.8082779632483126, "tokens_seen": 649858048 }, { "epoch": 1.09, "learning_rate": 0.00040561685055165496, "loss": 3.046, "theoretical_loss": 3.80823845143568, "tokens_seen": 649923584 }, { "epoch": 1.09, "learning_rate": 0.00040560682046138414, "loss": 2.9372, "theoretical_loss": 3.808198944722536, "tokens_seen": 649989120 }, { "epoch": 1.09, "learning_rate": 0.0004055967903711134, "loss": 2.8361, "theoretical_loss": 3.8081594431077073, "tokens_seen": 650054656 }, { "epoch": 1.09, "learning_rate": 0.0004055867602808425, "loss": 3.0377, "theoretical_loss": 3.808119946590022, "tokens_seen": 650120192 }, { "epoch": 1.09, "learning_rate": 0.00040557673019057174, "loss": 3.0214, "theoretical_loss": 3.808080455168308, "tokens_seen": 650185728 }, { "epoch": 1.09, "learning_rate": 0.00040556670010030087, "loss": 2.9137, "theoretical_loss": 3.8080409688413956, "tokens_seen": 650251264 }, { "epoch": 1.09, "learning_rate": 0.0004055566700100301, "loss": 3.1396, "theoretical_loss": 3.808001487608113, "tokens_seen": 650316800 }, { "epoch": 1.09, "learning_rate": 0.0004055466399197593, "loss": 3.1217, "theoretical_loss": 3.807962011467291, "tokens_seen": 650382336 }, { "epoch": 1.09, "objective/train/docs_used": 1056476, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0001420974731445, "objective/train/theoretical_loss": 3.8079324077028973, "objective/train/tokens_used": 670891488, "theoretical_loss": 3.8079324077028973, "tokens_seen": 650431488 }, { "epoch": 1.09, "learning_rate": 0.00040553660982948846, "loss": 3.0276, "theoretical_loss": 3.807922540417759, "tokens_seen": 650447872 }, { "epoch": 1.09, "learning_rate": 0.00040552657973921764, "loss": 3.1884, "theoretical_loss": 3.8078830744583474, "tokens_seen": 650513408 }, { "epoch": 1.09, "learning_rate": 0.0004055165496489469, "loss": 2.9537, "theoretical_loss": 3.8078436135878873, "tokens_seen": 650578944 }, { "epoch": 1.09, "learning_rate": 0.000405506519558676, "loss": 2.9668, "theoretical_loss": 3.8078041578052098, "tokens_seen": 650644480 }, { "epoch": 1.09, "learning_rate": 0.00040549648946840524, "loss": 2.8147, "theoretical_loss": 3.807764707109147, "tokens_seen": 650710016 }, { "epoch": 1.09, "learning_rate": 0.00040548645937813437, "loss": 2.9179, "theoretical_loss": 3.807725261498531, "tokens_seen": 650775552 }, { "epoch": 1.09, "learning_rate": 0.0004054764292878636, "loss": 3.004, "theoretical_loss": 3.8076858209721935, "tokens_seen": 650841088 }, { "epoch": 1.09, "learning_rate": 0.0004054663991975928, "loss": 3.1476, "theoretical_loss": 3.807646385528968, "tokens_seen": 650906624 }, { "epoch": 1.09, "learning_rate": 0.00040545636910732197, "loss": 2.8716, "theoretical_loss": 3.807606955167687, "tokens_seen": 650972160 }, { "epoch": 1.09, "learning_rate": 0.00040544633901705115, "loss": 3.0432, "theoretical_loss": 3.8075675298871845, "tokens_seen": 651037696 }, { "epoch": 1.09, "learning_rate": 0.00040543630892678033, "loss": 2.9345, "theoretical_loss": 3.8075281096862947, "tokens_seen": 651103232 }, { "epoch": 1.09, "learning_rate": 0.0004054262788365095, "loss": 3.071, "theoretical_loss": 3.8074886945638515, "tokens_seen": 651168768 }, { "epoch": 1.09, "learning_rate": 0.00040541624874623874, "loss": 3.091, "theoretical_loss": 3.80744928451869, "tokens_seen": 651234304 }, { "epoch": 1.09, "learning_rate": 0.00040540621865596787, "loss": 2.9494, "theoretical_loss": 3.8074098795496454, "tokens_seen": 651299840 }, { "epoch": 1.09, "learning_rate": 0.0004053961885656971, "loss": 2.9722, "theoretical_loss": 3.8073704796555523, "tokens_seen": 651365376 }, { "epoch": 1.09, "learning_rate": 0.0004053861584754263, "loss": 3.1102, "theoretical_loss": 3.8073310848352477, "tokens_seen": 651430912 }, { "epoch": 1.09, "learning_rate": 0.00040537612838515547, "loss": 2.9955, "theoretical_loss": 3.8072916950875673, "tokens_seen": 651496448 }, { "epoch": 1.09, "learning_rate": 0.0004053660982948847, "loss": 3.0765, "theoretical_loss": 3.807252310411348, "tokens_seen": 651561984 }, { "epoch": 1.09, "learning_rate": 0.00040535606820461383, "loss": 2.9607, "theoretical_loss": 3.8072129308054263, "tokens_seen": 651627520 }, { "epoch": 1.09, "learning_rate": 0.00040534603811434307, "loss": 3.0289, "theoretical_loss": 3.8071735562686406, "tokens_seen": 651693056 }, { "epoch": 1.09, "learning_rate": 0.00040533600802407225, "loss": 3.0429, "theoretical_loss": 3.807134186799828, "tokens_seen": 651758592 }, { "epoch": 1.09, "learning_rate": 0.00040532597793380143, "loss": 2.9844, "theoretical_loss": 3.8070948223978265, "tokens_seen": 651824128 }, { "epoch": 1.09, "learning_rate": 0.0004053159478435306, "loss": 2.8234, "theoretical_loss": 3.8070554630614755, "tokens_seen": 651889664 }, { "epoch": 1.09, "learning_rate": 0.0004053059177532598, "loss": 2.9873, "theoretical_loss": 3.8070161087896137, "tokens_seen": 651955200 }, { "epoch": 1.09, "learning_rate": 0.00040529588766298897, "loss": 3.1036, "theoretical_loss": 3.8069767595810795, "tokens_seen": 652020736 }, { "epoch": 1.09, "objective/train/docs_used": 1059228, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.822502613067627, "objective/train/theoretical_loss": 3.8069472509967905, "objective/train/tokens_used": 672529888, "theoretical_loss": 3.8069472509967905, "tokens_seen": 652069888 }, { "epoch": 1.09, "learning_rate": 0.0004052858575727182, "loss": 3.1001, "theoretical_loss": 3.8069374154347138, "tokens_seen": 652086272 }, { "epoch": 1.09, "learning_rate": 0.00040527582748244733, "loss": 3.009, "theoretical_loss": 3.806898076349356, "tokens_seen": 652151808 }, { "epoch": 1.09, "learning_rate": 0.00040526579739217657, "loss": 2.9077, "theoretical_loss": 3.806858742323847, "tokens_seen": 652217344 }, { "epoch": 1.09, "learning_rate": 0.0004052557673019057, "loss": 3.0136, "theoretical_loss": 3.8068194133570277, "tokens_seen": 652282880 }, { "epoch": 1.09, "learning_rate": 0.00040524573721163493, "loss": 2.9316, "theoretical_loss": 3.806780089447739, "tokens_seen": 652348416 }, { "epoch": 1.09, "learning_rate": 0.0004052357071213641, "loss": 3.0063, "theoretical_loss": 3.806740770594822, "tokens_seen": 652413952 }, { "epoch": 1.09, "learning_rate": 0.0004052256770310933, "loss": 2.9766, "theoretical_loss": 3.8067014567971205, "tokens_seen": 652479488 }, { "epoch": 1.09, "learning_rate": 0.0004052156469408225, "loss": 2.8465, "theoretical_loss": 3.8066621480534755, "tokens_seen": 652545024 }, { "epoch": 1.09, "learning_rate": 0.00040520561685055166, "loss": 3.0081, "theoretical_loss": 3.80662284436273, "tokens_seen": 652610560 }, { "epoch": 1.09, "learning_rate": 0.00040519558676028084, "loss": 3.1293, "theoretical_loss": 3.8065835457237274, "tokens_seen": 652676096 }, { "epoch": 1.09, "learning_rate": 0.0004051855566700101, "loss": 3.0375, "theoretical_loss": 3.8065442521353106, "tokens_seen": 652741632 }, { "epoch": 1.09, "learning_rate": 0.0004051755265797392, "loss": 3.1008, "theoretical_loss": 3.806504963596324, "tokens_seen": 652807168 }, { "epoch": 1.09, "learning_rate": 0.00040516549648946844, "loss": 2.8881, "theoretical_loss": 3.8064656801056125, "tokens_seen": 652872704 }, { "epoch": 1.09, "learning_rate": 0.0004051554663991976, "loss": 2.8301, "theoretical_loss": 3.80642640166202, "tokens_seen": 652938240 }, { "epoch": 1.09, "learning_rate": 0.0004051454363089268, "loss": 2.9493, "theoretical_loss": 3.8063871282643915, "tokens_seen": 653003776 }, { "epoch": 1.09, "learning_rate": 0.000405135406218656, "loss": 2.9783, "theoretical_loss": 3.806347859911573, "tokens_seen": 653069312 }, { "epoch": 1.09, "learning_rate": 0.00040512537612838516, "loss": 3.1203, "theoretical_loss": 3.8063085966024097, "tokens_seen": 653134848 }, { "epoch": 1.09, "learning_rate": 0.00040511534603811434, "loss": 3.1428, "theoretical_loss": 3.8062693383357487, "tokens_seen": 653200384 }, { "epoch": 1.09, "learning_rate": 0.0004051053159478436, "loss": 3.0224, "theoretical_loss": 3.8062300851104354, "tokens_seen": 653265920 }, { "epoch": 1.09, "learning_rate": 0.0004050952858575727, "loss": 3.0839, "theoretical_loss": 3.806190836925318, "tokens_seen": 653331456 }, { "epoch": 1.09, "learning_rate": 0.00040508525576730194, "loss": 2.9583, "theoretical_loss": 3.8061515937792425, "tokens_seen": 653396992 }, { "epoch": 1.09, "learning_rate": 0.00040507522567703107, "loss": 2.919, "theoretical_loss": 3.806112355671058, "tokens_seen": 653462528 }, { "epoch": 1.09, "learning_rate": 0.0004050651955867603, "loss": 3.1026, "theoretical_loss": 3.806073122599612, "tokens_seen": 653528064 }, { "epoch": 1.09, "learning_rate": 0.0004050551654964895, "loss": 3.0392, "theoretical_loss": 3.806033894563752, "tokens_seen": 653593600 }, { "epoch": 1.09, "learning_rate": 0.00040504513540621866, "loss": 3.1509, "theoretical_loss": 3.8059946715623285, "tokens_seen": 653659136 }, { "epoch": 1.09, "objective/train/docs_used": 1061949, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2213926315307617, "objective/train/theoretical_loss": 3.8059652576144165, "objective/train/tokens_used": 674168288, "theoretical_loss": 3.8059652576144165, "tokens_seen": 653708288 }, { "epoch": 1.09, "learning_rate": 0.00040503510531594784, "loss": 3.0279, "theoretical_loss": 3.8059554535941897, "tokens_seen": 653724672 }, { "epoch": 1.09, "learning_rate": 0.0004050250752256771, "loss": 2.9634, "theoretical_loss": 3.8059162406581857, "tokens_seen": 653790208 }, { "epoch": 1.09, "learning_rate": 0.0004050150451354062, "loss": 3.0779, "theoretical_loss": 3.805877032753166, "tokens_seen": 653855744 }, { "epoch": 1.09, "learning_rate": 0.00040500501504513544, "loss": 3.0972, "theoretical_loss": 3.805837829877982, "tokens_seen": 653921280 }, { "epoch": 1.09, "learning_rate": 0.00040499498495486457, "loss": 2.8372, "theoretical_loss": 3.8057986320314834, "tokens_seen": 653986816 }, { "epoch": 1.09, "learning_rate": 0.0004049849548645938, "loss": 3.246, "theoretical_loss": 3.8057594392125207, "tokens_seen": 654052352 }, { "epoch": 1.09, "learning_rate": 0.000404974924774323, "loss": 2.9524, "theoretical_loss": 3.8057202514199475, "tokens_seen": 654117888 }, { "epoch": 1.09, "learning_rate": 0.00040496489468405217, "loss": 3.0233, "theoretical_loss": 3.805681068652614, "tokens_seen": 654183424 }, { "epoch": 1.09, "learning_rate": 0.00040495486459378135, "loss": 3.0401, "theoretical_loss": 3.8056418909093734, "tokens_seen": 654248960 }, { "epoch": 1.09, "learning_rate": 0.00040494483450351053, "loss": 3.1551, "theoretical_loss": 3.805602718189078, "tokens_seen": 654314496 }, { "epoch": 1.09, "learning_rate": 0.0004049348044132397, "loss": 3.0204, "theoretical_loss": 3.8055635504905805, "tokens_seen": 654380032 }, { "epoch": 1.09, "learning_rate": 0.00040492477432296894, "loss": 2.9629, "theoretical_loss": 3.805524387812734, "tokens_seen": 654445568 }, { "epoch": 1.09, "learning_rate": 0.00040491474423269807, "loss": 3.1363, "theoretical_loss": 3.8054852301543938, "tokens_seen": 654511104 }, { "epoch": 1.09, "learning_rate": 0.0004049047141424273, "loss": 2.9541, "theoretical_loss": 3.8054460775144126, "tokens_seen": 654576640 }, { "epoch": 1.09, "learning_rate": 0.00040489468405215643, "loss": 3.0924, "theoretical_loss": 3.805406929891645, "tokens_seen": 654642176 }, { "epoch": 1.09, "learning_rate": 0.00040488465396188567, "loss": 2.951, "theoretical_loss": 3.805367787284947, "tokens_seen": 654707712 }, { "epoch": 1.09, "learning_rate": 0.00040487462387161485, "loss": 3.0994, "theoretical_loss": 3.8053286496931724, "tokens_seen": 654773248 }, { "epoch": 1.09, "learning_rate": 0.00040486459378134403, "loss": 2.9985, "theoretical_loss": 3.8052895171151784, "tokens_seen": 654838784 }, { "epoch": 1.09, "learning_rate": 0.0004048545636910732, "loss": 3.0575, "theoretical_loss": 3.8052503895498195, "tokens_seen": 654904320 }, { "epoch": 1.09, "learning_rate": 0.00040484453360080245, "loss": 2.9538, "theoretical_loss": 3.805211266995953, "tokens_seen": 654969856 }, { "epoch": 1.09, "learning_rate": 0.0004048345035105316, "loss": 2.9732, "theoretical_loss": 3.805172149452435, "tokens_seen": 655035392 }, { "epoch": 1.09, "learning_rate": 0.0004048244734202608, "loss": 2.9655, "theoretical_loss": 3.8051330369181238, "tokens_seen": 655100928 }, { "epoch": 1.09, "learning_rate": 0.00040481444332998994, "loss": 2.8266, "theoretical_loss": 3.805093929391876, "tokens_seen": 655166464 }, { "epoch": 1.1, "learning_rate": 0.00040480441323971917, "loss": 3.0023, "theoretical_loss": 3.80505482687255, "tokens_seen": 655232000 }, { "epoch": 1.1, "learning_rate": 0.00040479438314944835, "loss": 3.0255, "theoretical_loss": 3.8050157293590035, "tokens_seen": 655297536 }, { "debugging/Self-BLEU-5": 0.5603924306325536, "debugging/distinct-1-grams": 0.7733332453993704, "debugging/distinct-2-grams": 0.9613935928882845, "debugging/entropy-1-grams": 6.3088429650941285, "debugging/entropy-2-grams": 7.383123444866935, "debugging/length": 509.5652173913044, "debugging/num_segments": 23, "epoch": 1.1, "objective/train/docs_used": 1064827, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.017918825149536, "objective/train/theoretical_loss": 3.8049864095082, "objective/train/tokens_used": 675806688, "theoretical_loss": 3.8049864095082, "tokens_seen": 655346688 }, { "epoch": 1.1, "learning_rate": 0.00040478435305917753, "loss": 3.0813, "theoretical_loss": 3.8049766368500952, "tokens_seen": 655363072 }, { "epoch": 1.1, "learning_rate": 0.0004047743229689067, "loss": 3.0438, "theoretical_loss": 3.8049375493446846, "tokens_seen": 655428608 }, { "epoch": 1.1, "learning_rate": 0.0004047642928786359, "loss": 3.0853, "theoretical_loss": 3.8048984668416312, "tokens_seen": 655494144 }, { "epoch": 1.1, "learning_rate": 0.0004047542627883651, "loss": 2.9247, "theoretical_loss": 3.804859389339794, "tokens_seen": 655559680 }, { "epoch": 1.1, "learning_rate": 0.0004047442326980943, "loss": 3.0269, "theoretical_loss": 3.8048203168380335, "tokens_seen": 655625216 }, { "epoch": 1.1, "learning_rate": 0.00040473420260782344, "loss": 2.7771, "theoretical_loss": 3.8047812493352104, "tokens_seen": 655690752 }, { "epoch": 1.1, "learning_rate": 0.0004047241725175527, "loss": 3.0311, "theoretical_loss": 3.804742186830186, "tokens_seen": 655756288 }, { "epoch": 1.1, "learning_rate": 0.0004047141424272818, "loss": 2.9288, "theoretical_loss": 3.8047031293218208, "tokens_seen": 655821824 }, { "epoch": 1.1, "learning_rate": 0.00040470411233701104, "loss": 2.9991, "theoretical_loss": 3.804664076808976, "tokens_seen": 655887360 }, { "epoch": 1.1, "learning_rate": 0.0004046940822467402, "loss": 2.8105, "theoretical_loss": 3.804625029290515, "tokens_seen": 655952896 }, { "epoch": 1.1, "learning_rate": 0.0004046840521564694, "loss": 2.9758, "theoretical_loss": 3.804585986765299, "tokens_seen": 656018432 }, { "epoch": 1.1, "learning_rate": 0.0004046740220661986, "loss": 2.9787, "theoretical_loss": 3.8045469492321917, "tokens_seen": 656083968 }, { "epoch": 1.1, "learning_rate": 0.0004046639919759278, "loss": 2.9035, "theoretical_loss": 3.8045079166900555, "tokens_seen": 656149504 }, { "epoch": 1.1, "learning_rate": 0.000404653961885657, "loss": 2.9772, "theoretical_loss": 3.8044688891377545, "tokens_seen": 656215040 }, { "epoch": 1.1, "learning_rate": 0.0004046439317953862, "loss": 2.9217, "theoretical_loss": 3.804429866574152, "tokens_seen": 656280576 }, { "epoch": 1.1, "learning_rate": 0.00040463390170511536, "loss": 2.7446, "theoretical_loss": 3.8043908489981124, "tokens_seen": 656346112 }, { "epoch": 1.1, "learning_rate": 0.00040462387161484454, "loss": 3.0333, "theoretical_loss": 3.8043518364085003, "tokens_seen": 656411648 }, { "epoch": 1.1, "learning_rate": 0.0004046138415245738, "loss": 3.1289, "theoretical_loss": 3.804312828804181, "tokens_seen": 656477184 }, { "epoch": 1.1, "learning_rate": 0.0004046038114343029, "loss": 3.0478, "theoretical_loss": 3.8042738261840197, "tokens_seen": 656542720 }, { "epoch": 1.1, "learning_rate": 0.00040459378134403214, "loss": 2.8372, "theoretical_loss": 3.804234828546882, "tokens_seen": 656608256 }, { "epoch": 1.1, "learning_rate": 0.00040458375125376127, "loss": 3.024, "theoretical_loss": 3.804195835891634, "tokens_seen": 656673792 }, { "epoch": 1.1, "learning_rate": 0.0004045737211634905, "loss": 3.0777, "theoretical_loss": 3.804156848217142, "tokens_seen": 656739328 }, { "epoch": 1.1, "learning_rate": 0.0004045636910732197, "loss": 3.1686, "theoretical_loss": 3.804117865522273, "tokens_seen": 656804864 }, { "epoch": 1.1, "learning_rate": 0.00040455366098294886, "loss": 2.7593, "theoretical_loss": 3.804078887805895, "tokens_seen": 656870400 }, { "epoch": 1.1, "learning_rate": 0.00040454363089267804, "loss": 3.0643, "theoretical_loss": 3.8040399150668747, "tokens_seen": 656935936 }, { "epoch": 1.1, "objective/train/docs_used": 1066283, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6792080402374268, "objective/train/theoretical_loss": 3.8040106887783196, "objective/train/tokens_used": 677445088, "theoretical_loss": 3.8040106887783196, "tokens_seen": 656985088 }, { "epoch": 1.1, "learning_rate": 0.0004045336008024073, "loss": 2.9104, "theoretical_loss": 3.80400094730408, "tokens_seen": 657001472 }, { "epoch": 1.1, "learning_rate": 0.0004045235707121364, "loss": 2.8878, "theoretical_loss": 3.8039619845163797, "tokens_seen": 657067008 }, { "epoch": 1.1, "learning_rate": 0.00040451354062186564, "loss": 3.0477, "theoretical_loss": 3.803923026702642, "tokens_seen": 657132544 }, { "epoch": 1.1, "learning_rate": 0.00040450351053159477, "loss": 3.104, "theoretical_loss": 3.8038840738617368, "tokens_seen": 657198080 }, { "epoch": 1.1, "learning_rate": 0.000404493480441324, "loss": 3.0382, "theoretical_loss": 3.8038451259925323, "tokens_seen": 657263616 }, { "epoch": 1.1, "learning_rate": 0.0004044834503510532, "loss": 3.0012, "theoretical_loss": 3.8038061830938994, "tokens_seen": 657329152 }, { "epoch": 1.1, "learning_rate": 0.00040447342026078237, "loss": 2.851, "theoretical_loss": 3.8037672451647078, "tokens_seen": 657394688 }, { "epoch": 1.1, "learning_rate": 0.00040446339017051155, "loss": 3.0948, "theoretical_loss": 3.803728312203828, "tokens_seen": 657460224 }, { "epoch": 1.1, "learning_rate": 0.00040445336008024073, "loss": 2.9503, "theoretical_loss": 3.8036893842101316, "tokens_seen": 657525760 }, { "epoch": 1.1, "learning_rate": 0.0004044433299899699, "loss": 2.8645, "theoretical_loss": 3.803650461182489, "tokens_seen": 657591296 }, { "epoch": 1.1, "learning_rate": 0.00040443329989969915, "loss": 2.9406, "theoretical_loss": 3.8036115431197715, "tokens_seen": 657656832 }, { "epoch": 1.1, "learning_rate": 0.00040442326980942827, "loss": 2.9765, "theoretical_loss": 3.8035726300208523, "tokens_seen": 657722368 }, { "epoch": 1.1, "learning_rate": 0.0004044132397191575, "loss": 3.0505, "theoretical_loss": 3.8035337218846035, "tokens_seen": 657787904 }, { "epoch": 1.1, "learning_rate": 0.00040440320962888663, "loss": 3.0355, "theoretical_loss": 3.803494818709897, "tokens_seen": 657853440 }, { "epoch": 1.1, "learning_rate": 0.00040439317953861587, "loss": 3.0406, "theoretical_loss": 3.8034559204956073, "tokens_seen": 657918976 }, { "epoch": 1.1, "learning_rate": 0.00040438314944834505, "loss": 2.9615, "theoretical_loss": 3.8034170272406067, "tokens_seen": 657984512 }, { "epoch": 1.1, "learning_rate": 0.00040437311935807423, "loss": 2.8232, "theoretical_loss": 3.8033781389437697, "tokens_seen": 658050048 }, { "epoch": 1.1, "learning_rate": 0.0004043630892678034, "loss": 2.9585, "theoretical_loss": 3.8033392556039702, "tokens_seen": 658115584 }, { "epoch": 1.1, "learning_rate": 0.00040435305917753265, "loss": 3.0841, "theoretical_loss": 3.8033003772200833, "tokens_seen": 658181120 }, { "epoch": 1.1, "learning_rate": 0.0004043430290872618, "loss": 2.9648, "theoretical_loss": 3.803261503790983, "tokens_seen": 658246656 }, { "epoch": 1.1, "learning_rate": 0.000404332998996991, "loss": 2.9664, "theoretical_loss": 3.8032226353155463, "tokens_seen": 658312192 }, { "epoch": 1.1, "learning_rate": 0.00040432296890672014, "loss": 2.7456, "theoretical_loss": 3.8031837717926473, "tokens_seen": 658377728 }, { "epoch": 1.1, "learning_rate": 0.00040431293881644937, "loss": 2.8611, "theoretical_loss": 3.8031449132211623, "tokens_seen": 658443264 }, { "epoch": 1.1, "learning_rate": 0.00040430290872617855, "loss": 2.9526, "theoretical_loss": 3.803106059599968, "tokens_seen": 658508800 }, { "epoch": 1.1, "learning_rate": 0.00040429287863590773, "loss": 2.9352, "theoretical_loss": 3.803067210927942, "tokens_seen": 658574336 }, { "epoch": 1.1, "objective/train/docs_used": 1069115, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.849724292755127, "objective/train/theoretical_loss": 3.803038077671138, "objective/train/tokens_used": 679083488, "theoretical_loss": 3.803038077671138, "tokens_seen": 658623488 }, { "epoch": 1.1, "learning_rate": 0.0004042828485456369, "loss": 2.9857, "theoretical_loss": 3.803028367203961, "tokens_seen": 658639872 }, { "epoch": 1.1, "learning_rate": 0.0004042728184553661, "loss": 2.9416, "theoretical_loss": 3.802989528426901, "tokens_seen": 658705408 }, { "epoch": 1.1, "learning_rate": 0.0004042627883650953, "loss": 2.9258, "theoretical_loss": 3.802950694595642, "tokens_seen": 658770944 }, { "epoch": 1.1, "learning_rate": 0.0004042527582748245, "loss": 3.1174, "theoretical_loss": 3.8029118657090613, "tokens_seen": 658836480 }, { "epoch": 1.1, "learning_rate": 0.00040424272818455364, "loss": 3.0127, "theoretical_loss": 3.8028730417660377, "tokens_seen": 658902016 }, { "epoch": 1.1, "learning_rate": 0.0004042326980942829, "loss": 3.0829, "theoretical_loss": 3.80283422276545, "tokens_seen": 658967552 }, { "epoch": 1.1, "learning_rate": 0.000404222668004012, "loss": 3.0652, "theoretical_loss": 3.802795408706178, "tokens_seen": 659033088 }, { "epoch": 1.1, "learning_rate": 0.00040421263791374124, "loss": 3.0082, "theoretical_loss": 3.8027565995871004, "tokens_seen": 659098624 }, { "epoch": 1.1, "learning_rate": 0.0004042026078234704, "loss": 2.9521, "theoretical_loss": 3.802717795407099, "tokens_seen": 659164160 }, { "epoch": 1.1, "learning_rate": 0.0004041925777331996, "loss": 2.966, "theoretical_loss": 3.802678996165053, "tokens_seen": 659229696 }, { "epoch": 1.1, "learning_rate": 0.0004041825476429288, "loss": 3.1041, "theoretical_loss": 3.8026402018598437, "tokens_seen": 659295232 }, { "epoch": 1.1, "learning_rate": 0.000404172517552658, "loss": 2.9588, "theoretical_loss": 3.802601412490352, "tokens_seen": 659360768 }, { "epoch": 1.1, "learning_rate": 0.00040416248746238714, "loss": 2.907, "theoretical_loss": 3.8025626280554596, "tokens_seen": 659426304 }, { "epoch": 1.1, "learning_rate": 0.0004041524573721164, "loss": 2.8606, "theoretical_loss": 3.8025238485540482, "tokens_seen": 659491840 }, { "epoch": 1.1, "learning_rate": 0.0004041424272818455, "loss": 3.1068, "theoretical_loss": 3.8024850739850007, "tokens_seen": 659557376 }, { "epoch": 1.1, "learning_rate": 0.00040413239719157474, "loss": 3.0194, "theoretical_loss": 3.802446304347199, "tokens_seen": 659622912 }, { "epoch": 1.1, "learning_rate": 0.0004041223671013039, "loss": 2.9758, "theoretical_loss": 3.802407539639527, "tokens_seen": 659688448 }, { "epoch": 1.1, "learning_rate": 0.0004041123370110331, "loss": 2.7728, "theoretical_loss": 3.802368779860867, "tokens_seen": 659753984 }, { "epoch": 1.1, "learning_rate": 0.0004041023069207623, "loss": 2.9118, "theoretical_loss": 3.8023300250101038, "tokens_seen": 659819520 }, { "epoch": 1.1, "learning_rate": 0.00040409227683049147, "loss": 3.0406, "theoretical_loss": 3.8022912750861204, "tokens_seen": 659885056 }, { "epoch": 1.1, "learning_rate": 0.00040408224674022065, "loss": 2.9922, "theoretical_loss": 3.8022525300878023, "tokens_seen": 659950592 }, { "epoch": 1.1, "learning_rate": 0.0004040722166499499, "loss": 2.821, "theoretical_loss": 3.802213790014034, "tokens_seen": 660016128 }, { "epoch": 1.1, "learning_rate": 0.000404062186559679, "loss": 2.9293, "theoretical_loss": 3.8021750548637003, "tokens_seen": 660081664 }, { "epoch": 1.1, "learning_rate": 0.00040405215646940824, "loss": 2.9389, "theoretical_loss": 3.8021363246356867, "tokens_seen": 660147200 }, { "epoch": 1.1, "learning_rate": 0.00040404212637913737, "loss": 2.9298, "theoretical_loss": 3.8020975993288797, "tokens_seen": 660212736 }, { "epoch": 1.1, "objective/train/docs_used": 1072034, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.208097457885742, "objective/train/theoretical_loss": 3.8020685585776466, "objective/train/tokens_used": 680721888, "theoretical_loss": 3.8020685585776466, "tokens_seen": 660261888 }, { "epoch": 1.1, "learning_rate": 0.0004040320962888666, "loss": 3.0855, "theoretical_loss": 3.802058878942166, "tokens_seen": 660278272 }, { "epoch": 1.1, "learning_rate": 0.0004040220661985958, "loss": 3.0286, "theoretical_loss": 3.8020201634744306, "tokens_seen": 660343808 }, { "epoch": 1.1, "learning_rate": 0.00040401203610832497, "loss": 2.9355, "theoretical_loss": 3.8019814529245624, "tokens_seen": 660409344 }, { "epoch": 1.1, "learning_rate": 0.00040400200601805415, "loss": 3.1011, "theoretical_loss": 3.801942747291447, "tokens_seen": 660474880 }, { "epoch": 1.1, "learning_rate": 0.0004039919759277834, "loss": 3.0283, "theoretical_loss": 3.8019040465739735, "tokens_seen": 660540416 }, { "epoch": 1.1, "learning_rate": 0.0004039819458375125, "loss": 3.0793, "theoretical_loss": 3.8018653507710294, "tokens_seen": 660605952 }, { "epoch": 1.1, "learning_rate": 0.00040397191574724175, "loss": 2.8012, "theoretical_loss": 3.8018266598815034, "tokens_seen": 660671488 }, { "epoch": 1.1, "learning_rate": 0.0004039618856569709, "loss": 2.9946, "theoretical_loss": 3.8017879739042835, "tokens_seen": 660737024 }, { "epoch": 1.1, "learning_rate": 0.0004039518555667001, "loss": 2.9795, "theoretical_loss": 3.8017492928382604, "tokens_seen": 660802560 }, { "epoch": 1.1, "learning_rate": 0.0004039418254764293, "loss": 3.1222, "theoretical_loss": 3.8017106166823225, "tokens_seen": 660868096 }, { "epoch": 1.1, "learning_rate": 0.00040393179538615847, "loss": 2.9015, "theoretical_loss": 3.80167194543536, "tokens_seen": 660933632 }, { "epoch": 1.1, "learning_rate": 0.00040392176529588765, "loss": 3.0079, "theoretical_loss": 3.8016332790962633, "tokens_seen": 660999168 }, { "epoch": 1.1, "learning_rate": 0.00040391173520561683, "loss": 2.9788, "theoretical_loss": 3.8015946176639224, "tokens_seen": 661064704 }, { "epoch": 1.1, "learning_rate": 0.00040390170511534607, "loss": 2.8939, "theoretical_loss": 3.801555961137229, "tokens_seen": 661130240 }, { "epoch": 1.1, "learning_rate": 0.00040389167502507525, "loss": 3.0208, "theoretical_loss": 3.8015173095150745, "tokens_seen": 661195776 }, { "epoch": 1.1, "learning_rate": 0.00040388164493480443, "loss": 2.9979, "theoretical_loss": 3.8014786627963497, "tokens_seen": 661261312 }, { "epoch": 1.1, "learning_rate": 0.0004038716148445336, "loss": 2.9797, "theoretical_loss": 3.801440020979948, "tokens_seen": 661326848 }, { "epoch": 1.1, "learning_rate": 0.00040386158475426285, "loss": 2.86, "theoretical_loss": 3.8014013840647602, "tokens_seen": 661392384 }, { "epoch": 1.1, "learning_rate": 0.000403851554663992, "loss": 3.1812, "theoretical_loss": 3.8013627520496804, "tokens_seen": 661457920 }, { "epoch": 1.1, "learning_rate": 0.0004038415245737212, "loss": 3.081, "theoretical_loss": 3.8013241249336014, "tokens_seen": 661523456 }, { "epoch": 1.1, "learning_rate": 0.00040383149448345034, "loss": 2.967, "theoretical_loss": 3.801285502715417, "tokens_seen": 661588992 }, { "epoch": 1.1, "learning_rate": 0.00040382146439317957, "loss": 2.983, "theoretical_loss": 3.8012468853940202, "tokens_seen": 661654528 }, { "epoch": 1.1, "learning_rate": 0.00040381143430290875, "loss": 3.0319, "theoretical_loss": 3.8012082729683057, "tokens_seen": 661720064 }, { "epoch": 1.1, "learning_rate": 0.00040380140421263794, "loss": 2.9714, "theoretical_loss": 3.8011696654371683, "tokens_seen": 661785600 }, { "epoch": 1.1, "learning_rate": 0.0004037913741223671, "loss": 2.8868, "theoretical_loss": 3.8011310627995027, "tokens_seen": 661851136 }, { "epoch": 1.1, "objective/train/docs_used": 1074885, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6953744888305664, "objective/train/theoretical_loss": 3.80110211403193, "objective/train/tokens_used": 682360288, "theoretical_loss": 3.80110211403193, "tokens_seen": 661900288 }, { "epoch": 1.1, "learning_rate": 0.0004037813440320963, "loss": 2.9468, "theoretical_loss": 3.8010924650542046, "tokens_seen": 661916672 }, { "epoch": 1.1, "learning_rate": 0.0004037713139418255, "loss": 2.972, "theoretical_loss": 3.801053872200169, "tokens_seen": 661982208 }, { "epoch": 1.1, "learning_rate": 0.0004037612838515547, "loss": 2.9876, "theoretical_loss": 3.801015284236292, "tokens_seen": 662047744 }, { "epoch": 1.1, "learning_rate": 0.00040375125376128384, "loss": 2.9486, "theoretical_loss": 3.8009767011614706, "tokens_seen": 662113280 }, { "epoch": 1.1, "learning_rate": 0.0004037412236710131, "loss": 2.9973, "theoretical_loss": 3.8009381229746007, "tokens_seen": 662178816 }, { "epoch": 1.1, "learning_rate": 0.0004037311935807422, "loss": 3.0325, "theoretical_loss": 3.8008995496745803, "tokens_seen": 662244352 }, { "epoch": 1.1, "learning_rate": 0.00040372116349047144, "loss": 2.9697, "theoretical_loss": 3.800860981260306, "tokens_seen": 662309888 }, { "epoch": 1.1, "learning_rate": 0.0004037111334002006, "loss": 3.0201, "theoretical_loss": 3.800822417730676, "tokens_seen": 662375424 }, { "epoch": 1.1, "learning_rate": 0.0004037011033099298, "loss": 3.0098, "theoretical_loss": 3.800783859084589, "tokens_seen": 662440960 }, { "epoch": 1.1, "learning_rate": 0.000403691073219659, "loss": 3.0399, "theoretical_loss": 3.800745305320943, "tokens_seen": 662506496 }, { "epoch": 1.1, "learning_rate": 0.0004036810431293882, "loss": 3.0225, "theoretical_loss": 3.800706756438636, "tokens_seen": 662572032 }, { "epoch": 1.1, "learning_rate": 0.00040367101303911734, "loss": 3.0641, "theoretical_loss": 3.8006682124365687, "tokens_seen": 662637568 }, { "epoch": 1.1, "learning_rate": 0.0004036609829488466, "loss": 3.0658, "theoretical_loss": 3.8006296733136398, "tokens_seen": 662703104 }, { "epoch": 1.1, "learning_rate": 0.0004036509528585757, "loss": 3.0367, "theoretical_loss": 3.8005911390687497, "tokens_seen": 662768640 }, { "epoch": 1.1, "learning_rate": 0.00040364092276830494, "loss": 2.9358, "theoretical_loss": 3.8005526097007984, "tokens_seen": 662834176 }, { "epoch": 1.1, "learning_rate": 0.0004036308926780341, "loss": 3.0273, "theoretical_loss": 3.800514085208687, "tokens_seen": 662899712 }, { "epoch": 1.1, "learning_rate": 0.0004036208625877633, "loss": 2.9236, "theoretical_loss": 3.800475565591316, "tokens_seen": 662965248 }, { "epoch": 1.1, "learning_rate": 0.0004036108324974925, "loss": 2.9538, "theoretical_loss": 3.800437050847587, "tokens_seen": 663030784 }, { "epoch": 1.1, "learning_rate": 0.00040360080240722167, "loss": 2.9746, "theoretical_loss": 3.800398540976402, "tokens_seen": 663096320 }, { "epoch": 1.1, "learning_rate": 0.00040359077231695085, "loss": 3.1649, "theoretical_loss": 3.800360035976663, "tokens_seen": 663161856 }, { "epoch": 1.1, "learning_rate": 0.0004035807422266801, "loss": 3.105, "theoretical_loss": 3.8003215358472717, "tokens_seen": 663227392 }, { "epoch": 1.1, "learning_rate": 0.0004035707121364092, "loss": 2.9401, "theoretical_loss": 3.800283040587132, "tokens_seen": 663292928 }, { "epoch": 1.1, "learning_rate": 0.00040356068204613844, "loss": 3.0257, "theoretical_loss": 3.8002445501951465, "tokens_seen": 663358464 }, { "epoch": 1.1, "learning_rate": 0.00040355065195586757, "loss": 3.0602, "theoretical_loss": 3.8002060646702187, "tokens_seen": 663424000 }, { "epoch": 1.1, "learning_rate": 0.0004035406218655968, "loss": 2.9702, "theoretical_loss": 3.8001675840112528, "tokens_seen": 663489536 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0246901512145996, "objective/train/theoretical_loss": 3.800138726709656, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.800138726709656, "tokens_seen": 663538688 }, { "epoch": 1.1, "learning_rate": 0.000403530591775326, "loss": 2.8559, "theoretical_loss": 3.8001291082171518, "tokens_seen": 663555072 }, { "epoch": 1.1, "learning_rate": 0.00040352056168505517, "loss": 3.0212, "theoretical_loss": 3.8000906372868224, "tokens_seen": 663620608 }, { "epoch": 1.1, "learning_rate": 0.00040351053159478435, "loss": 3.0171, "theoretical_loss": 3.8000521712191677, "tokens_seen": 663686144 }, { "epoch": 1.1, "learning_rate": 0.0004035005015045136, "loss": 3.0837, "theoretical_loss": 3.800013710013094, "tokens_seen": 663751680 }, { "epoch": 1.1, "learning_rate": 0.0004034904714142427, "loss": 2.9793, "theoretical_loss": 3.7999752536675064, "tokens_seen": 663817216 }, { "epoch": 1.1, "learning_rate": 0.00040348044132397195, "loss": 3.09, "theoretical_loss": 3.7999368021813114, "tokens_seen": 663882752 }, { "epoch": 1.1, "learning_rate": 0.0004034704112337011, "loss": 3.0694, "theoretical_loss": 3.7998983555534145, "tokens_seen": 663948288 }, { "epoch": 1.1, "learning_rate": 0.0004034603811434303, "loss": 2.9673, "theoretical_loss": 3.7998599137827234, "tokens_seen": 664013824 }, { "epoch": 1.1, "learning_rate": 0.0004034503510531595, "loss": 3.0497, "theoretical_loss": 3.7998214768681446, "tokens_seen": 664079360 }, { "epoch": 1.1, "learning_rate": 0.00040344032096288867, "loss": 3.0924, "theoretical_loss": 3.799783044808586, "tokens_seen": 664144896 }, { "epoch": 1.1, "learning_rate": 0.00040343029087261785, "loss": 3.0294, "theoretical_loss": 3.799744617602954, "tokens_seen": 664210432 }, { "epoch": 1.1, "learning_rate": 0.00040342026078234703, "loss": 3.0231, "theoretical_loss": 3.7997061952501587, "tokens_seen": 664275968 }, { "epoch": 1.1, "learning_rate": 0.0004034102306920762, "loss": 2.9889, "theoretical_loss": 3.7996677777491072, "tokens_seen": 664341504 }, { "epoch": 1.1, "learning_rate": 0.00040340020060180545, "loss": 2.87, "theoretical_loss": 3.799629365098709, "tokens_seen": 664407040 }, { "epoch": 1.1, "learning_rate": 0.0004033901705115346, "loss": 3.0142, "theoretical_loss": 3.7995909572978723, "tokens_seen": 664472576 }, { "epoch": 1.1, "learning_rate": 0.0004033801404212638, "loss": 2.9075, "theoretical_loss": 3.799552554345508, "tokens_seen": 664538112 }, { "epoch": 1.1, "learning_rate": 0.000403370110330993, "loss": 2.9522, "theoretical_loss": 3.799514156240525, "tokens_seen": 664603648 }, { "epoch": 1.1, "learning_rate": 0.0004033600802407222, "loss": 2.9993, "theoretical_loss": 3.799475762981834, "tokens_seen": 664669184 }, { "epoch": 1.1, "learning_rate": 0.00040335005015045136, "loss": 2.9891, "theoretical_loss": 3.7994373745683454, "tokens_seen": 664734720 }, { "epoch": 1.1, "learning_rate": 0.00040334002006018054, "loss": 3.048, "theoretical_loss": 3.79939899099897, "tokens_seen": 664800256 }, { "epoch": 1.1, "learning_rate": 0.0004033299899699097, "loss": 2.9921, "theoretical_loss": 3.7993606122726193, "tokens_seen": 664865792 }, { "epoch": 1.1, "learning_rate": 0.00040331995987963895, "loss": 3.0883, "theoretical_loss": 3.7993222383882053, "tokens_seen": 664931328 }, { "epoch": 1.1, "learning_rate": 0.0004033099297893681, "loss": 3.0211, "theoretical_loss": 3.7992838693446394, "tokens_seen": 664996864 }, { "epoch": 1.1, "learning_rate": 0.0004032998996990973, "loss": 3.0947, "theoretical_loss": 3.799245505140834, "tokens_seen": 665062400 }, { "epoch": 1.1, "learning_rate": 0.00040328986960882644, "loss": 3.0382, "theoretical_loss": 3.7992071457757017, "tokens_seen": 665127936 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9202823638916016, "objective/train/theoretical_loss": 3.7991783794265785, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.7991783794265785, "tokens_seen": 665177088 }, { "epoch": 1.1, "learning_rate": 0.0004032798395185557, "loss": 3.0427, "theoretical_loss": 3.7991687912481567, "tokens_seen": 665193472 }, { "epoch": 1.1, "learning_rate": 0.00040326980942828486, "loss": 2.8885, "theoretical_loss": 3.7991304415571108, "tokens_seen": 665259008 }, { "epoch": 1.1, "learning_rate": 0.00040325977933801404, "loss": 2.9755, "theoretical_loss": 3.7990920967014787, "tokens_seen": 665324544 }, { "epoch": 1.1, "learning_rate": 0.0004032497492477432, "loss": 3.0355, "theoretical_loss": 3.7990537566801743, "tokens_seen": 665390080 }, { "epoch": 1.1, "learning_rate": 0.0004032397191574724, "loss": 2.9001, "theoretical_loss": 3.799015421492112, "tokens_seen": 665455616 }, { "epoch": 1.1, "learning_rate": 0.0004032296890672016, "loss": 3.1305, "theoretical_loss": 3.798977091136207, "tokens_seen": 665521152 }, { "epoch": 1.1, "learning_rate": 0.0004032196589769308, "loss": 3.0405, "theoretical_loss": 3.798938765611374, "tokens_seen": 665586688 }, { "epoch": 1.1, "learning_rate": 0.00040320962888665995, "loss": 3.0552, "theoretical_loss": 3.7989004449165282, "tokens_seen": 665652224 }, { "epoch": 1.1, "learning_rate": 0.0004031995987963892, "loss": 3.0859, "theoretical_loss": 3.798862129050586, "tokens_seen": 665717760 }, { "epoch": 1.1, "learning_rate": 0.00040318956870611836, "loss": 3.1076, "theoretical_loss": 3.7988238180124636, "tokens_seen": 665783296 }, { "epoch": 1.1, "learning_rate": 0.00040317953861584754, "loss": 3.0245, "theoretical_loss": 3.7987855118010776, "tokens_seen": 665848832 }, { "epoch": 1.1, "learning_rate": 0.0004031695085255767, "loss": 3.0242, "theoretical_loss": 3.798747210415345, "tokens_seen": 665914368 }, { "epoch": 1.1, "learning_rate": 0.0004031594784353059, "loss": 2.9378, "theoretical_loss": 3.798708913854182, "tokens_seen": 665979904 }, { "epoch": 1.1, "learning_rate": 0.00040314944834503514, "loss": 3.117, "theoretical_loss": 3.7986706221165076, "tokens_seen": 666045440 }, { "epoch": 1.1, "learning_rate": 0.0004031394182547643, "loss": 3.1124, "theoretical_loss": 3.7986323352012388, "tokens_seen": 666110976 }, { "epoch": 1.1, "learning_rate": 0.0004031293881644935, "loss": 3.062, "theoretical_loss": 3.798594053107295, "tokens_seen": 666176512 }, { "epoch": 1.1, "learning_rate": 0.0004031193580742227, "loss": 3.024, "theoretical_loss": 3.7985557758335933, "tokens_seen": 666242048 }, { "epoch": 1.1, "learning_rate": 0.00040310932798395187, "loss": 3.0145, "theoretical_loss": 3.798517503379054, "tokens_seen": 666307584 }, { "epoch": 1.1, "learning_rate": 0.00040309929789368105, "loss": 3.0033, "theoretical_loss": 3.7984792357425956, "tokens_seen": 666373120 }, { "epoch": 1.1, "learning_rate": 0.0004030892678034103, "loss": 2.996, "theoretical_loss": 3.7984409729231383, "tokens_seen": 666438656 }, { "epoch": 1.1, "learning_rate": 0.0004030792377131394, "loss": 2.85, "theoretical_loss": 3.7984027149196025, "tokens_seen": 666504192 }, { "epoch": 1.1, "learning_rate": 0.00040306920762286864, "loss": 2.937, "theoretical_loss": 3.7983644617309076, "tokens_seen": 666569728 }, { "epoch": 1.1, "learning_rate": 0.00040305917753259777, "loss": 3.1244, "theoretical_loss": 3.798326213355975, "tokens_seen": 666635264 }, { "epoch": 1.1, "learning_rate": 0.000403049147442327, "loss": 2.8916, "theoretical_loss": 3.7982879697937255, "tokens_seen": 666700800 }, { "epoch": 1.1, "learning_rate": 0.0004030391173520562, "loss": 2.9626, "theoretical_loss": 3.7982497310430814, "tokens_seen": 666766336 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9567999839782715, "objective/train/theoretical_loss": 3.798221055137065, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.798221055137065, "tokens_seen": 666815488 }, { "epoch": 1.1, "learning_rate": 0.00040302908726178537, "loss": 2.9158, "theoretical_loss": 3.7982114971029635, "tokens_seen": 666831872 }, { "epoch": 1.1, "learning_rate": 0.00040301905717151455, "loss": 2.8955, "theoretical_loss": 3.7981732679722944, "tokens_seen": 666897408 }, { "epoch": 1.1, "learning_rate": 0.0004030090270812438, "loss": 2.9283, "theoretical_loss": 3.798135043649996, "tokens_seen": 666962944 }, { "epoch": 1.1, "learning_rate": 0.0004029989969909729, "loss": 2.9842, "theoretical_loss": 3.7980968241349915, "tokens_seen": 667028480 }, { "epoch": 1.1, "learning_rate": 0.00040298896690070215, "loss": 2.913, "theoretical_loss": 3.7980586094262043, "tokens_seen": 667094016 }, { "epoch": 1.1, "learning_rate": 0.0004029789368104313, "loss": 3.2272, "theoretical_loss": 3.798020399522558, "tokens_seen": 667159552 }, { "epoch": 1.1, "learning_rate": 0.0004029689067201605, "loss": 3.0157, "theoretical_loss": 3.7979821944229757, "tokens_seen": 667225088 }, { "epoch": 1.1, "learning_rate": 0.0004029588766298897, "loss": 3.0106, "theoretical_loss": 3.7979439941263826, "tokens_seen": 667290624 }, { "epoch": 1.1, "learning_rate": 0.00040294884653961887, "loss": 3.0358, "theoretical_loss": 3.7979057986317026, "tokens_seen": 667356160 }, { "epoch": 1.1, "learning_rate": 0.00040293881644934805, "loss": 3.0217, "theoretical_loss": 3.797867607937861, "tokens_seen": 667421696 }, { "epoch": 1.1, "learning_rate": 0.00040292878635907723, "loss": 2.925, "theoretical_loss": 3.797829422043783, "tokens_seen": 667487232 }, { "epoch": 1.1, "learning_rate": 0.0004029187562688064, "loss": 3.108, "theoretical_loss": 3.797791240948394, "tokens_seen": 667552768 }, { "epoch": 1.1, "learning_rate": 0.00040290872617853565, "loss": 3.0812, "theoretical_loss": 3.79775306465062, "tokens_seen": 667618304 }, { "epoch": 1.1, "learning_rate": 0.0004028986960882648, "loss": 2.8977, "theoretical_loss": 3.7977148931493874, "tokens_seen": 667683840 }, { "epoch": 1.1, "learning_rate": 0.000402888665997994, "loss": 3.0495, "theoretical_loss": 3.797676726443622, "tokens_seen": 667749376 }, { "epoch": 1.1, "learning_rate": 0.0004028786359077232, "loss": 2.9763, "theoretical_loss": 3.797638564532252, "tokens_seen": 667814912 }, { "epoch": 1.1, "learning_rate": 0.0004028686058174524, "loss": 2.9702, "theoretical_loss": 3.7976004074142047, "tokens_seen": 667880448 }, { "epoch": 1.1, "learning_rate": 0.00040285857572718156, "loss": 3.1216, "theoretical_loss": 3.7975622550884074, "tokens_seen": 667945984 }, { "epoch": 1.1, "learning_rate": 0.00040284854563691074, "loss": 2.9972, "theoretical_loss": 3.797524107553788, "tokens_seen": 668011520 }, { "epoch": 1.1, "learning_rate": 0.0004028385155466399, "loss": 3.0978, "theoretical_loss": 3.7974859648092747, "tokens_seen": 668077056 }, { "epoch": 1.1, "learning_rate": 0.00040282848545636915, "loss": 3.0068, "theoretical_loss": 3.797447826853797, "tokens_seen": 668142592 }, { "epoch": 1.1, "learning_rate": 0.0004028184553660983, "loss": 2.7996, "theoretical_loss": 3.7974096936862827, "tokens_seen": 668208128 }, { "epoch": 1.1, "learning_rate": 0.0004028084252758275, "loss": 3.1167, "theoretical_loss": 3.7973715653056628, "tokens_seen": 668273664 }, { "epoch": 1.1, "learning_rate": 0.00040279839518555664, "loss": 3.0551, "theoretical_loss": 3.7973334417108653, "tokens_seen": 668339200 }, { "epoch": 1.1, "learning_rate": 0.0004027883650952859, "loss": 2.9823, "theoretical_loss": 3.7972953229008217, "tokens_seen": 668404736 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9505157470703125, "objective/train/theoretical_loss": 3.79726673693264, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.79726673693264, "tokens_seen": 668453888 }, { "epoch": 1.1, "learning_rate": 0.00040277833500501506, "loss": 2.977, "theoretical_loss": 3.797257208874462, "tokens_seen": 668470272 }, { "epoch": 1.1, "learning_rate": 0.00040276830491474424, "loss": 2.7917, "theoretical_loss": 3.7972190996307162, "tokens_seen": 668535808 }, { "epoch": 1.1, "learning_rate": 0.0004027582748244734, "loss": 2.9305, "theoretical_loss": 3.7971809951685165, "tokens_seen": 668601344 }, { "epoch": 1.1, "learning_rate": 0.0004027482447342026, "loss": 3.145, "theoretical_loss": 3.797142895486794, "tokens_seen": 668666880 }, { "epoch": 1.1, "learning_rate": 0.0004027382146439318, "loss": 2.993, "theoretical_loss": 3.7971048005844805, "tokens_seen": 668732416 }, { "epoch": 1.1, "learning_rate": 0.000402728184553661, "loss": 2.9905, "theoretical_loss": 3.7970667104605083, "tokens_seen": 668797952 }, { "epoch": 1.1, "learning_rate": 0.00040271815446339015, "loss": 3.0559, "theoretical_loss": 3.7970286251138097, "tokens_seen": 668863488 }, { "epoch": 1.1, "learning_rate": 0.0004027081243731194, "loss": 3.0376, "theoretical_loss": 3.796990544543317, "tokens_seen": 668929024 }, { "epoch": 1.1, "learning_rate": 0.00040269809428284856, "loss": 3.0791, "theoretical_loss": 3.796952468747965, "tokens_seen": 668994560 }, { "epoch": 1.1, "learning_rate": 0.00040268806419257774, "loss": 3.053, "theoretical_loss": 3.7969143977266855, "tokens_seen": 669060096 }, { "epoch": 1.1, "learning_rate": 0.0004026780341023069, "loss": 2.8902, "theoretical_loss": 3.796876331478413, "tokens_seen": 669125632 }, { "epoch": 1.1, "learning_rate": 0.0004026680040120361, "loss": 3.1176, "theoretical_loss": 3.7968382700020813, "tokens_seen": 669191168 }, { "epoch": 1.1, "learning_rate": 0.0004026579739217653, "loss": 3.0453, "theoretical_loss": 3.796800213296626, "tokens_seen": 669256704 }, { "epoch": 1.1, "learning_rate": 0.0004026479438314945, "loss": 2.8505, "theoretical_loss": 3.7967621613609817, "tokens_seen": 669322240 }, { "epoch": 1.1, "learning_rate": 0.00040263791374122365, "loss": 3.085, "theoretical_loss": 3.796724114194083, "tokens_seen": 669387776 }, { "epoch": 1.1, "learning_rate": 0.0004026278836509529, "loss": 2.9303, "theoretical_loss": 3.7966860717948654, "tokens_seen": 669453312 }, { "epoch": 1.1, "learning_rate": 0.000402617853560682, "loss": 3.1743, "theoretical_loss": 3.7966480341622653, "tokens_seen": 669518848 }, { "epoch": 1.1, "learning_rate": 0.00040260782347041125, "loss": 2.9876, "theoretical_loss": 3.7966100012952193, "tokens_seen": 669584384 }, { "epoch": 1.1, "learning_rate": 0.00040259779338014043, "loss": 2.8396, "theoretical_loss": 3.7965719731926635, "tokens_seen": 669649920 }, { "epoch": 1.1, "learning_rate": 0.0004025877632898696, "loss": 2.811, "theoretical_loss": 3.796533949853535, "tokens_seen": 669715456 }, { "epoch": 1.1, "learning_rate": 0.0004025777331995988, "loss": 3.0848, "theoretical_loss": 3.7964959312767705, "tokens_seen": 669780992 }, { "epoch": 1.1, "learning_rate": 0.00040256770310932797, "loss": 3.0487, "theoretical_loss": 3.796457917461309, "tokens_seen": 669846528 }, { "epoch": 1.1, "learning_rate": 0.00040255767301905715, "loss": 3.1179, "theoretical_loss": 3.796419908406087, "tokens_seen": 669912064 }, { "epoch": 1.1, "learning_rate": 0.0004025476429287864, "loss": 2.9617, "theoretical_loss": 3.796381904110044, "tokens_seen": 669977600 }, { "epoch": 1.1, "learning_rate": 0.0004025376128385155, "loss": 3.1272, "theoretical_loss": 3.796343904572117, "tokens_seen": 670043136 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2151005268096924, "objective/train/theoretical_loss": 3.7963154080405483, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.7963154080405483, "tokens_seen": 670092288 }, { "epoch": 1.1, "learning_rate": 0.00040252758274824475, "loss": 3.1287, "theoretical_loss": 3.796305909791247, "tokens_seen": 670108672 }, { "epoch": 1.1, "learning_rate": 0.00040251755265797393, "loss": 3.0029, "theoretical_loss": 3.796267919766372, "tokens_seen": 670174208 }, { "epoch": 1.1, "learning_rate": 0.0004025075225677031, "loss": 2.7903, "theoretical_loss": 3.796229934496432, "tokens_seen": 670239744 }, { "epoch": 1.1, "learning_rate": 0.0004024974924774323, "loss": 2.9964, "theoretical_loss": 3.796191953980367, "tokens_seen": 670305280 }, { "epoch": 1.1, "learning_rate": 0.0004024874623871615, "loss": 3.0367, "theoretical_loss": 3.796153978217118, "tokens_seen": 670370816 }, { "epoch": 1.1, "learning_rate": 0.00040247743229689066, "loss": 2.9946, "theoretical_loss": 3.796116007205624, "tokens_seen": 670436352 }, { "epoch": 1.1, "learning_rate": 0.0004024674022066199, "loss": 3.0284, "theoretical_loss": 3.7960780409448285, "tokens_seen": 670501888 }, { "epoch": 1.1, "learning_rate": 0.000402457372116349, "loss": 2.9603, "theoretical_loss": 3.796040079433671, "tokens_seen": 670567424 }, { "epoch": 1.1, "learning_rate": 0.00040244734202607825, "loss": 2.871, "theoretical_loss": 3.7960021226710934, "tokens_seen": 670632960 }, { "epoch": 1.1, "learning_rate": 0.0004024373119358074, "loss": 3.0646, "theoretical_loss": 3.7959641706560383, "tokens_seen": 670698496 }, { "epoch": 1.1, "learning_rate": 0.0004024272818455366, "loss": 2.954, "theoretical_loss": 3.795926223387448, "tokens_seen": 670764032 }, { "epoch": 1.1, "learning_rate": 0.0004024172517552658, "loss": 3.0196, "theoretical_loss": 3.795888280864264, "tokens_seen": 670829568 }, { "epoch": 1.1, "learning_rate": 0.000402407221664995, "loss": 2.8166, "theoretical_loss": 3.7958503430854313, "tokens_seen": 670895104 }, { "epoch": 1.1, "learning_rate": 0.0004023971915747242, "loss": 2.7983, "theoretical_loss": 3.7958124100498924, "tokens_seen": 670960640 }, { "epoch": 1.1, "learning_rate": 0.0004023871614844534, "loss": 3.0728, "theoretical_loss": 3.795774481756591, "tokens_seen": 671026176 }, { "epoch": 1.1, "learning_rate": 0.0004023771313941826, "loss": 3.0102, "theoretical_loss": 3.795736558204471, "tokens_seen": 671091712 }, { "epoch": 1.1, "learning_rate": 0.00040236710130391176, "loss": 2.9347, "theoretical_loss": 3.7956986393924774, "tokens_seen": 671157248 }, { "epoch": 1.1, "learning_rate": 0.00040235707121364094, "loss": 2.8665, "theoretical_loss": 3.795660725319555, "tokens_seen": 671222784 }, { "epoch": 1.1, "learning_rate": 0.0004023470411233701, "loss": 2.9912, "theoretical_loss": 3.7956228159846477, "tokens_seen": 671288320 }, { "epoch": 1.1, "learning_rate": 0.00040233701103309935, "loss": 3.0794, "theoretical_loss": 3.795584911386702, "tokens_seen": 671353856 }, { "epoch": 1.1, "learning_rate": 0.0004023269809428285, "loss": 3.0247, "theoretical_loss": 3.795547011524664, "tokens_seen": 671419392 }, { "epoch": 1.1, "learning_rate": 0.0004023169508525577, "loss": 3.0519, "theoretical_loss": 3.7955091163974783, "tokens_seen": 671484928 }, { "epoch": 1.1, "learning_rate": 0.00040230692076228684, "loss": 3.0607, "theoretical_loss": 3.795471226004093, "tokens_seen": 671550464 }, { "epoch": 1.1, "learning_rate": 0.0004022968906720161, "loss": 3.1547, "theoretical_loss": 3.795433340343454, "tokens_seen": 671616000 }, { "epoch": 1.1, "learning_rate": 0.00040228686058174526, "loss": 2.8858, "theoretical_loss": 3.7953954594145083, "tokens_seen": 671681536 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.006545066833496, "objective/train/theoretical_loss": 3.79536705182234, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.79536705182234, "tokens_seen": 671730688 }, { "epoch": 1.1, "learning_rate": 0.00040227683049147444, "loss": 2.969, "theoretical_loss": 3.7953575832162034, "tokens_seen": 671747072 }, { "epoch": 1.1, "learning_rate": 0.0004022668004012036, "loss": 2.9623, "theoretical_loss": 3.7953197117474877, "tokens_seen": 671812608 }, { "epoch": 1.1, "learning_rate": 0.0004022567703109328, "loss": 2.8528, "theoretical_loss": 3.7952818450073087, "tokens_seen": 671878144 }, { "epoch": 1.1, "learning_rate": 0.000402246740220662, "loss": 2.8706, "theoretical_loss": 3.795243982994615, "tokens_seen": 671943680 }, { "epoch": 1.1, "learning_rate": 0.0004022367101303912, "loss": 2.913, "theoretical_loss": 3.795206125708356, "tokens_seen": 672009216 }, { "epoch": 1.1, "learning_rate": 0.00040222668004012035, "loss": 2.911, "theoretical_loss": 3.79516827314748, "tokens_seen": 672074752 }, { "epoch": 1.1, "learning_rate": 0.0004022166499498496, "loss": 3.1102, "theoretical_loss": 3.795130425310937, "tokens_seen": 672140288 }, { "epoch": 1.1, "learning_rate": 0.00040220661985957876, "loss": 2.9834, "theoretical_loss": 3.7950925821976766, "tokens_seen": 672205824 }, { "epoch": 1.1, "learning_rate": 0.00040219658976930794, "loss": 2.846, "theoretical_loss": 3.7950547438066486, "tokens_seen": 672271360 }, { "epoch": 1.1, "learning_rate": 0.0004021865596790371, "loss": 2.9934, "theoretical_loss": 3.795016910136804, "tokens_seen": 672336896 }, { "epoch": 1.1, "learning_rate": 0.0004021765295887663, "loss": 2.943, "theoretical_loss": 3.794979081187094, "tokens_seen": 672402432 }, { "epoch": 1.1, "learning_rate": 0.0004021664994984955, "loss": 2.9447, "theoretical_loss": 3.7949412569564682, "tokens_seen": 672467968 }, { "epoch": 1.1, "learning_rate": 0.0004021564694082247, "loss": 2.967, "theoretical_loss": 3.79490343744388, "tokens_seen": 672533504 }, { "epoch": 1.1, "learning_rate": 0.00040214643931795385, "loss": 3.002, "theoretical_loss": 3.79486562264828, "tokens_seen": 672599040 }, { "epoch": 1.1, "learning_rate": 0.0004021364092276831, "loss": 3.0825, "theoretical_loss": 3.7948278125686206, "tokens_seen": 672664576 }, { "epoch": 1.1, "learning_rate": 0.0004021263791374122, "loss": 2.8804, "theoretical_loss": 3.7947900072038547, "tokens_seen": 672730112 }, { "epoch": 1.1, "learning_rate": 0.00040211634904714145, "loss": 3.036, "theoretical_loss": 3.7947522065529347, "tokens_seen": 672795648 }, { "epoch": 1.1, "learning_rate": 0.00040210631895687063, "loss": 2.8976, "theoretical_loss": 3.794714410614813, "tokens_seen": 672861184 }, { "epoch": 1.1, "learning_rate": 0.0004020962888665998, "loss": 2.9148, "theoretical_loss": 3.7946766193884454, "tokens_seen": 672926720 }, { "epoch": 1.1, "learning_rate": 0.000402086258776329, "loss": 3.0232, "theoretical_loss": 3.794638832872783, "tokens_seen": 672992256 }, { "epoch": 1.1, "learning_rate": 0.00040207622868605817, "loss": 2.9898, "theoretical_loss": 3.794601051066782, "tokens_seen": 673057792 }, { "epoch": 1.1, "learning_rate": 0.00040206619859578735, "loss": 3.0968, "theoretical_loss": 3.7945632739693957, "tokens_seen": 673123328 }, { "epoch": 1.1, "learning_rate": 0.0004020561685055166, "loss": 3.0128, "theoretical_loss": 3.79452550157958, "tokens_seen": 673188864 }, { "epoch": 1.1, "learning_rate": 0.0004020461384152457, "loss": 3.1051, "theoretical_loss": 3.794487733896289, "tokens_seen": 673254400 }, { "epoch": 1.1, "learning_rate": 0.00040203610832497495, "loss": 2.9246, "theoretical_loss": 3.7944499709184782, "tokens_seen": 673319936 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9991118907928467, "objective/train/theoretical_loss": 3.7944216517724643, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.7944216517724643, "tokens_seen": 673369088 }, { "epoch": 1.1, "learning_rate": 0.00040202607823470413, "loss": 3.0845, "theoretical_loss": 3.7944122126451045, "tokens_seen": 673385472 }, { "epoch": 1.1, "learning_rate": 0.0004020160481444333, "loss": 3.0255, "theoretical_loss": 3.794374459075123, "tokens_seen": 673451008 }, { "epoch": 1.1, "learning_rate": 0.0004020060180541625, "loss": 3.1477, "theoretical_loss": 3.7943367102074914, "tokens_seen": 673516544 }, { "epoch": 1.1, "learning_rate": 0.0004019959879638917, "loss": 3.0929, "theoretical_loss": 3.794298966041165, "tokens_seen": 673582080 }, { "epoch": 1.1, "learning_rate": 0.00040198595787362086, "loss": 2.923, "theoretical_loss": 3.7942612265751023, "tokens_seen": 673647616 }, { "epoch": 1.1, "learning_rate": 0.0004019759277833501, "loss": 2.9286, "theoretical_loss": 3.7942234918082596, "tokens_seen": 673713152 }, { "epoch": 1.1, "learning_rate": 0.0004019658976930792, "loss": 2.9132, "theoretical_loss": 3.794185761739596, "tokens_seen": 673778688 }, { "epoch": 1.1, "learning_rate": 0.00040195586760280845, "loss": 2.9874, "theoretical_loss": 3.7941480363680684, "tokens_seen": 673844224 }, { "epoch": 1.1, "learning_rate": 0.0004019458375125376, "loss": 2.8137, "theoretical_loss": 3.7941103156926363, "tokens_seen": 673909760 }, { "epoch": 1.1, "learning_rate": 0.0004019358074222668, "loss": 3.0107, "theoretical_loss": 3.7940725997122584, "tokens_seen": 673975296 }, { "epoch": 1.1, "learning_rate": 0.000401925777331996, "loss": 3.1353, "theoretical_loss": 3.794034888425893, "tokens_seen": 674040832 }, { "epoch": 1.1, "learning_rate": 0.0004019157472417252, "loss": 3.083, "theoretical_loss": 3.7939971818325002, "tokens_seen": 674106368 }, { "epoch": 1.1, "learning_rate": 0.00040190571715145436, "loss": 2.9363, "theoretical_loss": 3.7939594799310408, "tokens_seen": 674171904 }, { "epoch": 1.1, "learning_rate": 0.0004018956870611836, "loss": 3.0116, "theoretical_loss": 3.793921782720473, "tokens_seen": 674237440 }, { "epoch": 1.1, "learning_rate": 0.0004018856569709127, "loss": 3.0038, "theoretical_loss": 3.793884090199758, "tokens_seen": 674302976 }, { "epoch": 1.1, "learning_rate": 0.00040187562688064196, "loss": 2.911, "theoretical_loss": 3.7938464023678575, "tokens_seen": 674368512 }, { "epoch": 1.1, "learning_rate": 0.0004018655967903711, "loss": 2.9001, "theoretical_loss": 3.7938087192237324, "tokens_seen": 674434048 }, { "epoch": 1.1, "learning_rate": 0.0004018555667001003, "loss": 3.0449, "theoretical_loss": 3.793771040766343, "tokens_seen": 674499584 }, { "epoch": 1.1, "learning_rate": 0.0004018455366098295, "loss": 2.969, "theoretical_loss": 3.7937333669946525, "tokens_seen": 674565120 }, { "epoch": 1.1, "learning_rate": 0.0004018355065195587, "loss": 3.1923, "theoretical_loss": 3.793695697907622, "tokens_seen": 674630656 }, { "epoch": 1.1, "learning_rate": 0.00040182547642928786, "loss": 3.0447, "theoretical_loss": 3.7936580335042147, "tokens_seen": 674696192 }, { "epoch": 1.1, "learning_rate": 0.00040181544633901704, "loss": 3.0636, "theoretical_loss": 3.7936203737833933, "tokens_seen": 674761728 }, { "epoch": 1.1, "learning_rate": 0.0004018054162487462, "loss": 2.9297, "theoretical_loss": 3.7935827187441205, "tokens_seen": 674827264 }, { "epoch": 1.1, "learning_rate": 0.00040179538615847546, "loss": 2.9586, "theoretical_loss": 3.79354506838536, "tokens_seen": 674892800 }, { "epoch": 1.1, "learning_rate": 0.0004017853560682046, "loss": 3.1051, "theoretical_loss": 3.7935074227060754, "tokens_seen": 674958336 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.083796739578247, "objective/train/theoretical_loss": 3.7934791915168953, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.7934791915168953, "tokens_seen": 675007488 }, { "epoch": 1.1, "learning_rate": 0.0004017753259779338, "loss": 3.0721, "theoretical_loss": 3.793469781705231, "tokens_seen": 675023872 }, { "epoch": 1.1, "learning_rate": 0.00040176529588766295, "loss": 3.0254, "theoretical_loss": 3.793432145381792, "tokens_seen": 675089408 }, { "epoch": 1.1, "learning_rate": 0.0004017552657973922, "loss": 2.9069, "theoretical_loss": 3.793394513734722, "tokens_seen": 675154944 }, { "epoch": 1.1, "learning_rate": 0.00040174523570712137, "loss": 2.9108, "theoretical_loss": 3.7933568867629868, "tokens_seen": 675220480 }, { "epoch": 1.1, "learning_rate": 0.00040173520561685055, "loss": 2.8709, "theoretical_loss": 3.793319264465551, "tokens_seen": 675286016 }, { "epoch": 1.1, "learning_rate": 0.00040172517552657973, "loss": 2.9768, "theoretical_loss": 3.7932816468413817, "tokens_seen": 675351552 }, { "epoch": 1.1, "learning_rate": 0.00040171514543630896, "loss": 2.9531, "theoretical_loss": 3.7932440338894438, "tokens_seen": 675417088 }, { "epoch": 1.1, "learning_rate": 0.0004017051153460381, "loss": 2.8731, "theoretical_loss": 3.793206425608705, "tokens_seen": 675482624 }, { "epoch": 1.1, "learning_rate": 0.0004016950852557673, "loss": 2.9984, "theoretical_loss": 3.7931688219981305, "tokens_seen": 675548160 }, { "epoch": 1.1, "learning_rate": 0.00040168505516549645, "loss": 3.035, "theoretical_loss": 3.7931312230566885, "tokens_seen": 675613696 }, { "epoch": 1.1, "learning_rate": 0.0004016750250752257, "loss": 3.0731, "theoretical_loss": 3.793093628783346, "tokens_seen": 675679232 }, { "epoch": 1.1, "learning_rate": 0.00040166499498495487, "loss": 2.875, "theoretical_loss": 3.793056039177071, "tokens_seen": 675744768 }, { "epoch": 1.1, "learning_rate": 0.00040165496489468405, "loss": 2.9559, "theoretical_loss": 3.7930184542368313, "tokens_seen": 675810304 }, { "epoch": 1.1, "learning_rate": 0.0004016449348044133, "loss": 3.0803, "theoretical_loss": 3.7929808739615956, "tokens_seen": 675875840 }, { "epoch": 1.1, "learning_rate": 0.0004016349047141424, "loss": 2.8949, "theoretical_loss": 3.792943298350332, "tokens_seen": 675941376 }, { "epoch": 1.1, "learning_rate": 0.00040162487462387165, "loss": 3.081, "theoretical_loss": 3.7929057274020104, "tokens_seen": 676006912 }, { "epoch": 1.1, "learning_rate": 0.00040161484453360083, "loss": 2.9874, "theoretical_loss": 3.7928681611155994, "tokens_seen": 676072448 }, { "epoch": 1.1, "learning_rate": 0.00040160481444333, "loss": 2.9976, "theoretical_loss": 3.7928305994900695, "tokens_seen": 676137984 }, { "epoch": 1.1, "learning_rate": 0.0004015947843530592, "loss": 2.9832, "theoretical_loss": 3.7927930425243903, "tokens_seen": 676203520 }, { "epoch": 1.1, "learning_rate": 0.00040158475426278837, "loss": 2.9831, "theoretical_loss": 3.792755490217532, "tokens_seen": 676269056 }, { "epoch": 1.1, "learning_rate": 0.00040157472417251755, "loss": 2.8532, "theoretical_loss": 3.7927179425684656, "tokens_seen": 676334592 }, { "epoch": 1.1, "learning_rate": 0.0004015646940822468, "loss": 2.8491, "theoretical_loss": 3.792680399576162, "tokens_seen": 676400128 }, { "epoch": 1.1, "learning_rate": 0.0004015546639919759, "loss": 2.9103, "theoretical_loss": 3.7926428612395924, "tokens_seen": 676465664 }, { "epoch": 1.1, "learning_rate": 0.00040154463390170515, "loss": 2.96, "theoretical_loss": 3.7926053275577285, "tokens_seen": 676531200 }, { "epoch": 1.1, "learning_rate": 0.00040153460381143433, "loss": 2.9101, "theoretical_loss": 3.792567798529543, "tokens_seen": 676596736 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.127293348312378, "objective/train/theoretical_loss": 3.792539654811762, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.792539654811762, "tokens_seen": 676645888 }, { "epoch": 1.1, "learning_rate": 0.0004015245737211635, "loss": 3.0355, "theoretical_loss": 3.7925302741540072, "tokens_seen": 676662272 }, { "epoch": 1.1, "learning_rate": 0.0004015145436308927, "loss": 2.9405, "theoretical_loss": 3.7924927544300946, "tokens_seen": 676727808 }, { "epoch": 1.1, "learning_rate": 0.0004015045135406219, "loss": 3.1102, "theoretical_loss": 3.7924552393567774, "tokens_seen": 676793344 }, { "epoch": 1.1, "learning_rate": 0.00040149448345035106, "loss": 2.8451, "theoretical_loss": 3.79241772893303, "tokens_seen": 676858880 }, { "epoch": 1.1, "learning_rate": 0.0004014844533600803, "loss": 3.0202, "theoretical_loss": 3.7923802231578247, "tokens_seen": 676924416 }, { "epoch": 1.1, "learning_rate": 0.0004014744232698094, "loss": 3.0362, "theoretical_loss": 3.7923427220301367, "tokens_seen": 676989952 }, { "epoch": 1.1, "learning_rate": 0.00040146439317953865, "loss": 3.0488, "theoretical_loss": 3.792305225548939, "tokens_seen": 677055488 }, { "epoch": 1.1, "learning_rate": 0.0004014543630892678, "loss": 2.9705, "theoretical_loss": 3.7922677337132074, "tokens_seen": 677121024 }, { "epoch": 1.1, "learning_rate": 0.000401444332998997, "loss": 2.8486, "theoretical_loss": 3.7922302465219158, "tokens_seen": 677186560 }, { "epoch": 1.1, "learning_rate": 0.0004014343029087262, "loss": 2.9505, "theoretical_loss": 3.7921927639740405, "tokens_seen": 677252096 }, { "epoch": 1.1, "learning_rate": 0.0004014242728184554, "loss": 2.9307, "theoretical_loss": 3.7921552860685566, "tokens_seen": 677317632 }, { "epoch": 1.1, "learning_rate": 0.00040141424272818456, "loss": 2.9082, "theoretical_loss": 3.7921178128044395, "tokens_seen": 677383168 }, { "epoch": 1.1, "learning_rate": 0.0004014042126379138, "loss": 2.965, "theoretical_loss": 3.7920803441806665, "tokens_seen": 677448704 }, { "epoch": 1.1, "learning_rate": 0.0004013941825476429, "loss": 2.9158, "theoretical_loss": 3.792042880196213, "tokens_seen": 677514240 }, { "epoch": 1.1, "learning_rate": 0.00040138415245737216, "loss": 3.0066, "theoretical_loss": 3.7920054208500567, "tokens_seen": 677579776 }, { "epoch": 1.1, "learning_rate": 0.0004013741223671013, "loss": 3.1009, "theoretical_loss": 3.7919679661411747, "tokens_seen": 677645312 }, { "epoch": 1.1, "learning_rate": 0.0004013640922768305, "loss": 2.928, "theoretical_loss": 3.7919305160685446, "tokens_seen": 677710848 }, { "epoch": 1.1, "learning_rate": 0.0004013540621865597, "loss": 3.0602, "theoretical_loss": 3.7918930706311444, "tokens_seen": 677776384 }, { "epoch": 1.1, "learning_rate": 0.0004013440320962889, "loss": 2.9736, "theoretical_loss": 3.791855629827951, "tokens_seen": 677841920 }, { "epoch": 1.1, "learning_rate": 0.00040133400200601806, "loss": 2.9825, "theoretical_loss": 3.7918181936579445, "tokens_seen": 677907456 }, { "epoch": 1.1, "learning_rate": 0.00040132397191574724, "loss": 3.0282, "theoretical_loss": 3.791780762120103, "tokens_seen": 677972992 }, { "epoch": 1.1, "learning_rate": 0.0004013139418254764, "loss": 2.9225, "theoretical_loss": 3.791743335213406, "tokens_seen": 678038528 }, { "epoch": 1.1, "learning_rate": 0.00040130391173520566, "loss": 3.0694, "theoretical_loss": 3.7917059129368322, "tokens_seen": 678104064 }, { "epoch": 1.1, "learning_rate": 0.0004012938816449348, "loss": 2.8796, "theoretical_loss": 3.7916684952893625, "tokens_seen": 678169600 }, { "epoch": 1.1, "learning_rate": 0.000401283851554664, "loss": 3.0038, "theoretical_loss": 3.7916310822699764, "tokens_seen": 678235136 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1594769954681396, "objective/train/theoretical_loss": 3.7916030255420035, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.7916030255420035, "tokens_seen": 678284288 }, { "epoch": 1.1, "learning_rate": 0.00040127382146439315, "loss": 3.0853, "theoretical_loss": 3.791593673877654, "tokens_seen": 678300672 }, { "epoch": 1.1, "learning_rate": 0.0004012637913741224, "loss": 3.0829, "theoretical_loss": 3.7915562701113767, "tokens_seen": 678366208 }, { "epoch": 1.1, "learning_rate": 0.00040125376128385157, "loss": 3.1149, "theoretical_loss": 3.791518870970126, "tokens_seen": 678431744 }, { "epoch": 1.1, "learning_rate": 0.00040124373119358075, "loss": 2.962, "theoretical_loss": 3.791481476452882, "tokens_seen": 678497280 }, { "epoch": 1.1, "learning_rate": 0.00040123370110330993, "loss": 3.0348, "theoretical_loss": 3.791444086558627, "tokens_seen": 678562816 }, { "epoch": 1.1, "learning_rate": 0.00040122367101303916, "loss": 3.1042, "theoretical_loss": 3.791406701286344, "tokens_seen": 678628352 }, { "epoch": 1.1, "learning_rate": 0.0004012136409227683, "loss": 2.8851, "theoretical_loss": 3.791369320635014, "tokens_seen": 678693888 }, { "epoch": 1.1, "learning_rate": 0.0004012036108324975, "loss": 3.1344, "theoretical_loss": 3.7913319446036207, "tokens_seen": 678759424 }, { "epoch": 1.1, "learning_rate": 0.00040119358074222665, "loss": 3.0345, "theoretical_loss": 3.7912945731911467, "tokens_seen": 678824960 }, { "epoch": 1.1, "learning_rate": 0.0004011835506519559, "loss": 3.1002, "theoretical_loss": 3.7912572063965753, "tokens_seen": 678890496 }, { "epoch": 1.1, "learning_rate": 0.00040117352056168507, "loss": 3.0901, "theoretical_loss": 3.79121984421889, "tokens_seen": 678956032 }, { "epoch": 1.1, "learning_rate": 0.00040116349047141425, "loss": 3.0043, "theoretical_loss": 3.791182486657075, "tokens_seen": 679021568 }, { "epoch": 1.1, "learning_rate": 0.00040115346038114343, "loss": 2.8639, "theoretical_loss": 3.791145133710115, "tokens_seen": 679087104 }, { "epoch": 1.1, "learning_rate": 0.0004011434302908726, "loss": 3.0222, "theoretical_loss": 3.7911077853769948, "tokens_seen": 679152640 }, { "epoch": 1.1, "learning_rate": 0.0004011334002006018, "loss": 3.0847, "theoretical_loss": 3.7910704416566983, "tokens_seen": 679218176 }, { "epoch": 1.1, "learning_rate": 0.00040112337011033103, "loss": 3.0975, "theoretical_loss": 3.7910331025482114, "tokens_seen": 679283712 }, { "epoch": 1.1, "learning_rate": 0.00040111334002006016, "loss": 3.0651, "theoretical_loss": 3.7909957680505197, "tokens_seen": 679349248 }, { "epoch": 1.1, "learning_rate": 0.0004011033099297894, "loss": 2.9078, "theoretical_loss": 3.7909584381626087, "tokens_seen": 679414784 }, { "epoch": 1.1, "learning_rate": 0.0004010932798395185, "loss": 2.8657, "theoretical_loss": 3.7909211128834652, "tokens_seen": 679480320 }, { "epoch": 1.1, "learning_rate": 0.00040108324974924775, "loss": 3.063, "theoretical_loss": 3.7908837922120764, "tokens_seen": 679545856 }, { "epoch": 1.1, "learning_rate": 0.00040107321965897693, "loss": 3.0652, "theoretical_loss": 3.7908464761474274, "tokens_seen": 679611392 }, { "epoch": 1.1, "learning_rate": 0.0004010631895687061, "loss": 2.8113, "theoretical_loss": 3.7908091646885067, "tokens_seen": 679676928 }, { "epoch": 1.1, "learning_rate": 0.0004010531594784353, "loss": 2.9407, "theoretical_loss": 3.7907718578343017, "tokens_seen": 679742464 }, { "epoch": 1.1, "learning_rate": 0.00040104312938816453, "loss": 3.0372, "theoretical_loss": 3.7907345555838, "tokens_seen": 679808000 }, { "epoch": 1.1, "learning_rate": 0.00040103309929789366, "loss": 2.7183, "theoretical_loss": 3.7906972579359897, "tokens_seen": 679873536 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.038749933242798, "objective/train/theoretical_loss": 3.79066928772004, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.79066928772004, "tokens_seen": 679922688 }, { "epoch": 1.1, "learning_rate": 0.0004010230692076229, "loss": 3.0765, "theoretical_loss": 3.7906599648898593, "tokens_seen": 679939072 }, { "epoch": 1.1, "learning_rate": 0.000401013039117352, "loss": 2.8678, "theoretical_loss": 3.790622676444398, "tokens_seen": 680004608 }, { "epoch": 1.1, "learning_rate": 0.00040100300902708126, "loss": 2.8117, "theoretical_loss": 3.7905853925985946, "tokens_seen": 680070144 }, { "epoch": 1.1, "learning_rate": 0.00040099297893681044, "loss": 2.9501, "theoretical_loss": 3.7905481133514387, "tokens_seen": 680135680 }, { "epoch": 1.1, "learning_rate": 0.0004009829488465396, "loss": 2.8699, "theoretical_loss": 3.7905108387019197, "tokens_seen": 680201216 }, { "epoch": 1.1, "learning_rate": 0.0004009729187562688, "loss": 2.7933, "theoretical_loss": 3.790473568649028, "tokens_seen": 680266752 }, { "epoch": 1.1, "learning_rate": 0.000400962888665998, "loss": 2.8067, "theoretical_loss": 3.790436303191754, "tokens_seen": 680332288 }, { "epoch": 1.1, "learning_rate": 0.00040095285857572716, "loss": 2.9151, "theoretical_loss": 3.7903990423290885, "tokens_seen": 680397824 }, { "epoch": 1.1, "learning_rate": 0.0004009428284854564, "loss": 2.9495, "theoretical_loss": 3.7903617860600223, "tokens_seen": 680463360 }, { "epoch": 1.1, "learning_rate": 0.0004009327983951855, "loss": 2.9199, "theoretical_loss": 3.7903245343835468, "tokens_seen": 680528896 }, { "epoch": 1.1, "learning_rate": 0.00040092276830491476, "loss": 2.9101, "theoretical_loss": 3.7902872872986535, "tokens_seen": 680594432 }, { "epoch": 1.1, "learning_rate": 0.0004009127382146439, "loss": 3.1254, "theoretical_loss": 3.790250044804335, "tokens_seen": 680659968 }, { "epoch": 1.1, "learning_rate": 0.0004009027081243731, "loss": 3.0645, "theoretical_loss": 3.7902128068995835, "tokens_seen": 680725504 }, { "epoch": 1.1, "learning_rate": 0.00040089267803410236, "loss": 2.9491, "theoretical_loss": 3.790175573583391, "tokens_seen": 680791040 }, { "epoch": 1.1, "learning_rate": 0.0004008826479438315, "loss": 3.0706, "theoretical_loss": 3.7901383448547508, "tokens_seen": 680856576 }, { "epoch": 1.1, "learning_rate": 0.0004008726178535607, "loss": 2.9777, "theoretical_loss": 3.7901011207126567, "tokens_seen": 680922112 }, { "epoch": 1.1, "learning_rate": 0.0004008625877632899, "loss": 2.7552, "theoretical_loss": 3.790063901156101, "tokens_seen": 680987648 }, { "epoch": 1.1, "learning_rate": 0.0004008525576730191, "loss": 2.928, "theoretical_loss": 3.7900266861840786, "tokens_seen": 681053184 }, { "epoch": 1.1, "learning_rate": 0.00040084252758274826, "loss": 2.8153, "theoretical_loss": 3.7899894757955837, "tokens_seen": 681118720 }, { "epoch": 1.1, "learning_rate": 0.00040083249749247744, "loss": 2.8616, "theoretical_loss": 3.7899522699896107, "tokens_seen": 681184256 }, { "epoch": 1.1, "learning_rate": 0.0004008224674022066, "loss": 3.01, "theoretical_loss": 3.789915068765155, "tokens_seen": 681249792 }, { "epoch": 1.1, "learning_rate": 0.00040081243731193586, "loss": 3.0608, "theoretical_loss": 3.7898778721212105, "tokens_seen": 681315328 }, { "epoch": 1.1, "learning_rate": 0.000400802407221665, "loss": 2.8689, "theoretical_loss": 3.7898406800567734, "tokens_seen": 681380864 }, { "epoch": 1.1, "learning_rate": 0.0004007923771313942, "loss": 2.9371, "theoretical_loss": 3.7898034925708393, "tokens_seen": 681446400 }, { "epoch": 1.1, "learning_rate": 0.00040078234704112335, "loss": 2.931, "theoretical_loss": 3.7897663096624052, "tokens_seen": 681511936 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1582694053649902, "objective/train/theoretical_loss": 3.789738425484459, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.789738425484459, "tokens_seen": 681561088 }, { "epoch": 1.1, "learning_rate": 0.0004007723169508526, "loss": 2.9088, "theoretical_loss": 3.7897291313304664, "tokens_seen": 681577472 }, { "epoch": 1.1, "learning_rate": 0.00040076228686058177, "loss": 3.0373, "theoretical_loss": 3.78969195757402, "tokens_seen": 681643008 }, { "epoch": 1.1, "learning_rate": 0.00040075225677031095, "loss": 2.8627, "theoretical_loss": 3.7896547883920633, "tokens_seen": 681708544 }, { "epoch": 1.1, "learning_rate": 0.00040074222668004013, "loss": 2.9627, "theoretical_loss": 3.789617623783594, "tokens_seen": 681774080 }, { "epoch": 1.1, "learning_rate": 0.00040073219658976936, "loss": 3.0354, "theoretical_loss": 3.789580463747609, "tokens_seen": 681839616 }, { "epoch": 1.1, "learning_rate": 0.0004007221664994985, "loss": 3.0551, "theoretical_loss": 3.7895433082831067, "tokens_seen": 681905152 }, { "epoch": 1.1, "learning_rate": 0.0004007121364092277, "loss": 3.0034, "theoretical_loss": 3.789506157389085, "tokens_seen": 681970688 }, { "epoch": 1.1, "learning_rate": 0.00040070210631895685, "loss": 2.887, "theoretical_loss": 3.789469011064544, "tokens_seen": 682036224 }, { "epoch": 1.1, "learning_rate": 0.0004006920762286861, "loss": 2.807, "theoretical_loss": 3.7894318693084807, "tokens_seen": 682101760 }, { "epoch": 1.1, "learning_rate": 0.00040068204613841527, "loss": 2.9916, "theoretical_loss": 3.789394732119896, "tokens_seen": 682167296 }, { "epoch": 1.1, "learning_rate": 0.00040067201604814445, "loss": 3.1673, "theoretical_loss": 3.789357599497789, "tokens_seen": 682232832 }, { "epoch": 1.1, "learning_rate": 0.00040066198595787363, "loss": 3.0066, "theoretical_loss": 3.789320471441159, "tokens_seen": 682298368 }, { "epoch": 1.1, "learning_rate": 0.0004006519558676028, "loss": 3.0769, "theoretical_loss": 3.7892833479490067, "tokens_seen": 682363904 }, { "epoch": 1.1, "learning_rate": 0.000400641925777332, "loss": 2.8619, "theoretical_loss": 3.789246229020333, "tokens_seen": 682429440 }, { "epoch": 1.1, "learning_rate": 0.00040063189568706123, "loss": 3.0213, "theoretical_loss": 3.789209114654138, "tokens_seen": 682494976 }, { "epoch": 1.1, "learning_rate": 0.00040062186559679036, "loss": 2.8732, "theoretical_loss": 3.7891720048494233, "tokens_seen": 682560512 }, { "epoch": 1.1, "learning_rate": 0.0004006118355065196, "loss": 2.9126, "theoretical_loss": 3.7891348996051906, "tokens_seen": 682626048 }, { "epoch": 1.1, "learning_rate": 0.0004006018054162487, "loss": 2.9435, "theoretical_loss": 3.7890977989204413, "tokens_seen": 682691584 }, { "epoch": 1.1, "learning_rate": 0.00040059177532597795, "loss": 3.0501, "theoretical_loss": 3.7890607027941776, "tokens_seen": 682757120 }, { "epoch": 1.1, "learning_rate": 0.00040058174523570713, "loss": 2.9835, "theoretical_loss": 3.7890236112254025, "tokens_seen": 682822656 }, { "epoch": 1.1, "learning_rate": 0.0004005717151454363, "loss": 2.9616, "theoretical_loss": 3.7889865242131178, "tokens_seen": 682888192 }, { "epoch": 1.1, "learning_rate": 0.0004005616850551655, "loss": 3.0585, "theoretical_loss": 3.788949441756327, "tokens_seen": 682953728 }, { "epoch": 1.1, "learning_rate": 0.00040055165496489473, "loss": 3.0029, "theoretical_loss": 3.788912363854034, "tokens_seen": 683019264 }, { "epoch": 1.1, "learning_rate": 0.00040054162487462386, "loss": 2.9679, "theoretical_loss": 3.7888752905052416, "tokens_seen": 683084800 }, { "epoch": 1.1, "learning_rate": 0.0004005315947843531, "loss": 2.9898, "theoretical_loss": 3.7888382217089545, "tokens_seen": 683150336 }, { "epoch": 1.1, "objective/train/docs_used": 1075722, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8501954078674316, "objective/train/theoretical_loss": 3.788810423098721, "objective/train/tokens_used": 683395552, "theoretical_loss": 3.788810423098721, "tokens_seen": 683199488 }, { "epoch": 1.1, "learning_rate": 0.0004005215646940822, "loss": 2.8529, "theoretical_loss": 3.7888011574641762, "tokens_seen": 683215872 }, { "epoch": 1.1, "learning_rate": 0.00040051153460381146, "loss": 3.0038, "theoretical_loss": 3.788764097769912, "tokens_seen": 683281408 }, { "epoch": 1.1, "learning_rate": 0.00040050150451354064, "loss": 2.9846, "theoretical_loss": 3.788727042625167, "tokens_seen": 683346944 }, { "epoch": 1.1, "learning_rate": 0.0004004914744232698, "loss": 2.9167, "theoretical_loss": 3.788689992028946, "tokens_seen": 683412480 }, { "epoch": 2.0, "learning_rate": 0.000400481444332999, "loss": 3.8244, "theoretical_loss": 3.7886512095582843, "tokens_seen": 683481088 }, { "epoch": 2.0, "learning_rate": 0.0004004714142427282, "loss": 2.8719, "theoretical_loss": 3.788614168269223, "tokens_seen": 683546624 }, { "epoch": 2.0, "learning_rate": 0.00040046138415245736, "loss": 2.9984, "theoretical_loss": 3.7885771315256567, "tokens_seen": 683612160 }, { "epoch": 2.0, "learning_rate": 0.0004004513540621866, "loss": 2.92, "theoretical_loss": 3.7885400993265925, "tokens_seen": 683677696 }, { "epoch": 2.0, "learning_rate": 0.0004004413239719157, "loss": 2.9367, "theoretical_loss": 3.788503071671036, "tokens_seen": 683743232 }, { "epoch": 2.0, "learning_rate": 0.00040043129388164496, "loss": 3.1309, "theoretical_loss": 3.7884660485579964, "tokens_seen": 683808768 }, { "epoch": 2.0, "learning_rate": 0.0004004212637913741, "loss": 2.9928, "theoretical_loss": 3.788429029986479, "tokens_seen": 683874304 }, { "epoch": 2.0, "learning_rate": 0.0004004112337011033, "loss": 2.9938, "theoretical_loss": 3.788392015955493, "tokens_seen": 683939840 }, { "epoch": 2.0, "learning_rate": 0.0004004012036108325, "loss": 3.0819, "theoretical_loss": 3.7883550064640454, "tokens_seen": 684005376 }, { "epoch": 2.0, "learning_rate": 0.0004003911735205617, "loss": 2.9196, "theoretical_loss": 3.7883180015111457, "tokens_seen": 684070912 }, { "epoch": 2.0, "learning_rate": 0.00040038114343029087, "loss": 2.9721, "theoretical_loss": 3.788281001095802, "tokens_seen": 684136448 }, { "epoch": 2.0, "learning_rate": 0.0004003711133400201, "loss": 2.9215, "theoretical_loss": 3.788244005217023, "tokens_seen": 684201984 }, { "epoch": 2.0, "learning_rate": 0.00040036108324974923, "loss": 2.9718, "theoretical_loss": 3.7882070138738193, "tokens_seen": 684267520 }, { "epoch": 2.0, "learning_rate": 0.00040035105315947846, "loss": 2.9726, "theoretical_loss": 3.7881700270651986, "tokens_seen": 684333056 }, { "epoch": 2.0, "learning_rate": 0.0004003410230692076, "loss": 2.9655, "theoretical_loss": 3.7881330447901727, "tokens_seen": 684398592 }, { "epoch": 2.0, "learning_rate": 0.0004003309929789368, "loss": 2.931, "theoretical_loss": 3.788096067047751, "tokens_seen": 684464128 }, { "epoch": 2.0, "learning_rate": 0.000400320962888666, "loss": 2.9117, "theoretical_loss": 3.7880590938369436, "tokens_seen": 684529664 }, { "epoch": 2.0, "learning_rate": 0.0004003109327983952, "loss": 3.0039, "theoretical_loss": 3.788022125156762, "tokens_seen": 684595200 }, { "epoch": 2.0, "learning_rate": 0.00040030090270812437, "loss": 2.9369, "theoretical_loss": 3.7879851610062176, "tokens_seen": 684660736 }, { "epoch": 2.0, "learning_rate": 0.00040029087261785355, "loss": 2.9451, "theoretical_loss": 3.7879482013843213, "tokens_seen": 684726272 }, { "epoch": 2.0, "learning_rate": 0.00040028084252758273, "loss": 3.0178, "theoretical_loss": 3.7879112462900855, "tokens_seen": 684791808 }, { "epoch": 2.0, "objective/train/docs_used": 1110494, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.893524408340454, "objective/train/theoretical_loss": 3.7878927704405316, "objective/train/tokens_used": 705284576, "theoretical_loss": 3.7878927704405316, "tokens_seen": 684824576 }, { "epoch": 2.0, "learning_rate": 0.00040027081243731197, "loss": 3.0324, "theoretical_loss": 3.787874295722522, "tokens_seen": 684857344 }, { "epoch": 2.0, "learning_rate": 0.0004002607823470411, "loss": 2.9416, "theoretical_loss": 3.7878373496806432, "tokens_seen": 684922880 }, { "epoch": 2.0, "learning_rate": 0.00040025075225677033, "loss": 2.9874, "theoretical_loss": 3.787800408163462, "tokens_seen": 684988416 }, { "epoch": 2.0, "learning_rate": 0.00040024072216649946, "loss": 2.9579, "theoretical_loss": 3.787763471169991, "tokens_seen": 685053952 }, { "epoch": 2.0, "learning_rate": 0.0004002306920762287, "loss": 2.8909, "theoretical_loss": 3.7877265386992445, "tokens_seen": 685119488 }, { "epoch": 2.0, "learning_rate": 0.00040022066198595787, "loss": 2.9716, "theoretical_loss": 3.7876896107502356, "tokens_seen": 685185024 }, { "epoch": 2.0, "learning_rate": 0.00040021063189568705, "loss": 2.8033, "theoretical_loss": 3.7876526873219776, "tokens_seen": 685250560 }, { "epoch": 2.0, "learning_rate": 0.00040020060180541623, "loss": 2.9894, "theoretical_loss": 3.787615768413486, "tokens_seen": 685316096 }, { "epoch": 2.0, "learning_rate": 0.00040019057171514547, "loss": 2.9893, "theoretical_loss": 3.7875788540237747, "tokens_seen": 685381632 }, { "epoch": 2.0, "learning_rate": 0.0004001805416248746, "loss": 2.9882, "theoretical_loss": 3.787541944151859, "tokens_seen": 685447168 }, { "epoch": 2.0, "learning_rate": 0.00040017051153460383, "loss": 3.0908, "theoretical_loss": 3.7875050387967533, "tokens_seen": 685512704 }, { "epoch": 2.0, "learning_rate": 0.00040016048144433296, "loss": 2.7529, "theoretical_loss": 3.787468137957474, "tokens_seen": 685578240 }, { "epoch": 2.0, "learning_rate": 0.0004001504513540622, "loss": 2.8878, "theoretical_loss": 3.787431241633037, "tokens_seen": 685643776 }, { "epoch": 2.0, "learning_rate": 0.00040014042126379143, "loss": 2.8945, "theoretical_loss": 3.7873943498224576, "tokens_seen": 685709312 }, { "epoch": 2.0, "learning_rate": 0.00040013039117352056, "loss": 3.0703, "theoretical_loss": 3.787357462524753, "tokens_seen": 685774848 }, { "epoch": 2.0, "learning_rate": 0.0004001203610832498, "loss": 2.9049, "theoretical_loss": 3.7873205797389398, "tokens_seen": 685840384 }, { "epoch": 2.0, "learning_rate": 0.0004001103309929789, "loss": 3.0061, "theoretical_loss": 3.787283701464035, "tokens_seen": 685905920 }, { "epoch": 2.0, "learning_rate": 0.00040010030090270815, "loss": 3.0934, "theoretical_loss": 3.787246827699055, "tokens_seen": 685971456 }, { "epoch": 2.0, "learning_rate": 0.00040009027081243733, "loss": 3.1511, "theoretical_loss": 3.787209958443019, "tokens_seen": 686036992 }, { "epoch": 2.0, "learning_rate": 0.0004000802407221665, "loss": 3.0536, "theoretical_loss": 3.7871730936949453, "tokens_seen": 686102528 }, { "epoch": 2.0, "learning_rate": 0.0004000702106318957, "loss": 2.9862, "theoretical_loss": 3.7871362334538503, "tokens_seen": 686168064 }, { "epoch": 2.0, "learning_rate": 0.00040006018054162493, "loss": 2.9934, "theoretical_loss": 3.7870993777187536, "tokens_seen": 686233600 }, { "epoch": 2.0, "learning_rate": 0.00040005015045135406, "loss": 3.0619, "theoretical_loss": 3.7870625264886746, "tokens_seen": 686299136 }, { "epoch": 2.0, "learning_rate": 0.0004000401203610833, "loss": 3.1547, "theoretical_loss": 3.787025679762632, "tokens_seen": 686364672 }, { "epoch": 2.0, "learning_rate": 0.0004000300902708124, "loss": 3.1137, "theoretical_loss": 3.786988837539645, "tokens_seen": 686430208 }, { "epoch": 2.0, "objective/train/docs_used": 1113529, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2046334743499756, "objective/train/theoretical_loss": 3.7869704181164914, "objective/train/tokens_used": 706922976, "theoretical_loss": 3.7869704181164914, "tokens_seen": 686462976 }, { "epoch": 2.0, "learning_rate": 0.00040002006018054166, "loss": 2.9942, "theoretical_loss": 3.7869519998187338, "tokens_seen": 686495744 }, { "epoch": 2.0, "learning_rate": 0.00040001003009027084, "loss": 2.9745, "theoretical_loss": 3.786915166598919, "tokens_seen": 686561280 }, { "epoch": 2.0, "learning_rate": 0.0004, "loss": 2.8749, "theoretical_loss": 3.7868783378792203, "tokens_seen": 686626816 }, { "epoch": 2.0, "learning_rate": 0.0003999899699097292, "loss": 2.8615, "theoretical_loss": 3.7868415136586595, "tokens_seen": 686692352 }, { "epoch": 2.0, "learning_rate": 0.0003999799398194584, "loss": 3.2106, "theoretical_loss": 3.786804693936256, "tokens_seen": 686757888 }, { "epoch": 2.0, "learning_rate": 0.00039996990972918756, "loss": 2.8274, "theoretical_loss": 3.786767878711032, "tokens_seen": 686823424 }, { "epoch": 2.0, "learning_rate": 0.0003999598796389168, "loss": 3.0812, "theoretical_loss": 3.78673106798201, "tokens_seen": 686888960 }, { "epoch": 2.0, "learning_rate": 0.0003999498495486459, "loss": 2.9778, "theoretical_loss": 3.786694261748211, "tokens_seen": 686954496 }, { "epoch": 2.0, "learning_rate": 0.00039993981945837516, "loss": 3.0217, "theoretical_loss": 3.7866574600086573, "tokens_seen": 687020032 }, { "epoch": 2.0, "learning_rate": 0.0003999297893681043, "loss": 3.0482, "theoretical_loss": 3.786620662762372, "tokens_seen": 687085568 }, { "epoch": 2.0, "learning_rate": 0.0003999197592778335, "loss": 3.0649, "theoretical_loss": 3.7865838700083776, "tokens_seen": 687151104 }, { "epoch": 2.0, "learning_rate": 0.0003999097291875627, "loss": 3.1107, "theoretical_loss": 3.7865470817456974, "tokens_seen": 687216640 }, { "epoch": 2.0, "learning_rate": 0.0003998996990972919, "loss": 2.9955, "theoretical_loss": 3.786510297973355, "tokens_seen": 687282176 }, { "epoch": 2.0, "learning_rate": 0.00039988966900702107, "loss": 3.0366, "theoretical_loss": 3.786473518690374, "tokens_seen": 687347712 }, { "epoch": 2.0, "learning_rate": 0.0003998796389167503, "loss": 2.8403, "theoretical_loss": 3.786436743895779, "tokens_seen": 687413248 }, { "epoch": 2.0, "learning_rate": 0.00039986960882647943, "loss": 3.1674, "theoretical_loss": 3.7863999735885936, "tokens_seen": 687478784 }, { "epoch": 2.0, "learning_rate": 0.00039985957873620866, "loss": 2.8762, "theoretical_loss": 3.7863632077678435, "tokens_seen": 687544320 }, { "epoch": 2.0, "learning_rate": 0.0003998495486459378, "loss": 3.0533, "theoretical_loss": 3.786326446432553, "tokens_seen": 687609856 }, { "epoch": 2.0, "learning_rate": 0.000399839518555667, "loss": 3.0837, "theoretical_loss": 3.7862896895817477, "tokens_seen": 687675392 }, { "epoch": 2.0, "learning_rate": 0.0003998294884653962, "loss": 2.9002, "theoretical_loss": 3.7862529372144538, "tokens_seen": 687740928 }, { "epoch": 2.0, "learning_rate": 0.0003998194583751254, "loss": 2.8857, "theoretical_loss": 3.7862161893296955, "tokens_seen": 687806464 }, { "epoch": 2.0, "learning_rate": 0.00039980942828485457, "loss": 2.9969, "theoretical_loss": 3.786179445926501, "tokens_seen": 687872000 }, { "epoch": 2.0, "learning_rate": 0.00039979939819458375, "loss": 3.024, "theoretical_loss": 3.786142707003896, "tokens_seen": 687937536 }, { "epoch": 2.0, "learning_rate": 0.00039978936810431293, "loss": 2.9776, "theoretical_loss": 3.7861059725609074, "tokens_seen": 688003072 }, { "epoch": 2.0, "learning_rate": 0.00039977933801404217, "loss": 2.9684, "theoretical_loss": 3.7860692425965627, "tokens_seen": 688068608 }, { "epoch": 2.0, "objective/train/docs_used": 1116343, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.748493194580078, "objective/train/theoretical_loss": 3.786050879293578, "objective/train/tokens_used": 708561376, "theoretical_loss": 3.786050879293578, "tokens_seen": 688101376 }, { "epoch": 2.0, "learning_rate": 0.0003997693079237713, "loss": 2.8551, "theoretical_loss": 3.786032517109889, "tokens_seen": 688134144 }, { "epoch": 2.0, "learning_rate": 0.00039975927783350053, "loss": 2.9371, "theoretical_loss": 3.7859957960999147, "tokens_seen": 688199680 }, { "epoch": 2.0, "learning_rate": 0.00039974924774322966, "loss": 3.0617, "theoretical_loss": 3.785959079565666, "tokens_seen": 688265216 }, { "epoch": 2.0, "learning_rate": 0.0003997392176529589, "loss": 3.0557, "theoretical_loss": 3.785922367506174, "tokens_seen": 688330752 }, { "epoch": 2.0, "learning_rate": 0.00039972918756268807, "loss": 3.0624, "theoretical_loss": 3.7858856599204653, "tokens_seen": 688396288 }, { "epoch": 2.0, "learning_rate": 0.00039971915747241725, "loss": 2.9322, "theoretical_loss": 3.7858489568075697, "tokens_seen": 688461824 }, { "epoch": 2.0, "learning_rate": 0.00039970912738214643, "loss": 3.0232, "theoretical_loss": 3.7858122581665166, "tokens_seen": 688527360 }, { "epoch": 2.0, "learning_rate": 0.00039969909729187567, "loss": 2.913, "theoretical_loss": 3.785775563996335, "tokens_seen": 688592896 }, { "epoch": 2.0, "learning_rate": 0.0003996890672016048, "loss": 2.9389, "theoretical_loss": 3.785738874296056, "tokens_seen": 688658432 }, { "epoch": 2.0, "learning_rate": 0.00039967903711133403, "loss": 2.9983, "theoretical_loss": 3.7857021890647085, "tokens_seen": 688723968 }, { "epoch": 2.0, "learning_rate": 0.00039966900702106316, "loss": 2.8947, "theoretical_loss": 3.785665508301323, "tokens_seen": 688789504 }, { "epoch": 2.0, "learning_rate": 0.0003996589769307924, "loss": 2.9901, "theoretical_loss": 3.785628832004931, "tokens_seen": 688855040 }, { "epoch": 2.0, "learning_rate": 0.0003996489468405216, "loss": 3.0452, "theoretical_loss": 3.7855921601745637, "tokens_seen": 688920576 }, { "epoch": 2.0, "learning_rate": 0.00039963891675025076, "loss": 3.082, "theoretical_loss": 3.7855554928092525, "tokens_seen": 688986112 }, { "epoch": 2.0, "learning_rate": 0.00039962888665997994, "loss": 3.0359, "theoretical_loss": 3.7855188299080282, "tokens_seen": 689051648 }, { "epoch": 2.0, "learning_rate": 0.0003996188565697091, "loss": 3.0235, "theoretical_loss": 3.7854821714699236, "tokens_seen": 689117184 }, { "epoch": 2.0, "learning_rate": 0.0003996088264794383, "loss": 3.0201, "theoretical_loss": 3.7854455174939714, "tokens_seen": 689182720 }, { "epoch": 2.0, "learning_rate": 0.00039959879638916754, "loss": 3.1249, "theoretical_loss": 3.785408867979203, "tokens_seen": 689248256 }, { "epoch": 2.0, "learning_rate": 0.00039958876629889666, "loss": 3.0697, "theoretical_loss": 3.785372222924652, "tokens_seen": 689313792 }, { "epoch": 2.0, "learning_rate": 0.0003995787362086259, "loss": 3.0738, "theoretical_loss": 3.7853355823293517, "tokens_seen": 689379328 }, { "epoch": 2.0, "learning_rate": 0.0003995687061183551, "loss": 2.8972, "theoretical_loss": 3.785298946192336, "tokens_seen": 689444864 }, { "epoch": 2.0, "learning_rate": 0.00039955867602808426, "loss": 3.1359, "theoretical_loss": 3.7852623145126376, "tokens_seen": 689510400 }, { "epoch": 2.0, "learning_rate": 0.00039954864593781344, "loss": 3.2106, "theoretical_loss": 3.7852256872892918, "tokens_seen": 689575936 }, { "epoch": 2.0, "learning_rate": 0.0003995386158475426, "loss": 3.0656, "theoretical_loss": 3.785189064521332, "tokens_seen": 689641472 }, { "epoch": 2.0, "learning_rate": 0.0003995285857572718, "loss": 3.0717, "theoretical_loss": 3.785152446207794, "tokens_seen": 689707008 }, { "epoch": 2.0, "objective/train/docs_used": 1118568, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0141966342926025, "objective/train/theoretical_loss": 3.7851341387211317, "objective/train/tokens_used": 710199776, "theoretical_loss": 3.7851341387211317, "tokens_seen": 689739776 }, { "epoch": 2.0, "learning_rate": 0.00039951855566700104, "loss": 3.1283, "theoretical_loss": 3.785115832347712, "tokens_seen": 689772544 }, { "epoch": 2.0, "learning_rate": 0.00039950852557673017, "loss": 2.9899, "theoretical_loss": 3.785079222940122, "tokens_seen": 689838080 }, { "epoch": 2.0, "learning_rate": 0.0003994984954864594, "loss": 3.0493, "theoretical_loss": 3.7850426179840584, "tokens_seen": 689903616 }, { "epoch": 2.0, "learning_rate": 0.00039948846539618853, "loss": 3.0215, "theoretical_loss": 3.7850060174785582, "tokens_seen": 689969152 }, { "epoch": 2.0, "learning_rate": 0.00039947843530591776, "loss": 2.8938, "theoretical_loss": 3.784969421422658, "tokens_seen": 690034688 }, { "epoch": 2.0, "learning_rate": 0.00039946840521564694, "loss": 3.0367, "theoretical_loss": 3.784932829815393, "tokens_seen": 690100224 }, { "epoch": 2.0, "learning_rate": 0.0003994583751253761, "loss": 2.8606, "theoretical_loss": 3.784896242655801, "tokens_seen": 690165760 }, { "epoch": 2.0, "learning_rate": 0.0003994483450351053, "loss": 3.1891, "theoretical_loss": 3.7848596599429185, "tokens_seen": 690231296 }, { "epoch": 2.0, "learning_rate": 0.0003994383149448345, "loss": 2.9456, "theoretical_loss": 3.7848230816757837, "tokens_seen": 690296832 }, { "epoch": 2.0, "learning_rate": 0.00039942828485456367, "loss": 3.025, "theoretical_loss": 3.7847865078534335, "tokens_seen": 690362368 }, { "epoch": 2.0, "learning_rate": 0.0003994182547642929, "loss": 3.0936, "theoretical_loss": 3.7847499384749073, "tokens_seen": 690427904 }, { "epoch": 2.0, "learning_rate": 0.00039940822467402203, "loss": 3.1321, "theoretical_loss": 3.7847133735392413, "tokens_seen": 690493440 }, { "epoch": 2.0, "learning_rate": 0.00039939819458375127, "loss": 3.0824, "theoretical_loss": 3.784676813045476, "tokens_seen": 690558976 }, { "epoch": 2.0, "learning_rate": 0.0003993881644934805, "loss": 2.9596, "theoretical_loss": 3.78464025699265, "tokens_seen": 690624512 }, { "epoch": 2.0, "learning_rate": 0.00039937813440320963, "loss": 2.8597, "theoretical_loss": 3.784603705379801, "tokens_seen": 690690048 }, { "epoch": 2.0, "learning_rate": 0.00039936810431293886, "loss": 3.0192, "theoretical_loss": 3.7845671582059706, "tokens_seen": 690755584 }, { "epoch": 2.0, "learning_rate": 0.000399358074222668, "loss": 3.0122, "theoretical_loss": 3.784530615470197, "tokens_seen": 690821120 }, { "epoch": 2.0, "learning_rate": 0.0003993480441323972, "loss": 2.9583, "theoretical_loss": 3.7844940771715216, "tokens_seen": 690886656 }, { "epoch": 2.0, "learning_rate": 0.0003993380140421264, "loss": 2.9218, "theoretical_loss": 3.784457543308984, "tokens_seen": 690952192 }, { "epoch": 2.0, "learning_rate": 0.0003993279839518556, "loss": 3.0867, "theoretical_loss": 3.7844210138816257, "tokens_seen": 691017728 }, { "epoch": 2.0, "learning_rate": 0.00039931795386158477, "loss": 2.939, "theoretical_loss": 3.7843844888884863, "tokens_seen": 691083264 }, { "epoch": 2.0, "learning_rate": 0.00039930792377131395, "loss": 3.0207, "theoretical_loss": 3.7843479683286088, "tokens_seen": 691148800 }, { "epoch": 2.0, "learning_rate": 0.00039929789368104313, "loss": 2.9053, "theoretical_loss": 3.7843114522010337, "tokens_seen": 691214336 }, { "epoch": 2.0, "learning_rate": 0.00039928786359077237, "loss": 2.8726, "theoretical_loss": 3.784274940504803, "tokens_seen": 691279872 }, { "epoch": 2.0, "learning_rate": 0.0003992778335005015, "loss": 3.0863, "theoretical_loss": 3.7842384332389596, "tokens_seen": 691345408 }, { "epoch": 2.0, "objective/train/docs_used": 1121425, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7829933166503906, "objective/train/theoretical_loss": 3.7842201812671337, "objective/train/tokens_used": 711838176, "theoretical_loss": 3.7842201812671337, "tokens_seen": 691378176 }, { "epoch": 2.0, "learning_rate": 0.00039926780341023073, "loss": 2.9102, "theoretical_loss": 3.784201930402545, "tokens_seen": 691410944 }, { "epoch": 2.0, "learning_rate": 0.00039925777331995986, "loss": 2.9607, "theoretical_loss": 3.7841654319946034, "tokens_seen": 691476480 }, { "epoch": 2.0, "learning_rate": 0.0003992477432296891, "loss": 2.9107, "theoretical_loss": 3.7841289380141765, "tokens_seen": 691542016 }, { "epoch": 2.0, "learning_rate": 0.00039923771313941827, "loss": 2.9287, "theoretical_loss": 3.7840924484603082, "tokens_seen": 691607552 }, { "epoch": 2.0, "learning_rate": 0.00039922768304914745, "loss": 2.9949, "theoretical_loss": 3.784055963332042, "tokens_seen": 691673088 }, { "epoch": 2.0, "learning_rate": 0.00039921765295887663, "loss": 2.9438, "theoretical_loss": 3.7840194826284232, "tokens_seen": 691738624 }, { "epoch": 2.0, "learning_rate": 0.00039920762286860587, "loss": 3.184, "theoretical_loss": 3.7839830063484943, "tokens_seen": 691804160 }, { "epoch": 2.0, "learning_rate": 0.000399197592778335, "loss": 3.0698, "theoretical_loss": 3.7839465344913004, "tokens_seen": 691869696 }, { "epoch": 2.0, "learning_rate": 0.00039918756268806423, "loss": 2.952, "theoretical_loss": 3.7839100670558867, "tokens_seen": 691935232 }, { "epoch": 2.0, "learning_rate": 0.00039917753259779336, "loss": 2.978, "theoretical_loss": 3.7838736040412986, "tokens_seen": 692000768 }, { "epoch": 2.0, "learning_rate": 0.0003991675025075226, "loss": 3.0479, "theoretical_loss": 3.7838371454465816, "tokens_seen": 692066304 }, { "epoch": 2.0, "learning_rate": 0.0003991574724172518, "loss": 3.0259, "theoretical_loss": 3.783800691270781, "tokens_seen": 692131840 }, { "epoch": 2.0, "learning_rate": 0.00039914744232698096, "loss": 3.1142, "theoretical_loss": 3.7837642415129427, "tokens_seen": 692197376 }, { "epoch": 2.0, "learning_rate": 0.00039913741223671014, "loss": 3.0778, "theoretical_loss": 3.7837277961721134, "tokens_seen": 692262912 }, { "epoch": 2.0, "learning_rate": 0.0003991273821464393, "loss": 2.9055, "theoretical_loss": 3.78369135524734, "tokens_seen": 692328448 }, { "epoch": 2.0, "learning_rate": 0.0003991173520561685, "loss": 2.8973, "theoretical_loss": 3.7836549187376693, "tokens_seen": 692393984 }, { "epoch": 2.0, "learning_rate": 0.00039910732196589774, "loss": 3.051, "theoretical_loss": 3.7836184866421485, "tokens_seen": 692459520 }, { "epoch": 2.0, "learning_rate": 0.00039909729187562686, "loss": 3.0862, "theoretical_loss": 3.7835820589598246, "tokens_seen": 692525056 }, { "epoch": 2.0, "learning_rate": 0.0003990872617853561, "loss": 3.0068, "theoretical_loss": 3.7835456356897463, "tokens_seen": 692590592 }, { "epoch": 2.0, "learning_rate": 0.0003990772316950853, "loss": 2.8447, "theoretical_loss": 3.7835092168309616, "tokens_seen": 692656128 }, { "epoch": 2.0, "learning_rate": 0.00039906720160481446, "loss": 3.027, "theoretical_loss": 3.7834728023825184, "tokens_seen": 692721664 }, { "epoch": 2.0, "learning_rate": 0.00039905717151454364, "loss": 3.0253, "theoretical_loss": 3.783436392343466, "tokens_seen": 692787200 }, { "epoch": 2.0, "learning_rate": 0.0003990471414242728, "loss": 3.176, "theoretical_loss": 3.783399986712854, "tokens_seen": 692852736 }, { "epoch": 2.0, "learning_rate": 0.000399037111334002, "loss": 2.9335, "theoretical_loss": 3.78336358548973, "tokens_seen": 692918272 }, { "epoch": 2.0, "learning_rate": 0.00039902708124373124, "loss": 3.0727, "theoretical_loss": 3.7833271886731445, "tokens_seen": 692983808 }, { "epoch": 2.0, "objective/train/docs_used": 1124400, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0261852741241455, "objective/train/theoretical_loss": 3.783308991917007, "objective/train/tokens_used": 713476576, "theoretical_loss": 3.783308991917007, "tokens_seen": 693016576 }, { "epoch": 2.0, "learning_rate": 0.00039901705115346037, "loss": 3.0578, "theoretical_loss": 3.783290796262148, "tokens_seen": 693049344 }, { "epoch": 2.0, "learning_rate": 0.0003990070210631896, "loss": 3.0508, "theoretical_loss": 3.7832544082557904, "tokens_seen": 693114880 }, { "epoch": 2.0, "learning_rate": 0.00039899699097291873, "loss": 2.9956, "theoretical_loss": 3.783218024653121, "tokens_seen": 693180416 }, { "epoch": 2.0, "learning_rate": 0.00039898696088264796, "loss": 2.9639, "theoretical_loss": 3.7831816454531926, "tokens_seen": 693245952 }, { "epoch": 2.0, "learning_rate": 0.00039897693079237714, "loss": 2.8992, "theoretical_loss": 3.783145270655055, "tokens_seen": 693311488 }, { "epoch": 2.0, "learning_rate": 0.0003989669007021063, "loss": 2.9135, "theoretical_loss": 3.7831089002577594, "tokens_seen": 693377024 }, { "epoch": 2.0, "learning_rate": 0.0003989568706118355, "loss": 3.0087, "theoretical_loss": 3.7830725342603584, "tokens_seen": 693442560 }, { "epoch": 2.0, "learning_rate": 0.0003989468405215647, "loss": 2.9421, "theoretical_loss": 3.7830361726619035, "tokens_seen": 693508096 }, { "epoch": 2.0, "learning_rate": 0.00039893681043129387, "loss": 3.0788, "theoretical_loss": 3.782999815461447, "tokens_seen": 693573632 }, { "epoch": 2.0, "learning_rate": 0.0003989267803410231, "loss": 3.0401, "theoretical_loss": 3.7829634626580413, "tokens_seen": 693639168 }, { "epoch": 2.0, "learning_rate": 0.00039891675025075223, "loss": 2.9722, "theoretical_loss": 3.78292711425074, "tokens_seen": 693704704 }, { "epoch": 2.0, "learning_rate": 0.00039890672016048147, "loss": 2.8663, "theoretical_loss": 3.7828907702385948, "tokens_seen": 693770240 }, { "epoch": 2.0, "learning_rate": 0.00039889669007021065, "loss": 2.8507, "theoretical_loss": 3.7828544306206604, "tokens_seen": 693835776 }, { "epoch": 2.0, "learning_rate": 0.00039888665997993983, "loss": 2.9858, "theoretical_loss": 3.7828180953959905, "tokens_seen": 693901312 }, { "epoch": 2.0, "learning_rate": 0.000398876629889669, "loss": 2.9258, "theoretical_loss": 3.782781764563638, "tokens_seen": 693966848 }, { "epoch": 2.0, "learning_rate": 0.0003988665997993982, "loss": 2.7937, "theoretical_loss": 3.7827454381226584, "tokens_seen": 694032384 }, { "epoch": 2.0, "learning_rate": 0.00039885656970912737, "loss": 3.0516, "theoretical_loss": 3.7827091160721062, "tokens_seen": 694097920 }, { "epoch": 2.0, "learning_rate": 0.0003988465396188566, "loss": 2.945, "theoretical_loss": 3.7826727984110358, "tokens_seen": 694163456 }, { "epoch": 2.0, "learning_rate": 0.00039883650952858573, "loss": 3.0402, "theoretical_loss": 3.782636485138503, "tokens_seen": 694228992 }, { "epoch": 2.0, "learning_rate": 0.00039882647943831497, "loss": 3.1516, "theoretical_loss": 3.782600176253562, "tokens_seen": 694294528 }, { "epoch": 2.0, "learning_rate": 0.0003988164493480441, "loss": 2.9505, "theoretical_loss": 3.78256387175527, "tokens_seen": 694360064 }, { "epoch": 2.0, "learning_rate": 0.00039880641925777333, "loss": 2.9877, "theoretical_loss": 3.7825275716426825, "tokens_seen": 694425600 }, { "epoch": 2.0, "learning_rate": 0.0003987963891675025, "loss": 2.9887, "theoretical_loss": 3.7824912759148557, "tokens_seen": 694491136 }, { "epoch": 2.0, "learning_rate": 0.0003987863590772317, "loss": 3.0146, "theoretical_loss": 3.7824549845708466, "tokens_seen": 694556672 }, { "epoch": 2.0, "learning_rate": 0.0003987763289869609, "loss": 2.9566, "theoretical_loss": 3.782418697609712, "tokens_seen": 694622208 }, { "epoch": 2.0, "objective/train/docs_used": 1127334, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2494609355926514, "objective/train/theoretical_loss": 3.782400555772428, "objective/train/tokens_used": 715114976, "theoretical_loss": 3.782400555772428, "tokens_seen": 694654976 }, { "epoch": 2.0, "learning_rate": 0.00039876629889669006, "loss": 3.0187, "theoretical_loss": 3.7823824150305096, "tokens_seen": 694687744 }, { "epoch": 2.0, "learning_rate": 0.00039875626880641924, "loss": 2.9672, "theoretical_loss": 3.782346136832296, "tokens_seen": 694753280 }, { "epoch": 2.0, "learning_rate": 0.00039874623871614847, "loss": 2.9392, "theoretical_loss": 3.78230986301413, "tokens_seen": 694818816 }, { "epoch": 2.0, "learning_rate": 0.0003987362086258776, "loss": 3.115, "theoretical_loss": 3.782273593575069, "tokens_seen": 694884352 }, { "epoch": 2.0, "learning_rate": 0.00039872617853560683, "loss": 3.022, "theoretical_loss": 3.782237328514172, "tokens_seen": 694949888 }, { "epoch": 2.0, "learning_rate": 0.000398716148445336, "loss": 2.8176, "theoretical_loss": 3.7822010678304974, "tokens_seen": 695015424 }, { "epoch": 2.0, "learning_rate": 0.0003987061183550652, "loss": 3.059, "theoretical_loss": 3.7821648115231037, "tokens_seen": 695080960 }, { "epoch": 2.0, "learning_rate": 0.0003986960882647944, "loss": 2.9869, "theoretical_loss": 3.782128559591051, "tokens_seen": 695146496 }, { "epoch": 2.0, "learning_rate": 0.00039868605817452356, "loss": 3.067, "theoretical_loss": 3.7820923120333987, "tokens_seen": 695212032 }, { "epoch": 2.0, "learning_rate": 0.00039867602808425274, "loss": 2.989, "theoretical_loss": 3.782056068849206, "tokens_seen": 695277568 }, { "epoch": 2.0, "learning_rate": 0.000398665997993982, "loss": 3.0649, "theoretical_loss": 3.782019830037534, "tokens_seen": 695343104 }, { "epoch": 2.0, "learning_rate": 0.0003986559679037111, "loss": 3.2297, "theoretical_loss": 3.781983595597443, "tokens_seen": 695408640 }, { "epoch": 2.0, "learning_rate": 0.00039864593781344034, "loss": 3.0938, "theoretical_loss": 3.7819473655279925, "tokens_seen": 695474176 }, { "epoch": 2.0, "learning_rate": 0.0003986359077231695, "loss": 2.9986, "theoretical_loss": 3.781911139828245, "tokens_seen": 695539712 }, { "epoch": 2.0, "learning_rate": 0.0003986258776328987, "loss": 2.9467, "theoretical_loss": 3.7818749184972615, "tokens_seen": 695605248 }, { "epoch": 2.0, "learning_rate": 0.00039861584754262794, "loss": 2.9322, "theoretical_loss": 3.7818387015341033, "tokens_seen": 695670784 }, { "epoch": 2.0, "learning_rate": 0.00039860581745235706, "loss": 3.2409, "theoretical_loss": 3.7818024889378323, "tokens_seen": 695736320 }, { "epoch": 2.0, "learning_rate": 0.0003985957873620863, "loss": 2.9401, "theoretical_loss": 3.7817662807075108, "tokens_seen": 695801856 }, { "epoch": 2.0, "learning_rate": 0.0003985857572718155, "loss": 3.0238, "theoretical_loss": 3.781730076842201, "tokens_seen": 695867392 }, { "epoch": 2.0, "learning_rate": 0.00039857572718154466, "loss": 3.1276, "theoretical_loss": 3.7816938773409663, "tokens_seen": 695932928 }, { "epoch": 2.0, "learning_rate": 0.00039856569709127384, "loss": 2.9884, "theoretical_loss": 3.7816576822028694, "tokens_seen": 695998464 }, { "epoch": 2.0, "learning_rate": 0.000398555667001003, "loss": 3.0364, "theoretical_loss": 3.7816214914269732, "tokens_seen": 696064000 }, { "epoch": 2.0, "learning_rate": 0.0003985456369107322, "loss": 2.8881, "theoretical_loss": 3.7815853050123422, "tokens_seen": 696129536 }, { "epoch": 2.0, "learning_rate": 0.00039853560682046144, "loss": 2.9705, "theoretical_loss": 3.78154912295804, "tokens_seen": 696195072 }, { "epoch": 2.0, "learning_rate": 0.00039852557673019057, "loss": 3.1673, "theoretical_loss": 3.78151294526313, "tokens_seen": 696260608 }, { "epoch": 2.0, "objective/train/docs_used": 1130214, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9958178997039795, "objective/train/theoretical_loss": 3.7814948580501557, "objective/train/tokens_used": 716753376, "theoretical_loss": 3.7814948580501557, "tokens_seen": 696293376 }, { "epoch": 2.0, "learning_rate": 0.0003985155466399198, "loss": 2.998, "theoretical_loss": 3.7814767719266777, "tokens_seen": 696326144 }, { "epoch": 2.0, "learning_rate": 0.00039850551654964893, "loss": 2.8285, "theoretical_loss": 3.7814406029477485, "tokens_seen": 696391680 }, { "epoch": 2.0, "learning_rate": 0.00039849548645937816, "loss": 2.9389, "theoretical_loss": 3.7814044383254055, "tokens_seen": 696457216 }, { "epoch": 2.0, "learning_rate": 0.00039848545636910734, "loss": 3.043, "theoretical_loss": 3.7813682780587157, "tokens_seen": 696522752 }, { "epoch": 2.0, "learning_rate": 0.0003984754262788365, "loss": 2.8391, "theoretical_loss": 3.781332122146744, "tokens_seen": 696588288 }, { "epoch": 2.0, "learning_rate": 0.0003984653961885657, "loss": 3.0747, "theoretical_loss": 3.7812959705885563, "tokens_seen": 696653824 }, { "epoch": 2.0, "learning_rate": 0.0003984553660982949, "loss": 3.0921, "theoretical_loss": 3.7812598233832198, "tokens_seen": 696719360 }, { "epoch": 2.0, "learning_rate": 0.00039844533600802407, "loss": 3.024, "theoretical_loss": 3.7812236805297994, "tokens_seen": 696784896 }, { "epoch": 2.0, "learning_rate": 0.0003984353059177533, "loss": 3.1077, "theoretical_loss": 3.7811875420273635, "tokens_seen": 696850432 }, { "epoch": 2.0, "learning_rate": 0.00039842527582748243, "loss": 2.9558, "theoretical_loss": 3.781151407874978, "tokens_seen": 696915968 }, { "epoch": 2.0, "learning_rate": 0.00039841524573721167, "loss": 2.9539, "theoretical_loss": 3.781115278071711, "tokens_seen": 696981504 }, { "epoch": 2.0, "learning_rate": 0.00039840521564694085, "loss": 3.0497, "theoretical_loss": 3.7810791526166305, "tokens_seen": 697047040 }, { "epoch": 2.0, "learning_rate": 0.00039839518555667003, "loss": 2.9788, "theoretical_loss": 3.7810430315088035, "tokens_seen": 697112576 }, { "epoch": 2.0, "learning_rate": 0.0003983851554663992, "loss": 2.935, "theoretical_loss": 3.781006914747299, "tokens_seen": 697178112 }, { "epoch": 2.0, "learning_rate": 0.0003983751253761284, "loss": 2.8891, "theoretical_loss": 3.7809708023311845, "tokens_seen": 697243648 }, { "epoch": 2.0, "learning_rate": 0.00039836509528585757, "loss": 3.0577, "theoretical_loss": 3.78093469425953, "tokens_seen": 697309184 }, { "epoch": 2.0, "learning_rate": 0.0003983550651955868, "loss": 2.9334, "theoretical_loss": 3.780898590531404, "tokens_seen": 697374720 }, { "epoch": 2.0, "learning_rate": 0.00039834503510531593, "loss": 3.0846, "theoretical_loss": 3.780862491145876, "tokens_seen": 697440256 }, { "epoch": 2.0, "learning_rate": 0.00039833500501504517, "loss": 2.91, "theoretical_loss": 3.7808263961020154, "tokens_seen": 697505792 }, { "epoch": 2.0, "learning_rate": 0.0003983249749247743, "loss": 2.77, "theoretical_loss": 3.7807903053988934, "tokens_seen": 697571328 }, { "epoch": 2.0, "learning_rate": 0.00039831494483450353, "loss": 2.9907, "theoretical_loss": 3.7807542190355785, "tokens_seen": 697636864 }, { "epoch": 2.0, "learning_rate": 0.0003983049147442327, "loss": 3.0497, "theoretical_loss": 3.7807181370111422, "tokens_seen": 697702400 }, { "epoch": 2.0, "learning_rate": 0.0003982948846539619, "loss": 3.0127, "theoretical_loss": 3.780682059324656, "tokens_seen": 697767936 }, { "epoch": 2.0, "learning_rate": 0.0003982848545636911, "loss": 3.0392, "theoretical_loss": 3.7806459859751893, "tokens_seen": 697833472 }, { "epoch": 2.0, "learning_rate": 0.00039827482447342026, "loss": 2.9993, "theoretical_loss": 3.7806099169618146, "tokens_seen": 697899008 }, { "epoch": 2.0, "objective/train/docs_used": 1132035, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8705596923828125, "objective/train/theoretical_loss": 3.780591884080872, "objective/train/tokens_used": 718391776, "theoretical_loss": 3.780591884080872, "tokens_seen": 697931776 }, { "epoch": 2.0, "learning_rate": 0.00039826479438314944, "loss": 2.9503, "theoretical_loss": 3.7805738522836037, "tokens_seen": 697964544 }, { "epoch": 2.0, "learning_rate": 0.00039825476429287867, "loss": 2.9738, "theoretical_loss": 3.7805377919396284, "tokens_seen": 698030080 }, { "epoch": 2.0, "learning_rate": 0.0003982447342026078, "loss": 2.8914, "theoretical_loss": 3.78050173592896, "tokens_seen": 698095616 }, { "epoch": 2.0, "learning_rate": 0.00039823470411233703, "loss": 2.9321, "theoretical_loss": 3.7804656842506725, "tokens_seen": 698161152 }, { "epoch": 2.0, "learning_rate": 0.0003982246740220662, "loss": 3.0321, "theoretical_loss": 3.7804296369038384, "tokens_seen": 698226688 }, { "epoch": 2.0, "learning_rate": 0.0003982146439317954, "loss": 2.9387, "theoretical_loss": 3.7803935938875304, "tokens_seen": 698292224 }, { "epoch": 2.0, "learning_rate": 0.0003982046138415246, "loss": 2.9124, "theoretical_loss": 3.7803575552008217, "tokens_seen": 698357760 }, { "epoch": 2.0, "learning_rate": 0.00039819458375125376, "loss": 3.0017, "theoretical_loss": 3.7803215208427865, "tokens_seen": 698423296 }, { "epoch": 2.0, "learning_rate": 0.00039818455366098294, "loss": 3.0734, "theoretical_loss": 3.7802854908124983, "tokens_seen": 698488832 }, { "epoch": 2.0, "learning_rate": 0.0003981745235707122, "loss": 2.9375, "theoretical_loss": 3.7802494651090317, "tokens_seen": 698554368 }, { "epoch": 2.0, "learning_rate": 0.0003981644934804413, "loss": 3.1537, "theoretical_loss": 3.7802134437314616, "tokens_seen": 698619904 }, { "epoch": 2.0, "learning_rate": 0.00039815446339017054, "loss": 3.0837, "theoretical_loss": 3.780177426678862, "tokens_seen": 698685440 }, { "epoch": 2.0, "learning_rate": 0.00039814443329989966, "loss": 2.9871, "theoretical_loss": 3.780141413950309, "tokens_seen": 698750976 }, { "epoch": 2.0, "learning_rate": 0.0003981344032096289, "loss": 2.9521, "theoretical_loss": 3.7801054055448766, "tokens_seen": 698816512 }, { "epoch": 2.0, "learning_rate": 0.0003981243731193581, "loss": 2.9973, "theoretical_loss": 3.7800694014616423, "tokens_seen": 698882048 }, { "epoch": 2.0, "learning_rate": 0.00039811434302908726, "loss": 2.8696, "theoretical_loss": 3.78003340169968, "tokens_seen": 698947584 }, { "epoch": 2.0, "learning_rate": 0.00039810431293881644, "loss": 2.9101, "theoretical_loss": 3.779997406258068, "tokens_seen": 699013120 }, { "epoch": 2.0, "learning_rate": 0.0003980942828485457, "loss": 2.9686, "theoretical_loss": 3.7799614151358814, "tokens_seen": 699078656 }, { "epoch": 2.0, "learning_rate": 0.0003980842527582748, "loss": 3.2131, "theoretical_loss": 3.779925428332197, "tokens_seen": 699144192 }, { "epoch": 2.0, "learning_rate": 0.00039807422266800404, "loss": 3.0213, "theoretical_loss": 3.7798894458460928, "tokens_seen": 699209728 }, { "epoch": 2.0, "learning_rate": 0.00039806419257773317, "loss": 2.9765, "theoretical_loss": 3.7798534676766455, "tokens_seen": 699275264 }, { "epoch": 2.0, "learning_rate": 0.0003980541624874624, "loss": 2.9789, "theoretical_loss": 3.779817493822933, "tokens_seen": 699340800 }, { "epoch": 2.0, "learning_rate": 0.0003980441323971916, "loss": 3.0637, "theoretical_loss": 3.779781524284034, "tokens_seen": 699406336 }, { "epoch": 2.0, "learning_rate": 0.00039803410230692077, "loss": 2.911, "theoretical_loss": 3.779745559059025, "tokens_seen": 699471872 }, { "epoch": 2.0, "learning_rate": 0.00039802407221664995, "loss": 3.0516, "theoretical_loss": 3.7797095981469857, "tokens_seen": 699537408 }, { "epoch": 2.0, "objective/train/docs_used": 1134916, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8908395767211914, "objective/train/theoretical_loss": 3.779691619308042, "objective/train/tokens_used": 720030176, "theoretical_loss": 3.779691619308042, "tokens_seen": 699570176 }, { "epoch": 2.0, "learning_rate": 0.00039801404212637913, "loss": 3.0579, "theoretical_loss": 3.779673641546995, "tokens_seen": 699602944 }, { "epoch": 2.0, "learning_rate": 0.0003980040120361083, "loss": 3.0423, "theoretical_loss": 3.7796376892581316, "tokens_seen": 699668480 }, { "epoch": 2.0, "learning_rate": 0.00039799398194583754, "loss": 2.9993, "theoretical_loss": 3.7796017412794747, "tokens_seen": 699734016 }, { "epoch": 2.0, "learning_rate": 0.00039798395185556667, "loss": 2.9226, "theoretical_loss": 3.7795657976101045, "tokens_seen": 699799552 }, { "epoch": 2.0, "learning_rate": 0.0003979739217652959, "loss": 3.1785, "theoretical_loss": 3.7795298582491004, "tokens_seen": 699865088 }, { "epoch": 2.01, "learning_rate": 0.00039796389167502503, "loss": 2.9528, "theoretical_loss": 3.7794939231955427, "tokens_seen": 699930624 }, { "epoch": 2.01, "learning_rate": 0.00039795386158475427, "loss": 3.1873, "theoretical_loss": 3.7794579924485125, "tokens_seen": 699996160 }, { "epoch": 2.01, "learning_rate": 0.00039794383149448345, "loss": 3.1136, "theoretical_loss": 3.77942206600709, "tokens_seen": 700061696 }, { "epoch": 2.01, "learning_rate": 0.00039793380140421263, "loss": 2.9507, "theoretical_loss": 3.779386143870356, "tokens_seen": 700127232 }, { "epoch": 2.01, "learning_rate": 0.0003979237713139418, "loss": 3.0582, "theoretical_loss": 3.7793502260373923, "tokens_seen": 700192768 }, { "epoch": 2.01, "learning_rate": 0.00039791374122367105, "loss": 3.0885, "theoretical_loss": 3.779314312507281, "tokens_seen": 700258304 }, { "epoch": 2.01, "learning_rate": 0.0003979037111334002, "loss": 3.084, "theoretical_loss": 3.7792784032791027, "tokens_seen": 700323840 }, { "epoch": 2.01, "learning_rate": 0.0003978936810431294, "loss": 3.1684, "theoretical_loss": 3.779242498351941, "tokens_seen": 700389376 }, { "epoch": 2.01, "learning_rate": 0.0003978836509528586, "loss": 3.0231, "theoretical_loss": 3.779206597724877, "tokens_seen": 700454912 }, { "epoch": 2.01, "learning_rate": 0.00039787362086258777, "loss": 3.0712, "theoretical_loss": 3.779170701396995, "tokens_seen": 700520448 }, { "epoch": 2.01, "learning_rate": 0.000397863590772317, "loss": 2.9359, "theoretical_loss": 3.7791348093673767, "tokens_seen": 700585984 }, { "epoch": 2.01, "learning_rate": 0.00039785356068204613, "loss": 3.0177, "theoretical_loss": 3.7790989216351054, "tokens_seen": 700651520 }, { "epoch": 2.01, "learning_rate": 0.00039784353059177537, "loss": 2.9621, "theoretical_loss": 3.779063038199266, "tokens_seen": 700717056 }, { "epoch": 2.01, "learning_rate": 0.0003978335005015045, "loss": 3.2934, "theoretical_loss": 3.7790271590589413, "tokens_seen": 700782592 }, { "epoch": 2.01, "learning_rate": 0.00039782347041123373, "loss": 2.8504, "theoretical_loss": 3.7789912842132156, "tokens_seen": 700848128 }, { "epoch": 2.01, "learning_rate": 0.0003978134403209629, "loss": 3.0974, "theoretical_loss": 3.7789554136611736, "tokens_seen": 700913664 }, { "epoch": 2.01, "learning_rate": 0.0003978034102306921, "loss": 3.0672, "theoretical_loss": 3.7789195474019, "tokens_seen": 700979200 }, { "epoch": 2.01, "learning_rate": 0.0003977933801404213, "loss": 3.0414, "theoretical_loss": 3.7788836854344794, "tokens_seen": 701044736 }, { "epoch": 2.01, "learning_rate": 0.00039778335005015046, "loss": 3.0875, "theoretical_loss": 3.7788478277579975, "tokens_seen": 701110272 }, { "epoch": 2.01, "learning_rate": 0.00039777331995987964, "loss": 2.9639, "theoretical_loss": 3.77881197437154, "tokens_seen": 701175808 }, { "epoch": 2.01, "objective/train/docs_used": 1137822, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.101701498031616, "objective/train/theoretical_loss": 3.778794049286784, "objective/train/tokens_used": 721668576, "theoretical_loss": 3.778794049286784, "tokens_seen": 701208576 }, { "epoch": 2.01, "learning_rate": 0.00039776328986960887, "loss": 2.9504, "theoretical_loss": 3.778776125274192, "tokens_seen": 701241344 }, { "epoch": 2.01, "learning_rate": 0.000397753259779338, "loss": 3.0919, "theoretical_loss": 3.7787402804650405, "tokens_seen": 701306880 }, { "epoch": 2.01, "learning_rate": 0.00039774322968906723, "loss": 3.065, "theoretical_loss": 3.778704439943171, "tokens_seen": 701372416 }, { "epoch": 2.01, "learning_rate": 0.0003977331995987964, "loss": 2.9634, "theoretical_loss": 3.778668603707671, "tokens_seen": 701437952 }, { "epoch": 2.01, "learning_rate": 0.0003977231695085256, "loss": 3.1265, "theoretical_loss": 3.7786327717576276, "tokens_seen": 701503488 }, { "epoch": 2.01, "learning_rate": 0.0003977131394182548, "loss": 2.9186, "theoretical_loss": 3.7785969440921265, "tokens_seen": 701569024 }, { "epoch": 2.01, "learning_rate": 0.00039770310932798396, "loss": 3.0135, "theoretical_loss": 3.778561120710257, "tokens_seen": 701634560 }, { "epoch": 2.01, "learning_rate": 0.00039769307923771314, "loss": 3.0954, "theoretical_loss": 3.778525301611106, "tokens_seen": 701700096 }, { "epoch": 2.01, "learning_rate": 0.0003976830491474424, "loss": 3.1577, "theoretical_loss": 3.778489486793762, "tokens_seen": 701765632 }, { "epoch": 2.01, "learning_rate": 0.0003976730190571715, "loss": 3.094, "theoretical_loss": 3.778453676257313, "tokens_seen": 701831168 }, { "epoch": 2.01, "learning_rate": 0.00039766298896690074, "loss": 2.9975, "theoretical_loss": 3.778417870000848, "tokens_seen": 701896704 }, { "epoch": 2.01, "learning_rate": 0.00039765295887662987, "loss": 3.0818, "theoretical_loss": 3.7783820680234554, "tokens_seen": 701962240 }, { "epoch": 2.01, "learning_rate": 0.0003976429287863591, "loss": 3.1986, "theoretical_loss": 3.778346270324225, "tokens_seen": 702027776 }, { "epoch": 2.01, "learning_rate": 0.0003976328986960883, "loss": 3.0293, "theoretical_loss": 3.7783104769022455, "tokens_seen": 702093312 }, { "epoch": 2.01, "learning_rate": 0.00039762286860581746, "loss": 2.8848, "theoretical_loss": 3.7782746877566074, "tokens_seen": 702158848 }, { "epoch": 2.01, "learning_rate": 0.00039761283851554664, "loss": 2.9752, "theoretical_loss": 3.7782389028864003, "tokens_seen": 702224384 }, { "epoch": 2.01, "learning_rate": 0.0003976028084252759, "loss": 3.1119, "theoretical_loss": 3.778203122290715, "tokens_seen": 702289920 }, { "epoch": 2.01, "learning_rate": 0.000397592778335005, "loss": 2.9184, "theoretical_loss": 3.7781673459686416, "tokens_seen": 702355456 }, { "epoch": 2.01, "learning_rate": 0.00039758274824473424, "loss": 2.7699, "theoretical_loss": 3.7781315739192713, "tokens_seen": 702420992 }, { "epoch": 2.01, "learning_rate": 0.00039757271815446337, "loss": 3.1194, "theoretical_loss": 3.7780958061416947, "tokens_seen": 702486528 }, { "epoch": 2.01, "learning_rate": 0.0003975626880641926, "loss": 2.8179, "theoretical_loss": 3.778060042635004, "tokens_seen": 702552064 }, { "epoch": 2.01, "learning_rate": 0.0003975526579739218, "loss": 2.8427, "theoretical_loss": 3.77802428339829, "tokens_seen": 702617600 }, { "epoch": 2.01, "learning_rate": 0.00039754262788365097, "loss": 3.1034, "theoretical_loss": 3.777988528430645, "tokens_seen": 702683136 }, { "epoch": 2.01, "learning_rate": 0.00039753259779338015, "loss": 2.9692, "theoretical_loss": 3.777952777731162, "tokens_seen": 702748672 }, { "epoch": 2.01, "learning_rate": 0.00039752256770310933, "loss": 3.0001, "theoretical_loss": 3.7779170312989327, "tokens_seen": 702814208 }, { "epoch": 2.01, "objective/train/docs_used": 1140622, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.982654571533203, "objective/train/theoretical_loss": 3.7778991596827547, "objective/train/tokens_used": 723306976, "theoretical_loss": 3.7778991596827547, "tokens_seen": 702846976 }, { "epoch": 2.01, "learning_rate": 0.0003975125376128385, "loss": 2.9833, "theoretical_loss": 3.77788128913305, "tokens_seen": 702879744 }, { "epoch": 2.01, "learning_rate": 0.00039750250752256774, "loss": 3.0299, "theoretical_loss": 3.777845551232608, "tokens_seen": 702945280 }, { "epoch": 2.01, "learning_rate": 0.00039749247743229687, "loss": 2.8917, "theoretical_loss": 3.7778098175966983, "tokens_seen": 703010816 }, { "epoch": 2.01, "learning_rate": 0.0003974824473420261, "loss": 3.0471, "theoretical_loss": 3.777774088224416, "tokens_seen": 703076352 }, { "epoch": 2.01, "learning_rate": 0.00039747241725175523, "loss": 3.1119, "theoretical_loss": 3.7777383631148544, "tokens_seen": 703141888 }, { "epoch": 2.01, "learning_rate": 0.00039746238716148447, "loss": 2.8743, "theoretical_loss": 3.777702642267108, "tokens_seen": 703207424 }, { "epoch": 2.01, "learning_rate": 0.00039745235707121365, "loss": 3.1132, "theoretical_loss": 3.777666925680271, "tokens_seen": 703272960 }, { "epoch": 2.01, "learning_rate": 0.00039744232698094283, "loss": 2.9272, "theoretical_loss": 3.7776312133534375, "tokens_seen": 703338496 }, { "epoch": 2.01, "learning_rate": 0.000397432296890672, "loss": 3.0037, "theoretical_loss": 3.7775955052857038, "tokens_seen": 703404032 }, { "epoch": 2.01, "learning_rate": 0.00039742226680040125, "loss": 3.0838, "theoretical_loss": 3.777559801476165, "tokens_seen": 703469568 }, { "epoch": 2.01, "learning_rate": 0.0003974122367101304, "loss": 3.0274, "theoretical_loss": 3.777524101923916, "tokens_seen": 703535104 }, { "epoch": 2.01, "learning_rate": 0.0003974022066198596, "loss": 2.9988, "theoretical_loss": 3.777488406628053, "tokens_seen": 703600640 }, { "epoch": 2.01, "learning_rate": 0.00039739217652958874, "loss": 3.084, "theoretical_loss": 3.7774527155876725, "tokens_seen": 703666176 }, { "epoch": 2.01, "learning_rate": 0.00039738214643931797, "loss": 3.0325, "theoretical_loss": 3.77741702880187, "tokens_seen": 703731712 }, { "epoch": 2.01, "learning_rate": 0.00039737211634904715, "loss": 2.9068, "theoretical_loss": 3.777381346269743, "tokens_seen": 703797248 }, { "epoch": 2.01, "learning_rate": 0.00039736208625877633, "loss": 3.1556, "theoretical_loss": 3.7773456679903887, "tokens_seen": 703862784 }, { "epoch": 2.01, "learning_rate": 0.0003973520561685055, "loss": 3.04, "theoretical_loss": 3.777309993962903, "tokens_seen": 703928320 }, { "epoch": 2.01, "learning_rate": 0.0003973420260782347, "loss": 2.989, "theoretical_loss": 3.7772743241863846, "tokens_seen": 703993856 }, { "epoch": 2.01, "learning_rate": 0.0003973319959879639, "loss": 2.9298, "theoretical_loss": 3.777238658659931, "tokens_seen": 704059392 }, { "epoch": 2.01, "learning_rate": 0.0003973219658976931, "loss": 2.9779, "theoretical_loss": 3.7772029973826404, "tokens_seen": 704124928 }, { "epoch": 2.01, "learning_rate": 0.00039731193580742224, "loss": 3.0398, "theoretical_loss": 3.7771673403536106, "tokens_seen": 704190464 }, { "epoch": 2.01, "learning_rate": 0.0003973019057171515, "loss": 3.0579, "theoretical_loss": 3.777131687571941, "tokens_seen": 704256000 }, { "epoch": 2.01, "learning_rate": 0.0003972918756268806, "loss": 3.0316, "theoretical_loss": 3.7770960390367296, "tokens_seen": 704321536 }, { "epoch": 2.01, "learning_rate": 0.00039728184553660984, "loss": 2.9661, "theoretical_loss": 3.7770603947470764, "tokens_seen": 704387072 }, { "epoch": 2.01, "learning_rate": 0.000397271815446339, "loss": 3.0841, "theoretical_loss": 3.7770247547020803, "tokens_seen": 704452608 }, { "epoch": 2.01, "objective/train/docs_used": 1142051, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0680246353149414, "objective/train/theoretical_loss": 3.7770069362710474, "objective/train/tokens_used": 724945376, "theoretical_loss": 3.7770069362710474, "tokens_seen": 704485376 }, { "epoch": 2.01, "learning_rate": 0.0003972617853560682, "loss": 3.0622, "theoretical_loss": 3.7769891189008407, "tokens_seen": 704518144 }, { "epoch": 2.01, "learning_rate": 0.0003972517552657974, "loss": 2.9879, "theoretical_loss": 3.7769534873424586, "tokens_seen": 704583680 }, { "epoch": 2.01, "learning_rate": 0.0003972417251755266, "loss": 3.0423, "theoretical_loss": 3.7769178600260336, "tokens_seen": 704649216 }, { "epoch": 2.01, "learning_rate": 0.00039723169508525574, "loss": 3.0922, "theoretical_loss": 3.7768822369506667, "tokens_seen": 704714752 }, { "epoch": 2.01, "learning_rate": 0.000397221664994985, "loss": 2.8553, "theoretical_loss": 3.7768466181154583, "tokens_seen": 704780288 }, { "epoch": 2.01, "learning_rate": 0.0003972116349047141, "loss": 2.9161, "theoretical_loss": 3.776811003519509, "tokens_seen": 704845824 }, { "epoch": 2.01, "learning_rate": 0.00039720160481444334, "loss": 2.8681, "theoretical_loss": 3.7767753931619215, "tokens_seen": 704911360 }, { "epoch": 2.01, "learning_rate": 0.0003971915747241725, "loss": 3.0138, "theoretical_loss": 3.7767397870417962, "tokens_seen": 704976896 }, { "epoch": 2.01, "learning_rate": 0.0003971815446339017, "loss": 3.1709, "theoretical_loss": 3.7767041851582355, "tokens_seen": 705042432 }, { "epoch": 2.01, "learning_rate": 0.0003971715145436309, "loss": 3.0723, "theoretical_loss": 3.7766685875103416, "tokens_seen": 705107968 }, { "epoch": 2.01, "learning_rate": 0.00039716148445336007, "loss": 3.0234, "theoretical_loss": 3.776632994097217, "tokens_seen": 705173504 }, { "epoch": 2.01, "learning_rate": 0.00039715145436308925, "loss": 2.9521, "theoretical_loss": 3.7765974049179647, "tokens_seen": 705239040 }, { "epoch": 2.01, "learning_rate": 0.0003971414242728185, "loss": 3.058, "theoretical_loss": 3.7765618199716866, "tokens_seen": 705304576 }, { "epoch": 2.01, "learning_rate": 0.00039713139418254766, "loss": 2.9237, "theoretical_loss": 3.776526239257487, "tokens_seen": 705370112 }, { "epoch": 2.01, "learning_rate": 0.00039712136409227684, "loss": 3.1556, "theoretical_loss": 3.7764906627744694, "tokens_seen": 705435648 }, { "epoch": 2.01, "learning_rate": 0.0003971113340020061, "loss": 3.1541, "theoretical_loss": 3.7764550905217376, "tokens_seen": 705501184 }, { "epoch": 2.01, "learning_rate": 0.0003971013039117352, "loss": 3.0884, "theoretical_loss": 3.776419522498395, "tokens_seen": 705566720 }, { "epoch": 2.01, "learning_rate": 0.00039709127382146444, "loss": 3.0433, "theoretical_loss": 3.776383958703547, "tokens_seen": 705632256 }, { "epoch": 2.01, "learning_rate": 0.00039708124373119357, "loss": 3.0598, "theoretical_loss": 3.776348399136297, "tokens_seen": 705697792 }, { "epoch": 2.01, "learning_rate": 0.0003970712136409228, "loss": 2.737, "theoretical_loss": 3.7763128437957514, "tokens_seen": 705763328 }, { "epoch": 2.01, "learning_rate": 0.000397061183550652, "loss": 3.0029, "theoretical_loss": 3.776277292681014, "tokens_seen": 705828864 }, { "epoch": 2.01, "learning_rate": 0.00039705115346038117, "loss": 3.1169, "theoretical_loss": 3.776241745791191, "tokens_seen": 705894400 }, { "epoch": 2.01, "learning_rate": 0.00039704112337011035, "loss": 3.0092, "theoretical_loss": 3.7762062031253887, "tokens_seen": 705959936 }, { "epoch": 2.01, "learning_rate": 0.00039703109327983953, "loss": 2.9545, "theoretical_loss": 3.776170664682712, "tokens_seen": 706025472 }, { "epoch": 2.01, "learning_rate": 0.0003970210631895687, "loss": 3.0135, "theoretical_loss": 3.7761351304622677, "tokens_seen": 706091008 }, { "epoch": 2.01, "objective/train/docs_used": 1144616, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.111185312271118, "objective/train/theoretical_loss": 3.776117364935103, "objective/train/tokens_used": 726583776, "theoretical_loss": 3.776117364935103, "tokens_seen": 706123776 }, { "epoch": 2.01, "learning_rate": 0.00039701103309929794, "loss": 3.0719, "theoretical_loss": 3.776099600463162, "tokens_seen": 706156544 }, { "epoch": 2.01, "learning_rate": 0.00039700100300902707, "loss": 2.9156, "theoretical_loss": 3.776064074684502, "tokens_seen": 706222080 }, { "epoch": 2.01, "learning_rate": 0.0003969909729187563, "loss": 3.0956, "theoretical_loss": 3.776028553125395, "tokens_seen": 706287616 }, { "epoch": 2.01, "learning_rate": 0.00039698094282848543, "loss": 3.1162, "theoretical_loss": 3.775993035784948, "tokens_seen": 706353152 }, { "epoch": 2.01, "learning_rate": 0.00039697091273821467, "loss": 2.9201, "theoretical_loss": 3.775957522662269, "tokens_seen": 706418688 }, { "epoch": 2.01, "learning_rate": 0.00039696088264794385, "loss": 3.0248, "theoretical_loss": 3.7759220137564653, "tokens_seen": 706484224 }, { "epoch": 2.01, "learning_rate": 0.00039695085255767303, "loss": 3.1697, "theoretical_loss": 3.7758865090666456, "tokens_seen": 706549760 }, { "epoch": 2.01, "learning_rate": 0.0003969408224674022, "loss": 2.8273, "theoretical_loss": 3.775851008591918, "tokens_seen": 706615296 }, { "epoch": 2.01, "learning_rate": 0.00039693079237713145, "loss": 3.1201, "theoretical_loss": 3.7758155123313917, "tokens_seen": 706680832 }, { "epoch": 2.01, "learning_rate": 0.0003969207622868606, "loss": 3.1071, "theoretical_loss": 3.7757800202841754, "tokens_seen": 706746368 }, { "epoch": 2.01, "learning_rate": 0.0003969107321965898, "loss": 3.0149, "theoretical_loss": 3.7757445324493784, "tokens_seen": 706811904 }, { "epoch": 2.01, "learning_rate": 0.00039690070210631894, "loss": 2.8823, "theoretical_loss": 3.7757090488261102, "tokens_seen": 706877440 }, { "epoch": 2.01, "learning_rate": 0.00039689067201604817, "loss": 3.027, "theoretical_loss": 3.775673569413481, "tokens_seen": 706942976 }, { "epoch": 2.01, "learning_rate": 0.00039688064192577735, "loss": 2.9587, "theoretical_loss": 3.7756380942106, "tokens_seen": 707008512 }, { "epoch": 2.01, "learning_rate": 0.00039687061183550653, "loss": 3.1923, "theoretical_loss": 3.775602623216578, "tokens_seen": 707074048 }, { "epoch": 2.01, "learning_rate": 0.0003968605817452357, "loss": 2.9841, "theoretical_loss": 3.775567156430526, "tokens_seen": 707139584 }, { "epoch": 2.01, "learning_rate": 0.0003968505516549649, "loss": 3.0072, "theoretical_loss": 3.775531693851554, "tokens_seen": 707205120 }, { "epoch": 2.01, "learning_rate": 0.0003968405215646941, "loss": 2.9729, "theoretical_loss": 3.7754962354787747, "tokens_seen": 707270656 }, { "epoch": 2.01, "learning_rate": 0.0003968304914744233, "loss": 3.0515, "theoretical_loss": 3.7754607813112977, "tokens_seen": 707336192 }, { "epoch": 2.01, "learning_rate": 0.00039682046138415244, "loss": 2.9411, "theoretical_loss": 3.7754253313482358, "tokens_seen": 707401728 }, { "epoch": 2.01, "learning_rate": 0.0003968104312938817, "loss": 2.9601, "theoretical_loss": 3.7753898855887007, "tokens_seen": 707467264 }, { "epoch": 2.01, "learning_rate": 0.0003968004012036108, "loss": 3.1066, "theoretical_loss": 3.7753544440318048, "tokens_seen": 707532800 }, { "epoch": 2.01, "learning_rate": 0.00039679037111334004, "loss": 3.0609, "theoretical_loss": 3.77531900667666, "tokens_seen": 707598336 }, { "epoch": 2.01, "learning_rate": 0.0003967803410230692, "loss": 3.0026, "theoretical_loss": 3.77528357352238, "tokens_seen": 707663872 }, { "epoch": 2.01, "learning_rate": 0.0003967703109327984, "loss": 2.9723, "theoretical_loss": 3.7752481445680774, "tokens_seen": 707729408 }, { "epoch": 2.01, "objective/train/docs_used": 1147397, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.93155574798584, "objective/train/theoretical_loss": 3.7752304316656407, "objective/train/tokens_used": 728222176, "theoretical_loss": 3.7752304316656407, "tokens_seen": 707762176 }, { "epoch": 2.01, "learning_rate": 0.0003967602808425276, "loss": 2.9679, "theoretical_loss": 3.7752127198128655, "tokens_seen": 707794944 }, { "epoch": 2.01, "learning_rate": 0.0003967502507522568, "loss": 3.1664, "theoretical_loss": 3.7751772992558577, "tokens_seen": 707860480 }, { "epoch": 2.01, "learning_rate": 0.00039674022066198594, "loss": 2.9087, "theoretical_loss": 3.7751418828961683, "tokens_seen": 707926016 }, { "epoch": 2.01, "learning_rate": 0.0003967301905717152, "loss": 2.9441, "theoretical_loss": 3.775106470732911, "tokens_seen": 707991552 }, { "epoch": 2.01, "learning_rate": 0.0003967201604814443, "loss": 2.9105, "theoretical_loss": 3.7750710627652, "tokens_seen": 708057088 }, { "epoch": 2.01, "learning_rate": 0.00039671013039117354, "loss": 2.9786, "theoretical_loss": 3.7750356589921505, "tokens_seen": 708122624 }, { "epoch": 2.01, "learning_rate": 0.0003967001003009027, "loss": 2.8067, "theoretical_loss": 3.775000259412878, "tokens_seen": 708188160 }, { "epoch": 2.01, "learning_rate": 0.0003966900702106319, "loss": 3.0227, "theoretical_loss": 3.7749648640264963, "tokens_seen": 708253696 }, { "epoch": 2.01, "learning_rate": 0.0003966800401203611, "loss": 3.1877, "theoretical_loss": 3.774929472832121, "tokens_seen": 708319232 }, { "epoch": 2.01, "learning_rate": 0.00039667001003009027, "loss": 3.0664, "theoretical_loss": 3.7748940858288695, "tokens_seen": 708384768 }, { "epoch": 2.01, "learning_rate": 0.00039665997993981945, "loss": 3.0893, "theoretical_loss": 3.774858703015856, "tokens_seen": 708450304 }, { "epoch": 2.01, "learning_rate": 0.0003966499498495487, "loss": 3.1578, "theoretical_loss": 3.7748233243921976, "tokens_seen": 708515840 }, { "epoch": 2.01, "learning_rate": 0.0003966399197592778, "loss": 3.0583, "theoretical_loss": 3.7747879499570107, "tokens_seen": 708581376 }, { "epoch": 2.01, "learning_rate": 0.00039662988966900704, "loss": 3.1686, "theoretical_loss": 3.7747525797094115, "tokens_seen": 708646912 }, { "epoch": 2.01, "learning_rate": 0.00039661985957873617, "loss": 2.9951, "theoretical_loss": 3.7747172136485183, "tokens_seen": 708712448 }, { "epoch": 2.01, "learning_rate": 0.0003966098294884654, "loss": 3.024, "theoretical_loss": 3.7746818517734475, "tokens_seen": 708777984 }, { "epoch": 2.01, "learning_rate": 0.0003965997993981946, "loss": 2.9102, "theoretical_loss": 3.774646494083317, "tokens_seen": 708843520 }, { "epoch": 2.01, "learning_rate": 0.00039658976930792377, "loss": 3.0692, "theoretical_loss": 3.774611140577245, "tokens_seen": 708909056 }, { "epoch": 2.01, "learning_rate": 0.00039657973921765295, "loss": 3.0083, "theoretical_loss": 3.774575791254349, "tokens_seen": 708974592 }, { "epoch": 2.01, "learning_rate": 0.0003965697091273822, "loss": 3.0292, "theoretical_loss": 3.7745404461137477, "tokens_seen": 709040128 }, { "epoch": 2.01, "learning_rate": 0.0003965596790371113, "loss": 3.1349, "theoretical_loss": 3.77450510515456, "tokens_seen": 709105664 }, { "epoch": 2.01, "learning_rate": 0.00039654964894684055, "loss": 2.8256, "theoretical_loss": 3.774469768375905, "tokens_seen": 709171200 }, { "epoch": 2.01, "learning_rate": 0.0003965396188565697, "loss": 3.0662, "theoretical_loss": 3.774434435776901, "tokens_seen": 709236736 }, { "epoch": 2.01, "learning_rate": 0.0003965295887662989, "loss": 2.9864, "theoretical_loss": 3.7743991073566687, "tokens_seen": 709302272 }, { "epoch": 2.01, "learning_rate": 0.0003965195586760281, "loss": 2.8034, "theoretical_loss": 3.774363783114327, "tokens_seen": 709367808 }, { "epoch": 2.01, "objective/train/docs_used": 1150117, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9985620975494385, "objective/train/theoretical_loss": 3.77434612255959, "objective/train/tokens_used": 729860576, "theoretical_loss": 3.77434612255959, "tokens_seen": 709400576 }, { "epoch": 2.01, "learning_rate": 0.00039650952858575727, "loss": 2.9042, "theoretical_loss": 3.774328463048996, "tokens_seen": 709433344 }, { "epoch": 2.01, "learning_rate": 0.00039649949849548645, "loss": 2.8578, "theoretical_loss": 3.774293147159796, "tokens_seen": 709498880 }, { "epoch": 2.01, "learning_rate": 0.00039648946840521563, "loss": 3.0213, "theoretical_loss": 3.774257835445848, "tokens_seen": 709564416 }, { "epoch": 2.01, "learning_rate": 0.0003964794383149448, "loss": 3.1046, "theoretical_loss": 3.7742225279062724, "tokens_seen": 709629952 }, { "epoch": 2.01, "learning_rate": 0.00039646940822467405, "loss": 2.8338, "theoretical_loss": 3.7741872245401904, "tokens_seen": 709695488 }, { "epoch": 2.01, "learning_rate": 0.0003964593781344032, "loss": 2.8896, "theoretical_loss": 3.7741519253467226, "tokens_seen": 709761024 }, { "epoch": 2.01, "learning_rate": 0.0003964493480441324, "loss": 3.0698, "theoretical_loss": 3.7741166303249924, "tokens_seen": 709826560 }, { "epoch": 2.01, "learning_rate": 0.0003964393179538616, "loss": 3.0055, "theoretical_loss": 3.77408133947412, "tokens_seen": 709892096 }, { "epoch": 2.01, "learning_rate": 0.0003964292878635908, "loss": 2.7048, "theoretical_loss": 3.7740460527932282, "tokens_seen": 709957632 }, { "epoch": 2.01, "learning_rate": 0.00039641925777331996, "loss": 3.0385, "theoretical_loss": 3.77401077028144, "tokens_seen": 710023168 }, { "epoch": 2.01, "learning_rate": 0.00039640922768304914, "loss": 3.1381, "theoretical_loss": 3.7739754919378767, "tokens_seen": 710088704 }, { "epoch": 2.01, "learning_rate": 0.0003963991975927783, "loss": 2.9981, "theoretical_loss": 3.773940217761662, "tokens_seen": 710154240 }, { "epoch": 2.01, "learning_rate": 0.00039638916750250755, "loss": 2.9745, "theoretical_loss": 3.7739049477519195, "tokens_seen": 710219776 }, { "epoch": 2.01, "learning_rate": 0.00039637913741223673, "loss": 2.9052, "theoretical_loss": 3.7738696819077724, "tokens_seen": 710285312 }, { "epoch": 2.01, "learning_rate": 0.0003963691073219659, "loss": 3.1292, "theoretical_loss": 3.7738344202283436, "tokens_seen": 710350848 }, { "epoch": 2.01, "learning_rate": 0.0003963590772316951, "loss": 3.0449, "theoretical_loss": 3.7737991627127587, "tokens_seen": 710416384 }, { "epoch": 2.01, "learning_rate": 0.0003963490471414243, "loss": 3.0582, "theoretical_loss": 3.7737639093601407, "tokens_seen": 710481920 }, { "epoch": 2.01, "learning_rate": 0.0003963390170511535, "loss": 2.9376, "theoretical_loss": 3.7737286601696143, "tokens_seen": 710547456 }, { "epoch": 2.01, "learning_rate": 0.00039632898696088264, "loss": 2.9115, "theoretical_loss": 3.773693415140305, "tokens_seen": 710612992 }, { "epoch": 2.01, "learning_rate": 0.0003963189568706119, "loss": 3.0355, "theoretical_loss": 3.7736581742713367, "tokens_seen": 710678528 }, { "epoch": 2.01, "learning_rate": 0.000396308926780341, "loss": 2.7558, "theoretical_loss": 3.7736229375618358, "tokens_seen": 710744064 }, { "epoch": 2.01, "learning_rate": 0.00039629889669007024, "loss": 3.0142, "theoretical_loss": 3.773587705010928, "tokens_seen": 710809600 }, { "epoch": 2.01, "learning_rate": 0.0003962888665997994, "loss": 2.9493, "theoretical_loss": 3.7735524766177377, "tokens_seen": 710875136 }, { "epoch": 2.01, "learning_rate": 0.0003962788365095286, "loss": 2.9852, "theoretical_loss": 3.7735172523813927, "tokens_seen": 710940672 }, { "epoch": 2.01, "learning_rate": 0.0003962688064192578, "loss": 2.9948, "theoretical_loss": 3.7734820323010183, "tokens_seen": 711006208 }, { "epoch": 2.01, "objective/train/docs_used": 1152743, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.33278489112854, "objective/train/theoretical_loss": 3.773464423819047, "objective/train/tokens_used": 731498976, "theoretical_loss": 3.773464423819047, "tokens_seen": 711038976 }, { "epoch": 2.01, "learning_rate": 0.000396258776328987, "loss": 3.1842, "theoretical_loss": 3.773446816375742, "tokens_seen": 711071744 }, { "epoch": 2.01, "learning_rate": 0.00039624874623871614, "loss": 3.0527, "theoretical_loss": 3.7734116046046897, "tokens_seen": 711137280 }, { "epoch": 2.01, "learning_rate": 0.0003962387161484454, "loss": 2.9193, "theoretical_loss": 3.7733763969869893, "tokens_seen": 711202816 }, { "epoch": 2.01, "learning_rate": 0.0003962286860581745, "loss": 2.8429, "theoretical_loss": 3.7733411935217678, "tokens_seen": 711268352 }, { "epoch": 2.01, "learning_rate": 0.00039621865596790374, "loss": 3.0734, "theoretical_loss": 3.7733059942081537, "tokens_seen": 711333888 }, { "epoch": 2.01, "learning_rate": 0.0003962086258776329, "loss": 2.8593, "theoretical_loss": 3.7732707990452745, "tokens_seen": 711399424 }, { "epoch": 2.01, "learning_rate": 0.0003961985957873621, "loss": 2.9719, "theoretical_loss": 3.773235608032258, "tokens_seen": 711464960 }, { "epoch": 2.01, "learning_rate": 0.0003961885656970913, "loss": 2.9628, "theoretical_loss": 3.773200421168233, "tokens_seen": 711530496 }, { "epoch": 2.01, "learning_rate": 0.00039617853560682047, "loss": 2.9314, "theoretical_loss": 3.773165238452328, "tokens_seen": 711596032 }, { "epoch": 2.01, "learning_rate": 0.00039616850551654965, "loss": 2.8862, "theoretical_loss": 3.7731300598836732, "tokens_seen": 711661568 }, { "epoch": 2.01, "learning_rate": 0.0003961584754262789, "loss": 2.911, "theoretical_loss": 3.773094885461396, "tokens_seen": 711727104 }, { "epoch": 2.01, "learning_rate": 0.000396148445336008, "loss": 2.9683, "theoretical_loss": 3.773059715184628, "tokens_seen": 711792640 }, { "epoch": 2.01, "learning_rate": 0.00039613841524573724, "loss": 2.9682, "theoretical_loss": 3.7730245490524976, "tokens_seen": 711858176 }, { "epoch": 2.01, "learning_rate": 0.00039612838515546637, "loss": 2.8281, "theoretical_loss": 3.7729893870641353, "tokens_seen": 711923712 }, { "epoch": 2.01, "learning_rate": 0.0003961183550651956, "loss": 3.1436, "theoretical_loss": 3.7729542292186715, "tokens_seen": 711989248 }, { "epoch": 2.01, "learning_rate": 0.0003961083249749248, "loss": 3.1605, "theoretical_loss": 3.7729190755152366, "tokens_seen": 712054784 }, { "epoch": 2.01, "learning_rate": 0.00039609829488465397, "loss": 3.0626, "theoretical_loss": 3.772883925952961, "tokens_seen": 712120320 }, { "epoch": 2.01, "learning_rate": 0.00039608826479438315, "loss": 3.0671, "theoretical_loss": 3.7728487805309774, "tokens_seen": 712185856 }, { "epoch": 2.01, "learning_rate": 0.0003960782347041124, "loss": 2.9168, "theoretical_loss": 3.7728136392484153, "tokens_seen": 712251392 }, { "epoch": 2.01, "learning_rate": 0.0003960682046138415, "loss": 3.0369, "theoretical_loss": 3.7727785021044076, "tokens_seen": 712316928 }, { "epoch": 2.01, "learning_rate": 0.00039605817452357075, "loss": 3.0303, "theoretical_loss": 3.772743369098086, "tokens_seen": 712382464 }, { "epoch": 2.01, "learning_rate": 0.0003960481444332999, "loss": 3.0469, "theoretical_loss": 3.772708240228582, "tokens_seen": 712448000 }, { "epoch": 2.01, "learning_rate": 0.0003960381143430291, "loss": 2.8626, "theoretical_loss": 3.772673115495029, "tokens_seen": 712513536 }, { "epoch": 2.01, "learning_rate": 0.0003960280842527583, "loss": 2.9383, "theoretical_loss": 3.7726379948965594, "tokens_seen": 712579072 }, { "epoch": 2.01, "learning_rate": 0.00039601805416248747, "loss": 2.9399, "theoretical_loss": 3.7726028784323056, "tokens_seen": 712644608 }, { "epoch": 2.01, "objective/train/docs_used": 1155520, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.712702512741089, "objective/train/theoretical_loss": 3.7725853217502383, "objective/train/tokens_used": 733137376, "theoretical_loss": 3.7725853217502383, "tokens_seen": 712677376 }, { "epoch": 2.01, "learning_rate": 0.00039600802407221665, "loss": 2.7262, "theoretical_loss": 3.7725677661014005, "tokens_seen": 712710144 }, { "epoch": 2.01, "learning_rate": 0.00039599799398194583, "loss": 2.9801, "theoretical_loss": 3.772532657902979, "tokens_seen": 712775680 }, { "epoch": 2.01, "learning_rate": 0.000395987963891675, "loss": 3.0095, "theoretical_loss": 3.7724975538361747, "tokens_seen": 712841216 }, { "epoch": 2.01, "learning_rate": 0.00039597793380140425, "loss": 3.1322, "theoretical_loss": 3.77246245390012, "tokens_seen": 712906752 }, { "epoch": 2.01, "learning_rate": 0.0003959679037111334, "loss": 3.0705, "theoretical_loss": 3.7724273580939514, "tokens_seen": 712972288 }, { "epoch": 2.01, "learning_rate": 0.0003959578736208626, "loss": 2.91, "theoretical_loss": 3.772392266416801, "tokens_seen": 713037824 }, { "epoch": 2.01, "learning_rate": 0.0003959478435305918, "loss": 3.0139, "theoretical_loss": 3.7723571788678054, "tokens_seen": 713103360 }, { "epoch": 2.01, "learning_rate": 0.000395937813440321, "loss": 3.0371, "theoretical_loss": 3.772322095446099, "tokens_seen": 713168896 }, { "epoch": 2.01, "learning_rate": 0.00039592778335005016, "loss": 3.0974, "theoretical_loss": 3.772287016150817, "tokens_seen": 713234432 }, { "epoch": 2.01, "learning_rate": 0.00039591775325977934, "loss": 3.0335, "theoretical_loss": 3.7722519409810955, "tokens_seen": 713299968 }, { "epoch": 2.01, "learning_rate": 0.0003959077231695085, "loss": 3.1372, "theoretical_loss": 3.77221686993607, "tokens_seen": 713365504 }, { "epoch": 2.01, "learning_rate": 0.00039589769307923775, "loss": 2.9532, "theoretical_loss": 3.7721818030148757, "tokens_seen": 713431040 }, { "epoch": 2.01, "learning_rate": 0.0003958876629889669, "loss": 2.9672, "theoretical_loss": 3.772146740216651, "tokens_seen": 713496576 }, { "epoch": 2.01, "learning_rate": 0.0003958776328986961, "loss": 2.9461, "theoretical_loss": 3.7721116815405304, "tokens_seen": 713562112 }, { "epoch": 2.01, "learning_rate": 0.00039586760280842524, "loss": 2.9659, "theoretical_loss": 3.772076626985652, "tokens_seen": 713627648 }, { "epoch": 2.01, "learning_rate": 0.0003958575727181545, "loss": 2.9656, "theoretical_loss": 3.7720415765511524, "tokens_seen": 713693184 }, { "epoch": 2.01, "learning_rate": 0.00039584754262788366, "loss": 2.9975, "theoretical_loss": 3.7720065302361694, "tokens_seen": 713758720 }, { "epoch": 2.01, "learning_rate": 0.00039583751253761284, "loss": 3.0615, "theoretical_loss": 3.7719714880398403, "tokens_seen": 713824256 }, { "epoch": 2.01, "learning_rate": 0.000395827482447342, "loss": 2.8113, "theoretical_loss": 3.771936449961303, "tokens_seen": 713889792 }, { "epoch": 2.01, "learning_rate": 0.0003958174523570712, "loss": 2.8223, "theoretical_loss": 3.7719014159996966, "tokens_seen": 713955328 }, { "epoch": 2.01, "learning_rate": 0.0003958074222668004, "loss": 3.1251, "theoretical_loss": 3.7718663861541577, "tokens_seen": 714020864 }, { "epoch": 2.01, "learning_rate": 0.0003957973921765296, "loss": 3.0584, "theoretical_loss": 3.7718313604238265, "tokens_seen": 714086400 }, { "epoch": 2.01, "learning_rate": 0.00039578736208625875, "loss": 2.7785, "theoretical_loss": 3.771796338807842, "tokens_seen": 714151936 }, { "epoch": 2.01, "learning_rate": 0.000395777331995988, "loss": 2.9026, "theoretical_loss": 3.771761321305342, "tokens_seen": 714217472 }, { "epoch": 2.01, "learning_rate": 0.00039576730190571716, "loss": 3.0076, "theoretical_loss": 3.7717263079154675, "tokens_seen": 714283008 }, { "epoch": 2.01, "objective/train/docs_used": 1157062, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.037050247192383, "objective/train/theoretical_loss": 3.7717088027624954, "objective/train/tokens_used": 734775776, "theoretical_loss": 3.7717088027624954, "tokens_seen": 714315776 }, { "epoch": 2.01, "learning_rate": 0.00039575727181544634, "loss": 3.0643, "theoretical_loss": 3.771691298637357, "tokens_seen": 714348544 }, { "epoch": 2.01, "learning_rate": 0.0003957472417251755, "loss": 3.0341, "theoretical_loss": 3.7716562934701514, "tokens_seen": 714414080 }, { "epoch": 2.01, "learning_rate": 0.0003957372116349047, "loss": 3.0236, "theoretical_loss": 3.7716212924129904, "tokens_seen": 714479616 }, { "epoch": 2.01, "learning_rate": 0.0003957271815446339, "loss": 2.8807, "theoretical_loss": 3.7715862954650152, "tokens_seen": 714545152 }, { "epoch": 2.01, "learning_rate": 0.0003957171514543631, "loss": 3.0442, "theoretical_loss": 3.771551302625366, "tokens_seen": 714610688 }, { "epoch": 2.01, "learning_rate": 0.00039570712136409225, "loss": 3.0506, "theoretical_loss": 3.7715163138931835, "tokens_seen": 714676224 }, { "epoch": 2.01, "learning_rate": 0.0003956970912738215, "loss": 3.0922, "theoretical_loss": 3.7714813292676093, "tokens_seen": 714741760 }, { "epoch": 2.01, "learning_rate": 0.0003956870611835506, "loss": 2.9799, "theoretical_loss": 3.7714463487477854, "tokens_seen": 714807296 }, { "epoch": 2.01, "learning_rate": 0.00039567703109327985, "loss": 2.8774, "theoretical_loss": 3.7714113723328526, "tokens_seen": 714872832 }, { "epoch": 2.01, "learning_rate": 0.00039566700100300903, "loss": 3.0554, "theoretical_loss": 3.771376400021954, "tokens_seen": 714938368 }, { "epoch": 2.01, "learning_rate": 0.0003956569709127382, "loss": 2.8835, "theoretical_loss": 3.771341431814231, "tokens_seen": 715003904 }, { "epoch": 2.01, "learning_rate": 0.0003956469408224674, "loss": 3.01, "theoretical_loss": 3.7713064677088273, "tokens_seen": 715069440 }, { "epoch": 2.01, "learning_rate": 0.00039563691073219657, "loss": 2.9128, "theoretical_loss": 3.771271507704884, "tokens_seen": 715134976 }, { "epoch": 2.01, "learning_rate": 0.0003956268806419258, "loss": 2.8948, "theoretical_loss": 3.771236551801546, "tokens_seen": 715200512 }, { "epoch": 2.01, "learning_rate": 0.000395616850551655, "loss": 2.8664, "theoretical_loss": 3.771201599997956, "tokens_seen": 715266048 }, { "epoch": 2.01, "learning_rate": 0.00039560682046138417, "loss": 2.911, "theoretical_loss": 3.7711666522932568, "tokens_seen": 715331584 }, { "epoch": 2.01, "learning_rate": 0.00039559679037111335, "loss": 2.8907, "theoretical_loss": 3.7711317086865925, "tokens_seen": 715397120 }, { "epoch": 2.01, "learning_rate": 0.0003955867602808426, "loss": 2.8546, "theoretical_loss": 3.7710967691771087, "tokens_seen": 715462656 }, { "epoch": 2.01, "learning_rate": 0.0003955767301905717, "loss": 2.9286, "theoretical_loss": 3.771061833763948, "tokens_seen": 715528192 }, { "epoch": 2.01, "learning_rate": 0.00039556670010030095, "loss": 3.0488, "theoretical_loss": 3.771026902446256, "tokens_seen": 715593728 }, { "epoch": 2.01, "learning_rate": 0.0003955566700100301, "loss": 2.863, "theoretical_loss": 3.7709919752231773, "tokens_seen": 715659264 }, { "epoch": 2.01, "learning_rate": 0.0003955466399197593, "loss": 3.0399, "theoretical_loss": 3.7709570520938565, "tokens_seen": 715724800 }, { "epoch": 2.01, "learning_rate": 0.0003955366098294885, "loss": 3.0029, "theoretical_loss": 3.77092213305744, "tokens_seen": 715790336 }, { "epoch": 2.01, "learning_rate": 0.00039552657973921767, "loss": 2.994, "theoretical_loss": 3.7708872181130726, "tokens_seen": 715855872 }, { "epoch": 2.01, "learning_rate": 0.00039551654964894685, "loss": 3.0169, "theoretical_loss": 3.7708523072599007, "tokens_seen": 715921408 }, { "epoch": 2.01, "objective/train/docs_used": 1159922, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2523138523101807, "objective/train/theoretical_loss": 3.770834853367246, "objective/train/tokens_used": 736414176, "theoretical_loss": 3.770834853367246, "tokens_seen": 715954176 }, { "epoch": 2.01, "learning_rate": 0.00039550651955867603, "loss": 3.0593, "theoretical_loss": 3.7708174004970703, "tokens_seen": 715986944 }, { "epoch": 2.01, "learning_rate": 0.0003954964894684052, "loss": 3.0401, "theoretical_loss": 3.7707824978237277, "tokens_seen": 716052480 }, { "epoch": 2.01, "learning_rate": 0.00039548645937813445, "loss": 2.9216, "theoretical_loss": 3.7707475992390203, "tokens_seen": 716118016 }, { "epoch": 2.01, "learning_rate": 0.0003954764292878636, "loss": 2.9889, "theoretical_loss": 3.7707127047420936, "tokens_seen": 716183552 }, { "epoch": 2.01, "learning_rate": 0.0003954663991975928, "loss": 3.0132, "theoretical_loss": 3.770677814332097, "tokens_seen": 716249088 }, { "epoch": 2.01, "learning_rate": 0.000395456369107322, "loss": 3.0286, "theoretical_loss": 3.7706429280081752, "tokens_seen": 716314624 }, { "epoch": 2.01, "learning_rate": 0.0003954463390170512, "loss": 2.8885, "theoretical_loss": 3.770608045769478, "tokens_seen": 716380160 }, { "epoch": 2.01, "learning_rate": 0.00039543630892678036, "loss": 2.9411, "theoretical_loss": 3.7705731676151526, "tokens_seen": 716445696 }, { "epoch": 2.01, "learning_rate": 0.00039542627883650954, "loss": 2.9706, "theoretical_loss": 3.7705382935443468, "tokens_seen": 716511232 }, { "epoch": 2.01, "learning_rate": 0.0003954162487462387, "loss": 3.0912, "theoretical_loss": 3.77050342355621, "tokens_seen": 716576768 }, { "epoch": 2.01, "learning_rate": 0.00039540621865596795, "loss": 2.9667, "theoretical_loss": 3.77046855764989, "tokens_seen": 716642304 }, { "epoch": 2.01, "learning_rate": 0.0003953961885656971, "loss": 2.9857, "theoretical_loss": 3.7704336958245364, "tokens_seen": 716707840 }, { "epoch": 2.01, "learning_rate": 0.0003953861584754263, "loss": 2.9768, "theoretical_loss": 3.7703988380792985, "tokens_seen": 716773376 }, { "epoch": 2.01, "learning_rate": 0.00039537612838515544, "loss": 2.8564, "theoretical_loss": 3.7703639844133257, "tokens_seen": 716838912 }, { "epoch": 2.01, "learning_rate": 0.0003953660982948847, "loss": 3.01, "theoretical_loss": 3.7703291348257673, "tokens_seen": 716904448 }, { "epoch": 2.01, "learning_rate": 0.00039535606820461386, "loss": 3.0624, "theoretical_loss": 3.770294289315774, "tokens_seen": 716969984 }, { "epoch": 2.01, "learning_rate": 0.00039534603811434304, "loss": 2.8483, "theoretical_loss": 3.770259447882495, "tokens_seen": 717035520 }, { "epoch": 2.01, "learning_rate": 0.0003953360080240722, "loss": 2.9083, "theoretical_loss": 3.770224610525082, "tokens_seen": 717101056 }, { "epoch": 2.01, "learning_rate": 0.0003953259779338014, "loss": 3.069, "theoretical_loss": 3.7701897772426847, "tokens_seen": 717166592 }, { "epoch": 2.01, "learning_rate": 0.0003953159478435306, "loss": 2.7496, "theoretical_loss": 3.770154948034455, "tokens_seen": 717232128 }, { "epoch": 2.01, "learning_rate": 0.0003953059177532598, "loss": 2.9794, "theoretical_loss": 3.7701201228995433, "tokens_seen": 717297664 }, { "epoch": 2.01, "learning_rate": 0.00039529588766298895, "loss": 2.8598, "theoretical_loss": 3.7700853018371023, "tokens_seen": 717363200 }, { "epoch": 2.01, "learning_rate": 0.0003952858575727182, "loss": 3.0602, "theoretical_loss": 3.770050484846283, "tokens_seen": 717428736 }, { "epoch": 2.01, "learning_rate": 0.00039527582748244736, "loss": 2.8823, "theoretical_loss": 3.770015671926237, "tokens_seen": 717494272 }, { "epoch": 2.01, "learning_rate": 0.00039526579739217654, "loss": 3.0542, "theoretical_loss": 3.7699808630761185, "tokens_seen": 717559808 }, { "epoch": 2.01, "objective/train/docs_used": 1162641, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.243800163269043, "objective/train/theoretical_loss": 3.769963460177016, "objective/train/tokens_used": 738052576, "theoretical_loss": 3.769963460177016, "tokens_seen": 717592576 }, { "epoch": 2.01, "learning_rate": 0.0003952557673019057, "loss": 3.0921, "theoretical_loss": 3.7699460582950772, "tokens_seen": 717625344 }, { "epoch": 2.01, "learning_rate": 0.0003952457372116349, "loss": 2.8553, "theoretical_loss": 3.769911257582268, "tokens_seen": 717690880 }, { "epoch": 2.01, "learning_rate": 0.0003952357071213641, "loss": 2.9505, "theoretical_loss": 3.7698764609368434, "tokens_seen": 717756416 }, { "epoch": 2.01, "learning_rate": 0.0003952256770310933, "loss": 2.9981, "theoretical_loss": 3.7698416683579565, "tokens_seen": 717821952 }, { "epoch": 2.01, "learning_rate": 0.00039521564694082245, "loss": 3.0396, "theoretical_loss": 3.769806879844761, "tokens_seen": 717887488 }, { "epoch": 2.01, "learning_rate": 0.0003952056168505517, "loss": 2.9195, "theoretical_loss": 3.7697720953964104, "tokens_seen": 717953024 }, { "epoch": 2.01, "learning_rate": 0.0003951955867602808, "loss": 2.964, "theoretical_loss": 3.7697373150120597, "tokens_seen": 718018560 }, { "epoch": 2.01, "learning_rate": 0.00039518555667001005, "loss": 3.033, "theoretical_loss": 3.7697025386908622, "tokens_seen": 718084096 }, { "epoch": 2.01, "learning_rate": 0.00039517552657973923, "loss": 3.1288, "theoretical_loss": 3.769667766431973, "tokens_seen": 718149632 }, { "epoch": 2.01, "learning_rate": 0.0003951654964894684, "loss": 2.9644, "theoretical_loss": 3.7696329982345467, "tokens_seen": 718215168 }, { "epoch": 2.01, "learning_rate": 0.0003951554663991976, "loss": 2.8693, "theoretical_loss": 3.7695982340977388, "tokens_seen": 718280704 }, { "epoch": 2.01, "learning_rate": 0.00039514543630892677, "loss": 2.9933, "theoretical_loss": 3.7695634740207034, "tokens_seen": 718346240 }, { "epoch": 2.01, "learning_rate": 0.00039513540621865595, "loss": 3.0232, "theoretical_loss": 3.7695287180025976, "tokens_seen": 718411776 }, { "epoch": 2.01, "learning_rate": 0.0003951253761283852, "loss": 2.8833, "theoretical_loss": 3.7694939660425764, "tokens_seen": 718477312 }, { "epoch": 2.01, "learning_rate": 0.0003951153460381143, "loss": 2.9339, "theoretical_loss": 3.7694592181397963, "tokens_seen": 718542848 }, { "epoch": 2.01, "learning_rate": 0.00039510531594784355, "loss": 3.1105, "theoretical_loss": 3.7694244742934133, "tokens_seen": 718608384 }, { "epoch": 2.01, "learning_rate": 0.00039509528585757273, "loss": 2.8665, "theoretical_loss": 3.7693897345025844, "tokens_seen": 718673920 }, { "epoch": 2.01, "learning_rate": 0.0003950852557673019, "loss": 2.9808, "theoretical_loss": 3.769354998766466, "tokens_seen": 718739456 }, { "epoch": 2.01, "learning_rate": 0.0003950752256770311, "loss": 3.0648, "theoretical_loss": 3.7693202670842147, "tokens_seen": 718804992 }, { "epoch": 2.01, "learning_rate": 0.0003950651955867603, "loss": 2.7803, "theoretical_loss": 3.769285539454989, "tokens_seen": 718870528 }, { "epoch": 2.01, "learning_rate": 0.00039505516549648946, "loss": 2.913, "theoretical_loss": 3.7692508158779465, "tokens_seen": 718936064 }, { "epoch": 2.01, "learning_rate": 0.0003950451354062187, "loss": 2.9959, "theoretical_loss": 3.769216096352244, "tokens_seen": 719001600 }, { "epoch": 2.01, "learning_rate": 0.0003950351053159478, "loss": 2.8489, "theoretical_loss": 3.7691813808770407, "tokens_seen": 719067136 }, { "epoch": 2.01, "learning_rate": 0.00039502507522567705, "loss": 2.8923, "theoretical_loss": 3.769146669451494, "tokens_seen": 719132672 }, { "epoch": 2.01, "learning_rate": 0.0003950150451354062, "loss": 2.9522, "theoretical_loss": 3.7691119620747626, "tokens_seen": 719198208 }, { "epoch": 2.01, "objective/train/docs_used": 1165519, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1415786743164062, "objective/train/theoretical_loss": 3.7690946099044402, "objective/train/tokens_used": 739690976, "theoretical_loss": 3.7690946099044402, "tokens_seen": 719230976 }, { "epoch": 2.01, "learning_rate": 0.0003950050150451354, "loss": 2.985, "theoretical_loss": 3.769077258746006, "tokens_seen": 719263744 }, { "epoch": 2.01, "learning_rate": 0.0003949949849548646, "loss": 2.9869, "theoretical_loss": 3.7690425594643835, "tokens_seen": 719329280 }, { "epoch": 2.01, "learning_rate": 0.0003949849548645938, "loss": 2.9681, "theoretical_loss": 3.769007864229054, "tokens_seen": 719394816 }, { "epoch": 2.01, "learning_rate": 0.00039497492477432296, "loss": 3.0074, "theoretical_loss": 3.768973173039176, "tokens_seen": 719460352 }, { "epoch": 2.01, "learning_rate": 0.0003949648946840522, "loss": 2.9519, "theoretical_loss": 3.7689384858939112, "tokens_seen": 719525888 }, { "epoch": 2.01, "learning_rate": 0.0003949548645937813, "loss": 2.884, "theoretical_loss": 3.768903802792419, "tokens_seen": 719591424 }, { "epoch": 2.01, "learning_rate": 0.00039494483450351056, "loss": 2.7329, "theoretical_loss": 3.768869123733859, "tokens_seen": 719656960 }, { "epoch": 2.01, "learning_rate": 0.0003949348044132397, "loss": 2.9338, "theoretical_loss": 3.768834448717393, "tokens_seen": 719722496 }, { "epoch": 2.01, "learning_rate": 0.0003949247743229689, "loss": 2.9741, "theoretical_loss": 3.7687997777421822, "tokens_seen": 719788032 }, { "epoch": 2.01, "learning_rate": 0.0003949147442326981, "loss": 3.0027, "theoretical_loss": 3.7687651108073856, "tokens_seen": 719853568 }, { "epoch": 2.01, "learning_rate": 0.0003949047141424273, "loss": 3.1041, "theoretical_loss": 3.7687304479121666, "tokens_seen": 719919104 }, { "epoch": 2.01, "learning_rate": 0.0003948946840521565, "loss": 2.905, "theoretical_loss": 3.768695789055686, "tokens_seen": 719984640 }, { "epoch": 2.01, "learning_rate": 0.00039488465396188564, "loss": 3.119, "theoretical_loss": 3.768661134237106, "tokens_seen": 720050176 }, { "epoch": 2.01, "learning_rate": 0.0003948746238716149, "loss": 3.0023, "theoretical_loss": 3.7686264834555883, "tokens_seen": 720115712 }, { "epoch": 2.01, "learning_rate": 0.00039486459378134406, "loss": 2.908, "theoretical_loss": 3.7685918367102955, "tokens_seen": 720181248 }, { "epoch": 2.01, "learning_rate": 0.00039485456369107324, "loss": 2.9214, "theoretical_loss": 3.7685571940003904, "tokens_seen": 720246784 }, { "epoch": 2.01, "learning_rate": 0.0003948445336008024, "loss": 3.0443, "theoretical_loss": 3.7685225553250348, "tokens_seen": 720312320 }, { "epoch": 2.01, "learning_rate": 0.0003948345035105316, "loss": 3.0879, "theoretical_loss": 3.7684879206833934, "tokens_seen": 720377856 }, { "epoch": 2.01, "learning_rate": 0.0003948244734202608, "loss": 3.0959, "theoretical_loss": 3.7684532900746284, "tokens_seen": 720443392 }, { "epoch": 2.01, "learning_rate": 0.00039481444332999, "loss": 3.105, "theoretical_loss": 3.768418663497904, "tokens_seen": 720508928 }, { "epoch": 2.01, "learning_rate": 0.00039480441323971915, "loss": 3.0187, "theoretical_loss": 3.7683840409523848, "tokens_seen": 720574464 }, { "epoch": 2.01, "learning_rate": 0.0003947943831494484, "loss": 3.0388, "theoretical_loss": 3.768349422437233, "tokens_seen": 720640000 }, { "epoch": 2.01, "learning_rate": 0.00039478435305917756, "loss": 2.9491, "theoretical_loss": 3.768314807951614, "tokens_seen": 720705536 }, { "epoch": 2.01, "learning_rate": 0.00039477432296890674, "loss": 3.0701, "theoretical_loss": 3.7682801974946933, "tokens_seen": 720771072 }, { "epoch": 2.01, "learning_rate": 0.0003947642928786359, "loss": 2.834, "theoretical_loss": 3.768245591065634, "tokens_seen": 720836608 }, { "epoch": 2.01, "objective/train/docs_used": 1168349, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0201303958892822, "objective/train/theoretical_loss": 3.768228289361292, "objective/train/tokens_used": 741329376, "theoretical_loss": 3.768228289361292, "tokens_seen": 720869376 }, { "epoch": 2.01, "learning_rate": 0.0003947542627883651, "loss": 3.0997, "theoretical_loss": 3.7682109886636024, "tokens_seen": 720902144 }, { "epoch": 2.01, "learning_rate": 0.0003947442326980943, "loss": 2.9246, "theoretical_loss": 3.7681763902877634, "tokens_seen": 720967680 }, { "epoch": 2.01, "learning_rate": 0.0003947342026078235, "loss": 2.9592, "theoretical_loss": 3.768141795937283, "tokens_seen": 721033216 }, { "epoch": 2.01, "learning_rate": 0.00039472417251755265, "loss": 2.9907, "theoretical_loss": 3.768107205611327, "tokens_seen": 721098752 }, { "epoch": 2.01, "learning_rate": 0.0003947141424272819, "loss": 2.9573, "theoretical_loss": 3.768072619309061, "tokens_seen": 721164288 }, { "epoch": 2.01, "learning_rate": 0.000394704112337011, "loss": 2.9062, "theoretical_loss": 3.768038037029652, "tokens_seen": 721229824 }, { "epoch": 2.01, "learning_rate": 0.00039469408224674025, "loss": 2.9905, "theoretical_loss": 3.7680034587722666, "tokens_seen": 721295360 }, { "epoch": 2.01, "learning_rate": 0.00039468405215646943, "loss": 2.9657, "theoretical_loss": 3.767968884536071, "tokens_seen": 721360896 }, { "epoch": 2.01, "learning_rate": 0.0003946740220661986, "loss": 2.8559, "theoretical_loss": 3.7679343143202324, "tokens_seen": 721426432 }, { "epoch": 2.01, "learning_rate": 0.0003946639919759278, "loss": 2.9658, "theoretical_loss": 3.7678997481239187, "tokens_seen": 721491968 }, { "epoch": 2.01, "learning_rate": 0.00039465396188565697, "loss": 3.0306, "theoretical_loss": 3.767865185946297, "tokens_seen": 721557504 }, { "epoch": 2.01, "learning_rate": 0.00039464393179538615, "loss": 3.0114, "theoretical_loss": 3.7678306277865357, "tokens_seen": 721623040 }, { "epoch": 2.01, "learning_rate": 0.0003946339017051154, "loss": 2.9202, "theoretical_loss": 3.7677960736438023, "tokens_seen": 721688576 }, { "epoch": 2.01, "learning_rate": 0.0003946238716148445, "loss": 2.8935, "theoretical_loss": 3.7677615235172652, "tokens_seen": 721754112 }, { "epoch": 2.01, "learning_rate": 0.00039461384152457375, "loss": 2.9054, "theoretical_loss": 3.7677269774060935, "tokens_seen": 721819648 }, { "epoch": 2.01, "learning_rate": 0.00039460381143430293, "loss": 3.013, "theoretical_loss": 3.767692435309456, "tokens_seen": 721885184 }, { "epoch": 2.01, "learning_rate": 0.0003945937813440321, "loss": 2.8837, "theoretical_loss": 3.767657897226521, "tokens_seen": 721950720 }, { "epoch": 2.01, "learning_rate": 0.0003945837512537613, "loss": 3.067, "theoretical_loss": 3.767623363156458, "tokens_seen": 722016256 }, { "epoch": 2.01, "learning_rate": 0.0003945737211634905, "loss": 2.8968, "theoretical_loss": 3.767588833098438, "tokens_seen": 722081792 }, { "epoch": 2.01, "learning_rate": 0.00039456369107321966, "loss": 2.9872, "theoretical_loss": 3.767554307051629, "tokens_seen": 722147328 }, { "epoch": 2.01, "learning_rate": 0.0003945536609829489, "loss": 2.8563, "theoretical_loss": 3.7675197850152022, "tokens_seen": 722212864 }, { "epoch": 2.01, "learning_rate": 0.000394543630892678, "loss": 3.0826, "theoretical_loss": 3.7674852669883276, "tokens_seen": 722278400 }, { "epoch": 2.01, "learning_rate": 0.00039453360080240725, "loss": 3.123, "theoretical_loss": 3.7674507529701753, "tokens_seen": 722343936 }, { "epoch": 2.01, "learning_rate": 0.0003945235707121364, "loss": 3.1897, "theoretical_loss": 3.767416242959917, "tokens_seen": 722409472 }, { "epoch": 2.01, "learning_rate": 0.0003945135406218656, "loss": 3.1523, "theoretical_loss": 3.767381736956723, "tokens_seen": 722475008 }, { "epoch": 2.01, "objective/train/docs_used": 1169764, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2959272861480713, "objective/train/theoretical_loss": 3.767364485457516, "objective/train/tokens_used": 742967776, "theoretical_loss": 3.767364485457516, "tokens_seen": 722507776 }, { "epoch": 2.01, "learning_rate": 0.0003945035105315948, "loss": 3.1185, "theoretical_loss": 3.7673472349597645, "tokens_seen": 722540544 }, { "epoch": 2.01, "learning_rate": 0.000394493480441324, "loss": 3.0723, "theoretical_loss": 3.767312736968214, "tokens_seen": 722606080 }, { "epoch": 2.01, "learning_rate": 0.00039448345035105316, "loss": 3.0598, "theoretical_loss": 3.7672782429812433, "tokens_seen": 722671616 }, { "epoch": 2.01, "learning_rate": 0.0003944734202607824, "loss": 3.0169, "theoretical_loss": 3.7672437529980227, "tokens_seen": 722737152 }, { "epoch": 2.01, "learning_rate": 0.0003944633901705115, "loss": 2.8523, "theoretical_loss": 3.7672092670177264, "tokens_seen": 722802688 }, { "epoch": 2.01, "learning_rate": 0.00039445336008024076, "loss": 3.1171, "theoretical_loss": 3.767174785039526, "tokens_seen": 722868224 }, { "epoch": 2.01, "learning_rate": 0.0003944433299899699, "loss": 3.0567, "theoretical_loss": 3.767140307062595, "tokens_seen": 722933760 }, { "epoch": 2.01, "learning_rate": 0.0003944332998996991, "loss": 3.063, "theoretical_loss": 3.767105833086105, "tokens_seen": 722999296 }, { "epoch": 2.01, "learning_rate": 0.0003944232698094283, "loss": 3.0049, "theoretical_loss": 3.767071363109231, "tokens_seen": 723064832 }, { "epoch": 2.01, "learning_rate": 0.0003944132397191575, "loss": 2.8275, "theoretical_loss": 3.7670368971311454, "tokens_seen": 723130368 }, { "epoch": 2.01, "learning_rate": 0.00039440320962888666, "loss": 3.114, "theoretical_loss": 3.7670024351510225, "tokens_seen": 723195904 }, { "epoch": 2.01, "learning_rate": 0.00039439317953861584, "loss": 3.0687, "theoretical_loss": 3.7669679771680364, "tokens_seen": 723261440 }, { "epoch": 2.01, "learning_rate": 0.000394383149448345, "loss": 2.8891, "theoretical_loss": 3.7669335231813603, "tokens_seen": 723326976 }, { "epoch": 2.01, "learning_rate": 0.00039437311935807426, "loss": 2.9364, "theoretical_loss": 3.7668990731901704, "tokens_seen": 723392512 }, { "epoch": 2.01, "learning_rate": 0.0003943630892678034, "loss": 2.9788, "theoretical_loss": 3.76686462719364, "tokens_seen": 723458048 }, { "epoch": 2.01, "learning_rate": 0.0003943530591775326, "loss": 2.9313, "theoretical_loss": 3.766830185190945, "tokens_seen": 723523584 }, { "epoch": 2.01, "learning_rate": 0.00039434302908726175, "loss": 2.9306, "theoretical_loss": 3.76679574718126, "tokens_seen": 723589120 }, { "epoch": 2.01, "learning_rate": 0.000394332998996991, "loss": 3.0819, "theoretical_loss": 3.766761313163761, "tokens_seen": 723654656 }, { "epoch": 2.01, "learning_rate": 0.00039432296890672017, "loss": 3.201, "theoretical_loss": 3.7667268831376237, "tokens_seen": 723720192 }, { "epoch": 2.01, "learning_rate": 0.00039431293881644935, "loss": 2.8737, "theoretical_loss": 3.7666924571020237, "tokens_seen": 723785728 }, { "epoch": 2.01, "learning_rate": 0.00039430290872617853, "loss": 2.8965, "theoretical_loss": 3.766658035056137, "tokens_seen": 723851264 }, { "epoch": 2.01, "learning_rate": 0.00039429287863590776, "loss": 2.7251, "theoretical_loss": 3.7666236169991407, "tokens_seen": 723916800 }, { "epoch": 2.01, "learning_rate": 0.0003942828485456369, "loss": 2.9267, "theoretical_loss": 3.766589202930211, "tokens_seen": 723982336 }, { "epoch": 2.01, "learning_rate": 0.0003942728184553661, "loss": 3.077, "theoretical_loss": 3.7665547928485257, "tokens_seen": 724047872 }, { "epoch": 2.01, "learning_rate": 0.00039426278836509525, "loss": 3.0023, "theoretical_loss": 3.7665203867532613, "tokens_seen": 724113408 }, { "epoch": 2.01, "objective/train/docs_used": 1173596, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0672168731689453, "objective/train/theoretical_loss": 3.76650318520028, "objective/train/tokens_used": 744606176, "theoretical_loss": 3.76650318520028, "tokens_seen": 724146176 }, { "epoch": 2.01, "learning_rate": 0.0003942527582748245, "loss": 2.9857, "theoretical_loss": 3.7664859846435954, "tokens_seen": 724178944 }, { "epoch": 2.01, "learning_rate": 0.00039424272818455367, "loss": 2.8406, "theoretical_loss": 3.7664515865187056, "tokens_seen": 724244480 }, { "epoch": 2.01, "learning_rate": 0.00039423269809428285, "loss": 2.9989, "theoretical_loss": 3.7664171923777694, "tokens_seen": 724310016 }, { "epoch": 2.01, "learning_rate": 0.00039422266800401203, "loss": 2.8356, "theoretical_loss": 3.766382802219966, "tokens_seen": 724375552 }, { "epoch": 2.01, "learning_rate": 0.0003942126379137412, "loss": 3.0434, "theoretical_loss": 3.7663484160444733, "tokens_seen": 724441088 }, { "epoch": 2.01, "learning_rate": 0.0003942026078234704, "loss": 2.8591, "theoretical_loss": 3.76631403385047, "tokens_seen": 724506624 }, { "epoch": 2.01, "learning_rate": 0.00039419257773319963, "loss": 2.8569, "theoretical_loss": 3.7662796556371347, "tokens_seen": 724572160 }, { "epoch": 2.01, "learning_rate": 0.00039418254764292876, "loss": 2.8912, "theoretical_loss": 3.766245281403647, "tokens_seen": 724637696 }, { "epoch": 2.01, "learning_rate": 0.000394172517552658, "loss": 2.8854, "theoretical_loss": 3.7662109111491864, "tokens_seen": 724703232 }, { "epoch": 2.01, "learning_rate": 0.0003941624874623871, "loss": 2.8758, "theoretical_loss": 3.7661765448729314, "tokens_seen": 724768768 }, { "epoch": 2.01, "learning_rate": 0.00039415245737211635, "loss": 2.876, "theoretical_loss": 3.7661421825740633, "tokens_seen": 724834304 }, { "epoch": 2.01, "learning_rate": 0.0003941424272818456, "loss": 2.8287, "theoretical_loss": 3.7661078242517614, "tokens_seen": 724899840 }, { "epoch": 2.01, "learning_rate": 0.0003941323971915747, "loss": 3.038, "theoretical_loss": 3.7660734699052067, "tokens_seen": 724965376 }, { "epoch": 2.01, "learning_rate": 0.00039412236710130395, "loss": 2.9701, "theoretical_loss": 3.7660391195335787, "tokens_seen": 725030912 }, { "epoch": 2.01, "learning_rate": 0.00039411233701103313, "loss": 3.0332, "theoretical_loss": 3.7660047731360597, "tokens_seen": 725096448 }, { "epoch": 2.01, "learning_rate": 0.0003941023069207623, "loss": 2.8629, "theoretical_loss": 3.7659704307118296, "tokens_seen": 725161984 }, { "epoch": 2.01, "learning_rate": 0.0003940922768304915, "loss": 2.9496, "theoretical_loss": 3.7659360922600706, "tokens_seen": 725227520 }, { "epoch": 2.01, "learning_rate": 0.0003940822467402207, "loss": 3.1589, "theoretical_loss": 3.7659017577799636, "tokens_seen": 725293056 }, { "epoch": 2.01, "learning_rate": 0.00039407221664994986, "loss": 2.9116, "theoretical_loss": 3.7658674272706905, "tokens_seen": 725358592 }, { "epoch": 2.01, "learning_rate": 0.0003940621865596791, "loss": 3.0104, "theoretical_loss": 3.7658331007314336, "tokens_seen": 725424128 }, { "epoch": 2.01, "learning_rate": 0.0003940521564694082, "loss": 2.9388, "theoretical_loss": 3.7657987781613755, "tokens_seen": 725489664 }, { "epoch": 2.01, "learning_rate": 0.00039404212637913745, "loss": 3.0901, "theoretical_loss": 3.765764459559698, "tokens_seen": 725555200 }, { "epoch": 2.01, "learning_rate": 0.0003940320962888666, "loss": 3.0588, "theoretical_loss": 3.765730144925585, "tokens_seen": 725620736 }, { "epoch": 2.01, "learning_rate": 0.0003940220661985958, "loss": 3.035, "theoretical_loss": 3.765695834258218, "tokens_seen": 725686272 }, { "epoch": 2.01, "learning_rate": 0.000394012036108325, "loss": 3.1073, "theoretical_loss": 3.765661527556782, "tokens_seen": 725751808 }, { "epoch": 2.01, "objective/train/docs_used": 1175012, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5791220664978027, "objective/train/theoretical_loss": 3.7656443756930322, "objective/train/tokens_used": 746244576, "theoretical_loss": 3.7656443756930322, "tokens_seen": 725784576 }, { "epoch": 2.01, "learning_rate": 0.0003940020060180542, "loss": 2.9126, "theoretical_loss": 3.765627224820459, "tokens_seen": 725817344 }, { "epoch": 2.01, "learning_rate": 0.00039399197592778336, "loss": 3.0699, "theoretical_loss": 3.7655929260484338, "tokens_seen": 725882880 }, { "epoch": 2.01, "learning_rate": 0.0003939819458375126, "loss": 2.8666, "theoretical_loss": 3.765558631239889, "tokens_seen": 725948416 }, { "epoch": 2.01, "learning_rate": 0.0003939719157472417, "loss": 3.0349, "theoretical_loss": 3.7655243403940113, "tokens_seen": 726013952 }, { "epoch": 2.01, "learning_rate": 0.00039396188565697096, "loss": 3.004, "theoretical_loss": 3.765490053509983, "tokens_seen": 726079488 }, { "epoch": 2.01, "learning_rate": 0.0003939518555667001, "loss": 3.0418, "theoretical_loss": 3.7654557705869895, "tokens_seen": 726145024 }, { "epoch": 2.01, "learning_rate": 0.0003939418254764293, "loss": 3.0768, "theoretical_loss": 3.765421491624216, "tokens_seen": 726210560 }, { "epoch": 2.01, "learning_rate": 0.0003939317953861585, "loss": 2.9283, "theoretical_loss": 3.765387216620848, "tokens_seen": 726276096 }, { "epoch": 2.01, "learning_rate": 0.0003939217652958877, "loss": 3.0337, "theoretical_loss": 3.76535294557607, "tokens_seen": 726341632 }, { "epoch": 2.01, "learning_rate": 0.00039391173520561686, "loss": 2.9551, "theoretical_loss": 3.7653186784890678, "tokens_seen": 726407168 }, { "epoch": 2.01, "learning_rate": 0.00039390170511534604, "loss": 3.018, "theoretical_loss": 3.765284415359028, "tokens_seen": 726472704 }, { "epoch": 2.01, "learning_rate": 0.0003938916750250752, "loss": 3.0414, "theoretical_loss": 3.765250156185137, "tokens_seen": 726538240 }, { "epoch": 2.01, "learning_rate": 0.00039388164493480446, "loss": 2.9808, "theoretical_loss": 3.76521590096658, "tokens_seen": 726603776 }, { "epoch": 2.01, "learning_rate": 0.0003938716148445336, "loss": 2.7729, "theoretical_loss": 3.7651816497025443, "tokens_seen": 726669312 }, { "epoch": 2.01, "learning_rate": 0.0003938615847542628, "loss": 3.0073, "theoretical_loss": 3.765147402392217, "tokens_seen": 726734848 }, { "epoch": 2.01, "learning_rate": 0.00039385155466399195, "loss": 3.1182, "theoretical_loss": 3.7651131590347857, "tokens_seen": 726800384 }, { "epoch": 2.01, "learning_rate": 0.0003938415245737212, "loss": 2.7749, "theoretical_loss": 3.7650789196294365, "tokens_seen": 726865920 }, { "epoch": 2.01, "learning_rate": 0.00039383149448345037, "loss": 2.93, "theoretical_loss": 3.7650446841753578, "tokens_seen": 726931456 }, { "epoch": 2.01, "learning_rate": 0.00039382146439317955, "loss": 2.7314, "theoretical_loss": 3.765010452671737, "tokens_seen": 726996992 }, { "epoch": 2.01, "learning_rate": 0.00039381143430290873, "loss": 2.851, "theoretical_loss": 3.7649762251177634, "tokens_seen": 727062528 }, { "epoch": 2.01, "learning_rate": 0.00039380140421263796, "loss": 3.0506, "theoretical_loss": 3.7649420015126234, "tokens_seen": 727128064 }, { "epoch": 2.01, "learning_rate": 0.0003937913741223671, "loss": 2.9788, "theoretical_loss": 3.7649077818555075, "tokens_seen": 727193600 }, { "epoch": 2.01, "learning_rate": 0.0003937813440320963, "loss": 2.9804, "theoretical_loss": 3.764873566145603, "tokens_seen": 727259136 }, { "epoch": 2.01, "learning_rate": 0.00039377131394182545, "loss": 2.8698, "theoretical_loss": 3.7648393543821, "tokens_seen": 727324672 }, { "epoch": 2.01, "learning_rate": 0.0003937612838515547, "loss": 3.1203, "theoretical_loss": 3.7648051465641874, "tokens_seen": 727390208 }, { "epoch": 2.01, "objective/train/docs_used": 1178001, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7799723148345947, "objective/train/theoretical_loss": 3.7647880441345745, "objective/train/tokens_used": 747882976, "theoretical_loss": 3.7647880441345745, "tokens_seen": 727422976 }, { "epoch": 2.01, "learning_rate": 0.00039375125376128387, "loss": 2.9703, "theoretical_loss": 3.764770942691055, "tokens_seen": 727455744 }, { "epoch": 2.01, "learning_rate": 0.00039374122367101305, "loss": 3.0172, "theoretical_loss": 3.7647367427618916, "tokens_seen": 727521280 }, { "epoch": 2.01, "learning_rate": 0.00039373119358074223, "loss": 3.113, "theoretical_loss": 3.7647025467758883, "tokens_seen": 727586816 }, { "epoch": 2.01, "learning_rate": 0.0003937211634904714, "loss": 2.9776, "theoretical_loss": 3.764668354732235, "tokens_seen": 727652352 }, { "epoch": 2.01, "learning_rate": 0.0003937111334002006, "loss": 3.1522, "theoretical_loss": 3.764634166630122, "tokens_seen": 727717888 }, { "epoch": 2.01, "learning_rate": 0.00039370110330992983, "loss": 2.9419, "theoretical_loss": 3.7645999824687406, "tokens_seen": 727783424 }, { "epoch": 2.01, "learning_rate": 0.00039369107321965896, "loss": 3.0718, "theoretical_loss": 3.764565802247281, "tokens_seen": 727848960 }, { "epoch": 2.01, "learning_rate": 0.0003936810431293882, "loss": 3.0169, "theoretical_loss": 3.764531625964935, "tokens_seen": 727914496 }, { "epoch": 2.01, "learning_rate": 0.0003936710130391173, "loss": 2.9751, "theoretical_loss": 3.7644974536208933, "tokens_seen": 727980032 }, { "epoch": 2.01, "learning_rate": 0.00039366098294884655, "loss": 2.8406, "theoretical_loss": 3.764463285214349, "tokens_seen": 728045568 }, { "epoch": 2.01, "learning_rate": 0.00039365095285857573, "loss": 3.1638, "theoretical_loss": 3.7644291207444924, "tokens_seen": 728111104 }, { "epoch": 2.01, "learning_rate": 0.0003936409227683049, "loss": 3.0345, "theoretical_loss": 3.7643949602105167, "tokens_seen": 728176640 }, { "epoch": 2.01, "learning_rate": 0.0003936308926780341, "loss": 2.8726, "theoretical_loss": 3.764360803611614, "tokens_seen": 728242176 }, { "epoch": 2.01, "learning_rate": 0.00039362086258776333, "loss": 2.9434, "theoretical_loss": 3.764326650946977, "tokens_seen": 728307712 }, { "epoch": 2.01, "learning_rate": 0.00039361083249749246, "loss": 2.9727, "theoretical_loss": 3.7642925022157985, "tokens_seen": 728373248 }, { "epoch": 2.01, "learning_rate": 0.0003936008024072217, "loss": 2.9676, "theoretical_loss": 3.7642583574172717, "tokens_seen": 728438784 }, { "epoch": 2.01, "learning_rate": 0.0003935907723169508, "loss": 2.7674, "theoretical_loss": 3.7642242165505895, "tokens_seen": 728504320 }, { "epoch": 2.01, "learning_rate": 0.00039358074222668006, "loss": 2.9704, "theoretical_loss": 3.764190079614946, "tokens_seen": 728569856 }, { "epoch": 2.01, "learning_rate": 0.00039357071213640924, "loss": 3.0867, "theoretical_loss": 3.7641559466095353, "tokens_seen": 728635392 }, { "epoch": 2.01, "learning_rate": 0.0003935606820461384, "loss": 3.1484, "theoretical_loss": 3.764121817533551, "tokens_seen": 728700928 }, { "epoch": 2.01, "learning_rate": 0.0003935506519558676, "loss": 2.9097, "theoretical_loss": 3.7640876923861875, "tokens_seen": 728766464 }, { "epoch": 2.01, "learning_rate": 0.0003935406218655968, "loss": 2.8535, "theoretical_loss": 3.764053571166639, "tokens_seen": 728832000 }, { "epoch": 2.01, "learning_rate": 0.00039353059177532596, "loss": 3.0961, "theoretical_loss": 3.764019453874101, "tokens_seen": 728897536 }, { "epoch": 2.01, "learning_rate": 0.0003935205616850552, "loss": 3.0258, "theoretical_loss": 3.7639853405077677, "tokens_seen": 728963072 }, { "epoch": 2.01, "learning_rate": 0.0003935105315947843, "loss": 2.9133, "theoretical_loss": 3.7639512310668355, "tokens_seen": 729028608 }, { "epoch": 2.01, "objective/train/docs_used": 1181058, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7849576473236084, "objective/train/theoretical_loss": 3.7639341778181428, "objective/train/tokens_used": 749521376, "theoretical_loss": 3.7639341778181428, "tokens_seen": 729061376 }, { "epoch": 2.01, "learning_rate": 0.00039350050150451356, "loss": 2.9009, "theoretical_loss": 3.763917125550499, "tokens_seen": 729094144 }, { "epoch": 2.01, "learning_rate": 0.0003934904714142427, "loss": 2.9722, "theoretical_loss": 3.7638830239579537, "tokens_seen": 729159680 }, { "epoch": 2.01, "learning_rate": 0.0003934804413239719, "loss": 2.9891, "theoretical_loss": 3.7638489262883965, "tokens_seen": 729225216 }, { "epoch": 2.01, "learning_rate": 0.0003934704112337011, "loss": 3.0309, "theoretical_loss": 3.763814832541023, "tokens_seen": 729290752 }, { "epoch": 2.01, "learning_rate": 0.0003934603811434303, "loss": 2.8805, "theoretical_loss": 3.7637807427150296, "tokens_seen": 729356288 }, { "epoch": 2.01, "learning_rate": 0.00039345035105315946, "loss": 3.0333, "theoretical_loss": 3.7637466568096127, "tokens_seen": 729421824 }, { "epoch": 2.01, "learning_rate": 0.0003934403209628887, "loss": 3.0607, "theoretical_loss": 3.7637125748239697, "tokens_seen": 729487360 }, { "epoch": 2.01, "learning_rate": 0.00039343029087261783, "loss": 3.0586, "theoretical_loss": 3.763678496757298, "tokens_seen": 729552896 }, { "epoch": 2.01, "learning_rate": 0.00039342026078234706, "loss": 3.06, "theoretical_loss": 3.7636444226087953, "tokens_seen": 729618432 }, { "epoch": 2.01, "learning_rate": 0.0003934102306920762, "loss": 2.9975, "theoretical_loss": 3.763610352377658, "tokens_seen": 729683968 }, { "epoch": 2.01, "learning_rate": 0.0003934002006018054, "loss": 3.1266, "theoretical_loss": 3.763576286063085, "tokens_seen": 729749504 }, { "epoch": 2.01, "learning_rate": 0.00039339017051153466, "loss": 3.0372, "theoretical_loss": 3.763542223664273, "tokens_seen": 729815040 }, { "epoch": 2.01, "learning_rate": 0.0003933801404212638, "loss": 2.949, "theoretical_loss": 3.763508165180422, "tokens_seen": 729880576 }, { "epoch": 2.01, "learning_rate": 0.000393370110330993, "loss": 3.0233, "theoretical_loss": 3.76347411061073, "tokens_seen": 729946112 }, { "epoch": 2.01, "learning_rate": 0.00039336008024072215, "loss": 2.9338, "theoretical_loss": 3.7634400599543953, "tokens_seen": 730011648 }, { "epoch": 2.01, "learning_rate": 0.0003933500501504514, "loss": 2.7696, "theoretical_loss": 3.7634060132106173, "tokens_seen": 730077184 }, { "epoch": 2.01, "learning_rate": 0.00039334002006018057, "loss": 3.0165, "theoretical_loss": 3.7633719703785955, "tokens_seen": 730142720 }, { "epoch": 2.01, "learning_rate": 0.00039332998996990975, "loss": 2.9343, "theoretical_loss": 3.7633379314575293, "tokens_seen": 730208256 }, { "epoch": 2.01, "learning_rate": 0.00039331995987963893, "loss": 3.0135, "theoretical_loss": 3.763303896446618, "tokens_seen": 730273792 }, { "epoch": 2.01, "learning_rate": 0.00039330992978936816, "loss": 3.018, "theoretical_loss": 3.7632698653450625, "tokens_seen": 730339328 }, { "epoch": 2.01, "learning_rate": 0.0003932998996990973, "loss": 2.8879, "theoretical_loss": 3.763235838152062, "tokens_seen": 730404864 }, { "epoch": 2.01, "learning_rate": 0.0003932898696088265, "loss": 3.0518, "theoretical_loss": 3.763201814866817, "tokens_seen": 730470400 }, { "epoch": 2.01, "learning_rate": 0.00039327983951855565, "loss": 2.8727, "theoretical_loss": 3.7631677954885294, "tokens_seen": 730535936 }, { "epoch": 2.01, "learning_rate": 0.0003932698094282849, "loss": 2.8814, "theoretical_loss": 3.7631337800163993, "tokens_seen": 730601472 }, { "epoch": 2.01, "learning_rate": 0.00039325977933801407, "loss": 3.0176, "theoretical_loss": 3.763099768449628, "tokens_seen": 730667008 }, { "epoch": 2.01, "objective/train/docs_used": 1183729, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.231947898864746, "objective/train/theoretical_loss": 3.763082764130502, "objective/train/tokens_used": 751159776, "theoretical_loss": 3.763082764130502, "tokens_seen": 730699776 }, { "epoch": 2.01, "learning_rate": 0.00039324974924774325, "loss": 3.0366, "theoretical_loss": 3.7630657607874163, "tokens_seen": 730732544 }, { "epoch": 2.01, "learning_rate": 0.00039323971915747243, "loss": 2.9745, "theoretical_loss": 3.7630317570289664, "tokens_seen": 730798080 }, { "epoch": 2.01, "learning_rate": 0.0003932296890672016, "loss": 2.9481, "theoretical_loss": 3.7629977571734807, "tokens_seen": 730863616 }, { "epoch": 2.01, "learning_rate": 0.0003932196589769308, "loss": 2.9831, "theoretical_loss": 3.76296376122016, "tokens_seen": 730929152 }, { "epoch": 2.01, "learning_rate": 0.00039320962888666003, "loss": 2.8952, "theoretical_loss": 3.762929769168208, "tokens_seen": 730994688 }, { "epoch": 2.01, "learning_rate": 0.00039319959879638916, "loss": 2.9023, "theoretical_loss": 3.7628957810168258, "tokens_seen": 731060224 }, { "epoch": 2.01, "learning_rate": 0.0003931895687061184, "loss": 2.835, "theoretical_loss": 3.7628617967652174, "tokens_seen": 731125760 }, { "epoch": 2.01, "learning_rate": 0.0003931795386158475, "loss": 2.7248, "theoretical_loss": 3.7628278164125852, "tokens_seen": 731191296 }, { "epoch": 2.01, "learning_rate": 0.00039316950852557675, "loss": 2.9427, "theoretical_loss": 3.762793839958133, "tokens_seen": 731256832 }, { "epoch": 2.01, "learning_rate": 0.00039315947843530593, "loss": 3.0208, "theoretical_loss": 3.7627598674010643, "tokens_seen": 731322368 }, { "epoch": 2.01, "learning_rate": 0.0003931494483450351, "loss": 2.797, "theoretical_loss": 3.762725898740582, "tokens_seen": 731387904 }, { "epoch": 2.01, "learning_rate": 0.0003931394182547643, "loss": 2.8534, "theoretical_loss": 3.762691933975891, "tokens_seen": 731453440 }, { "epoch": 2.01, "learning_rate": 0.00039312938816449353, "loss": 2.9015, "theoretical_loss": 3.7626579731061955, "tokens_seen": 731518976 }, { "epoch": 2.01, "learning_rate": 0.00039311935807422266, "loss": 3.0818, "theoretical_loss": 3.7626240161306987, "tokens_seen": 731584512 }, { "epoch": 2.01, "learning_rate": 0.0003931093279839519, "loss": 2.9091, "theoretical_loss": 3.7625900630486067, "tokens_seen": 731650048 }, { "epoch": 2.01, "learning_rate": 0.000393099297893681, "loss": 3.0142, "theoretical_loss": 3.762556113859124, "tokens_seen": 731715584 }, { "epoch": 2.01, "learning_rate": 0.00039308926780341026, "loss": 2.8871, "theoretical_loss": 3.762522168561455, "tokens_seen": 731781120 }, { "epoch": 2.01, "learning_rate": 0.00039307923771313944, "loss": 3.0802, "theoretical_loss": 3.7624882271548064, "tokens_seen": 731846656 }, { "epoch": 2.01, "learning_rate": 0.0003930692076228686, "loss": 3.1035, "theoretical_loss": 3.7624542896383826, "tokens_seen": 731912192 }, { "epoch": 2.01, "learning_rate": 0.0003930591775325978, "loss": 3.0427, "theoretical_loss": 3.7624203560113902, "tokens_seen": 731977728 }, { "epoch": 2.01, "learning_rate": 0.000393049147442327, "loss": 2.9683, "theoretical_loss": 3.7623864262730353, "tokens_seen": 732043264 }, { "epoch": 2.01, "learning_rate": 0.00039303911735205616, "loss": 2.9136, "theoretical_loss": 3.762352500422524, "tokens_seen": 732108800 }, { "epoch": 2.01, "learning_rate": 0.0003930290872617854, "loss": 2.9735, "theoretical_loss": 3.7623185784590625, "tokens_seen": 732174336 }, { "epoch": 2.01, "learning_rate": 0.0003930190571715145, "loss": 3.0008, "theoretical_loss": 3.7622846603818574, "tokens_seen": 732239872 }, { "epoch": 2.01, "learning_rate": 0.00039300902708124376, "loss": 3.0495, "theoretical_loss": 3.7622507461901167, "tokens_seen": 732305408 }, { "epoch": 2.01, "objective/train/docs_used": 1186092, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2185251712799072, "objective/train/theoretical_loss": 3.762233790551048, "objective/train/tokens_used": 752798176, "theoretical_loss": 3.762233790551048, "tokens_seen": 732338176 }, { "epoch": 2.01, "learning_rate": 0.0003929989969909729, "loss": 2.913, "theoretical_loss": 3.7622168358830472, "tokens_seen": 732370944 }, { "epoch": 2.01, "learning_rate": 0.0003929889669007021, "loss": 3.0306, "theoretical_loss": 3.762182929459856, "tokens_seen": 732436480 }, { "epoch": 2.01, "learning_rate": 0.0003929789368104313, "loss": 3.0193, "theoretical_loss": 3.7621490269197517, "tokens_seen": 732502016 }, { "epoch": 2.01, "learning_rate": 0.0003929689067201605, "loss": 2.9754, "theoretical_loss": 3.7621151282619407, "tokens_seen": 732567552 }, { "epoch": 2.01, "learning_rate": 0.00039295887662988967, "loss": 2.8068, "theoretical_loss": 3.762081233485633, "tokens_seen": 732633088 }, { "epoch": 2.01, "learning_rate": 0.0003929488465396189, "loss": 2.9483, "theoretical_loss": 3.7620473425900354, "tokens_seen": 732698624 }, { "epoch": 2.01, "learning_rate": 0.00039293881644934803, "loss": 2.8819, "theoretical_loss": 3.762013455574358, "tokens_seen": 732764160 }, { "epoch": 2.01, "learning_rate": 0.00039292878635907726, "loss": 2.9319, "theoretical_loss": 3.7619795724378085, "tokens_seen": 732829696 }, { "epoch": 2.01, "learning_rate": 0.0003929187562688064, "loss": 3.0052, "theoretical_loss": 3.7619456931795963, "tokens_seen": 732895232 }, { "epoch": 2.02, "learning_rate": 0.0003929087261785356, "loss": 3.095, "theoretical_loss": 3.7619118177989312, "tokens_seen": 732960768 }, { "epoch": 2.02, "learning_rate": 0.0003928986960882648, "loss": 3.1476, "theoretical_loss": 3.761877946295022, "tokens_seen": 733026304 }, { "epoch": 2.02, "learning_rate": 0.000392888665997994, "loss": 3.0407, "theoretical_loss": 3.761844078667079, "tokens_seen": 733091840 }, { "epoch": 2.02, "learning_rate": 0.00039287863590772317, "loss": 2.9829, "theoretical_loss": 3.7618102149143127, "tokens_seen": 733157376 }, { "epoch": 2.02, "learning_rate": 0.00039286860581745235, "loss": 2.8787, "theoretical_loss": 3.7617763550359324, "tokens_seen": 733222912 }, { "epoch": 2.02, "learning_rate": 0.00039285857572718153, "loss": 2.9534, "theoretical_loss": 3.7617424990311488, "tokens_seen": 733288448 }, { "epoch": 2.02, "learning_rate": 0.00039284854563691077, "loss": 2.9998, "theoretical_loss": 3.761708646899173, "tokens_seen": 733353984 }, { "epoch": 2.02, "learning_rate": 0.0003928385155466399, "loss": 2.9768, "theoretical_loss": 3.7616747986392154, "tokens_seen": 733419520 }, { "epoch": 2.02, "learning_rate": 0.00039282848545636913, "loss": 2.9544, "theoretical_loss": 3.7616409542504883, "tokens_seen": 733485056 }, { "epoch": 2.02, "learning_rate": 0.00039281845536609825, "loss": 3.075, "theoretical_loss": 3.761607113732202, "tokens_seen": 733550592 }, { "epoch": 2.02, "learning_rate": 0.0003928084252758275, "loss": 3.0922, "theoretical_loss": 3.7615732770835684, "tokens_seen": 733616128 }, { "epoch": 2.02, "learning_rate": 0.00039279839518555667, "loss": 3.0415, "theoretical_loss": 3.7615394443038, "tokens_seen": 733681664 }, { "epoch": 2.02, "learning_rate": 0.00039278836509528585, "loss": 3.1131, "theoretical_loss": 3.761505615392108, "tokens_seen": 733747200 }, { "epoch": 2.02, "learning_rate": 0.00039277833500501503, "loss": 3.1399, "theoretical_loss": 3.7614717903477053, "tokens_seen": 733812736 }, { "epoch": 2.02, "learning_rate": 0.00039276830491474427, "loss": 2.8405, "theoretical_loss": 3.761437969169804, "tokens_seen": 733878272 }, { "epoch": 2.02, "learning_rate": 0.0003927582748244734, "loss": 2.9125, "theoretical_loss": 3.761404151857618, "tokens_seen": 733943808 }, { "epoch": 2.02, "objective/train/docs_used": 1188863, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9344232082366943, "objective/train/theoretical_loss": 3.7613872446509213, "objective/train/tokens_used": 754436576, "theoretical_loss": 3.7613872446509213, "tokens_seen": 733976576 }, { "epoch": 2.02, "learning_rate": 0.00039274824473420263, "loss": 2.8672, "theoretical_loss": 3.7613703384103587, "tokens_seen": 734009344 }, { "epoch": 2.02, "learning_rate": 0.00039273821464393176, "loss": 2.9511, "theoretical_loss": 3.7613365288272407, "tokens_seen": 734074880 }, { "epoch": 2.02, "learning_rate": 0.000392728184553661, "loss": 2.9683, "theoretical_loss": 3.7613027231074767, "tokens_seen": 734140416 }, { "epoch": 2.02, "learning_rate": 0.0003927181544633902, "loss": 2.9259, "theoretical_loss": 3.7612689212502812, "tokens_seen": 734205952 }, { "epoch": 2.02, "learning_rate": 0.00039270812437311936, "loss": 3.0079, "theoretical_loss": 3.7612351232548673, "tokens_seen": 734271488 }, { "epoch": 2.02, "learning_rate": 0.00039269809428284854, "loss": 3.0742, "theoretical_loss": 3.76120132912045, "tokens_seen": 734337024 }, { "epoch": 2.02, "learning_rate": 0.0003926880641925777, "loss": 3.0677, "theoretical_loss": 3.7611675388462427, "tokens_seen": 734402560 }, { "epoch": 2.02, "learning_rate": 0.0003926780341023069, "loss": 2.82, "theoretical_loss": 3.761133752431461, "tokens_seen": 734468096 }, { "epoch": 2.02, "learning_rate": 0.00039266800401203613, "loss": 3.0036, "theoretical_loss": 3.7610999698753194, "tokens_seen": 734533632 }, { "epoch": 2.02, "learning_rate": 0.00039265797392176526, "loss": 2.9449, "theoretical_loss": 3.761066191177033, "tokens_seen": 734599168 }, { "epoch": 2.02, "learning_rate": 0.0003926479438314945, "loss": 2.9183, "theoretical_loss": 3.761032416335817, "tokens_seen": 734664704 }, { "epoch": 2.02, "learning_rate": 0.00039263791374122373, "loss": 3.0888, "theoretical_loss": 3.7609986453508872, "tokens_seen": 734730240 }, { "epoch": 2.02, "learning_rate": 0.00039262788365095286, "loss": 3.0035, "theoretical_loss": 3.7609648782214595, "tokens_seen": 734795776 }, { "epoch": 2.02, "learning_rate": 0.0003926178535606821, "loss": 3.0775, "theoretical_loss": 3.7609311149467493, "tokens_seen": 734861312 }, { "epoch": 2.02, "learning_rate": 0.0003926078234704112, "loss": 3.0213, "theoretical_loss": 3.7608973555259735, "tokens_seen": 734926848 }, { "epoch": 2.02, "learning_rate": 0.00039259779338014046, "loss": 3.0858, "theoretical_loss": 3.7608635999583484, "tokens_seen": 734992384 }, { "epoch": 2.02, "learning_rate": 0.00039258776328986964, "loss": 2.9112, "theoretical_loss": 3.760829848243091, "tokens_seen": 735057920 }, { "epoch": 2.02, "learning_rate": 0.0003925777331995988, "loss": 3.02, "theoretical_loss": 3.7607961003794173, "tokens_seen": 735123456 }, { "epoch": 2.02, "learning_rate": 0.000392567703109328, "loss": 3.0098, "theoretical_loss": 3.760762356366545, "tokens_seen": 735188992 }, { "epoch": 2.02, "learning_rate": 0.0003925576730190572, "loss": 2.922, "theoretical_loss": 3.760728616203692, "tokens_seen": 735254528 }, { "epoch": 2.02, "learning_rate": 0.00039254764292878636, "loss": 3.0538, "theoretical_loss": 3.7606948798900754, "tokens_seen": 735320064 }, { "epoch": 2.02, "learning_rate": 0.0003925376128385156, "loss": 2.9259, "theoretical_loss": 3.760661147424913, "tokens_seen": 735385600 }, { "epoch": 2.02, "learning_rate": 0.0003925275827482447, "loss": 3.045, "theoretical_loss": 3.760627418807423, "tokens_seen": 735451136 }, { "epoch": 2.02, "learning_rate": 0.00039251755265797396, "loss": 3.0748, "theoretical_loss": 3.760593694036824, "tokens_seen": 735516672 }, { "epoch": 2.02, "learning_rate": 0.0003925075225677031, "loss": 2.8203, "theoretical_loss": 3.760559973112334, "tokens_seen": 735582208 }, { "epoch": 2.02, "objective/train/docs_used": 1191563, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0358664989471436, "objective/train/theoretical_loss": 3.760543114092136, "objective/train/tokens_used": 756074976, "theoretical_loss": 3.760543114092136, "tokens_seen": 735614976 }, { "epoch": 2.02, "learning_rate": 0.0003924974924774323, "loss": 2.9864, "theoretical_loss": 3.760526256033172, "tokens_seen": 735647744 }, { "epoch": 2.02, "learning_rate": 0.0003924874623871615, "loss": 3.1325, "theoretical_loss": 3.7604925427985574, "tokens_seen": 735713280 }, { "epoch": 2.02, "learning_rate": 0.0003924774322968907, "loss": 3.016, "theoretical_loss": 3.760458833407709, "tokens_seen": 735778816 }, { "epoch": 2.02, "learning_rate": 0.00039246740220661987, "loss": 2.8431, "theoretical_loss": 3.760425127859846, "tokens_seen": 735844352 }, { "epoch": 2.02, "learning_rate": 0.0003924573721163491, "loss": 3.0137, "theoretical_loss": 3.7603914261541886, "tokens_seen": 735909888 }, { "epoch": 2.02, "learning_rate": 0.00039244734202607823, "loss": 3.113, "theoretical_loss": 3.7603577282899563, "tokens_seen": 735975424 }, { "epoch": 2.02, "learning_rate": 0.00039243731193580746, "loss": 2.8957, "theoretical_loss": 3.7603240342663695, "tokens_seen": 736040960 }, { "epoch": 2.02, "learning_rate": 0.0003924272818455366, "loss": 2.9839, "theoretical_loss": 3.760290344082648, "tokens_seen": 736106496 }, { "epoch": 2.02, "learning_rate": 0.0003924172517552658, "loss": 3.1436, "theoretical_loss": 3.760256657738014, "tokens_seen": 736172032 }, { "epoch": 2.02, "learning_rate": 0.000392407221664995, "loss": 3.0716, "theoretical_loss": 3.7602229752316862, "tokens_seen": 736237568 }, { "epoch": 2.02, "learning_rate": 0.0003923971915747242, "loss": 2.9015, "theoretical_loss": 3.760189296562887, "tokens_seen": 736303104 }, { "epoch": 2.02, "learning_rate": 0.00039238716148445337, "loss": 2.9124, "theoretical_loss": 3.7601556217308376, "tokens_seen": 736368640 }, { "epoch": 2.02, "learning_rate": 0.00039237713139418255, "loss": 2.9086, "theoretical_loss": 3.760121950734759, "tokens_seen": 736434176 }, { "epoch": 2.02, "learning_rate": 0.00039236710130391173, "loss": 3.0229, "theoretical_loss": 3.7600882835738734, "tokens_seen": 736499712 }, { "epoch": 2.02, "learning_rate": 0.00039235707121364097, "loss": 2.8711, "theoretical_loss": 3.760054620247402, "tokens_seen": 736565248 }, { "epoch": 2.02, "learning_rate": 0.0003923470411233701, "loss": 2.9754, "theoretical_loss": 3.760020960754568, "tokens_seen": 736630784 }, { "epoch": 2.02, "learning_rate": 0.00039233701103309933, "loss": 2.862, "theoretical_loss": 3.759987305094593, "tokens_seen": 736696320 }, { "epoch": 2.02, "learning_rate": 0.00039232698094282846, "loss": 2.9701, "theoretical_loss": 3.7599536532666997, "tokens_seen": 736761856 }, { "epoch": 2.02, "learning_rate": 0.0003923169508525577, "loss": 2.9445, "theoretical_loss": 3.7599200052701116, "tokens_seen": 736827392 }, { "epoch": 2.02, "learning_rate": 0.00039230692076228687, "loss": 2.9467, "theoretical_loss": 3.7598863611040514, "tokens_seen": 736892928 }, { "epoch": 2.02, "learning_rate": 0.00039229689067201605, "loss": 3.0237, "theoretical_loss": 3.759852720767742, "tokens_seen": 736958464 }, { "epoch": 2.02, "learning_rate": 0.00039228686058174523, "loss": 2.9836, "theoretical_loss": 3.759819084260408, "tokens_seen": 737024000 }, { "epoch": 2.02, "learning_rate": 0.00039227683049147447, "loss": 2.8699, "theoretical_loss": 3.759785451581272, "tokens_seen": 737089536 }, { "epoch": 2.02, "learning_rate": 0.0003922668004012036, "loss": 2.9262, "theoretical_loss": 3.7597518227295588, "tokens_seen": 737155072 }, { "epoch": 2.02, "learning_rate": 0.00039225677031093283, "loss": 3.0161, "theoretical_loss": 3.759718197704492, "tokens_seen": 737220608 }, { "epoch": 2.02, "objective/train/docs_used": 1194303, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.368875741958618, "objective/train/theoretical_loss": 3.7597013866267086, "objective/train/tokens_used": 757713376, "theoretical_loss": 3.7597013866267086, "tokens_seen": 737253376 }, { "epoch": 2.02, "learning_rate": 0.00039224674022066196, "loss": 2.8845, "theoretical_loss": 3.7596845765052964, "tokens_seen": 737286144 }, { "epoch": 2.02, "learning_rate": 0.0003922367101303912, "loss": 2.9667, "theoretical_loss": 3.759650959131197, "tokens_seen": 737351680 }, { "epoch": 2.02, "learning_rate": 0.0003922266800401204, "loss": 2.8365, "theoretical_loss": 3.759617345581418, "tokens_seen": 737417216 }, { "epoch": 2.02, "learning_rate": 0.00039221664994984956, "loss": 2.8412, "theoretical_loss": 3.759583735855185, "tokens_seen": 737482752 }, { "epoch": 2.02, "learning_rate": 0.00039220661985957874, "loss": 3.1388, "theoretical_loss": 3.759550129951723, "tokens_seen": 737548288 }, { "epoch": 2.02, "learning_rate": 0.0003921965897693079, "loss": 3.0445, "theoretical_loss": 3.7595165278702583, "tokens_seen": 737613824 }, { "epoch": 2.02, "learning_rate": 0.0003921865596790371, "loss": 2.9696, "theoretical_loss": 3.759482929610016, "tokens_seen": 737679360 }, { "epoch": 2.02, "learning_rate": 0.00039217652958876633, "loss": 3.0479, "theoretical_loss": 3.7594493351702223, "tokens_seen": 737744896 }, { "epoch": 2.02, "learning_rate": 0.00039216649949849546, "loss": 2.9265, "theoretical_loss": 3.759415744550104, "tokens_seen": 737810432 }, { "epoch": 2.02, "learning_rate": 0.0003921564694082247, "loss": 3.0949, "theoretical_loss": 3.7593821577488864, "tokens_seen": 737875968 }, { "epoch": 2.02, "learning_rate": 0.0003921464393179539, "loss": 3.0198, "theoretical_loss": 3.759348574765797, "tokens_seen": 737941504 }, { "epoch": 2.02, "learning_rate": 0.00039213640922768306, "loss": 2.9943, "theoretical_loss": 3.7593149956000627, "tokens_seen": 738007040 }, { "epoch": 2.02, "learning_rate": 0.00039212637913741224, "loss": 2.9559, "theoretical_loss": 3.7592814202509106, "tokens_seen": 738072576 }, { "epoch": 2.02, "learning_rate": 0.0003921163490471414, "loss": 3.0107, "theoretical_loss": 3.759247848717568, "tokens_seen": 738138112 }, { "epoch": 2.02, "learning_rate": 0.0003921063189568706, "loss": 3.0686, "theoretical_loss": 3.7592142809992626, "tokens_seen": 738203648 }, { "epoch": 2.02, "learning_rate": 0.00039209628886659984, "loss": 2.9684, "theoretical_loss": 3.759180717095222, "tokens_seen": 738269184 }, { "epoch": 2.02, "learning_rate": 0.00039208625877632896, "loss": 2.9362, "theoretical_loss": 3.7591471570046746, "tokens_seen": 738334720 }, { "epoch": 2.02, "learning_rate": 0.0003920762286860582, "loss": 3.0014, "theoretical_loss": 3.7591136007268484, "tokens_seen": 738400256 }, { "epoch": 2.02, "learning_rate": 0.0003920661985957873, "loss": 3.0693, "theoretical_loss": 3.759080048260972, "tokens_seen": 738465792 }, { "epoch": 2.02, "learning_rate": 0.00039205616850551656, "loss": 3.2127, "theoretical_loss": 3.7590464996062747, "tokens_seen": 738531328 }, { "epoch": 2.02, "learning_rate": 0.00039204613841524574, "loss": 2.997, "theoretical_loss": 3.7590129547619844, "tokens_seen": 738596864 }, { "epoch": 2.02, "learning_rate": 0.0003920361083249749, "loss": 3.0581, "theoretical_loss": 3.7589794137273307, "tokens_seen": 738662400 }, { "epoch": 2.02, "learning_rate": 0.0003920260782347041, "loss": 3.037, "theoretical_loss": 3.758945876501543, "tokens_seen": 738727936 }, { "epoch": 2.02, "learning_rate": 0.0003920160481444333, "loss": 2.9872, "theoretical_loss": 3.7589123430838516, "tokens_seen": 738793472 }, { "epoch": 2.02, "learning_rate": 0.00039200601805416247, "loss": 2.9976, "theoretical_loss": 3.758878813473485, "tokens_seen": 738859008 }, { "epoch": 2.02, "objective/train/docs_used": 1195656, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.034820318222046, "objective/train/theoretical_loss": 3.758862050095809, "objective/train/tokens_used": 759351776, "theoretical_loss": 3.758862050095809, "tokens_seen": 738891776 }, { "epoch": 2.02, "learning_rate": 0.0003919959879638917, "loss": 2.7869, "theoretical_loss": 3.7588452876696747, "tokens_seen": 738924544 }, { "epoch": 2.02, "learning_rate": 0.00039198595787362083, "loss": 3.0608, "theoretical_loss": 3.75881176567165, "tokens_seen": 738990080 }, { "epoch": 2.02, "learning_rate": 0.00039197592778335007, "loss": 2.9276, "theoretical_loss": 3.758778247478642, "tokens_seen": 739055616 }, { "epoch": 2.02, "learning_rate": 0.00039196589769307925, "loss": 2.8499, "theoretical_loss": 3.7587447330898813, "tokens_seen": 739121152 }, { "epoch": 2.02, "learning_rate": 0.00039195586760280843, "loss": 2.9381, "theoretical_loss": 3.7587112225045987, "tokens_seen": 739186688 }, { "epoch": 2.02, "learning_rate": 0.0003919458375125376, "loss": 3.017, "theoretical_loss": 3.758677715722025, "tokens_seen": 739252224 }, { "epoch": 2.02, "learning_rate": 0.0003919358074222668, "loss": 2.9539, "theoretical_loss": 3.758644212741392, "tokens_seen": 739317760 }, { "epoch": 2.02, "learning_rate": 0.00039192577733199597, "loss": 3.1728, "theoretical_loss": 3.758610713561932, "tokens_seen": 739383296 }, { "epoch": 2.02, "learning_rate": 0.0003919157472417252, "loss": 3.0107, "theoretical_loss": 3.758577218182876, "tokens_seen": 739448832 }, { "epoch": 2.02, "learning_rate": 0.00039190571715145433, "loss": 3.1871, "theoretical_loss": 3.758543726603457, "tokens_seen": 739514368 }, { "epoch": 2.02, "learning_rate": 0.00039189568706118357, "loss": 2.883, "theoretical_loss": 3.7585102388229057, "tokens_seen": 739579904 }, { "epoch": 2.02, "learning_rate": 0.00039188565697091275, "loss": 2.9949, "theoretical_loss": 3.7584767548404563, "tokens_seen": 739645440 }, { "epoch": 2.02, "learning_rate": 0.00039187562688064193, "loss": 2.8956, "theoretical_loss": 3.758443274655341, "tokens_seen": 739710976 }, { "epoch": 2.02, "learning_rate": 0.00039186559679037117, "loss": 3.0567, "theoretical_loss": 3.758409798266792, "tokens_seen": 739776512 }, { "epoch": 2.02, "learning_rate": 0.0003918555667001003, "loss": 2.8157, "theoretical_loss": 3.7583763256740434, "tokens_seen": 739842048 }, { "epoch": 2.02, "learning_rate": 0.00039184553660982953, "loss": 3.0288, "theoretical_loss": 3.758342856876329, "tokens_seen": 739907584 }, { "epoch": 2.02, "learning_rate": 0.00039183550651955866, "loss": 2.9755, "theoretical_loss": 3.7583093918728805, "tokens_seen": 739973120 }, { "epoch": 2.02, "learning_rate": 0.0003918254764292879, "loss": 3.0117, "theoretical_loss": 3.7582759306629345, "tokens_seen": 740038656 }, { "epoch": 2.02, "learning_rate": 0.00039181544633901707, "loss": 2.8775, "theoretical_loss": 3.7582424732457227, "tokens_seen": 740104192 }, { "epoch": 2.02, "learning_rate": 0.00039180541624874625, "loss": 2.95, "theoretical_loss": 3.758209019620481, "tokens_seen": 740169728 }, { "epoch": 2.02, "learning_rate": 0.00039179538615847543, "loss": 3.1136, "theoretical_loss": 3.7581755697864434, "tokens_seen": 740235264 }, { "epoch": 2.02, "learning_rate": 0.00039178535606820467, "loss": 3.1204, "theoretical_loss": 3.758142123742844, "tokens_seen": 740300800 }, { "epoch": 2.02, "learning_rate": 0.0003917753259779338, "loss": 2.9173, "theoretical_loss": 3.758108681488919, "tokens_seen": 740366336 }, { "epoch": 2.02, "learning_rate": 0.00039176529588766303, "loss": 3.0938, "theoretical_loss": 3.7580752430239026, "tokens_seen": 740431872 }, { "epoch": 2.02, "learning_rate": 0.00039175526579739216, "loss": 3.0368, "theoretical_loss": 3.7580418083470306, "tokens_seen": 740497408 }, { "epoch": 2.02, "objective/train/docs_used": 1198392, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.92041015625, "objective/train/theoretical_loss": 3.7580250924289107, "objective/train/tokens_used": 760990176, "theoretical_loss": 3.7580250924289107, "tokens_seen": 740530176 }, { "epoch": 2.02, "learning_rate": 0.0003917452357071214, "loss": 2.9164, "theoretical_loss": 3.7580083774575392, "tokens_seen": 740562944 }, { "epoch": 2.02, "learning_rate": 0.0003917352056168506, "loss": 3.0002, "theoretical_loss": 3.7579749503546633, "tokens_seen": 740628480 }, { "epoch": 2.02, "learning_rate": 0.00039172517552657976, "loss": 2.9487, "theoretical_loss": 3.75794152703764, "tokens_seen": 740694016 }, { "epoch": 2.02, "learning_rate": 0.00039171514543630894, "loss": 2.9335, "theoretical_loss": 3.7579081075057044, "tokens_seen": 740759552 }, { "epoch": 2.02, "learning_rate": 0.0003917051153460381, "loss": 3.1383, "theoretical_loss": 3.757874691758094, "tokens_seen": 740825088 }, { "epoch": 2.02, "learning_rate": 0.0003916950852557673, "loss": 3.0553, "theoretical_loss": 3.757841279794045, "tokens_seen": 740890624 }, { "epoch": 2.02, "learning_rate": 0.00039168505516549653, "loss": 2.9672, "theoretical_loss": 3.757807871612795, "tokens_seen": 740956160 }, { "epoch": 2.02, "learning_rate": 0.00039167502507522566, "loss": 3.0656, "theoretical_loss": 3.75777446721358, "tokens_seen": 741021696 }, { "epoch": 2.02, "learning_rate": 0.0003916649949849549, "loss": 2.8996, "theoretical_loss": 3.7577410665956394, "tokens_seen": 741087232 }, { "epoch": 2.02, "learning_rate": 0.0003916549648946841, "loss": 3.0509, "theoretical_loss": 3.757707669758209, "tokens_seen": 741152768 }, { "epoch": 2.02, "learning_rate": 0.00039164493480441326, "loss": 3.0282, "theoretical_loss": 3.7576742767005276, "tokens_seen": 741218304 }, { "epoch": 2.02, "learning_rate": 0.00039163490471414244, "loss": 2.953, "theoretical_loss": 3.7576408874218328, "tokens_seen": 741283840 }, { "epoch": 2.02, "learning_rate": 0.0003916248746238716, "loss": 3.0419, "theoretical_loss": 3.7576075019213633, "tokens_seen": 741349376 }, { "epoch": 2.02, "learning_rate": 0.0003916148445336008, "loss": 2.9969, "theoretical_loss": 3.757574120198357, "tokens_seen": 741414912 }, { "epoch": 2.02, "learning_rate": 0.00039160481444333004, "loss": 3.0183, "theoretical_loss": 3.7575407422520537, "tokens_seen": 741480448 }, { "epoch": 2.02, "learning_rate": 0.00039159478435305916, "loss": 2.8245, "theoretical_loss": 3.7575073680816917, "tokens_seen": 741545984 }, { "epoch": 2.02, "learning_rate": 0.0003915847542627884, "loss": 2.9974, "theoretical_loss": 3.7574739976865095, "tokens_seen": 741611520 }, { "epoch": 2.02, "learning_rate": 0.0003915747241725175, "loss": 3.1244, "theoretical_loss": 3.757440631065748, "tokens_seen": 741677056 }, { "epoch": 2.02, "learning_rate": 0.00039156469408224676, "loss": 2.8801, "theoretical_loss": 3.757407268218646, "tokens_seen": 741742592 }, { "epoch": 2.02, "learning_rate": 0.00039155466399197594, "loss": 2.9868, "theoretical_loss": 3.7573739091444436, "tokens_seen": 741808128 }, { "epoch": 2.02, "learning_rate": 0.0003915446339017051, "loss": 3.0252, "theoretical_loss": 3.7573405538423805, "tokens_seen": 741873664 }, { "epoch": 2.02, "learning_rate": 0.0003915346038114343, "loss": 2.9377, "theoretical_loss": 3.757307202311697, "tokens_seen": 741939200 }, { "epoch": 2.02, "learning_rate": 0.0003915245737211635, "loss": 3.0818, "theoretical_loss": 3.7572738545516344, "tokens_seen": 742004736 }, { "epoch": 2.02, "learning_rate": 0.00039151454363089267, "loss": 2.8473, "theoretical_loss": 3.7572405105614326, "tokens_seen": 742070272 }, { "epoch": 2.02, "learning_rate": 0.0003915045135406219, "loss": 2.9072, "theoretical_loss": 3.757207170340333, "tokens_seen": 742135808 }, { "epoch": 2.02, "objective/train/docs_used": 1201322, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9505622386932373, "objective/train/theoretical_loss": 3.7571905016429588, "objective/train/tokens_used": 762628576, "theoretical_loss": 3.7571905016429588, "tokens_seen": 742168576 }, { "epoch": 2.02, "learning_rate": 0.00039149448345035103, "loss": 2.9787, "theoretical_loss": 3.757173833887576, "tokens_seen": 742201344 }, { "epoch": 2.02, "learning_rate": 0.00039148445336008027, "loss": 3.013, "theoretical_loss": 3.7571405012024037, "tokens_seen": 742266880 }, { "epoch": 2.02, "learning_rate": 0.00039147442326980945, "loss": 3.2193, "theoretical_loss": 3.7571071722840577, "tokens_seen": 742332416 }, { "epoch": 2.02, "learning_rate": 0.00039146439317953863, "loss": 2.9697, "theoretical_loss": 3.75707384713178, "tokens_seen": 742397952 }, { "epoch": 2.02, "learning_rate": 0.0003914543630892678, "loss": 2.9002, "theoretical_loss": 3.757040525744812, "tokens_seen": 742463488 }, { "epoch": 2.02, "learning_rate": 0.000391444332998997, "loss": 3.0624, "theoretical_loss": 3.7570072081223964, "tokens_seen": 742529024 }, { "epoch": 2.02, "learning_rate": 0.00039143430290872617, "loss": 2.8793, "theoretical_loss": 3.7569738942637754, "tokens_seen": 742594560 }, { "epoch": 2.02, "learning_rate": 0.0003914242728184554, "loss": 2.9702, "theoretical_loss": 3.756940584168192, "tokens_seen": 742660096 }, { "epoch": 2.02, "learning_rate": 0.00039141424272818453, "loss": 2.933, "theoretical_loss": 3.7569072778348893, "tokens_seen": 742725632 }, { "epoch": 2.02, "learning_rate": 0.00039140421263791377, "loss": 2.9704, "theoretical_loss": 3.75687397526311, "tokens_seen": 742791168 }, { "epoch": 2.02, "learning_rate": 0.0003913941825476429, "loss": 3.0607, "theoretical_loss": 3.7568406764520974, "tokens_seen": 742856704 }, { "epoch": 2.02, "learning_rate": 0.00039138415245737213, "loss": 2.9287, "theoretical_loss": 3.756807381401096, "tokens_seen": 742922240 }, { "epoch": 2.02, "learning_rate": 0.0003913741223671013, "loss": 2.9294, "theoretical_loss": 3.7567740901093485, "tokens_seen": 742987776 }, { "epoch": 2.02, "learning_rate": 0.0003913640922768305, "loss": 2.9302, "theoretical_loss": 3.756740802576099, "tokens_seen": 743053312 }, { "epoch": 2.02, "learning_rate": 0.0003913540621865597, "loss": 3.0147, "theoretical_loss": 3.7567075188005923, "tokens_seen": 743118848 }, { "epoch": 2.02, "learning_rate": 0.00039134403209628886, "loss": 2.9225, "theoretical_loss": 3.7566742387820726, "tokens_seen": 743184384 }, { "epoch": 2.02, "learning_rate": 0.00039133400200601804, "loss": 2.8799, "theoretical_loss": 3.756640962519785, "tokens_seen": 743249920 }, { "epoch": 2.02, "learning_rate": 0.00039132397191574727, "loss": 2.9482, "theoretical_loss": 3.7566076900129737, "tokens_seen": 743315456 }, { "epoch": 2.02, "learning_rate": 0.0003913139418254764, "loss": 3.0424, "theoretical_loss": 3.7565744212608845, "tokens_seen": 743380992 }, { "epoch": 2.02, "learning_rate": 0.00039130391173520563, "loss": 2.9027, "theoretical_loss": 3.7565411562627613, "tokens_seen": 743446528 }, { "epoch": 2.02, "learning_rate": 0.0003912938816449348, "loss": 2.7364, "theoretical_loss": 3.756507895017852, "tokens_seen": 743512064 }, { "epoch": 2.02, "learning_rate": 0.000391283851554664, "loss": 2.9351, "theoretical_loss": 3.7564746375254003, "tokens_seen": 743577600 }, { "epoch": 2.02, "learning_rate": 0.0003912738214643932, "loss": 2.8702, "theoretical_loss": 3.756441383784653, "tokens_seen": 743643136 }, { "epoch": 2.02, "learning_rate": 0.00039126379137412236, "loss": 3.1214, "theoretical_loss": 3.7564081337948565, "tokens_seen": 743708672 }, { "epoch": 2.02, "learning_rate": 0.00039125376128385154, "loss": 2.8962, "theoretical_loss": 3.756374887555257, "tokens_seen": 743774208 }, { "epoch": 2.02, "objective/train/docs_used": 1204150, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.21405029296875, "objective/train/theoretical_loss": 3.7563582658415453, "objective/train/tokens_used": 764266976, "theoretical_loss": 3.7563582658415453, "tokens_seen": 743806976 }, { "epoch": 2.02, "learning_rate": 0.0003912437311935808, "loss": 2.9222, "theoretical_loss": 3.756341645065101, "tokens_seen": 743839744 }, { "epoch": 2.02, "learning_rate": 0.0003912337011033099, "loss": 3.0716, "theoretical_loss": 3.756308406323635, "tokens_seen": 743905280 }, { "epoch": 2.02, "learning_rate": 0.00039122367101303914, "loss": 2.9871, "theoretical_loss": 3.756275171330107, "tokens_seen": 743970816 }, { "epoch": 2.02, "learning_rate": 0.00039121364092276826, "loss": 3.0549, "theoretical_loss": 3.756241940083764, "tokens_seen": 744036352 }, { "epoch": 2.02, "learning_rate": 0.0003912036108324975, "loss": 2.8774, "theoretical_loss": 3.7562087125838532, "tokens_seen": 744101888 }, { "epoch": 2.02, "learning_rate": 0.0003911935807422267, "loss": 3.1093, "theoretical_loss": 3.756175488829622, "tokens_seen": 744167424 }, { "epoch": 2.02, "learning_rate": 0.00039118355065195586, "loss": 3.0607, "theoretical_loss": 3.7561422688203194, "tokens_seen": 744232960 }, { "epoch": 2.02, "learning_rate": 0.00039117352056168504, "loss": 2.8924, "theoretical_loss": 3.7561090525551926, "tokens_seen": 744298496 }, { "epoch": 2.02, "learning_rate": 0.0003911634904714143, "loss": 3.0041, "theoretical_loss": 3.756075840033491, "tokens_seen": 744364032 }, { "epoch": 2.02, "learning_rate": 0.0003911534603811434, "loss": 3.0493, "theoretical_loss": 3.756042631254462, "tokens_seen": 744429568 }, { "epoch": 2.02, "learning_rate": 0.00039114343029087264, "loss": 3.0415, "theoretical_loss": 3.7560094262173553, "tokens_seen": 744495104 }, { "epoch": 2.02, "learning_rate": 0.0003911334002006018, "loss": 2.9476, "theoretical_loss": 3.7559762249214192, "tokens_seen": 744560640 }, { "epoch": 2.02, "learning_rate": 0.000391123370110331, "loss": 3.0497, "theoretical_loss": 3.7559430273659036, "tokens_seen": 744626176 }, { "epoch": 2.02, "learning_rate": 0.00039111334002006024, "loss": 3.0773, "theoretical_loss": 3.7559098335500574, "tokens_seen": 744691712 }, { "epoch": 2.02, "learning_rate": 0.00039110330992978936, "loss": 3.0055, "theoretical_loss": 3.7558766434731305, "tokens_seen": 744757248 }, { "epoch": 2.02, "learning_rate": 0.0003910932798395186, "loss": 3.0478, "theoretical_loss": 3.7558434571343735, "tokens_seen": 744822784 }, { "epoch": 2.02, "learning_rate": 0.00039108324974924773, "loss": 2.9645, "theoretical_loss": 3.7558102745330357, "tokens_seen": 744888320 }, { "epoch": 2.02, "learning_rate": 0.00039107321965897696, "loss": 3.0149, "theoretical_loss": 3.7557770956683676, "tokens_seen": 744953856 }, { "epoch": 2.02, "learning_rate": 0.00039106318956870614, "loss": 3.0298, "theoretical_loss": 3.7557439205396195, "tokens_seen": 745019392 }, { "epoch": 2.02, "learning_rate": 0.0003910531594784353, "loss": 2.8286, "theoretical_loss": 3.7557107491460426, "tokens_seen": 745084928 }, { "epoch": 2.02, "learning_rate": 0.0003910431293881645, "loss": 2.9395, "theoretical_loss": 3.7556775814868875, "tokens_seen": 745150464 }, { "epoch": 2.02, "learning_rate": 0.0003910330992978937, "loss": 2.9167, "theoretical_loss": 3.7556444175614057, "tokens_seen": 745216000 }, { "epoch": 2.02, "learning_rate": 0.00039102306920762287, "loss": 2.8756, "theoretical_loss": 3.755611257368849, "tokens_seen": 745281536 }, { "epoch": 2.02, "learning_rate": 0.0003910130391173521, "loss": 3.0638, "theoretical_loss": 3.7555781009084677, "tokens_seen": 745347072 }, { "epoch": 2.02, "learning_rate": 0.00039100300902708123, "loss": 3.0772, "theoretical_loss": 3.755544948179515, "tokens_seen": 745412608 }, { "epoch": 2.02, "objective/train/docs_used": 1207077, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0855767726898193, "objective/train/theoretical_loss": 3.7555283732140903, "objective/train/tokens_used": 765905376, "theoretical_loss": 3.7555283732140903, "tokens_seen": 745445376 }, { "epoch": 2.02, "learning_rate": 0.00039099297893681047, "loss": 3.0445, "theoretical_loss": 3.7555117991812423, "tokens_seen": 745478144 }, { "epoch": 2.02, "learning_rate": 0.00039098294884653965, "loss": 2.8757, "theoretical_loss": 3.7554786539129017, "tokens_seen": 745543680 }, { "epoch": 2.02, "learning_rate": 0.00039097291875626883, "loss": 3.0566, "theoretical_loss": 3.755445512373746, "tokens_seen": 745609216 }, { "epoch": 2.02, "learning_rate": 0.000390962888665998, "loss": 2.8855, "theoretical_loss": 3.7554123745630283, "tokens_seen": 745674752 }, { "epoch": 2.02, "learning_rate": 0.0003909528585757272, "loss": 3.0806, "theoretical_loss": 3.755379240480001, "tokens_seen": 745740288 }, { "epoch": 2.02, "learning_rate": 0.00039094282848545637, "loss": 2.855, "theoretical_loss": 3.7553461101239174, "tokens_seen": 745805824 }, { "epoch": 2.02, "learning_rate": 0.0003909327983951856, "loss": 2.8961, "theoretical_loss": 3.75531298349403, "tokens_seen": 745871360 }, { "epoch": 2.02, "learning_rate": 0.00039092276830491473, "loss": 3.0552, "theoretical_loss": 3.755279860589594, "tokens_seen": 745936896 }, { "epoch": 2.02, "learning_rate": 0.00039091273821464397, "loss": 3.0268, "theoretical_loss": 3.755246741409862, "tokens_seen": 746002432 }, { "epoch": 2.02, "learning_rate": 0.0003909027081243731, "loss": 2.8901, "theoretical_loss": 3.755213625954089, "tokens_seen": 746067968 }, { "epoch": 2.02, "learning_rate": 0.00039089267803410233, "loss": 3.0013, "theoretical_loss": 3.7551805142215278, "tokens_seen": 746133504 }, { "epoch": 2.02, "learning_rate": 0.0003908826479438315, "loss": 2.9585, "theoretical_loss": 3.7551474062114334, "tokens_seen": 746199040 }, { "epoch": 2.02, "learning_rate": 0.0003908726178535607, "loss": 3.0334, "theoretical_loss": 3.7551143019230615, "tokens_seen": 746264576 }, { "epoch": 2.02, "learning_rate": 0.0003908625877632899, "loss": 3.0026, "theoretical_loss": 3.755081201355665, "tokens_seen": 746330112 }, { "epoch": 2.02, "learning_rate": 0.00039085255767301906, "loss": 3.0517, "theoretical_loss": 3.755048104508501, "tokens_seen": 746395648 }, { "epoch": 2.02, "learning_rate": 0.00039084252758274824, "loss": 2.8324, "theoretical_loss": 3.755015011380823, "tokens_seen": 746461184 }, { "epoch": 2.02, "learning_rate": 0.00039083249749247747, "loss": 2.9445, "theoretical_loss": 3.754981921971888, "tokens_seen": 746526720 }, { "epoch": 2.02, "learning_rate": 0.0003908224674022066, "loss": 3.0008, "theoretical_loss": 3.7549488362809504, "tokens_seen": 746592256 }, { "epoch": 2.02, "learning_rate": 0.00039081243731193583, "loss": 2.9021, "theoretical_loss": 3.754915754307267, "tokens_seen": 746657792 }, { "epoch": 2.02, "learning_rate": 0.000390802407221665, "loss": 2.947, "theoretical_loss": 3.7548826760500935, "tokens_seen": 746723328 }, { "epoch": 2.02, "learning_rate": 0.0003907923771313942, "loss": 2.9265, "theoretical_loss": 3.7548496015086874, "tokens_seen": 746788864 }, { "epoch": 2.02, "learning_rate": 0.0003907823470411234, "loss": 3.0408, "theoretical_loss": 3.7548165306823034, "tokens_seen": 746854400 }, { "epoch": 2.02, "learning_rate": 0.00039077231695085256, "loss": 2.8808, "theoretical_loss": 3.754783463570199, "tokens_seen": 746919936 }, { "epoch": 2.02, "learning_rate": 0.00039076228686058174, "loss": 3.0714, "theoretical_loss": 3.7547504001716314, "tokens_seen": 746985472 }, { "epoch": 2.02, "learning_rate": 0.000390752256770311, "loss": 3.0357, "theoretical_loss": 3.7547173404858585, "tokens_seen": 747051008 }, { "epoch": 2.02, "objective/train/docs_used": 1209842, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1354238986968994, "objective/train/theoretical_loss": 3.7547008120350376, "objective/train/tokens_used": 767543776, "theoretical_loss": 3.7547008120350376, "tokens_seen": 747083776 }, { "epoch": 2.02, "learning_rate": 0.0003907422266800401, "loss": 2.9363, "theoretical_loss": 3.7546842845121366, "tokens_seen": 747116544 }, { "epoch": 2.02, "learning_rate": 0.00039073219658976934, "loss": 2.8852, "theoretical_loss": 3.754651232249724, "tokens_seen": 747182080 }, { "epoch": 2.02, "learning_rate": 0.00039072216649949846, "loss": 3.0378, "theoretical_loss": 3.754618183697878, "tokens_seen": 747247616 }, { "epoch": 2.02, "learning_rate": 0.0003907121364092277, "loss": 2.9781, "theoretical_loss": 3.754585138855857, "tokens_seen": 747313152 }, { "epoch": 2.02, "learning_rate": 0.0003907021063189569, "loss": 2.9063, "theoretical_loss": 3.754552097722919, "tokens_seen": 747378688 }, { "epoch": 2.02, "learning_rate": 0.00039069207622868606, "loss": 2.8037, "theoretical_loss": 3.7545190602983234, "tokens_seen": 747444224 }, { "epoch": 2.02, "learning_rate": 0.00039068204613841524, "loss": 3.0177, "theoretical_loss": 3.754486026581328, "tokens_seen": 747509760 }, { "epoch": 2.02, "learning_rate": 0.0003906720160481445, "loss": 3.023, "theoretical_loss": 3.7544529965711915, "tokens_seen": 747575296 }, { "epoch": 2.02, "learning_rate": 0.0003906619859578736, "loss": 2.9347, "theoretical_loss": 3.754419970267174, "tokens_seen": 747640832 }, { "epoch": 2.02, "learning_rate": 0.00039065195586760284, "loss": 2.9852, "theoretical_loss": 3.754386947668534, "tokens_seen": 747706368 }, { "epoch": 2.02, "learning_rate": 0.00039064192577733197, "loss": 3.004, "theoretical_loss": 3.754353928774532, "tokens_seen": 747771904 }, { "epoch": 2.02, "learning_rate": 0.0003906318956870612, "loss": 2.917, "theoretical_loss": 3.7543209135844267, "tokens_seen": 747837440 }, { "epoch": 2.02, "learning_rate": 0.0003906218655967904, "loss": 3.0933, "theoretical_loss": 3.754287902097478, "tokens_seen": 747902976 }, { "epoch": 2.02, "learning_rate": 0.00039061183550651957, "loss": 3.01, "theoretical_loss": 3.7542548943129477, "tokens_seen": 747968512 }, { "epoch": 2.02, "learning_rate": 0.00039060180541624875, "loss": 2.9584, "theoretical_loss": 3.7542218902300943, "tokens_seen": 748034048 }, { "epoch": 2.02, "learning_rate": 0.00039059177532597793, "loss": 3.0641, "theoretical_loss": 3.7541888898481797, "tokens_seen": 748099584 }, { "epoch": 2.02, "learning_rate": 0.0003905817452357071, "loss": 3.1031, "theoretical_loss": 3.7541558931664643, "tokens_seen": 748165120 }, { "epoch": 2.02, "learning_rate": 0.00039057171514543634, "loss": 2.7729, "theoretical_loss": 3.7541229001842096, "tokens_seen": 748230656 }, { "epoch": 2.02, "learning_rate": 0.00039056168505516547, "loss": 2.7876, "theoretical_loss": 3.7540899109006753, "tokens_seen": 748296192 }, { "epoch": 2.02, "learning_rate": 0.0003905516549648947, "loss": 3.0506, "theoretical_loss": 3.7540569253151244, "tokens_seen": 748361728 }, { "epoch": 2.02, "learning_rate": 0.00039054162487462383, "loss": 2.8966, "theoretical_loss": 3.7540239434268186, "tokens_seen": 748427264 }, { "epoch": 2.02, "learning_rate": 0.00039053159478435307, "loss": 3.0521, "theoretical_loss": 3.753990965235019, "tokens_seen": 748492800 }, { "epoch": 2.02, "learning_rate": 0.00039052156469408225, "loss": 2.9387, "theoretical_loss": 3.7539579907389884, "tokens_seen": 748558336 }, { "epoch": 2.02, "learning_rate": 0.00039051153460381143, "loss": 2.9573, "theoretical_loss": 3.7539250199379888, "tokens_seen": 748623872 }, { "epoch": 2.02, "learning_rate": 0.0003905015045135406, "loss": 2.9538, "theoretical_loss": 3.753892052831282, "tokens_seen": 748689408 }, { "epoch": 2.02, "objective/train/docs_used": 1211281, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0341784954071045, "objective/train/theoretical_loss": 3.753875570663058, "objective/train/tokens_used": 769182176, "theoretical_loss": 3.753875570663058, "tokens_seen": 748722176 }, { "epoch": 2.02, "learning_rate": 0.00039049147442326985, "loss": 3.0761, "theoretical_loss": 3.753859089418132, "tokens_seen": 748754944 }, { "epoch": 2.02, "learning_rate": 0.000390481444332999, "loss": 2.8766, "theoretical_loss": 3.753826129697801, "tokens_seen": 748820480 }, { "epoch": 2.02, "learning_rate": 0.0003904714142427282, "loss": 3.2221, "theoretical_loss": 3.753793173669552, "tokens_seen": 748886016 }, { "epoch": 2.02, "learning_rate": 0.00039046138415245734, "loss": 2.9559, "theoretical_loss": 3.7537602213326493, "tokens_seen": 748951552 }, { "epoch": 2.02, "learning_rate": 0.00039045135406218657, "loss": 2.9949, "theoretical_loss": 3.7537272726863558, "tokens_seen": 749017088 }, { "epoch": 2.02, "learning_rate": 0.00039044132397191575, "loss": 3.0521, "theoretical_loss": 3.753694327729935, "tokens_seen": 749082624 }, { "epoch": 2.02, "learning_rate": 0.00039043129388164493, "loss": 2.9528, "theoretical_loss": 3.753661386462652, "tokens_seen": 749148160 }, { "epoch": 2.02, "learning_rate": 0.0003904212637913741, "loss": 2.9072, "theoretical_loss": 3.75362844888377, "tokens_seen": 749213696 }, { "epoch": 2.02, "learning_rate": 0.0003904112337011033, "loss": 3.056, "theoretical_loss": 3.7535955149925537, "tokens_seen": 749279232 }, { "epoch": 2.02, "learning_rate": 0.0003904012036108325, "loss": 2.9635, "theoretical_loss": 3.753562584788268, "tokens_seen": 749344768 }, { "epoch": 2.02, "learning_rate": 0.0003903911735205617, "loss": 2.9733, "theoretical_loss": 3.7535296582701774, "tokens_seen": 749410304 }, { "epoch": 2.02, "learning_rate": 0.0003903811434302909, "loss": 2.8859, "theoretical_loss": 3.7534967354375475, "tokens_seen": 749475840 }, { "epoch": 2.02, "learning_rate": 0.0003903711133400201, "loss": 3.0316, "theoretical_loss": 3.7534638162896425, "tokens_seen": 749541376 }, { "epoch": 2.02, "learning_rate": 0.00039036108324974926, "loss": 2.9903, "theoretical_loss": 3.7534309008257294, "tokens_seen": 749606912 }, { "epoch": 2.02, "learning_rate": 0.00039035105315947844, "loss": 3.0452, "theoretical_loss": 3.7533979890450726, "tokens_seen": 749672448 }, { "epoch": 2.02, "learning_rate": 0.00039034102306920767, "loss": 3.1049, "theoretical_loss": 3.7533650809469385, "tokens_seen": 749737984 }, { "epoch": 2.02, "learning_rate": 0.0003903309929789368, "loss": 2.6141, "theoretical_loss": 3.7533321765305936, "tokens_seen": 749803520 }, { "epoch": 2.02, "learning_rate": 0.00039032096288866603, "loss": 3.0876, "theoretical_loss": 3.753299275795303, "tokens_seen": 749869056 }, { "epoch": 2.02, "learning_rate": 0.0003903109327983952, "loss": 2.8684, "theoretical_loss": 3.753266378740335, "tokens_seen": 749934592 }, { "epoch": 2.02, "learning_rate": 0.0003903009027081244, "loss": 3.148, "theoretical_loss": 3.753233485364955, "tokens_seen": 750000128 }, { "epoch": 2.02, "learning_rate": 0.0003902908726178536, "loss": 3.0374, "theoretical_loss": 3.753200595668431, "tokens_seen": 750065664 }, { "epoch": 2.02, "learning_rate": 0.00039028084252758276, "loss": 2.9374, "theoretical_loss": 3.753167709650029, "tokens_seen": 750131200 }, { "epoch": 2.02, "learning_rate": 0.00039027081243731194, "loss": 2.9408, "theoretical_loss": 3.753134827309017, "tokens_seen": 750196736 }, { "epoch": 2.02, "learning_rate": 0.0003902607823470412, "loss": 3.0315, "theoretical_loss": 3.7531019486446624, "tokens_seen": 750262272 }, { "epoch": 2.02, "learning_rate": 0.0003902507522567703, "loss": 2.9108, "theoretical_loss": 3.753069073656234, "tokens_seen": 750327808 }, { "epoch": 2.02, "objective/train/docs_used": 1215179, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8096323013305664, "objective/train/theoretical_loss": 3.7530526375402626, "objective/train/tokens_used": 770820576, "theoretical_loss": 3.7530526375402626, "tokens_seen": 750360576 }, { "epoch": 2.02, "learning_rate": 0.00039024072216649954, "loss": 2.9962, "theoretical_loss": 3.753036202342998, "tokens_seen": 750393344 }, { "epoch": 2.02, "learning_rate": 0.00039023069207622866, "loss": 2.9772, "theoretical_loss": 3.753003334704224, "tokens_seen": 750458880 }, { "epoch": 2.02, "learning_rate": 0.0003902206619859579, "loss": 3.069, "theoretical_loss": 3.7529704707391796, "tokens_seen": 750524416 }, { "epoch": 2.02, "learning_rate": 0.0003902106318956871, "loss": 2.9218, "theoretical_loss": 3.752937610447134, "tokens_seen": 750589952 }, { "epoch": 2.02, "learning_rate": 0.00039020060180541626, "loss": 3.035, "theoretical_loss": 3.752904753827356, "tokens_seen": 750655488 }, { "epoch": 2.02, "learning_rate": 0.00039019057171514544, "loss": 2.9366, "theoretical_loss": 3.7528719008791143, "tokens_seen": 750721024 }, { "epoch": 2.02, "learning_rate": 0.0003901805416248747, "loss": 2.9133, "theoretical_loss": 3.7528390516016783, "tokens_seen": 750786560 }, { "epoch": 2.02, "learning_rate": 0.0003901705115346038, "loss": 2.9397, "theoretical_loss": 3.752806205994318, "tokens_seen": 750852096 }, { "epoch": 2.02, "learning_rate": 0.00039016048144433304, "loss": 2.9685, "theoretical_loss": 3.752773364056303, "tokens_seen": 750917632 }, { "epoch": 2.02, "learning_rate": 0.00039015045135406217, "loss": 2.9716, "theoretical_loss": 3.752740525786902, "tokens_seen": 750983168 }, { "epoch": 2.02, "learning_rate": 0.0003901404212637914, "loss": 3.0698, "theoretical_loss": 3.7527076911853863, "tokens_seen": 751048704 }, { "epoch": 2.02, "learning_rate": 0.0003901303911735206, "loss": 3.1241, "theoretical_loss": 3.7526748602510254, "tokens_seen": 751114240 }, { "epoch": 2.02, "learning_rate": 0.00039012036108324977, "loss": 2.8635, "theoretical_loss": 3.7526420329830907, "tokens_seen": 751179776 }, { "epoch": 2.02, "learning_rate": 0.00039011033099297895, "loss": 3.1131, "theoretical_loss": 3.7526092093808527, "tokens_seen": 751245312 }, { "epoch": 2.02, "learning_rate": 0.00039010030090270813, "loss": 3.0279, "theoretical_loss": 3.7525763894435817, "tokens_seen": 751310848 }, { "epoch": 2.02, "learning_rate": 0.0003900902708124373, "loss": 3.053, "theoretical_loss": 3.7525435731705494, "tokens_seen": 751376384 }, { "epoch": 2.02, "learning_rate": 0.00039008024072216654, "loss": 2.9548, "theoretical_loss": 3.752510760561027, "tokens_seen": 751441920 }, { "epoch": 2.02, "learning_rate": 0.00039007021063189567, "loss": 3.0618, "theoretical_loss": 3.7524779516142868, "tokens_seen": 751507456 }, { "epoch": 2.02, "learning_rate": 0.0003900601805416249, "loss": 3.0182, "theoretical_loss": 3.752445146329599, "tokens_seen": 751572992 }, { "epoch": 2.02, "learning_rate": 0.00039005015045135403, "loss": 3.0826, "theoretical_loss": 3.7524123447062365, "tokens_seen": 751638528 }, { "epoch": 2.02, "learning_rate": 0.00039004012036108327, "loss": 2.965, "theoretical_loss": 3.7523795467434717, "tokens_seen": 751704064 }, { "epoch": 2.02, "learning_rate": 0.00039003009027081245, "loss": 2.9269, "theoretical_loss": 3.7523467524405767, "tokens_seen": 751769600 }, { "epoch": 2.02, "learning_rate": 0.00039002006018054163, "loss": 3.0019, "theoretical_loss": 3.7523139617968244, "tokens_seen": 751835136 }, { "epoch": 2.02, "learning_rate": 0.0003900100300902708, "loss": 3.0037, "theoretical_loss": 3.752281174811487, "tokens_seen": 751900672 }, { "epoch": 2.02, "learning_rate": 0.00039000000000000005, "loss": 2.7577, "theoretical_loss": 3.752248391483838, "tokens_seen": 751966208 }, { "epoch": 2.02, "objective/train/docs_used": 1216494, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.573331594467163, "objective/train/theoretical_loss": 3.7522320011914196, "objective/train/tokens_used": 772458976, "theoretical_loss": 3.7522320011914196, "tokens_seen": 751998976 }, { "epoch": 2.02, "learning_rate": 0.0003899899699097292, "loss": 2.9509, "theoretical_loss": 3.7522156118131504, "tokens_seen": 752031744 }, { "epoch": 2.02, "learning_rate": 0.0003899799398194584, "loss": 2.9025, "theoretical_loss": 3.7521828357986977, "tokens_seen": 752097280 }, { "epoch": 2.02, "learning_rate": 0.00038996990972918754, "loss": 2.9954, "theoretical_loss": 3.7521500634397533, "tokens_seen": 752162816 }, { "epoch": 2.02, "learning_rate": 0.00038995987963891677, "loss": 3.0992, "theoretical_loss": 3.752117294735592, "tokens_seen": 752228352 }, { "epoch": 2.02, "learning_rate": 0.00038994984954864595, "loss": 2.8849, "theoretical_loss": 3.7520845296854866, "tokens_seen": 752293888 }, { "epoch": 2.02, "learning_rate": 0.00038993981945837513, "loss": 2.9092, "theoretical_loss": 3.7520517682887125, "tokens_seen": 752359424 }, { "epoch": 2.02, "learning_rate": 0.0003899297893681043, "loss": 3.0964, "theoretical_loss": 3.752019010544543, "tokens_seen": 752424960 }, { "epoch": 2.02, "learning_rate": 0.0003899197592778335, "loss": 2.854, "theoretical_loss": 3.7519862564522537, "tokens_seen": 752490496 }, { "epoch": 2.02, "learning_rate": 0.0003899097291875627, "loss": 2.8702, "theoretical_loss": 3.751953506011119, "tokens_seen": 752556032 }, { "epoch": 2.02, "learning_rate": 0.0003898996990972919, "loss": 2.9357, "theoretical_loss": 3.7519207592204147, "tokens_seen": 752621568 }, { "epoch": 2.02, "learning_rate": 0.00038988966900702104, "loss": 2.8958, "theoretical_loss": 3.751888016079415, "tokens_seen": 752687104 }, { "epoch": 2.02, "learning_rate": 0.0003898796389167503, "loss": 3.0588, "theoretical_loss": 3.751855276587396, "tokens_seen": 752752640 }, { "epoch": 2.02, "learning_rate": 0.0003898696088264794, "loss": 2.9253, "theoretical_loss": 3.7518225407436336, "tokens_seen": 752818176 }, { "epoch": 2.02, "learning_rate": 0.00038985957873620864, "loss": 2.9558, "theoretical_loss": 3.7517898085474037, "tokens_seen": 752883712 }, { "epoch": 2.02, "learning_rate": 0.0003898495486459378, "loss": 3.0368, "theoretical_loss": 3.751757079997982, "tokens_seen": 752949248 }, { "epoch": 2.02, "learning_rate": 0.000389839518555667, "loss": 2.6505, "theoretical_loss": 3.7517243550946446, "tokens_seen": 753014784 }, { "epoch": 2.02, "learning_rate": 0.0003898294884653962, "loss": 3.0061, "theoretical_loss": 3.7516916338366695, "tokens_seen": 753080320 }, { "epoch": 2.02, "learning_rate": 0.0003898194583751254, "loss": 3.1629, "theoretical_loss": 3.751658916223332, "tokens_seen": 753145856 }, { "epoch": 2.02, "learning_rate": 0.00038980942828485454, "loss": 2.9735, "theoretical_loss": 3.7516262022539095, "tokens_seen": 753211392 }, { "epoch": 2.02, "learning_rate": 0.0003897993981945838, "loss": 2.8756, "theoretical_loss": 3.7515934919276788, "tokens_seen": 753276928 }, { "epoch": 2.02, "learning_rate": 0.0003897893681043129, "loss": 2.9543, "theoretical_loss": 3.751560785243918, "tokens_seen": 753342464 }, { "epoch": 2.02, "learning_rate": 0.00038977933801404214, "loss": 2.9944, "theoretical_loss": 3.7515280822019044, "tokens_seen": 753408000 }, { "epoch": 2.02, "learning_rate": 0.0003897693079237713, "loss": 2.9449, "theoretical_loss": 3.7514953828009157, "tokens_seen": 753473536 }, { "epoch": 2.02, "learning_rate": 0.0003897592778335005, "loss": 3.0303, "theoretical_loss": 3.7514626870402297, "tokens_seen": 753539072 }, { "epoch": 2.02, "learning_rate": 0.0003897492477432297, "loss": 2.9574, "theoretical_loss": 3.7514299949191248, "tokens_seen": 753604608 }, { "epoch": 2.02, "objective/train/docs_used": 1219211, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.047457456588745, "objective/train/theoretical_loss": 3.7514136502231894, "objective/train/tokens_used": 774097376, "theoretical_loss": 3.7514136502231894, "tokens_seen": 753637376 }, { "epoch": 2.02, "learning_rate": 0.00038973921765295886, "loss": 3.0837, "theoretical_loss": 3.7513973064368793, "tokens_seen": 753670144 }, { "epoch": 2.02, "learning_rate": 0.00038972918756268805, "loss": 3.0962, "theoretical_loss": 3.751364621592772, "tokens_seen": 753735680 }, { "epoch": 2.02, "learning_rate": 0.0003897191574724173, "loss": 3.0266, "theoretical_loss": 3.7513319403860814, "tokens_seen": 753801216 }, { "epoch": 2.02, "learning_rate": 0.0003897091273821464, "loss": 2.9624, "theoretical_loss": 3.7512992628160875, "tokens_seen": 753866752 }, { "epoch": 2.02, "learning_rate": 0.00038969909729187564, "loss": 3.0689, "theoretical_loss": 3.7512665888820678, "tokens_seen": 753932288 }, { "epoch": 2.02, "learning_rate": 0.00038968906720160477, "loss": 2.9882, "theoretical_loss": 3.751233918583303, "tokens_seen": 753997824 }, { "epoch": 2.02, "learning_rate": 0.000389679037111334, "loss": 3.0852, "theoretical_loss": 3.7512012519190723, "tokens_seen": 754063360 }, { "epoch": 2.02, "learning_rate": 0.0003896690070210632, "loss": 2.9892, "theoretical_loss": 3.7511685888886555, "tokens_seen": 754128896 }, { "epoch": 2.02, "learning_rate": 0.00038965897693079237, "loss": 2.9482, "theoretical_loss": 3.751135929491333, "tokens_seen": 754194432 }, { "epoch": 2.02, "learning_rate": 0.00038964894684052155, "loss": 2.8732, "theoretical_loss": 3.7511032737263843, "tokens_seen": 754259968 }, { "epoch": 2.02, "learning_rate": 0.0003896389167502508, "loss": 3.0106, "theoretical_loss": 3.7510706215930907, "tokens_seen": 754325504 }, { "epoch": 2.02, "learning_rate": 0.00038962888665997997, "loss": 2.9526, "theoretical_loss": 3.7510379730907326, "tokens_seen": 754391040 }, { "epoch": 2.02, "learning_rate": 0.00038961885656970915, "loss": 3.052, "theoretical_loss": 3.7510053282185907, "tokens_seen": 754456576 }, { "epoch": 2.02, "learning_rate": 0.00038960882647943833, "loss": 2.8672, "theoretical_loss": 3.7509726869759463, "tokens_seen": 754522112 }, { "epoch": 2.02, "learning_rate": 0.0003895987963891675, "loss": 2.9513, "theoretical_loss": 3.75094004936208, "tokens_seen": 754587648 }, { "epoch": 2.02, "learning_rate": 0.00038958876629889674, "loss": 2.9499, "theoretical_loss": 3.750907415376274, "tokens_seen": 754653184 }, { "epoch": 2.02, "learning_rate": 0.00038957873620862587, "loss": 3.0738, "theoretical_loss": 3.75087478501781, "tokens_seen": 754718720 }, { "epoch": 2.02, "learning_rate": 0.0003895687061183551, "loss": 2.9853, "theoretical_loss": 3.7508421582859697, "tokens_seen": 754784256 }, { "epoch": 2.02, "learning_rate": 0.00038955867602808423, "loss": 2.9767, "theoretical_loss": 3.750809535180035, "tokens_seen": 754849792 }, { "epoch": 2.02, "learning_rate": 0.00038954864593781347, "loss": 2.8335, "theoretical_loss": 3.7507769156992876, "tokens_seen": 754915328 }, { "epoch": 2.02, "learning_rate": 0.00038953861584754265, "loss": 3.0975, "theoretical_loss": 3.7507442998430114, "tokens_seen": 754980864 }, { "epoch": 2.02, "learning_rate": 0.00038952858575727183, "loss": 2.8461, "theoretical_loss": 3.7507116876104885, "tokens_seen": 755046400 }, { "epoch": 2.02, "learning_rate": 0.000389518555667001, "loss": 3.1078, "theoretical_loss": 3.7506790790010016, "tokens_seen": 755111936 }, { "epoch": 2.02, "learning_rate": 0.00038950852557673025, "loss": 3.0711, "theoretical_loss": 3.750646474013834, "tokens_seen": 755177472 }, { "epoch": 2.02, "learning_rate": 0.0003894984954864594, "loss": 2.944, "theoretical_loss": 3.7506138726482687, "tokens_seen": 755243008 }, { "epoch": 2.02, "objective/train/docs_used": 1221987, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1982581615448, "objective/train/theoretical_loss": 3.7505975733233634, "objective/train/tokens_used": 775735776, "theoretical_loss": 3.7505975733233634, "tokens_seen": 755275776 }, { "epoch": 2.02, "learning_rate": 0.0003894884653961886, "loss": 3.0754, "theoretical_loss": 3.75058127490359, "tokens_seen": 755308544 }, { "epoch": 2.02, "learning_rate": 0.00038947843530591774, "loss": 3.014, "theoretical_loss": 3.7505486807790804, "tokens_seen": 755374080 }, { "epoch": 2.02, "learning_rate": 0.00038946840521564697, "loss": 3.1046, "theoretical_loss": 3.750516090274025, "tokens_seen": 755439616 }, { "epoch": 2.02, "learning_rate": 0.00038945837512537615, "loss": 2.8266, "theoretical_loss": 3.7504835033877075, "tokens_seen": 755505152 }, { "epoch": 2.02, "learning_rate": 0.00038944834503510533, "loss": 2.9708, "theoretical_loss": 3.7504509201194116, "tokens_seen": 755570688 }, { "epoch": 2.02, "learning_rate": 0.0003894383149448345, "loss": 2.9402, "theoretical_loss": 3.750418340468423, "tokens_seen": 755636224 }, { "epoch": 2.02, "learning_rate": 0.0003894282848545637, "loss": 2.8766, "theoretical_loss": 3.750385764434026, "tokens_seen": 755701760 }, { "epoch": 2.02, "learning_rate": 0.0003894182547642929, "loss": 3.0003, "theoretical_loss": 3.7503531920155053, "tokens_seen": 755767296 }, { "epoch": 2.02, "learning_rate": 0.0003894082246740221, "loss": 3.0515, "theoretical_loss": 3.7503206232121453, "tokens_seen": 755832832 }, { "epoch": 2.02, "learning_rate": 0.00038939819458375124, "loss": 2.9007, "theoretical_loss": 3.7502880580232336, "tokens_seen": 755898368 }, { "epoch": 2.02, "learning_rate": 0.0003893881644934805, "loss": 2.9934, "theoretical_loss": 3.7502554964480534, "tokens_seen": 755963904 }, { "epoch": 2.02, "learning_rate": 0.0003893781344032096, "loss": 2.9665, "theoretical_loss": 3.7502229384858916, "tokens_seen": 756029440 }, { "epoch": 2.02, "learning_rate": 0.00038936810431293884, "loss": 3.0033, "theoretical_loss": 3.750190384136034, "tokens_seen": 756094976 }, { "epoch": 2.02, "learning_rate": 0.000389358074222668, "loss": 2.9595, "theoretical_loss": 3.750157833397767, "tokens_seen": 756160512 }, { "epoch": 2.02, "learning_rate": 0.0003893480441323972, "loss": 3.0438, "theoretical_loss": 3.750125286270377, "tokens_seen": 756226048 }, { "epoch": 2.02, "learning_rate": 0.0003893380140421264, "loss": 2.956, "theoretical_loss": 3.750092742753149, "tokens_seen": 756291584 }, { "epoch": 2.02, "learning_rate": 0.0003893279839518556, "loss": 2.8633, "theoretical_loss": 3.750060202845372, "tokens_seen": 756357120 }, { "epoch": 2.02, "learning_rate": 0.00038931795386158474, "loss": 3.0148, "theoretical_loss": 3.750027666546332, "tokens_seen": 756422656 }, { "epoch": 2.02, "learning_rate": 0.000389307923771314, "loss": 2.9736, "theoretical_loss": 3.7499951338553164, "tokens_seen": 756488192 }, { "epoch": 2.02, "learning_rate": 0.0003892978936810431, "loss": 3.0179, "theoretical_loss": 3.7499626047716124, "tokens_seen": 756553728 }, { "epoch": 2.02, "learning_rate": 0.00038928786359077234, "loss": 3.1135, "theoretical_loss": 3.749930079294507, "tokens_seen": 756619264 }, { "epoch": 2.02, "learning_rate": 0.0003892778335005015, "loss": 2.9023, "theoretical_loss": 3.7498975574232887, "tokens_seen": 756684800 }, { "epoch": 2.02, "learning_rate": 0.0003892678034102307, "loss": 2.9302, "theoretical_loss": 3.7498650391572452, "tokens_seen": 756750336 }, { "epoch": 2.02, "learning_rate": 0.0003892577733199599, "loss": 2.9912, "theoretical_loss": 3.749832524495665, "tokens_seen": 756815872 }, { "epoch": 2.02, "learning_rate": 0.00038924774322968906, "loss": 2.9842, "theoretical_loss": 3.749800013437837, "tokens_seen": 756881408 }, { "epoch": 2.02, "objective/train/docs_used": 1224816, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.909097671508789, "objective/train/theoretical_loss": 3.749783759260107, "objective/train/tokens_used": 777374176, "theoretical_loss": 3.749783759260107, "tokens_seen": 756914176 }, { "epoch": 2.02, "learning_rate": 0.00038923771313941825, "loss": 2.9216, "theoretical_loss": 3.7497675059830486, "tokens_seen": 756946944 }, { "epoch": 2.02, "learning_rate": 0.0003892276830491475, "loss": 2.827, "theoretical_loss": 3.7497350021305893, "tokens_seen": 757012480 }, { "epoch": 2.02, "learning_rate": 0.0003892176529588766, "loss": 2.7876, "theoretical_loss": 3.7497025018797476, "tokens_seen": 757078016 }, { "epoch": 2.02, "learning_rate": 0.00038920762286860584, "loss": 2.91, "theoretical_loss": 3.749670005229813, "tokens_seen": 757143552 }, { "epoch": 2.02, "learning_rate": 0.00038919759277833497, "loss": 3.0651, "theoretical_loss": 3.749637512180075, "tokens_seen": 757209088 }, { "epoch": 2.02, "learning_rate": 0.0003891875626880642, "loss": 2.9672, "theoretical_loss": 3.749605022729823, "tokens_seen": 757274624 }, { "epoch": 2.02, "learning_rate": 0.0003891775325977934, "loss": 2.9419, "theoretical_loss": 3.749572536878347, "tokens_seen": 757340160 }, { "epoch": 2.02, "learning_rate": 0.00038916750250752257, "loss": 2.9389, "theoretical_loss": 3.749540054624937, "tokens_seen": 757405696 }, { "epoch": 2.02, "learning_rate": 0.00038915747241725175, "loss": 3.0602, "theoretical_loss": 3.7495075759688836, "tokens_seen": 757471232 }, { "epoch": 2.02, "learning_rate": 0.000389147442326981, "loss": 3.2095, "theoretical_loss": 3.7494751009094758, "tokens_seen": 757536768 }, { "epoch": 2.02, "learning_rate": 0.0003891374122367101, "loss": 2.9328, "theoretical_loss": 3.7494426294460057, "tokens_seen": 757602304 }, { "epoch": 2.02, "learning_rate": 0.00038912738214643935, "loss": 2.9129, "theoretical_loss": 3.7494101615777637, "tokens_seen": 757667840 }, { "epoch": 2.02, "learning_rate": 0.0003891173520561685, "loss": 3.0136, "theoretical_loss": 3.7493776973040402, "tokens_seen": 757733376 }, { "epoch": 2.02, "learning_rate": 0.0003891073219658977, "loss": 2.8922, "theoretical_loss": 3.7493452366241273, "tokens_seen": 757798912 }, { "epoch": 2.02, "learning_rate": 0.0003890972918756269, "loss": 3.016, "theoretical_loss": 3.7493127795373162, "tokens_seen": 757864448 }, { "epoch": 2.02, "learning_rate": 0.00038908726178535607, "loss": 3.0678, "theoretical_loss": 3.749280326042898, "tokens_seen": 757929984 }, { "epoch": 2.02, "learning_rate": 0.00038907723169508525, "loss": 3.0442, "theoretical_loss": 3.7492478761401644, "tokens_seen": 757995520 }, { "epoch": 2.02, "learning_rate": 0.00038906720160481443, "loss": 3.17, "theoretical_loss": 3.7492154298284084, "tokens_seen": 758061056 }, { "epoch": 2.02, "learning_rate": 0.0003890571715145436, "loss": 3.0751, "theoretical_loss": 3.7491829871069218, "tokens_seen": 758126592 }, { "epoch": 2.02, "learning_rate": 0.00038904714142427285, "loss": 2.8053, "theoretical_loss": 3.7491505479749967, "tokens_seen": 758192128 }, { "epoch": 2.02, "learning_rate": 0.000389037111334002, "loss": 2.9799, "theoretical_loss": 3.7491181124319257, "tokens_seen": 758257664 }, { "epoch": 2.02, "learning_rate": 0.0003890270812437312, "loss": 3.0857, "theoretical_loss": 3.749085680477002, "tokens_seen": 758323200 }, { "epoch": 2.02, "learning_rate": 0.0003890170511534604, "loss": 3.0408, "theoretical_loss": 3.7490532521095186, "tokens_seen": 758388736 }, { "epoch": 2.02, "learning_rate": 0.0003890070210631896, "loss": 2.9118, "theoretical_loss": 3.7490208273287684, "tokens_seen": 758454272 }, { "epoch": 2.02, "learning_rate": 0.00038899699097291876, "loss": 3.098, "theoretical_loss": 3.7489884061340453, "tokens_seen": 758519808 }, { "epoch": 2.02, "objective/train/docs_used": 1227511, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0282721519470215, "objective/train/theoretical_loss": 3.7489721968812226, "objective/train/tokens_used": 779012576, "theoretical_loss": 3.7489721968812226, "tokens_seen": 758552576 }, { "epoch": 2.02, "learning_rate": 0.00038898696088264794, "loss": 3.0742, "theoretical_loss": 3.748955988524642, "tokens_seen": 758585344 }, { "epoch": 2.02, "learning_rate": 0.0003889769307923771, "loss": 2.8824, "theoretical_loss": 3.7489235744998535, "tokens_seen": 758650880 }, { "epoch": 2.02, "learning_rate": 0.00038896690070210635, "loss": 2.8241, "theoretical_loss": 3.748891164058973, "tokens_seen": 758716416 }, { "epoch": 2.02, "learning_rate": 0.0003889568706118355, "loss": 3.0376, "theoretical_loss": 3.748858757201295, "tokens_seen": 758781952 }, { "epoch": 2.02, "learning_rate": 0.0003889468405215647, "loss": 2.8984, "theoretical_loss": 3.7488263539261144, "tokens_seen": 758847488 }, { "epoch": 2.02, "learning_rate": 0.00038893681043129384, "loss": 2.8176, "theoretical_loss": 3.748793954232725, "tokens_seen": 758913024 }, { "epoch": 2.02, "learning_rate": 0.0003889267803410231, "loss": 3.1511, "theoretical_loss": 3.7487615581204214, "tokens_seen": 758978560 }, { "epoch": 2.02, "learning_rate": 0.00038891675025075226, "loss": 3.082, "theoretical_loss": 3.7487291655885, "tokens_seen": 759044096 }, { "epoch": 2.02, "learning_rate": 0.00038890672016048144, "loss": 3.0283, "theoretical_loss": 3.7486967766362547, "tokens_seen": 759109632 }, { "epoch": 2.02, "learning_rate": 0.0003888966900702106, "loss": 2.9911, "theoretical_loss": 3.7486643912629813, "tokens_seen": 759175168 }, { "epoch": 2.02, "learning_rate": 0.0003888866599799398, "loss": 3.0523, "theoretical_loss": 3.748632009467976, "tokens_seen": 759240704 }, { "epoch": 2.02, "learning_rate": 0.00038887662988966904, "loss": 3.1386, "theoretical_loss": 3.748599631250534, "tokens_seen": 759306240 }, { "epoch": 2.02, "learning_rate": 0.0003888665997993982, "loss": 2.8722, "theoretical_loss": 3.748567256609951, "tokens_seen": 759371776 }, { "epoch": 2.02, "learning_rate": 0.0003888565697091274, "loss": 2.967, "theoretical_loss": 3.7485348855455243, "tokens_seen": 759437312 }, { "epoch": 2.02, "learning_rate": 0.0003888465396188566, "loss": 3.0338, "theoretical_loss": 3.74850251805655, "tokens_seen": 759502848 }, { "epoch": 2.02, "learning_rate": 0.0003888365095285858, "loss": 2.9738, "theoretical_loss": 3.748470154142323, "tokens_seen": 759568384 }, { "epoch": 2.02, "learning_rate": 0.00038882647943831494, "loss": 2.9159, "theoretical_loss": 3.748437793802143, "tokens_seen": 759633920 }, { "epoch": 2.02, "learning_rate": 0.0003888164493480442, "loss": 3.0213, "theoretical_loss": 3.748405437035305, "tokens_seen": 759699456 }, { "epoch": 2.02, "learning_rate": 0.0003888064192577733, "loss": 3.0437, "theoretical_loss": 3.7483730838411065, "tokens_seen": 759764992 }, { "epoch": 2.02, "learning_rate": 0.00038879638916750254, "loss": 3.0161, "theoretical_loss": 3.7483407342188455, "tokens_seen": 759830528 }, { "epoch": 2.02, "learning_rate": 0.0003887863590772317, "loss": 2.9311, "theoretical_loss": 3.7483083881678194, "tokens_seen": 759896064 }, { "epoch": 2.02, "learning_rate": 0.0003887763289869609, "loss": 2.9976, "theoretical_loss": 3.7482760456873256, "tokens_seen": 759961600 }, { "epoch": 2.02, "learning_rate": 0.0003887662988966901, "loss": 2.9834, "theoretical_loss": 3.7482437067766625, "tokens_seen": 760027136 }, { "epoch": 2.02, "learning_rate": 0.00038875626880641927, "loss": 2.7954, "theoretical_loss": 3.7482113714351284, "tokens_seen": 760092672 }, { "epoch": 2.02, "learning_rate": 0.00038874623871614845, "loss": 3.0918, "theoretical_loss": 3.7481790396620216, "tokens_seen": 760158208 }, { "epoch": 2.02, "objective/train/docs_used": 1230181, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.57212233543396, "objective/train/theoretical_loss": 3.748162875113409, "objective/train/tokens_used": 780650976, "theoretical_loss": 3.748162875113409, "tokens_seen": 760190976 }, { "epoch": 2.02, "learning_rate": 0.0003887362086258777, "loss": 2.7343, "theoretical_loss": 3.7481467114566405, "tokens_seen": 760223744 }, { "epoch": 2.02, "learning_rate": 0.0003887261785356068, "loss": 3.0577, "theoretical_loss": 3.748114386818284, "tokens_seen": 760289280 }, { "epoch": 2.02, "learning_rate": 0.00038871614844533604, "loss": 3.1259, "theoretical_loss": 3.7480820657462512, "tokens_seen": 760354816 }, { "epoch": 2.02, "learning_rate": 0.00038870611835506517, "loss": 3.08, "theoretical_loss": 3.7480497482398407, "tokens_seen": 760420352 }, { "epoch": 2.02, "learning_rate": 0.0003886960882647944, "loss": 3.0164, "theoretical_loss": 3.7480174342983528, "tokens_seen": 760485888 }, { "epoch": 2.02, "learning_rate": 0.0003886860581745236, "loss": 2.8846, "theoretical_loss": 3.7479851239210866, "tokens_seen": 760551424 }, { "epoch": 2.02, "learning_rate": 0.00038867602808425277, "loss": 3.0802, "theoretical_loss": 3.7479528171073424, "tokens_seen": 760616960 }, { "epoch": 2.02, "learning_rate": 0.00038866599799398195, "loss": 2.9499, "theoretical_loss": 3.7479205138564193, "tokens_seen": 760682496 }, { "epoch": 2.02, "learning_rate": 0.0003886559679037112, "loss": 2.8773, "theoretical_loss": 3.7478882141676184, "tokens_seen": 760748032 }, { "epoch": 2.02, "learning_rate": 0.0003886459378134403, "loss": 2.8686, "theoretical_loss": 3.7478559180402393, "tokens_seen": 760813568 }, { "epoch": 2.02, "learning_rate": 0.00038863590772316955, "loss": 2.9896, "theoretical_loss": 3.747823625473583, "tokens_seen": 760879104 }, { "epoch": 2.02, "learning_rate": 0.0003886258776328987, "loss": 3.0895, "theoretical_loss": 3.7477913364669506, "tokens_seen": 760944640 }, { "epoch": 2.02, "learning_rate": 0.0003886158475426279, "loss": 3.1737, "theoretical_loss": 3.747759051019642, "tokens_seen": 761010176 }, { "epoch": 2.02, "learning_rate": 0.0003886058174523571, "loss": 2.7341, "theoretical_loss": 3.74772676913096, "tokens_seen": 761075712 }, { "epoch": 2.02, "learning_rate": 0.00038859578736208627, "loss": 3.026, "theoretical_loss": 3.747694490800204, "tokens_seen": 761141248 }, { "epoch": 2.02, "learning_rate": 0.00038858575727181545, "loss": 2.8167, "theoretical_loss": 3.7476622160266775, "tokens_seen": 761206784 }, { "epoch": 2.02, "learning_rate": 0.00038857572718154463, "loss": 3.0005, "theoretical_loss": 3.747629944809681, "tokens_seen": 761272320 }, { "epoch": 2.02, "learning_rate": 0.0003885656970912738, "loss": 2.8318, "theoretical_loss": 3.747597677148517, "tokens_seen": 761337856 }, { "epoch": 2.02, "learning_rate": 0.00038855566700100305, "loss": 2.9478, "theoretical_loss": 3.7475654130424876, "tokens_seen": 761403392 }, { "epoch": 2.02, "learning_rate": 0.0003885456369107322, "loss": 2.9538, "theoretical_loss": 3.7475331524908952, "tokens_seen": 761468928 }, { "epoch": 2.02, "learning_rate": 0.0003885356068204614, "loss": 2.8624, "theoretical_loss": 3.747500895493042, "tokens_seen": 761534464 }, { "epoch": 2.02, "learning_rate": 0.0003885255767301906, "loss": 2.888, "theoretical_loss": 3.747468642048231, "tokens_seen": 761600000 }, { "epoch": 2.02, "learning_rate": 0.0003885155466399198, "loss": 3.0037, "theoretical_loss": 3.747436392155765, "tokens_seen": 761665536 }, { "epoch": 2.02, "learning_rate": 0.00038850551654964896, "loss": 2.9093, "theoretical_loss": 3.747404145814947, "tokens_seen": 761731072 }, { "epoch": 2.02, "learning_rate": 0.00038849548645937814, "loss": 2.9989, "theoretical_loss": 3.7473719030250816, "tokens_seen": 761796608 }, { "epoch": 2.02, "objective/train/docs_used": 1231594, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.805318832397461, "objective/train/theoretical_loss": 3.7473557829615376, "objective/train/tokens_used": 782289376, "theoretical_loss": 3.7473557829615376, "tokens_seen": 761829376 }, { "epoch": 2.02, "learning_rate": 0.0003884854563691073, "loss": 2.9237, "theoretical_loss": 3.747339663785471, "tokens_seen": 761862144 }, { "epoch": 2.02, "learning_rate": 0.00038847542627883655, "loss": 3.1044, "theoretical_loss": 3.747307428095419, "tokens_seen": 761927680 }, { "epoch": 2.02, "learning_rate": 0.0003884653961885657, "loss": 2.93, "theoretical_loss": 3.74727519595423, "tokens_seen": 761993216 }, { "epoch": 2.02, "learning_rate": 0.0003884553660982949, "loss": 2.9495, "theoretical_loss": 3.747242967361208, "tokens_seen": 762058752 }, { "epoch": 2.02, "learning_rate": 0.00038844533600802404, "loss": 2.9321, "theoretical_loss": 3.747210742315657, "tokens_seen": 762124288 }, { "epoch": 2.02, "learning_rate": 0.0003884353059177533, "loss": 2.879, "theoretical_loss": 3.747178520816882, "tokens_seen": 762189824 }, { "epoch": 2.02, "learning_rate": 0.00038842527582748246, "loss": 2.8216, "theoretical_loss": 3.747146302864188, "tokens_seen": 762255360 }, { "epoch": 2.02, "learning_rate": 0.00038841524573721164, "loss": 3.1057, "theoretical_loss": 3.7471140884568785, "tokens_seen": 762320896 }, { "epoch": 2.02, "learning_rate": 0.0003884052156469408, "loss": 3.0044, "theoretical_loss": 3.74708187759426, "tokens_seen": 762386432 }, { "epoch": 2.02, "learning_rate": 0.00038839518555667, "loss": 3.0313, "theoretical_loss": 3.7470496702756373, "tokens_seen": 762451968 }, { "epoch": 2.02, "learning_rate": 0.0003883851554663992, "loss": 2.9162, "theoretical_loss": 3.747017466500316, "tokens_seen": 762517504 }, { "epoch": 2.02, "learning_rate": 0.0003883751253761284, "loss": 3.1144, "theoretical_loss": 3.746985266267602, "tokens_seen": 762583040 }, { "epoch": 2.02, "learning_rate": 0.00038836509528585755, "loss": 2.8975, "theoretical_loss": 3.7469530695768007, "tokens_seen": 762648576 }, { "epoch": 2.02, "learning_rate": 0.0003883550651955868, "loss": 3.0324, "theoretical_loss": 3.746920876427219, "tokens_seen": 762714112 }, { "epoch": 2.02, "learning_rate": 0.00038834503510531596, "loss": 3.0542, "theoretical_loss": 3.7468886868181617, "tokens_seen": 762779648 }, { "epoch": 2.02, "learning_rate": 0.00038833500501504514, "loss": 2.9809, "theoretical_loss": 3.7468565007489367, "tokens_seen": 762845184 }, { "epoch": 2.02, "learning_rate": 0.0003883249749247743, "loss": 2.873, "theoretical_loss": 3.74682431821885, "tokens_seen": 762910720 }, { "epoch": 2.02, "learning_rate": 0.0003883149448345035, "loss": 3.0379, "theoretical_loss": 3.7467921392272086, "tokens_seen": 762976256 }, { "epoch": 2.02, "learning_rate": 0.0003883049147442327, "loss": 2.9292, "theoretical_loss": 3.7467599637733198, "tokens_seen": 763041792 }, { "epoch": 2.02, "learning_rate": 0.0003882948846539619, "loss": 3.0297, "theoretical_loss": 3.7467277918564905, "tokens_seen": 763107328 }, { "epoch": 2.02, "learning_rate": 0.00038828485456369105, "loss": 2.9703, "theoretical_loss": 3.746695623476029, "tokens_seen": 763172864 }, { "epoch": 2.02, "learning_rate": 0.0003882748244734203, "loss": 2.9646, "theoretical_loss": 3.7466634586312413, "tokens_seen": 763238400 }, { "epoch": 2.02, "learning_rate": 0.0003882647943831494, "loss": 3.0741, "theoretical_loss": 3.7466312973214366, "tokens_seen": 763303936 }, { "epoch": 2.02, "learning_rate": 0.00038825476429287865, "loss": 2.914, "theoretical_loss": 3.7465991395459226, "tokens_seen": 763369472 }, { "epoch": 2.02, "learning_rate": 0.00038824473420260783, "loss": 2.9304, "theoretical_loss": 3.746566985304008, "tokens_seen": 763435008 }, { "epoch": 2.02, "objective/train/docs_used": 1234269, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8828282356262207, "objective/train/theoretical_loss": 3.7465509095079335, "objective/train/tokens_used": 783927776, "theoretical_loss": 3.7465509095079335, "tokens_seen": 763467776 }, { "epoch": 2.02, "learning_rate": 0.000388234704112337, "loss": 2.9072, "theoretical_loss": 3.746534834595, "tokens_seen": 763500544 }, { "epoch": 2.02, "learning_rate": 0.0003882246740220662, "loss": 3.0262, "theoretical_loss": 3.746502687418208, "tokens_seen": 763566080 }, { "epoch": 2.02, "learning_rate": 0.00038821464393179537, "loss": 2.914, "theoretical_loss": 3.746470543772941, "tokens_seen": 763631616 }, { "epoch": 2.02, "learning_rate": 0.00038820461384152455, "loss": 2.912, "theoretical_loss": 3.7464384036585074, "tokens_seen": 763697152 }, { "epoch": 2.02, "learning_rate": 0.0003881945837512538, "loss": 3.0265, "theoretical_loss": 3.7464062670742173, "tokens_seen": 763762688 }, { "epoch": 2.02, "learning_rate": 0.0003881845536609829, "loss": 3.0127, "theoretical_loss": 3.746374134019379, "tokens_seen": 763828224 }, { "epoch": 2.02, "learning_rate": 0.00038817452357071215, "loss": 2.8705, "theoretical_loss": 3.746342004493303, "tokens_seen": 763893760 }, { "epoch": 2.02, "learning_rate": 0.00038816449348044133, "loss": 2.9181, "theoretical_loss": 3.746309878495299, "tokens_seen": 763959296 }, { "epoch": 2.02, "learning_rate": 0.0003881544633901705, "loss": 2.8949, "theoretical_loss": 3.7462777560246767, "tokens_seen": 764024832 }, { "epoch": 2.02, "learning_rate": 0.0003881444332998997, "loss": 2.8709, "theoretical_loss": 3.7462456370807464, "tokens_seen": 764090368 }, { "epoch": 2.02, "learning_rate": 0.0003881344032096289, "loss": 3.1451, "theoretical_loss": 3.746213521662818, "tokens_seen": 764155904 }, { "epoch": 2.02, "learning_rate": 0.0003881243731193581, "loss": 3.0247, "theoretical_loss": 3.7461814097702026, "tokens_seen": 764221440 }, { "epoch": 2.02, "learning_rate": 0.0003881143430290873, "loss": 2.7751, "theoretical_loss": 3.7461493014022116, "tokens_seen": 764286976 }, { "epoch": 2.02, "learning_rate": 0.00038810431293881647, "loss": 2.996, "theoretical_loss": 3.7461171965581546, "tokens_seen": 764352512 }, { "epoch": 2.02, "learning_rate": 0.00038809428284854565, "loss": 2.91, "theoretical_loss": 3.7460850952373432, "tokens_seen": 764418048 }, { "epoch": 2.02, "learning_rate": 0.00038808425275827483, "loss": 3.0814, "theoretical_loss": 3.7460529974390893, "tokens_seen": 764483584 }, { "epoch": 2.02, "learning_rate": 0.000388074222668004, "loss": 2.9655, "theoretical_loss": 3.7460209031627034, "tokens_seen": 764549120 }, { "epoch": 2.02, "learning_rate": 0.00038806419257773325, "loss": 2.9215, "theoretical_loss": 3.7459888124074983, "tokens_seen": 764614656 }, { "epoch": 2.02, "learning_rate": 0.0003880541624874624, "loss": 2.9691, "theoretical_loss": 3.7459567251727854, "tokens_seen": 764680192 }, { "epoch": 2.02, "learning_rate": 0.0003880441323971916, "loss": 2.8626, "theoretical_loss": 3.7459246414578775, "tokens_seen": 764745728 }, { "epoch": 2.02, "learning_rate": 0.0003880341023069208, "loss": 3.0109, "theoretical_loss": 3.7458925612620853, "tokens_seen": 764811264 }, { "epoch": 2.02, "learning_rate": 0.00038802407221665, "loss": 3.0469, "theoretical_loss": 3.745860484584723, "tokens_seen": 764876800 }, { "epoch": 2.02, "learning_rate": 0.00038801404212637916, "loss": 3.1067, "theoretical_loss": 3.7458284114251024, "tokens_seen": 764942336 }, { "epoch": 2.02, "learning_rate": 0.00038800401203610834, "loss": 2.8558, "theoretical_loss": 3.745796341782537, "tokens_seen": 765007872 }, { "epoch": 2.02, "learning_rate": 0.0003879939819458375, "loss": 2.9111, "theoretical_loss": 3.745764275656339, "tokens_seen": 765073408 }, { "epoch": 2.02, "objective/train/docs_used": 1236952, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8716354370117188, "objective/train/theoretical_loss": 3.745748243911663, "objective/train/tokens_used": 785566176, "theoretical_loss": 3.745748243911663, "tokens_seen": 765106176 }, { "epoch": 2.02, "learning_rate": 0.00038798395185556675, "loss": 2.9123, "theoretical_loss": 3.745732213045822, "tokens_seen": 765138944 }, { "epoch": 2.02, "learning_rate": 0.0003879739217652959, "loss": 3.1075, "theoretical_loss": 3.7457001539503, "tokens_seen": 765204480 }, { "epoch": 2.02, "learning_rate": 0.0003879638916750251, "loss": 2.9189, "theoretical_loss": 3.7456680983690864, "tokens_seen": 765270016 }, { "epoch": 2.02, "learning_rate": 0.00038795386158475424, "loss": 3.0263, "theoretical_loss": 3.7456360463014944, "tokens_seen": 765335552 }, { "epoch": 2.02, "learning_rate": 0.0003879438314944835, "loss": 3.013, "theoretical_loss": 3.7456039977468394, "tokens_seen": 765401088 }, { "epoch": 2.02, "learning_rate": 0.00038793380140421266, "loss": 2.9286, "theoretical_loss": 3.7455719527044344, "tokens_seen": 765466624 }, { "epoch": 2.02, "learning_rate": 0.00038792377131394184, "loss": 3.0719, "theoretical_loss": 3.7455399111735943, "tokens_seen": 765532160 }, { "epoch": 2.02, "learning_rate": 0.000387913741223671, "loss": 2.9737, "theoretical_loss": 3.7455078731536338, "tokens_seen": 765597696 }, { "epoch": 2.02, "learning_rate": 0.0003879037111334002, "loss": 2.9923, "theoretical_loss": 3.745475838643867, "tokens_seen": 765663232 }, { "epoch": 2.02, "learning_rate": 0.0003878936810431294, "loss": 3.1357, "theoretical_loss": 3.7454438076436105, "tokens_seen": 765728768 }, { "epoch": 2.02, "learning_rate": 0.0003878836509528586, "loss": 3.0207, "theoretical_loss": 3.7454117801521782, "tokens_seen": 765794304 }, { "epoch": 2.02, "learning_rate": 0.00038787362086258775, "loss": 2.8854, "theoretical_loss": 3.7453797561688855, "tokens_seen": 765859840 }, { "epoch": 2.03, "learning_rate": 0.000387863590772317, "loss": 2.9685, "theoretical_loss": 3.7453477356930485, "tokens_seen": 765925376 }, { "epoch": 2.03, "learning_rate": 0.00038785356068204616, "loss": 2.9684, "theoretical_loss": 3.7453157187239827, "tokens_seen": 765990912 }, { "epoch": 2.03, "learning_rate": 0.00038784353059177534, "loss": 2.8542, "theoretical_loss": 3.745283705261004, "tokens_seen": 766056448 }, { "epoch": 2.03, "learning_rate": 0.0003878335005015045, "loss": 2.9173, "theoretical_loss": 3.745251695303429, "tokens_seen": 766121984 }, { "epoch": 2.03, "learning_rate": 0.0003878234704112337, "loss": 3.0208, "theoretical_loss": 3.7452196888505735, "tokens_seen": 766187520 }, { "epoch": 2.03, "learning_rate": 0.0003878134403209629, "loss": 3.0023, "theoretical_loss": 3.745187685901755, "tokens_seen": 766253056 }, { "epoch": 2.03, "learning_rate": 0.0003878034102306921, "loss": 3.1434, "theoretical_loss": 3.745155686456289, "tokens_seen": 766318592 }, { "epoch": 2.03, "learning_rate": 0.00038779338014042125, "loss": 3.0137, "theoretical_loss": 3.745123690513493, "tokens_seen": 766384128 }, { "epoch": 2.03, "learning_rate": 0.0003877833500501505, "loss": 3.0248, "theoretical_loss": 3.7450916980726845, "tokens_seen": 766449664 }, { "epoch": 2.03, "learning_rate": 0.0003877733199598796, "loss": 3.0254, "theoretical_loss": 3.74505970913318, "tokens_seen": 766515200 }, { "epoch": 2.03, "learning_rate": 0.00038776328986960885, "loss": 2.8812, "theoretical_loss": 3.7450277236942977, "tokens_seen": 766580736 }, { "epoch": 2.03, "learning_rate": 0.00038775325977933803, "loss": 3.0031, "theoretical_loss": 3.7449957417553548, "tokens_seen": 766646272 }, { "epoch": 2.03, "learning_rate": 0.0003877432296890672, "loss": 2.9909, "theoretical_loss": 3.744963763315669, "tokens_seen": 766711808 }, { "epoch": 2.03, "objective/train/docs_used": 1239797, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8591201305389404, "objective/train/theoretical_loss": 3.744947775407835, "objective/train/tokens_used": 787204576, "theoretical_loss": 3.744947775407835, "tokens_seen": 766744576 }, { "epoch": 2.03, "learning_rate": 0.0003877331995987964, "loss": 2.9351, "theoretical_loss": 3.7449317883745596, "tokens_seen": 766777344 }, { "epoch": 2.03, "learning_rate": 0.00038772316950852557, "loss": 2.8983, "theoretical_loss": 3.7448998169313437, "tokens_seen": 766842880 }, { "epoch": 2.03, "learning_rate": 0.00038771313941825475, "loss": 3.0584, "theoretical_loss": 3.74486784898534, "tokens_seen": 766908416 }, { "epoch": 2.03, "learning_rate": 0.000387703109327984, "loss": 2.9306, "theoretical_loss": 3.744835884535868, "tokens_seen": 766973952 }, { "epoch": 2.03, "learning_rate": 0.0003876930792377131, "loss": 2.9166, "theoretical_loss": 3.7448039235822446, "tokens_seen": 767039488 }, { "epoch": 2.03, "learning_rate": 0.00038768304914744235, "loss": 2.8787, "theoretical_loss": 3.744771966123791, "tokens_seen": 767105024 }, { "epoch": 2.03, "learning_rate": 0.00038767301905717153, "loss": 3.0484, "theoretical_loss": 3.744740012159825, "tokens_seen": 767170560 }, { "epoch": 2.03, "learning_rate": 0.0003876629889669007, "loss": 2.7983, "theoretical_loss": 3.7447080616896664, "tokens_seen": 767236096 }, { "epoch": 2.03, "learning_rate": 0.0003876529588766299, "loss": 2.9488, "theoretical_loss": 3.744676114712635, "tokens_seen": 767301632 }, { "epoch": 2.03, "learning_rate": 0.0003876429287863591, "loss": 3.0462, "theoretical_loss": 3.7446441712280505, "tokens_seen": 767367168 }, { "epoch": 2.03, "learning_rate": 0.00038763289869608826, "loss": 3.0955, "theoretical_loss": 3.744612231235233, "tokens_seen": 767432704 }, { "epoch": 2.03, "learning_rate": 0.0003876228686058175, "loss": 3.1075, "theoretical_loss": 3.7445802947335025, "tokens_seen": 767498240 }, { "epoch": 2.03, "learning_rate": 0.0003876128385155466, "loss": 2.8534, "theoretical_loss": 3.7445483617221793, "tokens_seen": 767563776 }, { "epoch": 2.03, "learning_rate": 0.00038760280842527585, "loss": 2.7855, "theoretical_loss": 3.744516432200584, "tokens_seen": 767629312 }, { "epoch": 2.03, "learning_rate": 0.000387592778335005, "loss": 2.9321, "theoretical_loss": 3.7444845061680376, "tokens_seen": 767694848 }, { "epoch": 2.03, "learning_rate": 0.0003875827482447342, "loss": 2.94, "theoretical_loss": 3.7444525836238607, "tokens_seen": 767760384 }, { "epoch": 2.03, "learning_rate": 0.0003875727181544634, "loss": 2.9773, "theoretical_loss": 3.7444206645673748, "tokens_seen": 767825920 }, { "epoch": 2.03, "learning_rate": 0.0003875626880641926, "loss": 3.0036, "theoretical_loss": 3.744388748997901, "tokens_seen": 767891456 }, { "epoch": 2.03, "learning_rate": 0.00038755265797392176, "loss": 2.9694, "theoretical_loss": 3.7443568369147604, "tokens_seen": 767956992 }, { "epoch": 2.03, "learning_rate": 0.000387542627883651, "loss": 2.8623, "theoretical_loss": 3.7443249283172753, "tokens_seen": 768022528 }, { "epoch": 2.03, "learning_rate": 0.0003875325977933801, "loss": 2.9503, "theoretical_loss": 3.7442930232047678, "tokens_seen": 768088064 }, { "epoch": 2.03, "learning_rate": 0.00038752256770310936, "loss": 3.0075, "theoretical_loss": 3.7442611215765593, "tokens_seen": 768153600 }, { "epoch": 2.03, "learning_rate": 0.0003875125376128385, "loss": 2.9524, "theoretical_loss": 3.7442292234319723, "tokens_seen": 768219136 }, { "epoch": 2.03, "learning_rate": 0.0003875025075225677, "loss": 2.9612, "theoretical_loss": 3.744197328770329, "tokens_seen": 768284672 }, { "epoch": 2.03, "learning_rate": 0.0003874924774322969, "loss": 2.8812, "theoretical_loss": 3.7441654375909525, "tokens_seen": 768350208 }, { "epoch": 2.03, "objective/train/docs_used": 1242843, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0903289318084717, "objective/train/theoretical_loss": 3.744149493306903, "objective/train/tokens_used": 788842976, "theoretical_loss": 3.744149493306903, "tokens_seen": 768382976 }, { "epoch": 2.03, "learning_rate": 0.0003874824473420261, "loss": 2.9253, "theoretical_loss": 3.744133549893166, "tokens_seen": 768415744 }, { "epoch": 2.03, "learning_rate": 0.00038747241725175526, "loss": 2.8956, "theoretical_loss": 3.7441016656762915, "tokens_seen": 768481280 }, { "epoch": 2.03, "learning_rate": 0.00038746238716148444, "loss": 2.9588, "theoretical_loss": 3.7440697849396525, "tokens_seen": 768546816 }, { "epoch": 2.03, "learning_rate": 0.0003874523570712136, "loss": 2.873, "theoretical_loss": 3.744037907682573, "tokens_seen": 768612352 }, { "epoch": 2.03, "learning_rate": 0.00038744232698094286, "loss": 2.9778, "theoretical_loss": 3.7440060339043764, "tokens_seen": 768677888 }, { "epoch": 2.03, "learning_rate": 0.000387432296890672, "loss": 2.8624, "theoretical_loss": 3.7439741636043857, "tokens_seen": 768743424 }, { "epoch": 2.03, "learning_rate": 0.0003874222668004012, "loss": 2.9778, "theoretical_loss": 3.743942296781926, "tokens_seen": 768808960 }, { "epoch": 2.03, "learning_rate": 0.00038741223671013035, "loss": 2.9486, "theoretical_loss": 3.7439104334363207, "tokens_seen": 768874496 }, { "epoch": 2.03, "learning_rate": 0.0003874022066198596, "loss": 2.9166, "theoretical_loss": 3.743878573566894, "tokens_seen": 768940032 }, { "epoch": 2.03, "learning_rate": 0.00038739217652958876, "loss": 2.9406, "theoretical_loss": 3.7438467171729712, "tokens_seen": 769005568 }, { "epoch": 2.03, "learning_rate": 0.00038738214643931795, "loss": 3.0008, "theoretical_loss": 3.743814864253877, "tokens_seen": 769071104 }, { "epoch": 2.03, "learning_rate": 0.0003873721163490472, "loss": 3.0089, "theoretical_loss": 3.743783014808936, "tokens_seen": 769136640 }, { "epoch": 2.03, "learning_rate": 0.00038736208625877636, "loss": 3.1256, "theoretical_loss": 3.7437511688374725, "tokens_seen": 769202176 }, { "epoch": 2.03, "learning_rate": 0.00038735205616850554, "loss": 2.7981, "theoretical_loss": 3.743719326338813, "tokens_seen": 769267712 }, { "epoch": 2.03, "learning_rate": 0.0003873420260782347, "loss": 2.8768, "theoretical_loss": 3.7436874873122825, "tokens_seen": 769333248 }, { "epoch": 2.03, "learning_rate": 0.0003873319959879639, "loss": 3.0697, "theoretical_loss": 3.743655651757207, "tokens_seen": 769398784 }, { "epoch": 2.03, "learning_rate": 0.0003873219658976931, "loss": 2.9943, "theoretical_loss": 3.7436238196729117, "tokens_seen": 769464320 }, { "epoch": 2.03, "learning_rate": 0.0003873119358074223, "loss": 2.8598, "theoretical_loss": 3.743591991058723, "tokens_seen": 769529856 }, { "epoch": 2.03, "learning_rate": 0.00038730190571715145, "loss": 2.9552, "theoretical_loss": 3.7435601659139675, "tokens_seen": 769595392 }, { "epoch": 2.03, "learning_rate": 0.0003872918756268807, "loss": 2.9959, "theoretical_loss": 3.743528344237971, "tokens_seen": 769660928 }, { "epoch": 2.03, "learning_rate": 0.0003872818455366098, "loss": 2.9001, "theoretical_loss": 3.74349652603006, "tokens_seen": 769726464 }, { "epoch": 2.03, "learning_rate": 0.00038727181544633905, "loss": 2.9617, "theoretical_loss": 3.743464711289562, "tokens_seen": 769792000 }, { "epoch": 2.03, "learning_rate": 0.00038726178535606823, "loss": 3.1393, "theoretical_loss": 3.7434329000158035, "tokens_seen": 769857536 }, { "epoch": 2.03, "learning_rate": 0.0003872517552657974, "loss": 2.9828, "theoretical_loss": 3.7434010922081127, "tokens_seen": 769923072 }, { "epoch": 2.03, "learning_rate": 0.0003872417251755266, "loss": 3.103, "theoretical_loss": 3.743369287865815, "tokens_seen": 769988608 }, { "epoch": 2.03, "objective/train/docs_used": 1244287, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.866001844406128, "objective/train/theoretical_loss": 3.743353386993979, "objective/train/tokens_used": 790481376, "theoretical_loss": 3.743353386993979, "tokens_seen": 770021376 }, { "epoch": 2.03, "learning_rate": 0.00038723169508525577, "loss": 2.9822, "theoretical_loss": 3.74333748698824, "tokens_seen": 770054144 }, { "epoch": 2.03, "learning_rate": 0.00038722166499498495, "loss": 2.9937, "theoretical_loss": 3.743305689574714, "tokens_seen": 770119680 }, { "epoch": 2.03, "learning_rate": 0.0003872116349047142, "loss": 2.9316, "theoretical_loss": 3.743273895624565, "tokens_seen": 770185216 }, { "epoch": 2.03, "learning_rate": 0.0003872016048144433, "loss": 2.9627, "theoretical_loss": 3.7432421051371225, "tokens_seen": 770250752 }, { "epoch": 2.03, "learning_rate": 0.00038719157472417255, "loss": 3.0039, "theoretical_loss": 3.7432103181117133, "tokens_seen": 770316288 }, { "epoch": 2.03, "learning_rate": 0.00038718154463390173, "loss": 2.8518, "theoretical_loss": 3.743178534547666, "tokens_seen": 770381824 }, { "epoch": 2.03, "learning_rate": 0.0003871715145436309, "loss": 2.9105, "theoretical_loss": 3.7431467544443104, "tokens_seen": 770447360 }, { "epoch": 2.03, "learning_rate": 0.0003871614844533601, "loss": 3.1047, "theoretical_loss": 3.7431149778009742, "tokens_seen": 770512896 }, { "epoch": 2.03, "learning_rate": 0.0003871514543630893, "loss": 3.0825, "theoretical_loss": 3.7430832046169877, "tokens_seen": 770578432 }, { "epoch": 2.03, "learning_rate": 0.00038714142427281846, "loss": 2.939, "theoretical_loss": 3.7430514348916786, "tokens_seen": 770643968 }, { "epoch": 2.03, "learning_rate": 0.0003871313941825477, "loss": 2.863, "theoretical_loss": 3.743019668624377, "tokens_seen": 770709504 }, { "epoch": 2.03, "learning_rate": 0.0003871213640922768, "loss": 2.8014, "theoretical_loss": 3.742987905814413, "tokens_seen": 770775040 }, { "epoch": 2.03, "learning_rate": 0.00038711133400200605, "loss": 2.8683, "theoretical_loss": 3.7429561464611156, "tokens_seen": 770840576 }, { "epoch": 2.03, "learning_rate": 0.0003871013039117352, "loss": 2.8998, "theoretical_loss": 3.742924390563816, "tokens_seen": 770906112 }, { "epoch": 2.03, "learning_rate": 0.0003870912738214644, "loss": 2.8782, "theoretical_loss": 3.742892638121843, "tokens_seen": 770971648 }, { "epoch": 2.03, "learning_rate": 0.0003870812437311936, "loss": 3.0839, "theoretical_loss": 3.742860889134527, "tokens_seen": 771037184 }, { "epoch": 2.03, "learning_rate": 0.0003870712136409228, "loss": 2.9433, "theoretical_loss": 3.7428291436012, "tokens_seen": 771102720 }, { "epoch": 2.03, "learning_rate": 0.00038706118355065196, "loss": 2.9893, "theoretical_loss": 3.742797401521191, "tokens_seen": 771168256 }, { "epoch": 2.03, "learning_rate": 0.0003870511534603812, "loss": 3.0, "theoretical_loss": 3.742765662893832, "tokens_seen": 771233792 }, { "epoch": 2.03, "learning_rate": 0.0003870411233701103, "loss": 2.9457, "theoretical_loss": 3.742733927718454, "tokens_seen": 771299328 }, { "epoch": 2.03, "learning_rate": 0.00038703109327983956, "loss": 2.9628, "theoretical_loss": 3.742702195994388, "tokens_seen": 771364864 }, { "epoch": 2.03, "learning_rate": 0.0003870210631895687, "loss": 2.9245, "theoretical_loss": 3.7426704677209655, "tokens_seen": 771430400 }, { "epoch": 2.03, "learning_rate": 0.0003870110330992979, "loss": 2.9644, "theoretical_loss": 3.742638742897518, "tokens_seen": 771495936 }, { "epoch": 2.03, "learning_rate": 0.0003870010030090271, "loss": 3.0606, "theoretical_loss": 3.742607021523378, "tokens_seen": 771561472 }, { "epoch": 2.03, "learning_rate": 0.0003869909729187563, "loss": 2.887, "theoretical_loss": 3.742575303597877, "tokens_seen": 771627008 }, { "epoch": 2.03, "objective/train/docs_used": 1247133, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.70950984954834, "objective/train/theoretical_loss": 3.7425594459281575, "objective/train/tokens_used": 792119776, "theoretical_loss": 3.7425594459281575, "tokens_seen": 771659776 }, { "epoch": 2.03, "learning_rate": 0.00038698094282848546, "loss": 2.9482, "theoretical_loss": 3.7425435891203476, "tokens_seen": 771692544 }, { "epoch": 2.03, "learning_rate": 0.00038697091273821464, "loss": 2.9353, "theoretical_loss": 3.7425118780901214, "tokens_seen": 771758080 }, { "epoch": 2.03, "learning_rate": 0.0003869608826479438, "loss": 3.1185, "theoretical_loss": 3.742480170506532, "tokens_seen": 771823616 }, { "epoch": 2.03, "learning_rate": 0.00038695085255767306, "loss": 3.052, "theoretical_loss": 3.742448466368912, "tokens_seen": 771889152 }, { "epoch": 2.03, "learning_rate": 0.0003869408224674022, "loss": 2.893, "theoretical_loss": 3.7424167656765936, "tokens_seen": 771954688 }, { "epoch": 2.03, "learning_rate": 0.0003869307923771314, "loss": 2.876, "theoretical_loss": 3.74238506842891, "tokens_seen": 772020224 }, { "epoch": 2.03, "learning_rate": 0.00038692076228686055, "loss": 2.846, "theoretical_loss": 3.742353374625196, "tokens_seen": 772085760 }, { "epoch": 2.03, "learning_rate": 0.0003869107321965898, "loss": 2.8207, "theoretical_loss": 3.742321684264784, "tokens_seen": 772151296 }, { "epoch": 2.03, "learning_rate": 0.00038690070210631896, "loss": 2.934, "theoretical_loss": 3.742289997347007, "tokens_seen": 772216832 }, { "epoch": 2.03, "learning_rate": 0.00038689067201604815, "loss": 2.8381, "theoretical_loss": 3.7422583138712002, "tokens_seen": 772282368 }, { "epoch": 2.03, "learning_rate": 0.00038688064192577733, "loss": 2.9892, "theoretical_loss": 3.7422266338366974, "tokens_seen": 772347904 }, { "epoch": 2.03, "learning_rate": 0.00038687061183550656, "loss": 2.9281, "theoretical_loss": 3.7421949572428326, "tokens_seen": 772413440 }, { "epoch": 2.03, "learning_rate": 0.0003868605817452357, "loss": 2.8291, "theoretical_loss": 3.7421632840889405, "tokens_seen": 772478976 }, { "epoch": 2.03, "learning_rate": 0.0003868505516549649, "loss": 2.9658, "theoretical_loss": 3.742131614374355, "tokens_seen": 772544512 }, { "epoch": 2.03, "learning_rate": 0.00038684052156469405, "loss": 3.005, "theoretical_loss": 3.742099948098412, "tokens_seen": 772610048 }, { "epoch": 2.03, "learning_rate": 0.0003868304914744233, "loss": 2.9678, "theoretical_loss": 3.7420682852604457, "tokens_seen": 772675584 }, { "epoch": 2.03, "learning_rate": 0.00038682046138415247, "loss": 2.9907, "theoretical_loss": 3.7420366258597912, "tokens_seen": 772741120 }, { "epoch": 2.03, "learning_rate": 0.00038681043129388165, "loss": 2.8471, "theoretical_loss": 3.7420049698957847, "tokens_seen": 772806656 }, { "epoch": 2.03, "learning_rate": 0.00038680040120361083, "loss": 3.0272, "theoretical_loss": 3.741973317367761, "tokens_seen": 772872192 }, { "epoch": 2.03, "learning_rate": 0.00038679037111334, "loss": 2.9724, "theoretical_loss": 3.7419416682750573, "tokens_seen": 772937728 }, { "epoch": 2.03, "learning_rate": 0.0003867803410230692, "loss": 2.9336, "theoretical_loss": 3.741910022617007, "tokens_seen": 773003264 }, { "epoch": 2.03, "learning_rate": 0.00038677031093279843, "loss": 2.936, "theoretical_loss": 3.741878380392948, "tokens_seen": 773068800 }, { "epoch": 2.03, "learning_rate": 0.00038676028084252755, "loss": 3.0086, "theoretical_loss": 3.7418467416022168, "tokens_seen": 773134336 }, { "epoch": 2.03, "learning_rate": 0.0003867502507522568, "loss": 3.0195, "theoretical_loss": 3.7418151062441485, "tokens_seen": 773199872 }, { "epoch": 2.03, "learning_rate": 0.0003867402206619859, "loss": 2.8157, "theoretical_loss": 3.7417834743180807, "tokens_seen": 773265408 }, { "epoch": 2.03, "objective/train/docs_used": 1250187, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0417487621307373, "objective/train/theoretical_loss": 3.74176765964184, "objective/train/tokens_used": 793758176, "theoretical_loss": 3.74176765964184, "tokens_seen": 773298176 }, { "epoch": 2.03, "learning_rate": 0.00038673019057171515, "loss": 2.9162, "theoretical_loss": 3.7417518458233507, "tokens_seen": 773330944 }, { "epoch": 2.03, "learning_rate": 0.00038672016048144433, "loss": 2.8994, "theoretical_loss": 3.7417202207592943, "tokens_seen": 773396480 }, { "epoch": 2.03, "learning_rate": 0.0003867101303911735, "loss": 2.9868, "theoretical_loss": 3.7416885991252498, "tokens_seen": 773462016 }, { "epoch": 2.03, "learning_rate": 0.0003867001003009027, "loss": 2.7907, "theoretical_loss": 3.7416569809205535, "tokens_seen": 773527552 }, { "epoch": 2.03, "learning_rate": 0.00038669007021063193, "loss": 2.9824, "theoretical_loss": 3.741625366144544, "tokens_seen": 773593088 }, { "epoch": 2.03, "learning_rate": 0.00038668004012036106, "loss": 2.9877, "theoretical_loss": 3.741593754796559, "tokens_seen": 773658624 }, { "epoch": 2.03, "learning_rate": 0.0003866700100300903, "loss": 2.9632, "theoretical_loss": 3.741562146875936, "tokens_seen": 773724160 }, { "epoch": 2.03, "learning_rate": 0.0003866599799398194, "loss": 3.0256, "theoretical_loss": 3.741530542382013, "tokens_seen": 773789696 }, { "epoch": 2.03, "learning_rate": 0.00038664994984954866, "loss": 2.8952, "theoretical_loss": 3.7414989413141293, "tokens_seen": 773855232 }, { "epoch": 2.03, "learning_rate": 0.00038663991975927784, "loss": 3.0606, "theoretical_loss": 3.7414673436716224, "tokens_seen": 773920768 }, { "epoch": 2.03, "learning_rate": 0.000386629889669007, "loss": 3.0375, "theoretical_loss": 3.7414357494538315, "tokens_seen": 773986304 }, { "epoch": 2.03, "learning_rate": 0.00038661985957873625, "loss": 3.0587, "theoretical_loss": 3.741404158660095, "tokens_seen": 774051840 }, { "epoch": 2.03, "learning_rate": 0.0003866098294884654, "loss": 2.7885, "theoretical_loss": 3.7413725712897525, "tokens_seen": 774117376 }, { "epoch": 2.03, "learning_rate": 0.0003865997993981946, "loss": 2.9436, "theoretical_loss": 3.7413409873421433, "tokens_seen": 774182912 }, { "epoch": 2.03, "learning_rate": 0.0003865897693079238, "loss": 2.8573, "theoretical_loss": 3.7413094068166064, "tokens_seen": 774248448 }, { "epoch": 2.03, "learning_rate": 0.000386579739217653, "loss": 2.8872, "theoretical_loss": 3.7412778297124816, "tokens_seen": 774313984 }, { "epoch": 2.03, "learning_rate": 0.00038656970912738216, "loss": 3.0533, "theoretical_loss": 3.7412462560291084, "tokens_seen": 774379520 }, { "epoch": 2.03, "learning_rate": 0.0003865596790371114, "loss": 2.7441, "theoretical_loss": 3.7412146857658275, "tokens_seen": 774445056 }, { "epoch": 2.03, "learning_rate": 0.0003865496489468405, "loss": 2.8261, "theoretical_loss": 3.7411831189219784, "tokens_seen": 774510592 }, { "epoch": 2.03, "learning_rate": 0.00038653961885656976, "loss": 3.0065, "theoretical_loss": 3.7411515554969013, "tokens_seen": 774576128 }, { "epoch": 2.03, "learning_rate": 0.0003865295887662989, "loss": 3.0589, "theoretical_loss": 3.741119995489937, "tokens_seen": 774641664 }, { "epoch": 2.03, "learning_rate": 0.0003865195586760281, "loss": 2.9191, "theoretical_loss": 3.741088438900427, "tokens_seen": 774707200 }, { "epoch": 2.03, "learning_rate": 0.0003865095285857573, "loss": 2.8615, "theoretical_loss": 3.741056885727711, "tokens_seen": 774772736 }, { "epoch": 2.03, "learning_rate": 0.0003864994984954865, "loss": 2.7685, "theoretical_loss": 3.7410253359711305, "tokens_seen": 774838272 }, { "epoch": 2.03, "learning_rate": 0.00038648946840521566, "loss": 2.6768, "theoretical_loss": 3.7409937896300276, "tokens_seen": 774903808 }, { "epoch": 2.03, "objective/train/docs_used": 1253205, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.865217447280884, "objective/train/theoretical_loss": 3.7409780177400735, "objective/train/tokens_used": 795396576, "theoretical_loss": 3.7409780177400735, "tokens_seen": 774936576 }, { "epoch": 2.03, "learning_rate": 0.00038647943831494484, "loss": 2.7338, "theoretical_loss": 3.740962246703742, "tokens_seen": 774969344 }, { "epoch": 2.03, "learning_rate": 0.000386469408224674, "loss": 2.7703, "theoretical_loss": 3.740930707191617, "tokens_seen": 775034880 }, { "epoch": 2.03, "learning_rate": 0.00038645937813440326, "loss": 2.9699, "theoretical_loss": 3.740899171092993, "tokens_seen": 775100416 }, { "epoch": 2.03, "learning_rate": 0.0003864493480441324, "loss": 3.0226, "theoretical_loss": 3.740867638407213, "tokens_seen": 775165952 }, { "epoch": 2.03, "learning_rate": 0.0003864393179538616, "loss": 2.9752, "theoretical_loss": 3.740836109133619, "tokens_seen": 775231488 }, { "epoch": 2.03, "learning_rate": 0.00038642928786359075, "loss": 2.9057, "theoretical_loss": 3.7408045832715526, "tokens_seen": 775297024 }, { "epoch": 2.03, "learning_rate": 0.00038641925777332, "loss": 2.9835, "theoretical_loss": 3.7407730608203575, "tokens_seen": 775362560 }, { "epoch": 2.03, "learning_rate": 0.00038640922768304917, "loss": 2.9092, "theoretical_loss": 3.740741541779376, "tokens_seen": 775428096 }, { "epoch": 2.03, "learning_rate": 0.00038639919759277835, "loss": 2.9344, "theoretical_loss": 3.7407100261479505, "tokens_seen": 775493632 }, { "epoch": 2.03, "learning_rate": 0.00038638916750250753, "loss": 2.8989, "theoretical_loss": 3.7406785139254244, "tokens_seen": 775559168 }, { "epoch": 2.03, "learning_rate": 0.00038637913741223676, "loss": 3.0466, "theoretical_loss": 3.740647005111141, "tokens_seen": 775624704 }, { "epoch": 2.03, "learning_rate": 0.0003863691073219659, "loss": 3.0212, "theoretical_loss": 3.740615499704444, "tokens_seen": 775690240 }, { "epoch": 2.03, "learning_rate": 0.0003863590772316951, "loss": 2.9809, "theoretical_loss": 3.740583997704676, "tokens_seen": 775755776 }, { "epoch": 2.03, "learning_rate": 0.00038634904714142425, "loss": 2.9925, "theoretical_loss": 3.7405524991111827, "tokens_seen": 775821312 }, { "epoch": 2.03, "learning_rate": 0.0003863390170511535, "loss": 2.935, "theoretical_loss": 3.7405210039233063, "tokens_seen": 775886848 }, { "epoch": 2.03, "learning_rate": 0.00038632898696088267, "loss": 2.9012, "theoretical_loss": 3.7404895121403916, "tokens_seen": 775952384 }, { "epoch": 2.03, "learning_rate": 0.00038631895687061185, "loss": 3.137, "theoretical_loss": 3.7404580237617835, "tokens_seen": 776017920 }, { "epoch": 2.03, "learning_rate": 0.00038630892678034103, "loss": 2.8622, "theoretical_loss": 3.7404265387868256, "tokens_seen": 776083456 }, { "epoch": 2.03, "learning_rate": 0.0003862988966900702, "loss": 2.9309, "theoretical_loss": 3.7403950572148634, "tokens_seen": 776148992 }, { "epoch": 2.03, "learning_rate": 0.0003862888665997994, "loss": 3.0402, "theoretical_loss": 3.740363579045241, "tokens_seen": 776214528 }, { "epoch": 2.03, "learning_rate": 0.00038627883650952863, "loss": 2.963, "theoretical_loss": 3.7403321042773046, "tokens_seen": 776280064 }, { "epoch": 2.03, "learning_rate": 0.00038626880641925775, "loss": 2.943, "theoretical_loss": 3.7403006329103983, "tokens_seen": 776345600 }, { "epoch": 2.03, "learning_rate": 0.000386258776328987, "loss": 2.8644, "theoretical_loss": 3.740269164943868, "tokens_seen": 776411136 }, { "epoch": 2.03, "learning_rate": 0.0003862487462387161, "loss": 3.0407, "theoretical_loss": 3.7402377003770595, "tokens_seen": 776476672 }, { "epoch": 2.03, "learning_rate": 0.00038623871614844535, "loss": 3.0983, "theoretical_loss": 3.7402062392093187, "tokens_seen": 776542208 }, { "epoch": 2.03, "objective/train/docs_used": 1255142, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.887727975845337, "objective/train/theoretical_loss": 3.740190509899894, "objective/train/tokens_used": 797034976, "theoretical_loss": 3.740190509899894, "tokens_seen": 776574976 }, { "epoch": 2.03, "learning_rate": 0.00038622868605817453, "loss": 2.9621, "theoretical_loss": 3.740174781439991, "tokens_seen": 776607744 }, { "epoch": 2.03, "learning_rate": 0.0003862186559679037, "loss": 2.9457, "theoretical_loss": 3.7401433270684232, "tokens_seen": 776673280 }, { "epoch": 2.03, "learning_rate": 0.0003862086258776329, "loss": 2.962, "theoretical_loss": 3.740111876093961, "tokens_seen": 776738816 }, { "epoch": 2.03, "learning_rate": 0.00038619859578736213, "loss": 2.9509, "theoretical_loss": 3.7400804285159515, "tokens_seen": 776804352 }, { "epoch": 2.03, "learning_rate": 0.00038618856569709126, "loss": 3.0892, "theoretical_loss": 3.7400489843337414, "tokens_seen": 776869888 }, { "epoch": 2.03, "learning_rate": 0.0003861785356068205, "loss": 3.0225, "theoretical_loss": 3.740017543546678, "tokens_seen": 776935424 }, { "epoch": 2.03, "learning_rate": 0.0003861685055165496, "loss": 2.9437, "theoretical_loss": 3.739986106154107, "tokens_seen": 777000960 }, { "epoch": 2.03, "learning_rate": 0.00038615847542627886, "loss": 2.8751, "theoretical_loss": 3.7399546721553762, "tokens_seen": 777066496 }, { "epoch": 2.03, "learning_rate": 0.00038614844533600804, "loss": 2.9977, "theoretical_loss": 3.7399232415498336, "tokens_seen": 777132032 }, { "epoch": 2.03, "learning_rate": 0.0003861384152457372, "loss": 2.8906, "theoretical_loss": 3.7398918143368265, "tokens_seen": 777197568 }, { "epoch": 2.03, "learning_rate": 0.0003861283851554664, "loss": 3.0772, "theoretical_loss": 3.7398603905157026, "tokens_seen": 777263104 }, { "epoch": 2.03, "learning_rate": 0.0003861183550651956, "loss": 2.9675, "theoretical_loss": 3.7398289700858105, "tokens_seen": 777328640 }, { "epoch": 2.03, "learning_rate": 0.00038610832497492476, "loss": 2.9644, "theoretical_loss": 3.7397975530464973, "tokens_seen": 777394176 }, { "epoch": 2.03, "learning_rate": 0.000386098294884654, "loss": 2.9565, "theoretical_loss": 3.739766139397112, "tokens_seen": 777459712 }, { "epoch": 2.03, "learning_rate": 0.0003860882647943831, "loss": 3.0918, "theoretical_loss": 3.7397347291370027, "tokens_seen": 777525248 }, { "epoch": 2.03, "learning_rate": 0.00038607823470411236, "loss": 2.8221, "theoretical_loss": 3.7397033222655187, "tokens_seen": 777590784 }, { "epoch": 2.03, "learning_rate": 0.0003860682046138415, "loss": 2.8925, "theoretical_loss": 3.739671918782008, "tokens_seen": 777656320 }, { "epoch": 2.03, "learning_rate": 0.0003860581745235707, "loss": 2.8453, "theoretical_loss": 3.739640518685821, "tokens_seen": 777721856 }, { "epoch": 2.03, "learning_rate": 0.0003860481444332999, "loss": 3.048, "theoretical_loss": 3.739609121976306, "tokens_seen": 777787392 }, { "epoch": 2.03, "learning_rate": 0.0003860381143430291, "loss": 2.8988, "theoretical_loss": 3.7395777286528125, "tokens_seen": 777852928 }, { "epoch": 2.03, "learning_rate": 0.00038602808425275826, "loss": 3.0469, "theoretical_loss": 3.73954633871469, "tokens_seen": 777918464 }, { "epoch": 2.03, "learning_rate": 0.0003860180541624875, "loss": 2.9811, "theoretical_loss": 3.7395149521612883, "tokens_seen": 777984000 }, { "epoch": 2.03, "learning_rate": 0.0003860080240722166, "loss": 2.9696, "theoretical_loss": 3.7394835689919574, "tokens_seen": 778049536 }, { "epoch": 2.03, "learning_rate": 0.00038599799398194586, "loss": 2.9717, "theoretical_loss": 3.7394521892060477, "tokens_seen": 778115072 }, { "epoch": 2.03, "learning_rate": 0.000385987963891675, "loss": 2.8859, "theoretical_loss": 3.7394208128029094, "tokens_seen": 778180608 }, { "epoch": 2.03, "objective/train/docs_used": 1257930, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.056715726852417, "objective/train/theoretical_loss": 3.739405125869676, "objective/train/tokens_used": 798673376, "theoretical_loss": 3.739405125869676, "tokens_seen": 778213376 }, { "epoch": 2.03, "learning_rate": 0.0003859779338014042, "loss": 2.9353, "theoretical_loss": 3.7393894397818928, "tokens_seen": 778246144 }, { "epoch": 2.03, "learning_rate": 0.0003859679037111334, "loss": 3.0001, "theoretical_loss": 3.7393580701423486, "tokens_seen": 778311680 }, { "epoch": 2.03, "learning_rate": 0.0003859578736208626, "loss": 2.9973, "theoretical_loss": 3.739326703883628, "tokens_seen": 778377216 }, { "epoch": 2.03, "learning_rate": 0.00038594784353059177, "loss": 2.9154, "theoretical_loss": 3.7392953410050813, "tokens_seen": 778442752 }, { "epoch": 2.03, "learning_rate": 0.00038593781344032095, "loss": 3.0689, "theoretical_loss": 3.7392639815060607, "tokens_seen": 778508288 }, { "epoch": 2.03, "learning_rate": 0.00038592778335005013, "loss": 2.9942, "theoretical_loss": 3.7392326253859167, "tokens_seen": 778573824 }, { "epoch": 2.03, "learning_rate": 0.00038591775325977937, "loss": 2.8973, "theoretical_loss": 3.739201272644001, "tokens_seen": 778639360 }, { "epoch": 2.03, "learning_rate": 0.0003859077231695085, "loss": 2.9592, "theoretical_loss": 3.7391699232796665, "tokens_seen": 778704896 }, { "epoch": 2.03, "learning_rate": 0.00038589769307923773, "loss": 2.7496, "theoretical_loss": 3.7391385772922634, "tokens_seen": 778770432 }, { "epoch": 2.03, "learning_rate": 0.00038588766298896685, "loss": 2.8568, "theoretical_loss": 3.739107234681145, "tokens_seen": 778835968 }, { "epoch": 2.03, "learning_rate": 0.0003858776328986961, "loss": 3.0081, "theoretical_loss": 3.739075895445663, "tokens_seen": 778901504 }, { "epoch": 2.03, "learning_rate": 0.0003858676028084253, "loss": 3.0482, "theoretical_loss": 3.73904455958517, "tokens_seen": 778967040 }, { "epoch": 2.03, "learning_rate": 0.00038585757271815445, "loss": 2.8045, "theoretical_loss": 3.739013227099019, "tokens_seen": 779032576 }, { "epoch": 2.03, "learning_rate": 0.0003858475426278837, "loss": 2.967, "theoretical_loss": 3.7389818979865623, "tokens_seen": 779098112 }, { "epoch": 2.03, "learning_rate": 0.00038583751253761287, "loss": 2.8066, "theoretical_loss": 3.738950572247153, "tokens_seen": 779163648 }, { "epoch": 2.03, "learning_rate": 0.00038582748244734205, "loss": 2.9395, "theoretical_loss": 3.738919249880145, "tokens_seen": 779229184 }, { "epoch": 2.03, "learning_rate": 0.00038581745235707123, "loss": 2.9803, "theoretical_loss": 3.7388879308848906, "tokens_seen": 779294720 }, { "epoch": 2.03, "learning_rate": 0.0003858074222668004, "loss": 3.0423, "theoretical_loss": 3.738856615260744, "tokens_seen": 779360256 }, { "epoch": 2.03, "learning_rate": 0.0003857973921765296, "loss": 2.8545, "theoretical_loss": 3.738825303007059, "tokens_seen": 779425792 }, { "epoch": 2.03, "learning_rate": 0.00038578736208625883, "loss": 2.9078, "theoretical_loss": 3.7387939941231885, "tokens_seen": 779491328 }, { "epoch": 2.03, "learning_rate": 0.00038577733199598796, "loss": 2.9713, "theoretical_loss": 3.7387626886084875, "tokens_seen": 779556864 }, { "epoch": 2.03, "learning_rate": 0.0003857673019057172, "loss": 2.973, "theoretical_loss": 3.73873138646231, "tokens_seen": 779622400 }, { "epoch": 2.03, "learning_rate": 0.0003857572718154463, "loss": 2.8872, "theoretical_loss": 3.738700087684011, "tokens_seen": 779687936 }, { "epoch": 2.03, "learning_rate": 0.00038574724172517555, "loss": 2.9885, "theoretical_loss": 3.738668792272944, "tokens_seen": 779753472 }, { "epoch": 2.03, "learning_rate": 0.00038573721163490473, "loss": 3.0497, "theoretical_loss": 3.7386375002284638, "tokens_seen": 779819008 }, { "epoch": 2.03, "objective/train/docs_used": 1260528, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.999164342880249, "objective/train/theoretical_loss": 3.7386218554684927, "objective/train/tokens_used": 800311776, "theoretical_loss": 3.7386218554684927, "tokens_seen": 779851776 }, { "epoch": 2.03, "learning_rate": 0.0003857271815446339, "loss": 2.9795, "theoretical_loss": 3.738606211549927, "tokens_seen": 779884544 }, { "epoch": 2.03, "learning_rate": 0.0003857171514543631, "loss": 3.0649, "theoretical_loss": 3.738574926236687, "tokens_seen": 779950080 }, { "epoch": 2.03, "learning_rate": 0.00038570712136409233, "loss": 2.9413, "theoretical_loss": 3.7385436442880993, "tokens_seen": 780015616 }, { "epoch": 2.03, "learning_rate": 0.00038569709127382146, "loss": 3.0142, "theoretical_loss": 3.738512365703521, "tokens_seen": 780081152 }, { "epoch": 2.03, "learning_rate": 0.0003856870611835507, "loss": 2.9717, "theoretical_loss": 3.738481090482306, "tokens_seen": 780146688 }, { "epoch": 2.03, "learning_rate": 0.0003856770310932798, "loss": 3.0986, "theoretical_loss": 3.7384498186238106, "tokens_seen": 780212224 }, { "epoch": 2.03, "learning_rate": 0.00038566700100300906, "loss": 3.0116, "theoretical_loss": 3.738418550127391, "tokens_seen": 780277760 }, { "epoch": 2.03, "learning_rate": 0.00038565697091273824, "loss": 2.9499, "theoretical_loss": 3.7383872849924034, "tokens_seen": 780343296 }, { "epoch": 2.03, "learning_rate": 0.0003856469408224674, "loss": 2.9476, "theoretical_loss": 3.738356023218204, "tokens_seen": 780408832 }, { "epoch": 2.03, "learning_rate": 0.0003856369107321966, "loss": 2.9054, "theoretical_loss": 3.7383247648041493, "tokens_seen": 780474368 }, { "epoch": 2.03, "learning_rate": 0.0003856268806419258, "loss": 2.9503, "theoretical_loss": 3.738293509749597, "tokens_seen": 780539904 }, { "epoch": 2.03, "learning_rate": 0.00038561685055165496, "loss": 2.8507, "theoretical_loss": 3.7382622580539024, "tokens_seen": 780605440 }, { "epoch": 2.03, "learning_rate": 0.0003856068204613842, "loss": 2.838, "theoretical_loss": 3.738231009716424, "tokens_seen": 780670976 }, { "epoch": 2.03, "learning_rate": 0.0003855967903711133, "loss": 3.0264, "theoretical_loss": 3.7381997647365184, "tokens_seen": 780736512 }, { "epoch": 2.03, "learning_rate": 0.00038558676028084256, "loss": 2.9039, "theoretical_loss": 3.7381685231135435, "tokens_seen": 780802048 }, { "epoch": 2.03, "learning_rate": 0.0003855767301905717, "loss": 2.8775, "theoretical_loss": 3.738137284846856, "tokens_seen": 780867584 }, { "epoch": 2.03, "learning_rate": 0.0003855667001003009, "loss": 3.004, "theoretical_loss": 3.7381060499358143, "tokens_seen": 780933120 }, { "epoch": 2.03, "learning_rate": 0.0003855566700100301, "loss": 2.9597, "theoretical_loss": 3.7380748183797765, "tokens_seen": 780998656 }, { "epoch": 2.03, "learning_rate": 0.0003855466399197593, "loss": 2.8489, "theoretical_loss": 3.7380435901781004, "tokens_seen": 781064192 }, { "epoch": 2.03, "learning_rate": 0.00038553660982948846, "loss": 3.0411, "theoretical_loss": 3.738012365330145, "tokens_seen": 781129728 }, { "epoch": 2.03, "learning_rate": 0.0003855265797392177, "loss": 2.9866, "theoretical_loss": 3.7379811438352677, "tokens_seen": 781195264 }, { "epoch": 2.03, "learning_rate": 0.0003855165496489468, "loss": 3.0336, "theoretical_loss": 3.737949925692828, "tokens_seen": 781260800 }, { "epoch": 2.03, "learning_rate": 0.00038550651955867606, "loss": 3.0743, "theoretical_loss": 3.7379187109021847, "tokens_seen": 781326336 }, { "epoch": 2.03, "learning_rate": 0.0003854964894684052, "loss": 2.9861, "theoretical_loss": 3.7378874994626967, "tokens_seen": 781391872 }, { "epoch": 2.03, "learning_rate": 0.0003854864593781344, "loss": 2.9074, "theoretical_loss": 3.737856291373723, "tokens_seen": 781457408 }, { "epoch": 2.03, "objective/train/docs_used": 1263282, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0654704570770264, "objective/train/theoretical_loss": 3.7378406885854787, "objective/train/tokens_used": 801950176, "theoretical_loss": 3.7378406885854787, "tokens_seen": 781490176 }, { "epoch": 2.03, "learning_rate": 0.0003854764292878636, "loss": 2.9453, "theoretical_loss": 3.737825086634623, "tokens_seen": 781522944 }, { "epoch": 2.03, "learning_rate": 0.0003854663991975928, "loss": 2.9848, "theoretical_loss": 3.7377938852447565, "tokens_seen": 781588480 }, { "epoch": 2.03, "learning_rate": 0.00038545636910732197, "loss": 3.0801, "theoretical_loss": 3.7377626872034835, "tokens_seen": 781654016 }, { "epoch": 2.03, "learning_rate": 0.00038544633901705115, "loss": 2.9396, "theoretical_loss": 3.7377314925101635, "tokens_seen": 781719552 }, { "epoch": 2.03, "learning_rate": 0.00038543630892678033, "loss": 3.0454, "theoretical_loss": 3.7377003011641565, "tokens_seen": 781785088 }, { "epoch": 2.03, "learning_rate": 0.00038542627883650957, "loss": 2.8604, "theoretical_loss": 3.7376691131648228, "tokens_seen": 781850624 }, { "epoch": 2.03, "learning_rate": 0.0003854162487462387, "loss": 2.9644, "theoretical_loss": 3.737637928511523, "tokens_seen": 781916160 }, { "epoch": 2.03, "learning_rate": 0.00038540621865596793, "loss": 2.8823, "theoretical_loss": 3.7376067472036185, "tokens_seen": 781981696 }, { "epoch": 2.03, "learning_rate": 0.00038539618856569705, "loss": 2.9981, "theoretical_loss": 3.737575569240468, "tokens_seen": 782047232 }, { "epoch": 2.03, "learning_rate": 0.0003853861584754263, "loss": 3.0883, "theoretical_loss": 3.737544394621435, "tokens_seen": 782112768 }, { "epoch": 2.03, "learning_rate": 0.00038537612838515547, "loss": 3.0511, "theoretical_loss": 3.737513223345878, "tokens_seen": 782178304 }, { "epoch": 2.03, "learning_rate": 0.00038536609829488465, "loss": 2.8917, "theoretical_loss": 3.7374820554131607, "tokens_seen": 782243840 }, { "epoch": 2.03, "learning_rate": 0.00038535606820461383, "loss": 3.0143, "theoretical_loss": 3.737450890822643, "tokens_seen": 782309376 }, { "epoch": 2.03, "learning_rate": 0.00038534603811434307, "loss": 2.9562, "theoretical_loss": 3.7374197295736877, "tokens_seen": 782374912 }, { "epoch": 2.03, "learning_rate": 0.0003853360080240722, "loss": 3.013, "theoretical_loss": 3.737388571665656, "tokens_seen": 782440448 }, { "epoch": 2.03, "learning_rate": 0.00038532597793380143, "loss": 3.071, "theoretical_loss": 3.73735741709791, "tokens_seen": 782505984 }, { "epoch": 2.03, "learning_rate": 0.00038531594784353056, "loss": 3.0392, "theoretical_loss": 3.737326265869812, "tokens_seen": 782571520 }, { "epoch": 2.03, "learning_rate": 0.0003853059177532598, "loss": 2.7966, "theoretical_loss": 3.7372951179807234, "tokens_seen": 782637056 }, { "epoch": 2.03, "learning_rate": 0.000385295887662989, "loss": 2.7049, "theoretical_loss": 3.737263973430009, "tokens_seen": 782702592 }, { "epoch": 2.03, "learning_rate": 0.00038528585757271816, "loss": 3.1411, "theoretical_loss": 3.737232832217029, "tokens_seen": 782768128 }, { "epoch": 2.03, "learning_rate": 0.00038527582748244734, "loss": 2.9983, "theoretical_loss": 3.7372016943411484, "tokens_seen": 782833664 }, { "epoch": 2.03, "learning_rate": 0.0003852657973921765, "loss": 2.9534, "theoretical_loss": 3.737170559801729, "tokens_seen": 782899200 }, { "epoch": 2.03, "learning_rate": 0.0003852557673019057, "loss": 2.7963, "theoretical_loss": 3.7371394285981343, "tokens_seen": 782964736 }, { "epoch": 2.03, "learning_rate": 0.00038524573721163493, "loss": 2.818, "theoretical_loss": 3.737108300729728, "tokens_seen": 783030272 }, { "epoch": 2.03, "learning_rate": 0.00038523570712136406, "loss": 2.8542, "theoretical_loss": 3.737077176195873, "tokens_seen": 783095808 }, { "epoch": 2.03, "objective/train/docs_used": 1266049, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9632413387298584, "objective/train/theoretical_loss": 3.737061615179204, "objective/train/tokens_used": 803588576, "theoretical_loss": 3.737061615179204, "tokens_seen": 783128576 }, { "epoch": 2.03, "learning_rate": 0.0003852256770310933, "loss": 2.9675, "theoretical_loss": 3.7370460549959343, "tokens_seen": 783161344 }, { "epoch": 2.03, "learning_rate": 0.0003852156469408225, "loss": 2.955, "theoretical_loss": 3.737014937129275, "tokens_seen": 783226880 }, { "epoch": 2.03, "learning_rate": 0.00038520561685055166, "loss": 3.1045, "theoretical_loss": 3.736983822595259, "tokens_seen": 783292416 }, { "epoch": 2.03, "learning_rate": 0.00038519558676028084, "loss": 2.9987, "theoretical_loss": 3.7369527113932506, "tokens_seen": 783357952 }, { "epoch": 2.03, "learning_rate": 0.00038518555667001, "loss": 2.9415, "theoretical_loss": 3.736921603522615, "tokens_seen": 783423488 }, { "epoch": 2.03, "learning_rate": 0.0003851755265797392, "loss": 2.8461, "theoretical_loss": 3.7368904989827163, "tokens_seen": 783489024 }, { "epoch": 2.03, "learning_rate": 0.00038516549648946844, "loss": 3.0187, "theoretical_loss": 3.7368593977729194, "tokens_seen": 783554560 }, { "epoch": 2.03, "learning_rate": 0.00038515546639919756, "loss": 3.0628, "theoretical_loss": 3.73682829989259, "tokens_seen": 783620096 }, { "epoch": 2.03, "learning_rate": 0.0003851454363089268, "loss": 3.0662, "theoretical_loss": 3.736797205341092, "tokens_seen": 783685632 }, { "epoch": 2.03, "learning_rate": 0.0003851354062186559, "loss": 2.9827, "theoretical_loss": 3.7367661141177915, "tokens_seen": 783751168 }, { "epoch": 2.03, "learning_rate": 0.00038512537612838516, "loss": 2.8437, "theoretical_loss": 3.736735026222054, "tokens_seen": 783816704 }, { "epoch": 2.03, "learning_rate": 0.0003851153460381144, "loss": 2.9349, "theoretical_loss": 3.7367039416532446, "tokens_seen": 783882240 }, { "epoch": 2.03, "learning_rate": 0.0003851053159478435, "loss": 3.1156, "theoretical_loss": 3.7366728604107298, "tokens_seen": 783947776 }, { "epoch": 2.03, "learning_rate": 0.00038509528585757276, "loss": 2.9164, "theoretical_loss": 3.7366417824938756, "tokens_seen": 784013312 }, { "epoch": 2.03, "learning_rate": 0.0003850852557673019, "loss": 2.9101, "theoretical_loss": 3.736610707902048, "tokens_seen": 784078848 }, { "epoch": 2.03, "learning_rate": 0.0003850752256770311, "loss": 2.8751, "theoretical_loss": 3.7365796366346133, "tokens_seen": 784144384 }, { "epoch": 2.03, "learning_rate": 0.0003850651955867603, "loss": 3.0549, "theoretical_loss": 3.7365485686909388, "tokens_seen": 784209920 }, { "epoch": 2.03, "learning_rate": 0.0003850551654964895, "loss": 3.025, "theoretical_loss": 3.73651750407039, "tokens_seen": 784275456 }, { "epoch": 2.03, "learning_rate": 0.00038504513540621866, "loss": 2.9719, "theoretical_loss": 3.7364864427723345, "tokens_seen": 784340992 }, { "epoch": 2.03, "learning_rate": 0.0003850351053159479, "loss": 3.0005, "theoretical_loss": 3.7364553847961393, "tokens_seen": 784406528 }, { "epoch": 2.03, "learning_rate": 0.000385025075225677, "loss": 2.8865, "theoretical_loss": 3.7364243301411717, "tokens_seen": 784472064 }, { "epoch": 2.03, "learning_rate": 0.00038501504513540626, "loss": 2.9985, "theoretical_loss": 3.7363932788067995, "tokens_seen": 784537600 }, { "epoch": 2.03, "learning_rate": 0.0003850050150451354, "loss": 3.0063, "theoretical_loss": 3.7363622307923894, "tokens_seen": 784603136 }, { "epoch": 2.03, "learning_rate": 0.0003849949849548646, "loss": 3.0227, "theoretical_loss": 3.7363311860973094, "tokens_seen": 784668672 }, { "epoch": 2.03, "learning_rate": 0.0003849849548645938, "loss": 3.0386, "theoretical_loss": 3.7363001447209285, "tokens_seen": 784734208 }, { "epoch": 2.03, "objective/train/docs_used": 1267573, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.114417314529419, "objective/train/theoretical_loss": 3.736284625277052, "objective/train/tokens_used": 805226976, "theoretical_loss": 3.736284625277052, "tokens_seen": 784766976 }, { "epoch": 2.03, "learning_rate": 0.000384974924774323, "loss": 2.9536, "theoretical_loss": 3.736269106662613, "tokens_seen": 784799744 }, { "epoch": 2.03, "learning_rate": 0.00038496489468405217, "loss": 2.9987, "theoretical_loss": 3.7362380719217327, "tokens_seen": 784865280 }, { "epoch": 2.03, "learning_rate": 0.00038495486459378135, "loss": 3.0573, "theoretical_loss": 3.736207040497656, "tokens_seen": 784930816 }, { "epoch": 2.03, "learning_rate": 0.00038494483450351053, "loss": 3.094, "theoretical_loss": 3.73617601238975, "tokens_seen": 784996352 }, { "epoch": 2.03, "learning_rate": 0.00038493480441323977, "loss": 3.1075, "theoretical_loss": 3.736144987597386, "tokens_seen": 785061888 }, { "epoch": 2.03, "learning_rate": 0.0003849247743229689, "loss": 2.9768, "theoretical_loss": 3.7361139661199307, "tokens_seen": 785127424 }, { "epoch": 2.03, "learning_rate": 0.00038491474423269813, "loss": 2.8979, "theoretical_loss": 3.7360829479567546, "tokens_seen": 785192960 }, { "epoch": 2.03, "learning_rate": 0.00038490471414242725, "loss": 3.0039, "theoretical_loss": 3.736051933107226, "tokens_seen": 785258496 }, { "epoch": 2.03, "learning_rate": 0.0003848946840521565, "loss": 2.819, "theoretical_loss": 3.7360209215707156, "tokens_seen": 785324032 }, { "epoch": 2.03, "learning_rate": 0.00038488465396188567, "loss": 3.0174, "theoretical_loss": 3.7359899133465917, "tokens_seen": 785389568 }, { "epoch": 2.03, "learning_rate": 0.00038487462387161485, "loss": 2.7808, "theoretical_loss": 3.7359589084342253, "tokens_seen": 785455104 }, { "epoch": 2.03, "learning_rate": 0.00038486459378134403, "loss": 3.0273, "theoretical_loss": 3.7359279068329863, "tokens_seen": 785520640 }, { "epoch": 2.03, "learning_rate": 0.00038485456369107327, "loss": 2.9712, "theoretical_loss": 3.7358969085422444, "tokens_seen": 785586176 }, { "epoch": 2.03, "learning_rate": 0.0003848445336008024, "loss": 3.0238, "theoretical_loss": 3.73586591356137, "tokens_seen": 785651712 }, { "epoch": 2.03, "learning_rate": 0.00038483450351053163, "loss": 2.9452, "theoretical_loss": 3.7358349218897335, "tokens_seen": 785717248 }, { "epoch": 2.03, "learning_rate": 0.00038482447342026076, "loss": 3.2082, "theoretical_loss": 3.7358039335267064, "tokens_seen": 785782784 }, { "epoch": 2.03, "learning_rate": 0.00038481444332999, "loss": 2.9417, "theoretical_loss": 3.7357729484716584, "tokens_seen": 785848320 }, { "epoch": 2.03, "learning_rate": 0.0003848044132397192, "loss": 3.0147, "theoretical_loss": 3.735741966723962, "tokens_seen": 785913856 }, { "epoch": 2.03, "learning_rate": 0.00038479438314944836, "loss": 3.0546, "theoretical_loss": 3.7357109882829875, "tokens_seen": 785979392 }, { "epoch": 2.03, "learning_rate": 0.00038478435305917754, "loss": 2.8338, "theoretical_loss": 3.7356800131481065, "tokens_seen": 786044928 }, { "epoch": 2.03, "learning_rate": 0.0003847743229689067, "loss": 3.0212, "theoretical_loss": 3.7356490413186902, "tokens_seen": 786110464 }, { "epoch": 2.03, "learning_rate": 0.0003847642928786359, "loss": 2.9665, "theoretical_loss": 3.735618072794111, "tokens_seen": 786176000 }, { "epoch": 2.03, "learning_rate": 0.00038475426278836513, "loss": 2.8682, "theoretical_loss": 3.73558710757374, "tokens_seen": 786241536 }, { "epoch": 2.03, "learning_rate": 0.00038474423269809426, "loss": 2.8639, "theoretical_loss": 3.73555614565695, "tokens_seen": 786307072 }, { "epoch": 2.03, "learning_rate": 0.0003847342026078235, "loss": 2.9266, "theoretical_loss": 3.735525187043113, "tokens_seen": 786372608 }, { "epoch": 2.03, "objective/train/docs_used": 1270619, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1743619441986084, "objective/train/theoretical_loss": 3.7355097089746057, "objective/train/tokens_used": 806865376, "theoretical_loss": 3.7355097089746057, "tokens_seen": 786405376 }, { "epoch": 2.03, "learning_rate": 0.0003847241725175527, "loss": 2.9693, "theoretical_loss": 3.735494231731601, "tokens_seen": 786438144 }, { "epoch": 2.03, "learning_rate": 0.00038471414242728186, "loss": 2.9565, "theoretical_loss": 3.7354632797217877, "tokens_seen": 786503680 }, { "epoch": 2.03, "learning_rate": 0.00038470411233701104, "loss": 2.7665, "theoretical_loss": 3.735432331013045, "tokens_seen": 786569216 }, { "epoch": 2.03, "learning_rate": 0.0003846940822467402, "loss": 2.9139, "theoretical_loss": 3.735401385604746, "tokens_seen": 786634752 }, { "epoch": 2.03, "learning_rate": 0.0003846840521564694, "loss": 3.0172, "theoretical_loss": 3.7353704434962633, "tokens_seen": 786700288 }, { "epoch": 2.03, "learning_rate": 0.00038467402206619864, "loss": 2.8204, "theoretical_loss": 3.735339504686971, "tokens_seen": 786765824 }, { "epoch": 2.03, "learning_rate": 0.00038466399197592776, "loss": 2.8671, "theoretical_loss": 3.735308569176243, "tokens_seen": 786831360 }, { "epoch": 2.03, "learning_rate": 0.000384653961885657, "loss": 2.8492, "theoretical_loss": 3.735277636963451, "tokens_seen": 786896896 }, { "epoch": 2.03, "learning_rate": 0.0003846439317953861, "loss": 2.9423, "theoretical_loss": 3.7352467080479705, "tokens_seen": 786962432 }, { "epoch": 2.03, "learning_rate": 0.00038463390170511536, "loss": 2.8835, "theoretical_loss": 3.735215782429175, "tokens_seen": 787027968 }, { "epoch": 2.03, "learning_rate": 0.00038462387161484454, "loss": 3.0586, "theoretical_loss": 3.735184860106439, "tokens_seen": 787093504 }, { "epoch": 2.03, "learning_rate": 0.0003846138415245737, "loss": 2.9785, "theoretical_loss": 3.735153941079136, "tokens_seen": 787159040 }, { "epoch": 2.03, "learning_rate": 0.0003846038114343029, "loss": 3.1068, "theoretical_loss": 3.735123025346641, "tokens_seen": 787224576 }, { "epoch": 2.03, "learning_rate": 0.0003845937813440321, "loss": 2.912, "theoretical_loss": 3.7350921129083283, "tokens_seen": 787290112 }, { "epoch": 2.03, "learning_rate": 0.00038458375125376127, "loss": 2.9662, "theoretical_loss": 3.735061203763573, "tokens_seen": 787355648 }, { "epoch": 2.03, "learning_rate": 0.0003845737211634905, "loss": 2.9652, "theoretical_loss": 3.73503029791175, "tokens_seen": 787421184 }, { "epoch": 2.03, "learning_rate": 0.00038456369107321963, "loss": 3.1269, "theoretical_loss": 3.7349993953522347, "tokens_seen": 787486720 }, { "epoch": 2.03, "learning_rate": 0.00038455366098294886, "loss": 2.9541, "theoretical_loss": 3.7349684960844023, "tokens_seen": 787552256 }, { "epoch": 2.03, "learning_rate": 0.00038454363089267805, "loss": 2.9243, "theoretical_loss": 3.734937600107628, "tokens_seen": 787617792 }, { "epoch": 2.03, "learning_rate": 0.00038453360080240723, "loss": 2.9501, "theoretical_loss": 3.734906707421288, "tokens_seen": 787683328 }, { "epoch": 2.03, "learning_rate": 0.0003845235707121364, "loss": 2.9042, "theoretical_loss": 3.7348758180247574, "tokens_seen": 787748864 }, { "epoch": 2.03, "learning_rate": 0.0003845135406218656, "loss": 2.958, "theoretical_loss": 3.734844931917413, "tokens_seen": 787814400 }, { "epoch": 2.03, "learning_rate": 0.00038450351053159477, "loss": 3.0263, "theoretical_loss": 3.73481404909863, "tokens_seen": 787879936 }, { "epoch": 2.03, "learning_rate": 0.000384493480441324, "loss": 3.0094, "theoretical_loss": 3.7347831695677867, "tokens_seen": 787945472 }, { "epoch": 2.03, "learning_rate": 0.00038448345035105313, "loss": 3.0426, "theoretical_loss": 3.7347522933242576, "tokens_seen": 788011008 }, { "epoch": 2.03, "objective/train/docs_used": 1273240, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0862505435943604, "objective/train/theoretical_loss": 3.734736856435041, "objective/train/tokens_used": 808503776, "theoretical_loss": 3.734736856435041, "tokens_seen": 788043776 }, { "epoch": 2.03, "learning_rate": 0.00038447342026078237, "loss": 3.0143, "theoretical_loss": 3.73472142036742, "tokens_seen": 788076544 }, { "epoch": 2.03, "learning_rate": 0.0003844633901705115, "loss": 2.8852, "theoretical_loss": 3.734690550696651, "tokens_seen": 788142080 }, { "epoch": 2.03, "learning_rate": 0.00038445336008024073, "loss": 2.9274, "theoretical_loss": 3.7346596843113273, "tokens_seen": 788207616 }, { "epoch": 2.03, "learning_rate": 0.0003844433299899699, "loss": 2.9986, "theoretical_loss": 3.7346288212108267, "tokens_seen": 788273152 }, { "epoch": 2.03, "learning_rate": 0.0003844332998996991, "loss": 2.7528, "theoretical_loss": 3.734597961394526, "tokens_seen": 788338688 }, { "epoch": 2.03, "learning_rate": 0.0003844232698094283, "loss": 2.9959, "theoretical_loss": 3.734567104861803, "tokens_seen": 788404224 }, { "epoch": 2.03, "learning_rate": 0.00038441323971915745, "loss": 2.8445, "theoretical_loss": 3.7345362516120355, "tokens_seen": 788469760 }, { "epoch": 2.03, "learning_rate": 0.00038440320962888664, "loss": 2.9687, "theoretical_loss": 3.7345054016446015, "tokens_seen": 788535296 }, { "epoch": 2.03, "learning_rate": 0.00038439317953861587, "loss": 2.9754, "theoretical_loss": 3.7344745549588785, "tokens_seen": 788600832 }, { "epoch": 2.03, "learning_rate": 0.00038438314944834505, "loss": 2.9018, "theoretical_loss": 3.7344437115542446, "tokens_seen": 788666368 }, { "epoch": 2.03, "learning_rate": 0.00038437311935807423, "loss": 2.7258, "theoretical_loss": 3.734412871430079, "tokens_seen": 788731904 }, { "epoch": 2.03, "learning_rate": 0.00038436308926780347, "loss": 2.9211, "theoretical_loss": 3.73438203458576, "tokens_seen": 788797440 }, { "epoch": 2.03, "learning_rate": 0.0003843530591775326, "loss": 2.7764, "theoretical_loss": 3.734351201020666, "tokens_seen": 788862976 }, { "epoch": 2.03, "learning_rate": 0.00038434302908726183, "loss": 2.9857, "theoretical_loss": 3.734320370734176, "tokens_seen": 788928512 }, { "epoch": 2.03, "learning_rate": 0.00038433299899699096, "loss": 2.7998, "theoretical_loss": 3.7342895437256702, "tokens_seen": 788994048 }, { "epoch": 2.03, "learning_rate": 0.0003843229689067202, "loss": 3.1116, "theoretical_loss": 3.734258719994526, "tokens_seen": 789059584 }, { "epoch": 2.03, "learning_rate": 0.0003843129388164494, "loss": 2.725, "theoretical_loss": 3.7342278995401235, "tokens_seen": 789125120 }, { "epoch": 2.03, "learning_rate": 0.00038430290872617856, "loss": 3.001, "theoretical_loss": 3.7341970823618427, "tokens_seen": 789190656 }, { "epoch": 2.03, "learning_rate": 0.00038429287863590774, "loss": 2.9792, "theoretical_loss": 3.7341662684590626, "tokens_seen": 789256192 }, { "epoch": 2.03, "learning_rate": 0.0003842828485456369, "loss": 2.9035, "theoretical_loss": 3.734135457831164, "tokens_seen": 789321728 }, { "epoch": 2.03, "learning_rate": 0.0003842728184553661, "loss": 3.0321, "theoretical_loss": 3.7341046504775264, "tokens_seen": 789387264 }, { "epoch": 2.03, "learning_rate": 0.00038426278836509533, "loss": 2.9982, "theoretical_loss": 3.7340738463975303, "tokens_seen": 789452800 }, { "epoch": 2.03, "learning_rate": 0.00038425275827482446, "loss": 2.7938, "theoretical_loss": 3.734043045590556, "tokens_seen": 789518336 }, { "epoch": 2.03, "learning_rate": 0.0003842427281845537, "loss": 3.1226, "theoretical_loss": 3.7340122480559836, "tokens_seen": 789583872 }, { "epoch": 2.03, "learning_rate": 0.0003842326980942829, "loss": 2.9233, "theoretical_loss": 3.733981453793195, "tokens_seen": 789649408 }, { "epoch": 2.03, "objective/train/docs_used": 1276035, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8479435443878174, "objective/train/theoretical_loss": 3.7339660578885256, "objective/train/tokens_used": 810142176, "theoretical_loss": 3.7339660578885256, "tokens_seen": 789682176 }, { "epoch": 2.03, "learning_rate": 0.00038422266800401206, "loss": 2.9616, "theoretical_loss": 3.73395066280157, "tokens_seen": 789714944 }, { "epoch": 2.03, "learning_rate": 0.00038421263791374124, "loss": 2.9523, "theoretical_loss": 3.733919875080491, "tokens_seen": 789780480 }, { "epoch": 2.03, "learning_rate": 0.0003842026078234704, "loss": 2.8547, "theoretical_loss": 3.733889090629338, "tokens_seen": 789846016 }, { "epoch": 2.03, "learning_rate": 0.0003841925777331996, "loss": 2.8009, "theoretical_loss": 3.733858309447492, "tokens_seen": 789911552 }, { "epoch": 2.03, "learning_rate": 0.00038418254764292884, "loss": 3.012, "theoretical_loss": 3.7338275315343368, "tokens_seen": 789977088 }, { "epoch": 2.03, "learning_rate": 0.00038417251755265796, "loss": 2.9447, "theoretical_loss": 3.7337967568892525, "tokens_seen": 790042624 }, { "epoch": 2.03, "learning_rate": 0.0003841624874623872, "loss": 2.9174, "theoretical_loss": 3.733765985511621, "tokens_seen": 790108160 }, { "epoch": 2.03, "learning_rate": 0.0003841524573721163, "loss": 2.8996, "theoretical_loss": 3.7337352174008256, "tokens_seen": 790173696 }, { "epoch": 2.03, "learning_rate": 0.00038414242728184556, "loss": 2.8941, "theoretical_loss": 3.733704452556247, "tokens_seen": 790239232 }, { "epoch": 2.03, "learning_rate": 0.00038413239719157474, "loss": 2.7346, "theoretical_loss": 3.7336736909772688, "tokens_seen": 790304768 }, { "epoch": 2.03, "learning_rate": 0.0003841223671013039, "loss": 2.9289, "theoretical_loss": 3.7336429326632734, "tokens_seen": 790370304 }, { "epoch": 2.03, "learning_rate": 0.0003841123370110331, "loss": 3.0659, "theoretical_loss": 3.733612177613643, "tokens_seen": 790435840 }, { "epoch": 2.03, "learning_rate": 0.0003841023069207623, "loss": 2.9122, "theoretical_loss": 3.7335814258277606, "tokens_seen": 790501376 }, { "epoch": 2.03, "learning_rate": 0.00038409227683049147, "loss": 3.0489, "theoretical_loss": 3.73355067730501, "tokens_seen": 790566912 }, { "epoch": 2.03, "learning_rate": 0.0003840822467402207, "loss": 2.9978, "theoretical_loss": 3.7335199320447745, "tokens_seen": 790632448 }, { "epoch": 2.03, "learning_rate": 0.00038407221664994983, "loss": 2.9831, "theoretical_loss": 3.733489190046437, "tokens_seen": 790697984 }, { "epoch": 2.03, "learning_rate": 0.00038406218655967907, "loss": 2.688, "theoretical_loss": 3.733458451309381, "tokens_seen": 790763520 }, { "epoch": 2.03, "learning_rate": 0.00038405215646940825, "loss": 3.0115, "theoretical_loss": 3.733427715832991, "tokens_seen": 790829056 }, { "epoch": 2.03, "learning_rate": 0.00038404212637913743, "loss": 3.0347, "theoretical_loss": 3.7333969836166503, "tokens_seen": 790894592 }, { "epoch": 2.03, "learning_rate": 0.0003840320962888666, "loss": 2.9955, "theoretical_loss": 3.7333662546597433, "tokens_seen": 790960128 }, { "epoch": 2.03, "learning_rate": 0.0003840220661985958, "loss": 3.0427, "theoretical_loss": 3.7333355289616543, "tokens_seen": 791025664 }, { "epoch": 2.03, "learning_rate": 0.00038401203610832497, "loss": 2.9945, "theoretical_loss": 3.733304806521767, "tokens_seen": 791091200 }, { "epoch": 2.03, "learning_rate": 0.0003840020060180542, "loss": 2.9839, "theoretical_loss": 3.733274087339468, "tokens_seen": 791156736 }, { "epoch": 2.03, "learning_rate": 0.00038399197592778333, "loss": 3.133, "theoretical_loss": 3.73324337141414, "tokens_seen": 791222272 }, { "epoch": 2.03, "learning_rate": 0.00038398194583751257, "loss": 2.7711, "theoretical_loss": 3.7332126587451686, "tokens_seen": 791287808 }, { "epoch": 2.03, "objective/train/docs_used": 1278846, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.949676990509033, "objective/train/theoretical_loss": 3.7331973036316253, "objective/train/tokens_used": 811780576, "theoretical_loss": 3.7331973036316253, "tokens_seen": 791320576 }, { "epoch": 2.03, "learning_rate": 0.0003839719157472417, "loss": 2.901, "theoretical_loss": 3.7331819493319394, "tokens_seen": 791353344 }, { "epoch": 2.03, "learning_rate": 0.00038396188565697093, "loss": 2.8902, "theoretical_loss": 3.733151243173838, "tokens_seen": 791418880 }, { "epoch": 2.03, "learning_rate": 0.0003839518555667001, "loss": 3.0889, "theoretical_loss": 3.733120540270248, "tokens_seen": 791484416 }, { "epoch": 2.03, "learning_rate": 0.0003839418254764293, "loss": 3.0766, "theoretical_loss": 3.733089840620557, "tokens_seen": 791549952 }, { "epoch": 2.03, "learning_rate": 0.0003839317953861585, "loss": 2.891, "theoretical_loss": 3.73305914422415, "tokens_seen": 791615488 }, { "epoch": 2.03, "learning_rate": 0.00038392176529588765, "loss": 2.9414, "theoretical_loss": 3.7330284510804126, "tokens_seen": 791681024 }, { "epoch": 2.03, "learning_rate": 0.00038391173520561684, "loss": 3.031, "theoretical_loss": 3.7329977611887317, "tokens_seen": 791746560 }, { "epoch": 2.03, "learning_rate": 0.00038390170511534607, "loss": 2.8007, "theoretical_loss": 3.7329670745484926, "tokens_seen": 791812096 }, { "epoch": 2.03, "learning_rate": 0.0003838916750250752, "loss": 2.7927, "theoretical_loss": 3.732936391159083, "tokens_seen": 791877632 }, { "epoch": 2.03, "learning_rate": 0.00038388164493480443, "loss": 2.9518, "theoretical_loss": 3.732905711019889, "tokens_seen": 791943168 }, { "epoch": 2.03, "learning_rate": 0.0003838716148445336, "loss": 2.8816, "theoretical_loss": 3.7328750341302968, "tokens_seen": 792008704 }, { "epoch": 2.03, "learning_rate": 0.0003838615847542628, "loss": 2.9217, "theoretical_loss": 3.732844360489694, "tokens_seen": 792074240 }, { "epoch": 2.03, "learning_rate": 0.000383851554663992, "loss": 3.0496, "theoretical_loss": 3.7328136900974673, "tokens_seen": 792139776 }, { "epoch": 2.03, "learning_rate": 0.00038384152457372116, "loss": 2.8649, "theoretical_loss": 3.7327830229530043, "tokens_seen": 792205312 }, { "epoch": 2.03, "learning_rate": 0.00038383149448345034, "loss": 2.7952, "theoretical_loss": 3.7327523590556924, "tokens_seen": 792270848 }, { "epoch": 2.03, "learning_rate": 0.0003838214643931796, "loss": 2.8037, "theoretical_loss": 3.7327216984049194, "tokens_seen": 792336384 }, { "epoch": 2.03, "learning_rate": 0.0003838114343029087, "loss": 2.847, "theoretical_loss": 3.7326910410000727, "tokens_seen": 792401920 }, { "epoch": 2.03, "learning_rate": 0.00038380140421263794, "loss": 2.8966, "theoretical_loss": 3.732660386840541, "tokens_seen": 792467456 }, { "epoch": 2.03, "learning_rate": 0.00038379137412236706, "loss": 2.681, "theoretical_loss": 3.7326297359257117, "tokens_seen": 792532992 }, { "epoch": 2.03, "learning_rate": 0.0003837813440320963, "loss": 2.834, "theoretical_loss": 3.7325990882549727, "tokens_seen": 792598528 }, { "epoch": 2.03, "learning_rate": 0.0003837713139418255, "loss": 2.8482, "theoretical_loss": 3.7325684438277134, "tokens_seen": 792664064 }, { "epoch": 2.03, "learning_rate": 0.00038376128385155466, "loss": 2.8176, "theoretical_loss": 3.7325378026433222, "tokens_seen": 792729600 }, { "epoch": 2.03, "learning_rate": 0.00038375125376128384, "loss": 3.0178, "theoretical_loss": 3.732507164701187, "tokens_seen": 792795136 }, { "epoch": 2.03, "learning_rate": 0.0003837412236710131, "loss": 2.988, "theoretical_loss": 3.7324765300006977, "tokens_seen": 792860672 }, { "epoch": 2.03, "learning_rate": 0.0003837311935807422, "loss": 2.9853, "theoretical_loss": 3.7324458985412434, "tokens_seen": 792926208 }, { "epoch": 2.03, "objective/train/docs_used": 1280247, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.932974100112915, "objective/train/theoretical_loss": 3.7324305840267136, "objective/train/tokens_used": 813418976, "theoretical_loss": 3.7324305840267136, "tokens_seen": 792958976 }, { "epoch": 2.03, "learning_rate": 0.00038372116349047144, "loss": 2.9438, "theoretical_loss": 3.732415270322213, "tokens_seen": 792991744 }, { "epoch": 2.03, "learning_rate": 0.00038371113340020057, "loss": 3.074, "theoretical_loss": 3.7323846453429965, "tokens_seen": 793057280 }, { "epoch": 2.03, "learning_rate": 0.0003837011033099298, "loss": 2.9147, "theoretical_loss": 3.7323540236029826, "tokens_seen": 793122816 }, { "epoch": 2.03, "learning_rate": 0.000383691073219659, "loss": 3.0599, "theoretical_loss": 3.7323234051015617, "tokens_seen": 793188352 }, { "epoch": 2.03, "learning_rate": 0.00038368104312938816, "loss": 2.8761, "theoretical_loss": 3.732292789838124, "tokens_seen": 793253888 }, { "epoch": 2.03, "learning_rate": 0.00038367101303911735, "loss": 2.7172, "theoretical_loss": 3.732262177812059, "tokens_seen": 793319424 }, { "epoch": 2.03, "learning_rate": 0.0003836609829488465, "loss": 2.9373, "theoretical_loss": 3.7322315690227565, "tokens_seen": 793384960 }, { "epoch": 2.03, "learning_rate": 0.0003836509528585757, "loss": 2.8619, "theoretical_loss": 3.7322009634696083, "tokens_seen": 793450496 }, { "epoch": 2.03, "learning_rate": 0.00038364092276830494, "loss": 2.9373, "theoretical_loss": 3.7321703611520043, "tokens_seen": 793516032 }, { "epoch": 2.03, "learning_rate": 0.0003836308926780341, "loss": 2.8968, "theoretical_loss": 3.732139762069335, "tokens_seen": 793581568 }, { "epoch": 2.03, "learning_rate": 0.0003836208625877633, "loss": 2.8394, "theoretical_loss": 3.732109166220992, "tokens_seen": 793647104 }, { "epoch": 2.03, "learning_rate": 0.0003836108324974925, "loss": 2.8891, "theoretical_loss": 3.7320785736063655, "tokens_seen": 793712640 }, { "epoch": 2.03, "learning_rate": 0.00038360080240722167, "loss": 2.9623, "theoretical_loss": 3.732047984224848, "tokens_seen": 793778176 }, { "epoch": 2.03, "learning_rate": 0.0003835907723169509, "loss": 2.9752, "theoretical_loss": 3.73201739807583, "tokens_seen": 793843712 }, { "epoch": 2.03, "learning_rate": 0.00038358074222668003, "loss": 3.0049, "theoretical_loss": 3.7319868151587032, "tokens_seen": 793909248 }, { "epoch": 2.03, "learning_rate": 0.00038357071213640927, "loss": 2.8354, "theoretical_loss": 3.7319562354728593, "tokens_seen": 793974784 }, { "epoch": 2.03, "learning_rate": 0.00038356068204613845, "loss": 2.879, "theoretical_loss": 3.7319256590176906, "tokens_seen": 794040320 }, { "epoch": 2.03, "learning_rate": 0.00038355065195586763, "loss": 2.944, "theoretical_loss": 3.7318950857925888, "tokens_seen": 794105856 }, { "epoch": 2.03, "learning_rate": 0.0003835406218655968, "loss": 2.9525, "theoretical_loss": 3.7318645157969463, "tokens_seen": 794171392 }, { "epoch": 2.03, "learning_rate": 0.000383530591775326, "loss": 2.9207, "theoretical_loss": 3.7318339490301558, "tokens_seen": 794236928 }, { "epoch": 2.03, "learning_rate": 0.00038352056168505517, "loss": 2.9533, "theoretical_loss": 3.7318033854916095, "tokens_seen": 794302464 }, { "epoch": 2.03, "learning_rate": 0.0003835105315947844, "loss": 2.9439, "theoretical_loss": 3.7317728251807, "tokens_seen": 794368000 }, { "epoch": 2.03, "learning_rate": 0.00038350050150451353, "loss": 3.0007, "theoretical_loss": 3.731742268096821, "tokens_seen": 794433536 }, { "epoch": 2.03, "learning_rate": 0.00038349047141424277, "loss": 2.9492, "theoretical_loss": 3.7317117142393643, "tokens_seen": 794499072 }, { "epoch": 2.03, "learning_rate": 0.0003834804413239719, "loss": 2.9234, "theoretical_loss": 3.7316811636077243, "tokens_seen": 794564608 }, { "epoch": 2.03, "objective/train/docs_used": 1282668, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0267624855041504, "objective/train/theoretical_loss": 3.7316658895013957, "objective/train/tokens_used": 815057376, "theoretical_loss": 3.7316658895013957, "tokens_seen": 794597376 }, { "epoch": 2.03, "learning_rate": 0.00038347041123370113, "loss": 2.982, "theoretical_loss": 3.7316506162012937, "tokens_seen": 794630144 }, { "epoch": 2.03, "learning_rate": 0.0003834603811434303, "loss": 2.9005, "theoretical_loss": 3.7316200720194663, "tokens_seen": 794695680 }, { "epoch": 2.03, "learning_rate": 0.0003834503510531595, "loss": 2.9658, "theoretical_loss": 3.7315895310616365, "tokens_seen": 794761216 }, { "epoch": 2.03, "learning_rate": 0.0003834403209628887, "loss": 3.0354, "theoretical_loss": 3.731558993327197, "tokens_seen": 794826752 }, { "epoch": 2.03, "learning_rate": 0.00038343029087261786, "loss": 2.8783, "theoretical_loss": 3.731528458815543, "tokens_seen": 794892288 }, { "epoch": 2.03, "learning_rate": 0.00038342026078234704, "loss": 2.9689, "theoretical_loss": 3.7314979275260676, "tokens_seen": 794957824 }, { "epoch": 2.03, "learning_rate": 0.00038341023069207627, "loss": 2.878, "theoretical_loss": 3.7314673994581655, "tokens_seen": 795023360 }, { "epoch": 2.03, "learning_rate": 0.0003834002006018054, "loss": 2.9409, "theoretical_loss": 3.731436874611232, "tokens_seen": 795088896 }, { "epoch": 2.03, "learning_rate": 0.00038339017051153463, "loss": 2.9079, "theoretical_loss": 3.731406352984661, "tokens_seen": 795154432 }, { "epoch": 2.03, "learning_rate": 0.0003833801404212638, "loss": 2.909, "theoretical_loss": 3.7313758345778476, "tokens_seen": 795219968 }, { "epoch": 2.03, "learning_rate": 0.000383370110330993, "loss": 2.9913, "theoretical_loss": 3.7313453193901873, "tokens_seen": 795285504 }, { "epoch": 2.03, "learning_rate": 0.0003833600802407222, "loss": 3.0217, "theoretical_loss": 3.7313148074210742, "tokens_seen": 795351040 }, { "epoch": 2.03, "learning_rate": 0.00038335005015045136, "loss": 2.7686, "theoretical_loss": 3.7312842986699053, "tokens_seen": 795416576 }, { "epoch": 2.03, "learning_rate": 0.00038334002006018054, "loss": 3.0318, "theoretical_loss": 3.7312537931360743, "tokens_seen": 795482112 }, { "epoch": 2.03, "learning_rate": 0.0003833299899699098, "loss": 2.9319, "theoretical_loss": 3.731223290818978, "tokens_seen": 795547648 }, { "epoch": 2.03, "learning_rate": 0.0003833199598796389, "loss": 3.0288, "theoretical_loss": 3.731192791718012, "tokens_seen": 795613184 }, { "epoch": 2.03, "learning_rate": 0.00038330992978936814, "loss": 2.9647, "theoretical_loss": 3.731162295832573, "tokens_seen": 795678720 }, { "epoch": 2.03, "learning_rate": 0.00038329989969909726, "loss": 2.9911, "theoretical_loss": 3.731131803162056, "tokens_seen": 795744256 }, { "epoch": 2.03, "learning_rate": 0.0003832898696088265, "loss": 2.8274, "theoretical_loss": 3.7311013137058575, "tokens_seen": 795809792 }, { "epoch": 2.03, "learning_rate": 0.0003832798395185557, "loss": 2.9393, "theoretical_loss": 3.731070827463375, "tokens_seen": 795875328 }, { "epoch": 2.03, "learning_rate": 0.00038326980942828486, "loss": 2.8838, "theoretical_loss": 3.731040344434004, "tokens_seen": 795940864 }, { "epoch": 2.03, "learning_rate": 0.00038325977933801404, "loss": 2.9022, "theoretical_loss": 3.7310098646171426, "tokens_seen": 796006400 }, { "epoch": 2.03, "learning_rate": 0.0003832497492477433, "loss": 2.9141, "theoretical_loss": 3.7309793880121864, "tokens_seen": 796071936 }, { "epoch": 2.03, "learning_rate": 0.0003832397191574724, "loss": 2.8697, "theoretical_loss": 3.7309489146185335, "tokens_seen": 796137472 }, { "epoch": 2.03, "learning_rate": 0.00038322968906720164, "loss": 2.8655, "theoretical_loss": 3.730918444435581, "tokens_seen": 796203008 }, { "epoch": 2.03, "objective/train/docs_used": 1285455, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9700472354888916, "objective/train/theoretical_loss": 3.7309032105479294, "objective/train/tokens_used": 816695776, "theoretical_loss": 3.7309032105479294, "tokens_seen": 796235776 }, { "epoch": 2.03, "learning_rate": 0.00038321965897693077, "loss": 2.78, "theoretical_loss": 3.730887977462727, "tokens_seen": 796268544 }, { "epoch": 2.03, "learning_rate": 0.00038320962888666, "loss": 3.0615, "theoretical_loss": 3.730857513699368, "tokens_seen": 796334080 }, { "epoch": 2.03, "learning_rate": 0.0003831995987963892, "loss": 2.9095, "theoretical_loss": 3.7308270531449015, "tokens_seen": 796399616 }, { "epoch": 2.03, "learning_rate": 0.00038318956870611836, "loss": 2.7818, "theoretical_loss": 3.7307965957987275, "tokens_seen": 796465152 }, { "epoch": 2.03, "learning_rate": 0.00038317953861584755, "loss": 2.8989, "theoretical_loss": 3.7307661416602422, "tokens_seen": 796530688 }, { "epoch": 2.03, "learning_rate": 0.0003831695085255767, "loss": 2.8865, "theoretical_loss": 3.730735690728845, "tokens_seen": 796596224 }, { "epoch": 2.03, "learning_rate": 0.0003831594784353059, "loss": 2.8572, "theoretical_loss": 3.7307052430039334, "tokens_seen": 796661760 }, { "epoch": 2.03, "learning_rate": 0.00038314944834503514, "loss": 2.9955, "theoretical_loss": 3.730674798484907, "tokens_seen": 796727296 }, { "epoch": 2.03, "learning_rate": 0.00038313941825476427, "loss": 2.8966, "theoretical_loss": 3.730644357171164, "tokens_seen": 796792832 }, { "epoch": 2.03, "learning_rate": 0.0003831293881644935, "loss": 2.8454, "theoretical_loss": 3.7306139190621037, "tokens_seen": 796858368 }, { "epoch": 2.03, "learning_rate": 0.00038311935807422263, "loss": 2.9434, "theoretical_loss": 3.7305834841571253, "tokens_seen": 796923904 }, { "epoch": 2.03, "learning_rate": 0.00038310932798395187, "loss": 2.8946, "theoretical_loss": 3.7305530524556274, "tokens_seen": 796989440 }, { "epoch": 2.03, "learning_rate": 0.00038309929789368105, "loss": 2.9355, "theoretical_loss": 3.73052262395701, "tokens_seen": 797054976 }, { "epoch": 2.03, "learning_rate": 0.00038308926780341023, "loss": 3.0002, "theoretical_loss": 3.7304921986606727, "tokens_seen": 797120512 }, { "epoch": 2.03, "learning_rate": 0.0003830792377131394, "loss": 2.7597, "theoretical_loss": 3.730461776566015, "tokens_seen": 797186048 }, { "epoch": 2.03, "learning_rate": 0.00038306920762286865, "loss": 2.9295, "theoretical_loss": 3.730431357672437, "tokens_seen": 797251584 }, { "epoch": 2.03, "learning_rate": 0.0003830591775325978, "loss": 2.99, "theoretical_loss": 3.730400941979338, "tokens_seen": 797317120 }, { "epoch": 2.03, "learning_rate": 0.000383049147442327, "loss": 2.8774, "theoretical_loss": 3.7303705294861196, "tokens_seen": 797382656 }, { "epoch": 2.03, "learning_rate": 0.00038303911735205614, "loss": 2.9869, "theoretical_loss": 3.7303401201921815, "tokens_seen": 797448192 }, { "epoch": 2.03, "learning_rate": 0.00038302908726178537, "loss": 2.9549, "theoretical_loss": 3.730309714096924, "tokens_seen": 797513728 }, { "epoch": 2.03, "learning_rate": 0.00038301905717151455, "loss": 2.9645, "theoretical_loss": 3.730279311199748, "tokens_seen": 797579264 }, { "epoch": 2.03, "learning_rate": 0.00038300902708124373, "loss": 2.896, "theoretical_loss": 3.730248911500055, "tokens_seen": 797644800 }, { "epoch": 2.03, "learning_rate": 0.0003829989969909729, "loss": 2.9723, "theoretical_loss": 3.7302185149972455, "tokens_seen": 797710336 }, { "epoch": 2.03, "learning_rate": 0.0003829889669007021, "loss": 2.8647, "theoretical_loss": 3.7301881216907207, "tokens_seen": 797775872 }, { "epoch": 2.03, "learning_rate": 0.0003829789368104313, "loss": 3.0142, "theoretical_loss": 3.730157731579882, "tokens_seen": 797841408 }, { "epoch": 2.03, "objective/train/docs_used": 1288022, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8650410175323486, "objective/train/theoretical_loss": 3.7301425377226582, "objective/train/tokens_used": 818334176, "theoretical_loss": 3.7301425377226582, "tokens_seen": 797874176 }, { "epoch": 2.03, "learning_rate": 0.0003829689067201605, "loss": 2.942, "theoretical_loss": 3.730127344664131, "tokens_seen": 797906944 }, { "epoch": 2.03, "learning_rate": 0.00038295887662988964, "loss": 2.8966, "theoretical_loss": 3.73009696094287, "tokens_seen": 797972480 }, { "epoch": 2.03, "learning_rate": 0.0003829488465396189, "loss": 3.0049, "theoretical_loss": 3.730066580415499, "tokens_seen": 798038016 }, { "epoch": 2.03, "learning_rate": 0.000382938816449348, "loss": 2.8549, "theoretical_loss": 3.730036203081422, "tokens_seen": 798103552 }, { "epoch": 2.03, "learning_rate": 0.00038292878635907724, "loss": 2.8098, "theoretical_loss": 3.7300058289400404, "tokens_seen": 798169088 }, { "epoch": 2.03, "learning_rate": 0.0003829187562688064, "loss": 3.0263, "theoretical_loss": 3.7299754579907565, "tokens_seen": 798234624 }, { "epoch": 2.03, "learning_rate": 0.0003829087261785356, "loss": 3.0282, "theoretical_loss": 3.729945090232973, "tokens_seen": 798300160 }, { "epoch": 2.03, "learning_rate": 0.0003828986960882648, "loss": 3.0056, "theoretical_loss": 3.7299147256660925, "tokens_seen": 798365696 }, { "epoch": 2.03, "learning_rate": 0.000382888665997994, "loss": 2.8926, "theoretical_loss": 3.729884364289517, "tokens_seen": 798431232 }, { "epoch": 2.03, "learning_rate": 0.0003828786359077232, "loss": 2.8689, "theoretical_loss": 3.7298540061026513, "tokens_seen": 798496768 }, { "epoch": 2.03, "learning_rate": 0.0003828686058174524, "loss": 2.9532, "theoretical_loss": 3.7298236511048968, "tokens_seen": 798562304 }, { "epoch": 2.03, "learning_rate": 0.00038285857572718156, "loss": 2.9209, "theoretical_loss": 3.729793299295658, "tokens_seen": 798627840 }, { "epoch": 2.03, "learning_rate": 0.00038284854563691074, "loss": 3.0153, "theoretical_loss": 3.7297629506743375, "tokens_seen": 798693376 }, { "epoch": 2.03, "learning_rate": 0.00038283851554664, "loss": 3.0405, "theoretical_loss": 3.729732605240339, "tokens_seen": 798758912 }, { "epoch": 2.03, "learning_rate": 0.0003828284854563691, "loss": 3.009, "theoretical_loss": 3.729702262993067, "tokens_seen": 798824448 }, { "epoch": 2.03, "learning_rate": 0.00038281845536609834, "loss": 2.8896, "theoretical_loss": 3.729671923931925, "tokens_seen": 798889984 }, { "epoch": 2.04, "learning_rate": 0.00038280842527582746, "loss": 2.7614, "theoretical_loss": 3.7296415880563174, "tokens_seen": 798955520 }, { "epoch": 2.04, "learning_rate": 0.0003827983951855567, "loss": 2.8425, "theoretical_loss": 3.7296112553656475, "tokens_seen": 799021056 }, { "epoch": 2.04, "learning_rate": 0.0003827883650952859, "loss": 3.0859, "theoretical_loss": 3.729580925859321, "tokens_seen": 799086592 }, { "epoch": 2.04, "learning_rate": 0.00038277833500501506, "loss": 2.8535, "theoretical_loss": 3.7295505995367417, "tokens_seen": 799152128 }, { "epoch": 2.04, "learning_rate": 0.00038276830491474424, "loss": 2.9025, "theoretical_loss": 3.729520276397314, "tokens_seen": 799217664 }, { "epoch": 2.04, "learning_rate": 0.0003827582748244735, "loss": 3.0043, "theoretical_loss": 3.7294899564404442, "tokens_seen": 799283200 }, { "epoch": 2.04, "learning_rate": 0.0003827482447342026, "loss": 2.8999, "theoretical_loss": 3.729459639665536, "tokens_seen": 799348736 }, { "epoch": 2.04, "learning_rate": 0.00038273821464393184, "loss": 2.9774, "theoretical_loss": 3.7294293260719953, "tokens_seen": 799414272 }, { "epoch": 2.04, "learning_rate": 0.00038272818455366097, "loss": 2.896, "theoretical_loss": 3.7293990156592276, "tokens_seen": 799479808 }, { "epoch": 2.04, "objective/train/docs_used": 1290903, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0261290073394775, "objective/train/theoretical_loss": 3.7293838616454478, "objective/train/tokens_used": 819972576, "theoretical_loss": 3.7293838616454478, "tokens_seen": 799512576 }, { "epoch": 2.04, "learning_rate": 0.0003827181544633902, "loss": 2.8819, "theoretical_loss": 3.729368708426638, "tokens_seen": 799545344 }, { "epoch": 2.04, "learning_rate": 0.0003827081243731194, "loss": 2.9894, "theoretical_loss": 3.7293384043736317, "tokens_seen": 799610880 }, { "epoch": 2.04, "learning_rate": 0.00038269809428284856, "loss": 2.9481, "theoretical_loss": 3.7293081034996165, "tokens_seen": 799676416 }, { "epoch": 2.04, "learning_rate": 0.00038268806419257775, "loss": 2.9285, "theoretical_loss": 3.7292778058039966, "tokens_seen": 799741952 }, { "epoch": 2.04, "learning_rate": 0.0003826780341023069, "loss": 2.9996, "theoretical_loss": 3.7292475112861787, "tokens_seen": 799807488 }, { "epoch": 2.04, "learning_rate": 0.0003826680040120361, "loss": 2.8797, "theoretical_loss": 3.729217219945569, "tokens_seen": 799873024 }, { "epoch": 2.04, "learning_rate": 0.00038265797392176534, "loss": 2.9573, "theoretical_loss": 3.729186931781575, "tokens_seen": 799938560 }, { "epoch": 2.04, "learning_rate": 0.00038264794383149447, "loss": 2.9953, "theoretical_loss": 3.7291566467936015, "tokens_seen": 800004096 }, { "epoch": 2.04, "learning_rate": 0.0003826379137412237, "loss": 2.9437, "theoretical_loss": 3.7291263649810573, "tokens_seen": 800069632 }, { "epoch": 2.04, "learning_rate": 0.00038262788365095283, "loss": 2.8786, "theoretical_loss": 3.729096086343348, "tokens_seen": 800135168 }, { "epoch": 2.04, "learning_rate": 0.00038261785356068207, "loss": 2.9645, "theoretical_loss": 3.7290658108798818, "tokens_seen": 800200704 }, { "epoch": 2.04, "learning_rate": 0.00038260782347041125, "loss": 2.9084, "theoretical_loss": 3.729035538590065, "tokens_seen": 800266240 }, { "epoch": 2.04, "learning_rate": 0.00038259779338014043, "loss": 2.8779, "theoretical_loss": 3.729005269473306, "tokens_seen": 800331776 }, { "epoch": 2.04, "learning_rate": 0.0003825877632898696, "loss": 2.8583, "theoretical_loss": 3.728975003529011, "tokens_seen": 800397312 }, { "epoch": 2.04, "learning_rate": 0.00038257773319959885, "loss": 2.9541, "theoretical_loss": 3.7289447407565897, "tokens_seen": 800462848 }, { "epoch": 2.04, "learning_rate": 0.000382567703109328, "loss": 2.7681, "theoretical_loss": 3.7289144811554484, "tokens_seen": 800528384 }, { "epoch": 2.04, "learning_rate": 0.0003825576730190572, "loss": 2.9674, "theoretical_loss": 3.728884224724996, "tokens_seen": 800593920 }, { "epoch": 2.04, "learning_rate": 0.00038254764292878634, "loss": 3.0605, "theoretical_loss": 3.7288539714646407, "tokens_seen": 800659456 }, { "epoch": 2.04, "learning_rate": 0.00038253761283851557, "loss": 3.0562, "theoretical_loss": 3.7288237213737907, "tokens_seen": 800724992 }, { "epoch": 2.04, "learning_rate": 0.00038252758274824475, "loss": 2.9551, "theoretical_loss": 3.7287934744518547, "tokens_seen": 800790528 }, { "epoch": 2.04, "learning_rate": 0.00038251755265797393, "loss": 2.9652, "theoretical_loss": 3.7287632306982417, "tokens_seen": 800856064 }, { "epoch": 2.04, "learning_rate": 0.0003825075225677031, "loss": 2.9373, "theoretical_loss": 3.72873299011236, "tokens_seen": 800921600 }, { "epoch": 2.04, "learning_rate": 0.0003824974924774323, "loss": 2.9015, "theoretical_loss": 3.7287027526936187, "tokens_seen": 800987136 }, { "epoch": 2.04, "learning_rate": 0.0003824874623871615, "loss": 3.049, "theoretical_loss": 3.728672518441428, "tokens_seen": 801052672 }, { "epoch": 2.04, "learning_rate": 0.0003824774322968907, "loss": 3.0964, "theoretical_loss": 3.7286422873551963, "tokens_seen": 801118208 }, { "epoch": 2.04, "objective/train/docs_used": 1293809, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.075124502182007, "objective/train/theoretical_loss": 3.72862717299913, "objective/train/tokens_used": 821610976, "theoretical_loss": 3.72862717299913, "tokens_seen": 801150976 }, { "epoch": 2.04, "learning_rate": 0.00038246740220661984, "loss": 2.93, "theoretical_loss": 3.728612059434333, "tokens_seen": 801183744 }, { "epoch": 2.04, "learning_rate": 0.0003824573721163491, "loss": 2.9183, "theoretical_loss": 3.7285818346782484, "tokens_seen": 801249280 }, { "epoch": 2.04, "learning_rate": 0.0003824473420260782, "loss": 2.9814, "theoretical_loss": 3.728551613086352, "tokens_seen": 801314816 }, { "epoch": 2.04, "learning_rate": 0.00038243731193580744, "loss": 2.9784, "theoretical_loss": 3.728521394658054, "tokens_seen": 801380352 }, { "epoch": 2.04, "learning_rate": 0.0003824272818455366, "loss": 2.8758, "theoretical_loss": 3.7284911793927646, "tokens_seen": 801445888 }, { "epoch": 2.04, "learning_rate": 0.0003824172517552658, "loss": 3.0752, "theoretical_loss": 3.7284609672898936, "tokens_seen": 801511424 }, { "epoch": 2.04, "learning_rate": 0.000382407221664995, "loss": 2.9437, "theoretical_loss": 3.728430758348852, "tokens_seen": 801576960 }, { "epoch": 2.04, "learning_rate": 0.0003823971915747242, "loss": 2.9303, "theoretical_loss": 3.7284005525690507, "tokens_seen": 801642496 }, { "epoch": 2.04, "learning_rate": 0.00038238716148445334, "loss": 2.9274, "theoretical_loss": 3.7283703499498992, "tokens_seen": 801708032 }, { "epoch": 2.04, "learning_rate": 0.0003823771313941826, "loss": 2.8934, "theoretical_loss": 3.7283401504908102, "tokens_seen": 801773568 }, { "epoch": 2.04, "learning_rate": 0.0003823671013039117, "loss": 2.986, "theoretical_loss": 3.7283099541911935, "tokens_seen": 801839104 }, { "epoch": 2.04, "learning_rate": 0.00038235707121364094, "loss": 2.9309, "theoretical_loss": 3.728279761050461, "tokens_seen": 801904640 }, { "epoch": 2.04, "learning_rate": 0.0003823470411233701, "loss": 2.9313, "theoretical_loss": 3.728249571068024, "tokens_seen": 801970176 }, { "epoch": 2.04, "learning_rate": 0.0003823370110330993, "loss": 2.9467, "theoretical_loss": 3.7282193842432934, "tokens_seen": 802035712 }, { "epoch": 2.04, "learning_rate": 0.0003823269809428285, "loss": 2.8768, "theoretical_loss": 3.728189200575682, "tokens_seen": 802101248 }, { "epoch": 2.04, "learning_rate": 0.00038231695085255766, "loss": 3.0786, "theoretical_loss": 3.7281590200646013, "tokens_seen": 802166784 }, { "epoch": 2.04, "learning_rate": 0.00038230692076228685, "loss": 3.0991, "theoretical_loss": 3.728128842709463, "tokens_seen": 802232320 }, { "epoch": 2.04, "learning_rate": 0.0003822968906720161, "loss": 2.9355, "theoretical_loss": 3.7280986685096797, "tokens_seen": 802297856 }, { "epoch": 2.04, "learning_rate": 0.0003822868605817452, "loss": 3.0325, "theoretical_loss": 3.7280684974646636, "tokens_seen": 802363392 }, { "epoch": 2.04, "learning_rate": 0.00038227683049147444, "loss": 3.0631, "theoretical_loss": 3.7280383295738275, "tokens_seen": 802428928 }, { "epoch": 2.04, "learning_rate": 0.00038226680040120357, "loss": 2.9905, "theoretical_loss": 3.728008164836584, "tokens_seen": 802494464 }, { "epoch": 2.04, "learning_rate": 0.0003822567703109328, "loss": 3.0585, "theoretical_loss": 3.7279780032523453, "tokens_seen": 802560000 }, { "epoch": 2.04, "learning_rate": 0.000382246740220662, "loss": 2.9453, "theoretical_loss": 3.7279478448205254, "tokens_seen": 802625536 }, { "epoch": 2.04, "learning_rate": 0.00038223671013039117, "loss": 2.8561, "theoretical_loss": 3.7279176895405373, "tokens_seen": 802691072 }, { "epoch": 2.04, "learning_rate": 0.00038222668004012035, "loss": 3.0306, "theoretical_loss": 3.7278875374117932, "tokens_seen": 802756608 }, { "epoch": 2.04, "objective/train/docs_used": 1296393, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9357025623321533, "objective/train/theoretical_loss": 3.727872462528955, "objective/train/tokens_used": 823249376, "theoretical_loss": 3.727872462528955, "tokens_seen": 802789376 }, { "epoch": 2.04, "learning_rate": 0.0003822166499498496, "loss": 2.7929, "theoretical_loss": 3.7278573884337076, "tokens_seen": 802822144 }, { "epoch": 2.04, "learning_rate": 0.0003822066198595787, "loss": 3.0928, "theoretical_loss": 3.727827242605694, "tokens_seen": 802887680 }, { "epoch": 2.04, "learning_rate": 0.00038219658976930795, "loss": 3.0315, "theoretical_loss": 3.727797099927166, "tokens_seen": 802953216 }, { "epoch": 2.04, "learning_rate": 0.0003821865596790371, "loss": 3.0138, "theoretical_loss": 3.727766960397538, "tokens_seen": 803018752 }, { "epoch": 2.04, "learning_rate": 0.0003821765295887663, "loss": 2.9144, "theoretical_loss": 3.7277368240162234, "tokens_seen": 803084288 }, { "epoch": 2.04, "learning_rate": 0.0003821664994984955, "loss": 3.0137, "theoretical_loss": 3.727706690782637, "tokens_seen": 803149824 }, { "epoch": 2.04, "learning_rate": 0.00038215646940822467, "loss": 2.9206, "theoretical_loss": 3.7276765606961924, "tokens_seen": 803215360 }, { "epoch": 2.04, "learning_rate": 0.00038214643931795385, "loss": 2.852, "theoretical_loss": 3.7276464337563047, "tokens_seen": 803280896 }, { "epoch": 2.04, "learning_rate": 0.00038213640922768303, "loss": 3.1955, "theoretical_loss": 3.7276163099623894, "tokens_seen": 803346432 }, { "epoch": 2.04, "learning_rate": 0.00038212637913741227, "loss": 2.9117, "theoretical_loss": 3.72758618931386, "tokens_seen": 803411968 }, { "epoch": 2.04, "learning_rate": 0.00038211634904714145, "loss": 3.0035, "theoretical_loss": 3.727556071810133, "tokens_seen": 803477504 }, { "epoch": 2.04, "learning_rate": 0.00038210631895687063, "loss": 3.0603, "theoretical_loss": 3.727525957450622, "tokens_seen": 803543040 }, { "epoch": 2.04, "learning_rate": 0.0003820962888665998, "loss": 2.8323, "theoretical_loss": 3.727495846234743, "tokens_seen": 803608576 }, { "epoch": 2.04, "learning_rate": 0.00038208625877632905, "loss": 2.8462, "theoretical_loss": 3.7274657381619125, "tokens_seen": 803674112 }, { "epoch": 2.04, "learning_rate": 0.0003820762286860582, "loss": 3.0531, "theoretical_loss": 3.7274356332315444, "tokens_seen": 803739648 }, { "epoch": 2.04, "learning_rate": 0.0003820661985957874, "loss": 2.9266, "theoretical_loss": 3.727405531443056, "tokens_seen": 803805184 }, { "epoch": 2.04, "learning_rate": 0.00038205616850551654, "loss": 3.0402, "theoretical_loss": 3.7273754327958626, "tokens_seen": 803870720 }, { "epoch": 2.04, "learning_rate": 0.00038204613841524577, "loss": 3.0399, "theoretical_loss": 3.72734533728938, "tokens_seen": 803936256 }, { "epoch": 2.04, "learning_rate": 0.00038203610832497495, "loss": 3.0659, "theoretical_loss": 3.727315244923026, "tokens_seen": 804001792 }, { "epoch": 2.04, "learning_rate": 0.00038202607823470413, "loss": 3.0343, "theoretical_loss": 3.7272851556962148, "tokens_seen": 804067328 }, { "epoch": 2.04, "learning_rate": 0.0003820160481444333, "loss": 3.0763, "theoretical_loss": 3.727255069608365, "tokens_seen": 804132864 }, { "epoch": 2.04, "learning_rate": 0.0003820060180541625, "loss": 2.9636, "theoretical_loss": 3.727224986658892, "tokens_seen": 804198400 }, { "epoch": 2.04, "learning_rate": 0.0003819959879638917, "loss": 3.0388, "theoretical_loss": 3.727194906847213, "tokens_seen": 804263936 }, { "epoch": 2.04, "learning_rate": 0.0003819859578736209, "loss": 2.9701, "theoretical_loss": 3.727164830172746, "tokens_seen": 804329472 }, { "epoch": 2.04, "learning_rate": 0.00038197592778335004, "loss": 2.8332, "theoretical_loss": 3.727134756634907, "tokens_seen": 804395008 }, { "epoch": 2.04, "objective/train/docs_used": 1299284, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.931384563446045, "objective/train/theoretical_loss": 3.7271197210420413, "objective/train/tokens_used": 824887776, "theoretical_loss": 3.7271197210420413, "tokens_seen": 804427776 }, { "epoch": 2.04, "learning_rate": 0.0003819658976930793, "loss": 3.0347, "theoretical_loss": 3.727104686233114, "tokens_seen": 804460544 }, { "epoch": 2.04, "learning_rate": 0.0003819558676028084, "loss": 3.0072, "theoretical_loss": 3.7270746189667845, "tokens_seen": 804526080 }, { "epoch": 2.04, "learning_rate": 0.00038194583751253764, "loss": 3.027, "theoretical_loss": 3.727044554835336, "tokens_seen": 804591616 }, { "epoch": 2.04, "learning_rate": 0.0003819358074222668, "loss": 3.0001, "theoretical_loss": 3.727014493838187, "tokens_seen": 804657152 }, { "epoch": 2.04, "learning_rate": 0.000381925777331996, "loss": 2.9004, "theoretical_loss": 3.7269844359747544, "tokens_seen": 804722688 }, { "epoch": 2.04, "learning_rate": 0.0003819157472417252, "loss": 2.9461, "theoretical_loss": 3.7269543812444574, "tokens_seen": 804788224 }, { "epoch": 2.04, "learning_rate": 0.0003819057171514544, "loss": 2.9874, "theoretical_loss": 3.7269243296467134, "tokens_seen": 804853760 }, { "epoch": 2.04, "learning_rate": 0.00038189568706118354, "loss": 2.9614, "theoretical_loss": 3.7268942811809413, "tokens_seen": 804919296 }, { "epoch": 2.04, "learning_rate": 0.0003818856569709128, "loss": 2.7737, "theoretical_loss": 3.72686423584656, "tokens_seen": 804984832 }, { "epoch": 2.04, "learning_rate": 0.0003818756268806419, "loss": 2.9896, "theoretical_loss": 3.726834193642988, "tokens_seen": 805050368 }, { "epoch": 2.04, "learning_rate": 0.00038186559679037114, "loss": 2.8336, "theoretical_loss": 3.726804154569644, "tokens_seen": 805115904 }, { "epoch": 2.04, "learning_rate": 0.0003818555667001003, "loss": 2.9042, "theoretical_loss": 3.726774118625948, "tokens_seen": 805181440 }, { "epoch": 2.04, "learning_rate": 0.0003818455366098295, "loss": 2.8611, "theoretical_loss": 3.726744085811318, "tokens_seen": 805246976 }, { "epoch": 2.04, "learning_rate": 0.0003818355065195587, "loss": 3.0064, "theoretical_loss": 3.7267140561251737, "tokens_seen": 805312512 }, { "epoch": 2.04, "learning_rate": 0.00038182547642928786, "loss": 2.9618, "theoretical_loss": 3.7266840295669352, "tokens_seen": 805378048 }, { "epoch": 2.04, "learning_rate": 0.00038181544633901705, "loss": 2.8468, "theoretical_loss": 3.7266540061360223, "tokens_seen": 805443584 }, { "epoch": 2.04, "learning_rate": 0.0003818054162487463, "loss": 2.9699, "theoretical_loss": 3.7266239858318544, "tokens_seen": 805509120 }, { "epoch": 2.04, "learning_rate": 0.0003817953861584754, "loss": 2.9419, "theoretical_loss": 3.7265939686538516, "tokens_seen": 805574656 }, { "epoch": 2.04, "learning_rate": 0.00038178535606820464, "loss": 2.8419, "theoretical_loss": 3.726563954601434, "tokens_seen": 805640192 }, { "epoch": 2.04, "learning_rate": 0.00038177532597793377, "loss": 3.0414, "theoretical_loss": 3.726533943674022, "tokens_seen": 805705728 }, { "epoch": 2.04, "learning_rate": 0.000381765295887663, "loss": 2.9264, "theoretical_loss": 3.726503935871036, "tokens_seen": 805771264 }, { "epoch": 2.04, "learning_rate": 0.0003817552657973922, "loss": 2.9039, "theoretical_loss": 3.7264739311918964, "tokens_seen": 805836800 }, { "epoch": 2.04, "learning_rate": 0.00038174523570712137, "loss": 3.0096, "theoretical_loss": 3.726443929636025, "tokens_seen": 805902336 }, { "epoch": 2.04, "learning_rate": 0.00038173520561685055, "loss": 2.9977, "theoretical_loss": 3.726413931202842, "tokens_seen": 805967872 }, { "epoch": 2.04, "learning_rate": 0.0003817251755265798, "loss": 3.0412, "theoretical_loss": 3.7263839358917688, "tokens_seen": 806033408 }, { "epoch": 2.04, "objective/train/docs_used": 1302019, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5174736976623535, "objective/train/theoretical_loss": 3.726368939406842, "objective/train/tokens_used": 826526176, "theoretical_loss": 3.726368939406842, "tokens_seen": 806066176 }, { "epoch": 2.04, "learning_rate": 0.0003817151454363089, "loss": 2.6985, "theoretical_loss": 3.7263539437022257, "tokens_seen": 806098944 }, { "epoch": 2.04, "learning_rate": 0.00038170511534603815, "loss": 3.0521, "theoretical_loss": 3.726323954633635, "tokens_seen": 806164480 }, { "epoch": 2.04, "learning_rate": 0.0003816950852557673, "loss": 3.0161, "theoretical_loss": 3.726293968685419, "tokens_seen": 806230016 }, { "epoch": 2.04, "learning_rate": 0.0003816850551654965, "loss": 3.0945, "theoretical_loss": 3.7262639858569977, "tokens_seen": 806295552 }, { "epoch": 2.04, "learning_rate": 0.0003816750250752257, "loss": 2.9371, "theoretical_loss": 3.726234006147794, "tokens_seen": 806361088 }, { "epoch": 2.04, "learning_rate": 0.00038166499498495487, "loss": 2.9206, "theoretical_loss": 3.7262040295572296, "tokens_seen": 806426624 }, { "epoch": 2.04, "learning_rate": 0.00038165496489468405, "loss": 3.1069, "theoretical_loss": 3.7261740560847274, "tokens_seen": 806492160 }, { "epoch": 2.04, "learning_rate": 0.00038164493480441323, "loss": 2.9952, "theoretical_loss": 3.7261440857297083, "tokens_seen": 806557696 }, { "epoch": 2.04, "learning_rate": 0.0003816349047141424, "loss": 2.7587, "theoretical_loss": 3.726114118491596, "tokens_seen": 806623232 }, { "epoch": 2.04, "learning_rate": 0.00038162487462387165, "loss": 2.9562, "theoretical_loss": 3.726084154369813, "tokens_seen": 806688768 }, { "epoch": 2.04, "learning_rate": 0.0003816148445336008, "loss": 2.9276, "theoretical_loss": 3.7260541933637814, "tokens_seen": 806754304 }, { "epoch": 2.04, "learning_rate": 0.00038160481444333, "loss": 2.9476, "theoretical_loss": 3.726024235472925, "tokens_seen": 806819840 }, { "epoch": 2.04, "learning_rate": 0.00038159478435305914, "loss": 3.0698, "theoretical_loss": 3.725994280696666, "tokens_seen": 806885376 }, { "epoch": 2.04, "learning_rate": 0.0003815847542627884, "loss": 2.8141, "theoretical_loss": 3.725964329034429, "tokens_seen": 806950912 }, { "epoch": 2.04, "learning_rate": 0.00038157472417251756, "loss": 2.8968, "theoretical_loss": 3.725934380485636, "tokens_seen": 807016448 }, { "epoch": 2.04, "learning_rate": 0.00038156469408224674, "loss": 2.9297, "theoretical_loss": 3.7259044350497112, "tokens_seen": 807081984 }, { "epoch": 2.04, "learning_rate": 0.0003815546639919759, "loss": 2.9145, "theoretical_loss": 3.7258744927260787, "tokens_seen": 807147520 }, { "epoch": 2.04, "learning_rate": 0.00038154463390170515, "loss": 2.8641, "theoretical_loss": 3.725844553514161, "tokens_seen": 807213056 }, { "epoch": 2.04, "learning_rate": 0.0003815346038114343, "loss": 2.8782, "theoretical_loss": 3.725814617413384, "tokens_seen": 807278592 }, { "epoch": 2.04, "learning_rate": 0.0003815245737211635, "loss": 2.8404, "theoretical_loss": 3.7257846844231706, "tokens_seen": 807344128 }, { "epoch": 2.04, "learning_rate": 0.00038151454363089264, "loss": 2.8945, "theoretical_loss": 3.7257547545429457, "tokens_seen": 807409664 }, { "epoch": 2.04, "learning_rate": 0.0003815045135406219, "loss": 2.8487, "theoretical_loss": 3.725724827772133, "tokens_seen": 807475200 }, { "epoch": 2.04, "learning_rate": 0.00038149448345035106, "loss": 2.8967, "theoretical_loss": 3.725694904110158, "tokens_seen": 807540736 }, { "epoch": 2.04, "learning_rate": 0.00038148445336008024, "loss": 2.8785, "theoretical_loss": 3.7256649835564453, "tokens_seen": 807606272 }, { "epoch": 2.04, "learning_rate": 0.0003814744232698094, "loss": 2.8923, "theoretical_loss": 3.72563506611042, "tokens_seen": 807671808 }, { "epoch": 2.04, "objective/train/docs_used": 1303383, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5928823947906494, "objective/train/theoretical_loss": 3.72562010855261, "objective/train/tokens_used": 828164576, "theoretical_loss": 3.72562010855261, "tokens_seen": 807704576 }, { "epoch": 2.04, "learning_rate": 0.0003814643931795386, "loss": 2.9211, "theoretical_loss": 3.7256051517715063, "tokens_seen": 807737344 }, { "epoch": 2.04, "learning_rate": 0.0003814543630892678, "loss": 2.8267, "theoretical_loss": 3.7255752405391305, "tokens_seen": 807802880 }, { "epoch": 2.04, "learning_rate": 0.000381444332998997, "loss": 2.8334, "theoretical_loss": 3.7255453324127172, "tokens_seen": 807868416 }, { "epoch": 2.04, "learning_rate": 0.00038143430290872614, "loss": 2.8992, "theoretical_loss": 3.725515427391693, "tokens_seen": 807933952 }, { "epoch": 2.04, "learning_rate": 0.0003814242728184554, "loss": 3.0182, "theoretical_loss": 3.7254855254754826, "tokens_seen": 807999488 }, { "epoch": 2.04, "learning_rate": 0.00038141424272818456, "loss": 2.982, "theoretical_loss": 3.725455626663512, "tokens_seen": 808065024 }, { "epoch": 2.04, "learning_rate": 0.00038140421263791374, "loss": 3.0092, "theoretical_loss": 3.7254257309552075, "tokens_seen": 808130560 }, { "epoch": 2.04, "learning_rate": 0.0003813941825476429, "loss": 2.866, "theoretical_loss": 3.7253958383499954, "tokens_seen": 808196096 }, { "epoch": 2.04, "learning_rate": 0.0003813841524573721, "loss": 3.0528, "theoretical_loss": 3.725365948847301, "tokens_seen": 808261632 }, { "epoch": 2.04, "learning_rate": 0.00038137412236710134, "loss": 3.0108, "theoretical_loss": 3.725336062446553, "tokens_seen": 808327168 }, { "epoch": 2.04, "learning_rate": 0.0003813640922768305, "loss": 2.72, "theoretical_loss": 3.7253061791471755, "tokens_seen": 808392704 }, { "epoch": 2.04, "learning_rate": 0.0003813540621865597, "loss": 2.9887, "theoretical_loss": 3.7252762989485966, "tokens_seen": 808458240 }, { "epoch": 2.04, "learning_rate": 0.0003813440320962889, "loss": 3.0818, "theoretical_loss": 3.7252464218502435, "tokens_seen": 808523776 }, { "epoch": 2.04, "learning_rate": 0.00038133400200601806, "loss": 2.841, "theoretical_loss": 3.7252165478515424, "tokens_seen": 808589312 }, { "epoch": 2.04, "learning_rate": 0.00038132397191574725, "loss": 2.8909, "theoretical_loss": 3.725186676951921, "tokens_seen": 808654848 }, { "epoch": 2.04, "learning_rate": 0.0003813139418254765, "loss": 3.0019, "theoretical_loss": 3.725156809150806, "tokens_seen": 808720384 }, { "epoch": 2.04, "learning_rate": 0.0003813039117352056, "loss": 2.9765, "theoretical_loss": 3.7251269444476263, "tokens_seen": 808785920 }, { "epoch": 2.04, "learning_rate": 0.00038129388164493484, "loss": 2.9458, "theoretical_loss": 3.725097082841809, "tokens_seen": 808851456 }, { "epoch": 2.04, "learning_rate": 0.00038128385155466397, "loss": 2.9903, "theoretical_loss": 3.725067224332781, "tokens_seen": 808916992 }, { "epoch": 2.04, "learning_rate": 0.0003812738214643932, "loss": 2.9516, "theoretical_loss": 3.725037368919972, "tokens_seen": 808982528 }, { "epoch": 2.04, "learning_rate": 0.0003812637913741224, "loss": 2.9651, "theoretical_loss": 3.7250075166028083, "tokens_seen": 809048064 }, { "epoch": 2.04, "learning_rate": 0.00038125376128385157, "loss": 2.8761, "theoretical_loss": 3.7249776673807196, "tokens_seen": 809113600 }, { "epoch": 2.04, "learning_rate": 0.00038124373119358075, "loss": 2.9951, "theoretical_loss": 3.724947821253134, "tokens_seen": 809179136 }, { "epoch": 2.04, "learning_rate": 0.00038123370110331, "loss": 3.0492, "theoretical_loss": 3.72491797821948, "tokens_seen": 809244672 }, { "epoch": 2.04, "learning_rate": 0.0003812236710130391, "loss": 2.9006, "theoretical_loss": 3.7248881382791863, "tokens_seen": 809310208 }, { "epoch": 2.04, "objective/train/docs_used": 1306217, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8504345417022705, "objective/train/theoretical_loss": 3.7248732194688707, "objective/train/tokens_used": 829802976, "theoretical_loss": 3.7248732194688707, "tokens_seen": 809342976 }, { "epoch": 2.04, "learning_rate": 0.00038121364092276835, "loss": 2.8907, "theoretical_loss": 3.7248583014316816, "tokens_seen": 809375744 }, { "epoch": 2.04, "learning_rate": 0.0003812036108324975, "loss": 3.0753, "theoretical_loss": 3.724828467676395, "tokens_seen": 809441280 }, { "epoch": 2.04, "learning_rate": 0.0003811935807422267, "loss": 3.0038, "theoretical_loss": 3.724798637012756, "tokens_seen": 809506816 }, { "epoch": 2.04, "learning_rate": 0.0003811835506519559, "loss": 2.8916, "theoretical_loss": 3.724768809440194, "tokens_seen": 809572352 }, { "epoch": 2.04, "learning_rate": 0.00038117352056168507, "loss": 2.8503, "theoretical_loss": 3.7247389849581385, "tokens_seen": 809637888 }, { "epoch": 2.04, "learning_rate": 0.00038116349047141425, "loss": 2.9445, "theoretical_loss": 3.724709163566019, "tokens_seen": 809703424 }, { "epoch": 2.04, "learning_rate": 0.00038115346038114343, "loss": 3.0242, "theoretical_loss": 3.724679345263265, "tokens_seen": 809768960 }, { "epoch": 2.04, "learning_rate": 0.0003811434302908726, "loss": 2.9272, "theoretical_loss": 3.724649530049308, "tokens_seen": 809834496 }, { "epoch": 2.04, "learning_rate": 0.00038113340020060185, "loss": 2.9476, "theoretical_loss": 3.7246197179235754, "tokens_seen": 809900032 }, { "epoch": 2.04, "learning_rate": 0.000381123370110331, "loss": 2.8275, "theoretical_loss": 3.7245899088855, "tokens_seen": 809965568 }, { "epoch": 2.04, "learning_rate": 0.0003811133400200602, "loss": 2.9071, "theoretical_loss": 3.724560102934511, "tokens_seen": 810031104 }, { "epoch": 2.04, "learning_rate": 0.00038110330992978934, "loss": 2.8814, "theoretical_loss": 3.7245303000700387, "tokens_seen": 810096640 }, { "epoch": 2.04, "learning_rate": 0.0003810932798395186, "loss": 3.0121, "theoretical_loss": 3.724500500291515, "tokens_seen": 810162176 }, { "epoch": 2.04, "learning_rate": 0.00038108324974924776, "loss": 2.9155, "theoretical_loss": 3.72447070359837, "tokens_seen": 810227712 }, { "epoch": 2.04, "learning_rate": 0.00038107321965897694, "loss": 2.8572, "theoretical_loss": 3.724440909990034, "tokens_seen": 810293248 }, { "epoch": 2.04, "learning_rate": 0.0003810631895687061, "loss": 3.0068, "theoretical_loss": 3.72441111946594, "tokens_seen": 810358784 }, { "epoch": 2.04, "learning_rate": 0.00038105315947843535, "loss": 2.7611, "theoretical_loss": 3.724381332025518, "tokens_seen": 810424320 }, { "epoch": 2.04, "learning_rate": 0.0003810431293881645, "loss": 2.818, "theoretical_loss": 3.7243515476681996, "tokens_seen": 810489856 }, { "epoch": 2.04, "learning_rate": 0.0003810330992978937, "loss": 2.8551, "theoretical_loss": 3.724321766393417, "tokens_seen": 810555392 }, { "epoch": 2.04, "learning_rate": 0.00038102306920762284, "loss": 3.0499, "theoretical_loss": 3.7242919882006014, "tokens_seen": 810620928 }, { "epoch": 2.04, "learning_rate": 0.0003810130391173521, "loss": 2.9323, "theoretical_loss": 3.724262213089185, "tokens_seen": 810686464 }, { "epoch": 2.04, "learning_rate": 0.00038100300902708126, "loss": 2.9335, "theoretical_loss": 3.724232441058599, "tokens_seen": 810752000 }, { "epoch": 2.04, "learning_rate": 0.00038099297893681044, "loss": 3.0558, "theoretical_loss": 3.7242026721082775, "tokens_seen": 810817536 }, { "epoch": 2.04, "learning_rate": 0.0003809829488465396, "loss": 3.0686, "theoretical_loss": 3.7241729062376514, "tokens_seen": 810883072 }, { "epoch": 2.04, "learning_rate": 0.0003809729187562688, "loss": 3.0444, "theoretical_loss": 3.724143143446154, "tokens_seen": 810948608 }, { "epoch": 2.04, "objective/train/docs_used": 1309160, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.107858896255493, "objective/train/theoretical_loss": 3.724128263204901, "objective/train/tokens_used": 831441376, "theoretical_loss": 3.724128263204901, "tokens_seen": 810981376 }, { "epoch": 2.04, "learning_rate": 0.000380962888665998, "loss": 2.9156, "theoretical_loss": 3.724113383733217, "tokens_seen": 811014144 }, { "epoch": 2.04, "learning_rate": 0.0003809528585757272, "loss": 3.0161, "theoretical_loss": 3.7240836270982745, "tokens_seen": 811079680 }, { "epoch": 2.04, "learning_rate": 0.00038094282848545635, "loss": 3.0652, "theoretical_loss": 3.7240538735407585, "tokens_seen": 811145216 }, { "epoch": 2.04, "learning_rate": 0.0003809327983951856, "loss": 2.9237, "theoretical_loss": 3.7240241230601026, "tokens_seen": 811210752 }, { "epoch": 2.04, "learning_rate": 0.00038092276830491476, "loss": 3.0717, "theoretical_loss": 3.72399437565574, "tokens_seen": 811276288 }, { "epoch": 2.04, "learning_rate": 0.00038091273821464394, "loss": 2.9493, "theoretical_loss": 3.723964631327104, "tokens_seen": 811341824 }, { "epoch": 2.04, "learning_rate": 0.0003809027081243731, "loss": 2.9949, "theoretical_loss": 3.7239348900736284, "tokens_seen": 811407360 }, { "epoch": 2.04, "learning_rate": 0.0003808926780341023, "loss": 2.8906, "theoretical_loss": 3.723905151894747, "tokens_seen": 811472896 }, { "epoch": 2.04, "learning_rate": 0.0003808826479438315, "loss": 2.8882, "theoretical_loss": 3.723875416789893, "tokens_seen": 811538432 }, { "epoch": 2.04, "learning_rate": 0.0003808726178535607, "loss": 2.9673, "theoretical_loss": 3.7238456847585013, "tokens_seen": 811603968 }, { "epoch": 2.04, "learning_rate": 0.00038086258776328985, "loss": 2.9086, "theoretical_loss": 3.7238159558000055, "tokens_seen": 811669504 }, { "epoch": 2.04, "learning_rate": 0.0003808525576730191, "loss": 2.7723, "theoretical_loss": 3.72378622991384, "tokens_seen": 811735040 }, { "epoch": 2.04, "learning_rate": 0.0003808425275827482, "loss": 3.075, "theoretical_loss": 3.7237565070994396, "tokens_seen": 811800576 }, { "epoch": 2.04, "learning_rate": 0.00038083249749247745, "loss": 2.8804, "theoretical_loss": 3.723726787356239, "tokens_seen": 811866112 }, { "epoch": 2.04, "learning_rate": 0.0003808224674022066, "loss": 2.8997, "theoretical_loss": 3.7236970706836723, "tokens_seen": 811931648 }, { "epoch": 2.04, "learning_rate": 0.0003808124373119358, "loss": 2.8707, "theoretical_loss": 3.723667357081175, "tokens_seen": 811997184 }, { "epoch": 2.04, "learning_rate": 0.000380802407221665, "loss": 2.8615, "theoretical_loss": 3.723637646548182, "tokens_seen": 812062720 }, { "epoch": 2.04, "learning_rate": 0.00038079237713139417, "loss": 2.9642, "theoretical_loss": 3.723607939084129, "tokens_seen": 812128256 }, { "epoch": 2.04, "learning_rate": 0.00038078234704112335, "loss": 3.0169, "theoretical_loss": 3.7235782346884507, "tokens_seen": 812193792 }, { "epoch": 2.04, "learning_rate": 0.0003807723169508526, "loss": 2.748, "theoretical_loss": 3.7235485333605833, "tokens_seen": 812259328 }, { "epoch": 2.04, "learning_rate": 0.0003807622868605817, "loss": 2.966, "theoretical_loss": 3.7235188350999615, "tokens_seen": 812324864 }, { "epoch": 2.04, "learning_rate": 0.00038075225677031095, "loss": 2.8826, "theoretical_loss": 3.7234891399060217, "tokens_seen": 812390400 }, { "epoch": 2.04, "learning_rate": 0.00038074222668004013, "loss": 2.9496, "theoretical_loss": 3.7234594477782004, "tokens_seen": 812455936 }, { "epoch": 2.04, "learning_rate": 0.0003807321965897693, "loss": 2.9777, "theoretical_loss": 3.723429758715933, "tokens_seen": 812521472 }, { "epoch": 2.04, "learning_rate": 0.0003807221664994985, "loss": 2.9973, "theoretical_loss": 3.723400072718656, "tokens_seen": 812587008 }, { "epoch": 2.04, "objective/train/docs_used": 1311485, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8776917457580566, "objective/train/theoretical_loss": 3.7233852308692126, "objective/train/tokens_used": 833079776, "theoretical_loss": 3.7233852308692126, "tokens_seen": 812619776 }, { "epoch": 2.04, "learning_rate": 0.0003807121364092277, "loss": 2.8888, "theoretical_loss": 3.7233703897858055, "tokens_seen": 812652544 }, { "epoch": 2.04, "learning_rate": 0.00038070210631895685, "loss": 3.0924, "theoretical_loss": 3.7233407099168185, "tokens_seen": 812718080 }, { "epoch": 2.04, "learning_rate": 0.0003806920762286861, "loss": 3.0105, "theoretical_loss": 3.7233110331111314, "tokens_seen": 812783616 }, { "epoch": 2.04, "learning_rate": 0.0003806820461384152, "loss": 2.7895, "theoretical_loss": 3.7232813593681824, "tokens_seen": 812849152 }, { "epoch": 2.04, "learning_rate": 0.00038067201604814445, "loss": 3.0019, "theoretical_loss": 3.723251688687406, "tokens_seen": 812914688 }, { "epoch": 2.04, "learning_rate": 0.0003806619859578736, "loss": 2.6297, "theoretical_loss": 3.7232220210682416, "tokens_seen": 812980224 }, { "epoch": 2.04, "learning_rate": 0.0003806519558676028, "loss": 3.0009, "theoretical_loss": 3.7231923565101255, "tokens_seen": 813045760 }, { "epoch": 2.04, "learning_rate": 0.000380641925777332, "loss": 2.8605, "theoretical_loss": 3.723162695012495, "tokens_seen": 813111296 }, { "epoch": 2.04, "learning_rate": 0.0003806318956870612, "loss": 2.7566, "theoretical_loss": 3.7231330365747883, "tokens_seen": 813176832 }, { "epoch": 2.04, "learning_rate": 0.0003806218655967904, "loss": 2.909, "theoretical_loss": 3.7231033811964434, "tokens_seen": 813242368 }, { "epoch": 2.04, "learning_rate": 0.00038061183550651954, "loss": 3.0328, "theoretical_loss": 3.7230737288768974, "tokens_seen": 813307904 }, { "epoch": 2.04, "learning_rate": 0.0003806018054162488, "loss": 2.9548, "theoretical_loss": 3.7230440796155886, "tokens_seen": 813373440 }, { "epoch": 2.04, "learning_rate": 0.00038059177532597796, "loss": 2.8225, "theoretical_loss": 3.7230144334119553, "tokens_seen": 813438976 }, { "epoch": 2.04, "learning_rate": 0.00038058174523570714, "loss": 2.7699, "theoretical_loss": 3.722984790265436, "tokens_seen": 813504512 }, { "epoch": 2.04, "learning_rate": 0.0003805717151454363, "loss": 2.7987, "theoretical_loss": 3.722955150175469, "tokens_seen": 813570048 }, { "epoch": 2.04, "learning_rate": 0.00038056168505516555, "loss": 3.0552, "theoretical_loss": 3.722925513141493, "tokens_seen": 813635584 }, { "epoch": 2.04, "learning_rate": 0.0003805516549648947, "loss": 3.0546, "theoretical_loss": 3.722895879162947, "tokens_seen": 813701120 }, { "epoch": 2.04, "learning_rate": 0.0003805416248746239, "loss": 2.9237, "theoretical_loss": 3.72286624823927, "tokens_seen": 813766656 }, { "epoch": 2.04, "learning_rate": 0.00038053159478435304, "loss": 3.0715, "theoretical_loss": 3.722836620369901, "tokens_seen": 813832192 }, { "epoch": 2.04, "learning_rate": 0.0003805215646940823, "loss": 2.9305, "theoretical_loss": 3.7228069955542784, "tokens_seen": 813897728 }, { "epoch": 2.04, "learning_rate": 0.00038051153460381146, "loss": 3.0157, "theoretical_loss": 3.722777373791843, "tokens_seen": 813963264 }, { "epoch": 2.04, "learning_rate": 0.00038050150451354064, "loss": 3.0563, "theoretical_loss": 3.7227477550820334, "tokens_seen": 814028800 }, { "epoch": 2.04, "learning_rate": 0.0003804914744232698, "loss": 2.8049, "theoretical_loss": 3.72271813942429, "tokens_seen": 814094336 }, { "epoch": 2.04, "learning_rate": 0.000380481444332999, "loss": 2.8856, "theoretical_loss": 3.7226885268180516, "tokens_seen": 814159872 }, { "epoch": 2.04, "learning_rate": 0.0003804714142427282, "loss": 2.9253, "theoretical_loss": 3.722658917262759, "tokens_seen": 814225408 }, { "epoch": 2.04, "objective/train/docs_used": 1314346, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9782323837280273, "objective/train/theoretical_loss": 3.722644113629043, "objective/train/tokens_used": 834718176, "theoretical_loss": 3.722644113629043, "tokens_seen": 814258176 }, { "epoch": 2.04, "learning_rate": 0.0003804613841524574, "loss": 2.9598, "theoretical_loss": 3.7226293107578523, "tokens_seen": 814290944 }, { "epoch": 2.04, "learning_rate": 0.00038045135406218655, "loss": 2.9986, "theoretical_loss": 3.7225997073027717, "tokens_seen": 814356480 }, { "epoch": 2.04, "learning_rate": 0.0003804413239719158, "loss": 2.9237, "theoretical_loss": 3.722570106896957, "tokens_seen": 814422016 }, { "epoch": 2.04, "learning_rate": 0.00038043129388164496, "loss": 2.7598, "theoretical_loss": 3.7225405095398503, "tokens_seen": 814487552 }, { "epoch": 2.04, "learning_rate": 0.00038042126379137414, "loss": 2.9467, "theoretical_loss": 3.722510915230891, "tokens_seen": 814553088 }, { "epoch": 2.04, "learning_rate": 0.0003804112337011033, "loss": 3.0094, "theoretical_loss": 3.72248132396952, "tokens_seen": 814618624 }, { "epoch": 2.04, "learning_rate": 0.0003804012036108325, "loss": 2.8886, "theoretical_loss": 3.722451735755179, "tokens_seen": 814684160 }, { "epoch": 2.04, "learning_rate": 0.0003803911735205617, "loss": 2.9042, "theoretical_loss": 3.7224221505873087, "tokens_seen": 814749696 }, { "epoch": 2.04, "learning_rate": 0.0003803811434302909, "loss": 2.9462, "theoretical_loss": 3.7223925684653505, "tokens_seen": 814815232 }, { "epoch": 2.04, "learning_rate": 0.00038037111334002005, "loss": 2.8856, "theoretical_loss": 3.7223629893887464, "tokens_seen": 814880768 }, { "epoch": 2.04, "learning_rate": 0.0003803610832497493, "loss": 2.9543, "theoretical_loss": 3.722333413356937, "tokens_seen": 814946304 }, { "epoch": 2.04, "learning_rate": 0.0003803510531594784, "loss": 2.9681, "theoretical_loss": 3.7223038403693645, "tokens_seen": 815011840 }, { "epoch": 2.04, "learning_rate": 0.00038034102306920765, "loss": 2.5217, "theoretical_loss": 3.7222742704254714, "tokens_seen": 815077376 }, { "epoch": 2.04, "learning_rate": 0.00038033099297893683, "loss": 3.0729, "theoretical_loss": 3.722244703524699, "tokens_seen": 815142912 }, { "epoch": 2.04, "learning_rate": 0.000380320962888666, "loss": 3.0075, "theoretical_loss": 3.72221513966649, "tokens_seen": 815208448 }, { "epoch": 2.04, "learning_rate": 0.0003803109327983952, "loss": 2.9806, "theoretical_loss": 3.7221855788502864, "tokens_seen": 815273984 }, { "epoch": 2.04, "learning_rate": 0.00038030090270812437, "loss": 2.862, "theoretical_loss": 3.7221560210755307, "tokens_seen": 815339520 }, { "epoch": 2.04, "learning_rate": 0.00038029087261785355, "loss": 3.1158, "theoretical_loss": 3.7221264663416656, "tokens_seen": 815405056 }, { "epoch": 2.04, "learning_rate": 0.0003802808425275828, "loss": 2.8687, "theoretical_loss": 3.7220969146481337, "tokens_seen": 815470592 }, { "epoch": 2.04, "learning_rate": 0.0003802708124373119, "loss": 2.944, "theoretical_loss": 3.7220673659943784, "tokens_seen": 815536128 }, { "epoch": 2.04, "learning_rate": 0.00038026078234704115, "loss": 2.9494, "theoretical_loss": 3.7220378203798425, "tokens_seen": 815601664 }, { "epoch": 2.04, "learning_rate": 0.00038025075225677033, "loss": 3.0526, "theoretical_loss": 3.7220082778039694, "tokens_seen": 815667200 }, { "epoch": 2.04, "learning_rate": 0.0003802407221664995, "loss": 2.9955, "theoretical_loss": 3.7219787382662024, "tokens_seen": 815732736 }, { "epoch": 2.04, "learning_rate": 0.0003802306920762287, "loss": 2.9713, "theoretical_loss": 3.7219492017659848, "tokens_seen": 815798272 }, { "epoch": 2.04, "learning_rate": 0.0003802206619859579, "loss": 3.055, "theoretical_loss": 3.7219196683027604, "tokens_seen": 815863808 }, { "epoch": 2.04, "objective/train/docs_used": 1317099, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.709165096282959, "objective/train/theoretical_loss": 3.721904902709847, "objective/train/tokens_used": 836356576, "theoretical_loss": 3.721904902709847, "tokens_seen": 815896576 }, { "epoch": 2.04, "learning_rate": 0.00038021063189568705, "loss": 2.9711, "theoretical_loss": 3.7218901378759734, "tokens_seen": 815929344 }, { "epoch": 2.04, "learning_rate": 0.0003802006018054163, "loss": 2.9652, "theoretical_loss": 3.7218606104850673, "tokens_seen": 815994880 }, { "epoch": 2.04, "learning_rate": 0.0003801905717151454, "loss": 3.0465, "theoretical_loss": 3.7218310861294865, "tokens_seen": 816060416 }, { "epoch": 2.04, "learning_rate": 0.00038018054162487465, "loss": 2.9298, "theoretical_loss": 3.7218015648086746, "tokens_seen": 816125952 }, { "epoch": 2.04, "learning_rate": 0.0003801705115346038, "loss": 2.9707, "theoretical_loss": 3.7217720465220765, "tokens_seen": 816191488 }, { "epoch": 2.04, "learning_rate": 0.000380160481444333, "loss": 2.9513, "theoretical_loss": 3.721742531269137, "tokens_seen": 816257024 }, { "epoch": 2.04, "learning_rate": 0.0003801504513540622, "loss": 3.083, "theoretical_loss": 3.7217130190493006, "tokens_seen": 816322560 }, { "epoch": 2.04, "learning_rate": 0.0003801404212637914, "loss": 2.8488, "theoretical_loss": 3.721683509862012, "tokens_seen": 816388096 }, { "epoch": 2.04, "learning_rate": 0.00038013039117352056, "loss": 2.939, "theoretical_loss": 3.7216540037067163, "tokens_seen": 816453632 }, { "epoch": 2.04, "learning_rate": 0.00038012036108324974, "loss": 2.8983, "theoretical_loss": 3.721624500582858, "tokens_seen": 816519168 }, { "epoch": 2.04, "learning_rate": 0.0003801103309929789, "loss": 2.9153, "theoretical_loss": 3.721595000489884, "tokens_seen": 816584704 }, { "epoch": 2.04, "learning_rate": 0.00038010030090270816, "loss": 2.9534, "theoretical_loss": 3.721565503427238, "tokens_seen": 816650240 }, { "epoch": 2.04, "learning_rate": 0.0003800902708124373, "loss": 3.0763, "theoretical_loss": 3.7215360093943666, "tokens_seen": 816715776 }, { "epoch": 2.04, "learning_rate": 0.0003800802407221665, "loss": 3.1101, "theoretical_loss": 3.721506518390715, "tokens_seen": 816781312 }, { "epoch": 2.04, "learning_rate": 0.0003800702106318957, "loss": 2.89, "theoretical_loss": 3.721477030415729, "tokens_seen": 816846848 }, { "epoch": 2.04, "learning_rate": 0.0003800601805416249, "loss": 2.9608, "theoretical_loss": 3.7214475454688554, "tokens_seen": 816912384 }, { "epoch": 2.04, "learning_rate": 0.00038005015045135406, "loss": 3.053, "theoretical_loss": 3.7214180635495397, "tokens_seen": 816977920 }, { "epoch": 2.04, "learning_rate": 0.00038004012036108324, "loss": 2.8602, "theoretical_loss": 3.7213885846572277, "tokens_seen": 817043456 }, { "epoch": 2.04, "learning_rate": 0.0003800300902708124, "loss": 2.9142, "theoretical_loss": 3.7213591087913667, "tokens_seen": 817108992 }, { "epoch": 2.04, "learning_rate": 0.00038002006018054166, "loss": 2.9383, "theoretical_loss": 3.7213296359514034, "tokens_seen": 817174528 }, { "epoch": 2.04, "learning_rate": 0.0003800100300902708, "loss": 3.0904, "theoretical_loss": 3.721300166136784, "tokens_seen": 817240064 }, { "epoch": 2.04, "learning_rate": 0.00038, "loss": 2.8559, "theoretical_loss": 3.721270699346956, "tokens_seen": 817305600 }, { "epoch": 2.04, "learning_rate": 0.00037998996990972915, "loss": 2.965, "theoretical_loss": 3.7212412355813655, "tokens_seen": 817371136 }, { "epoch": 2.04, "learning_rate": 0.0003799799398194584, "loss": 3.0204, "theoretical_loss": 3.7212117748394604, "tokens_seen": 817436672 }, { "epoch": 2.04, "learning_rate": 0.00037996990972918756, "loss": 2.9033, "theoretical_loss": 3.7211823171206873, "tokens_seen": 817502208 }, { "epoch": 2.04, "objective/train/docs_used": 1320055, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.834714889526367, "objective/train/theoretical_loss": 3.7211675893948026, "objective/train/tokens_used": 837994976, "theoretical_loss": 3.7211675893948026, "tokens_seen": 817534976 }, { "epoch": 2.04, "learning_rate": 0.00037995987963891675, "loss": 2.9242, "theoretical_loss": 3.7211528624244945, "tokens_seen": 817567744 }, { "epoch": 2.04, "learning_rate": 0.0003799498495486459, "loss": 2.922, "theoretical_loss": 3.7211234107503293, "tokens_seen": 817633280 }, { "epoch": 2.04, "learning_rate": 0.00037993981945837516, "loss": 2.8838, "theoretical_loss": 3.721093962097639, "tokens_seen": 817698816 }, { "epoch": 2.04, "learning_rate": 0.0003799297893681043, "loss": 3.0578, "theoretical_loss": 3.7210645164658724, "tokens_seen": 817764352 }, { "epoch": 2.04, "learning_rate": 0.0003799197592778335, "loss": 2.9194, "theoretical_loss": 3.721035073854477, "tokens_seen": 817829888 }, { "epoch": 2.04, "learning_rate": 0.00037990972918756265, "loss": 2.9065, "theoretical_loss": 3.7210056342629008, "tokens_seen": 817895424 }, { "epoch": 2.04, "learning_rate": 0.0003798996990972919, "loss": 2.9042, "theoretical_loss": 3.7209761976905926, "tokens_seen": 817960960 }, { "epoch": 2.04, "learning_rate": 0.00037988966900702107, "loss": 3.0238, "theoretical_loss": 3.7209467641370004, "tokens_seen": 818026496 }, { "epoch": 2.04, "learning_rate": 0.00037987963891675025, "loss": 2.9724, "theoretical_loss": 3.7209173336015735, "tokens_seen": 818092032 }, { "epoch": 2.04, "learning_rate": 0.0003798696088264795, "loss": 3.0805, "theoretical_loss": 3.72088790608376, "tokens_seen": 818157568 }, { "epoch": 2.04, "learning_rate": 0.0003798595787362086, "loss": 2.9811, "theoretical_loss": 3.720858481583009, "tokens_seen": 818223104 }, { "epoch": 2.04, "learning_rate": 0.00037984954864593785, "loss": 2.962, "theoretical_loss": 3.7208290600987697, "tokens_seen": 818288640 }, { "epoch": 2.04, "learning_rate": 0.00037983951855566703, "loss": 2.9911, "theoretical_loss": 3.7207996416304914, "tokens_seen": 818354176 }, { "epoch": 2.04, "learning_rate": 0.0003798294884653962, "loss": 2.7856, "theoretical_loss": 3.7207702261776228, "tokens_seen": 818419712 }, { "epoch": 2.04, "learning_rate": 0.0003798194583751254, "loss": 2.9373, "theoretical_loss": 3.7207408137396145, "tokens_seen": 818485248 }, { "epoch": 2.04, "learning_rate": 0.00037980942828485457, "loss": 2.8806, "theoretical_loss": 3.7207114043159146, "tokens_seen": 818550784 }, { "epoch": 2.04, "learning_rate": 0.00037979939819458375, "loss": 2.9551, "theoretical_loss": 3.7206819979059746, "tokens_seen": 818616320 }, { "epoch": 2.04, "learning_rate": 0.000379789368104313, "loss": 2.8709, "theoretical_loss": 3.720652594509243, "tokens_seen": 818681856 }, { "epoch": 2.04, "learning_rate": 0.0003797793380140421, "loss": 3.0025, "theoretical_loss": 3.7206231941251713, "tokens_seen": 818747392 }, { "epoch": 2.04, "learning_rate": 0.00037976930792377135, "loss": 3.0401, "theoretical_loss": 3.7205937967532083, "tokens_seen": 818812928 }, { "epoch": 2.04, "learning_rate": 0.00037975927783350053, "loss": 3.0214, "theoretical_loss": 3.7205644023928053, "tokens_seen": 818878464 }, { "epoch": 2.04, "learning_rate": 0.0003797492477432297, "loss": 2.9106, "theoretical_loss": 3.7205350110434123, "tokens_seen": 818944000 }, { "epoch": 2.04, "learning_rate": 0.0003797392176529589, "loss": 2.9399, "theoretical_loss": 3.72050562270448, "tokens_seen": 819009536 }, { "epoch": 2.04, "learning_rate": 0.0003797291875626881, "loss": 3.0579, "theoretical_loss": 3.7204762373754594, "tokens_seen": 819075072 }, { "epoch": 2.04, "learning_rate": 0.00037971915747241725, "loss": 2.9216, "theoretical_loss": 3.720446855055801, "tokens_seen": 819140608 }, { "debugging/Self-BLEU-5": 0.5784833545734609, "debugging/distinct-1-grams": 0.751683025940353, "debugging/distinct-2-grams": 0.9438828390481878, "debugging/entropy-1-grams": 6.234822116975876, "debugging/entropy-2-grams": 7.324681304022325, "debugging/length": 556.4285714285714, "debugging/num_segments": 21, "epoch": 2.04, "objective/train/docs_used": 1323057, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8465468883514404, "objective/train/theoretical_loss": 3.720432165024312, "objective/train/tokens_used": 839633376, "theoretical_loss": 3.720432165024312, "tokens_seen": 819173376 }, { "epoch": 2.04, "learning_rate": 0.0003797091273821465, "loss": 3.1506, "theoretical_loss": 3.7204174757449566, "tokens_seen": 819206144 }, { "epoch": 2.04, "learning_rate": 0.0003796990972918756, "loss": 2.9627, "theoretical_loss": 3.7203880994423777, "tokens_seen": 819271680 }, { "epoch": 2.04, "learning_rate": 0.00037968906720160485, "loss": 2.8166, "theoretical_loss": 3.7203587261475137, "tokens_seen": 819337216 }, { "epoch": 2.04, "learning_rate": 0.000379679037111334, "loss": 2.9864, "theoretical_loss": 3.7203293558598185, "tokens_seen": 819402752 }, { "epoch": 2.04, "learning_rate": 0.0003796690070210632, "loss": 2.8265, "theoretical_loss": 3.7202999885787422, "tokens_seen": 819468288 }, { "epoch": 2.04, "learning_rate": 0.0003796589769307924, "loss": 2.9471, "theoretical_loss": 3.720270624303737, "tokens_seen": 819533824 }, { "epoch": 2.04, "learning_rate": 0.0003796489468405216, "loss": 3.0519, "theoretical_loss": 3.7202412630342554, "tokens_seen": 819599360 }, { "epoch": 2.04, "learning_rate": 0.00037963891675025076, "loss": 3.0037, "theoretical_loss": 3.7202119047697484, "tokens_seen": 819664896 }, { "epoch": 2.04, "learning_rate": 0.00037962888665997994, "loss": 3.0567, "theoretical_loss": 3.720182549509669, "tokens_seen": 819730432 }, { "epoch": 2.04, "learning_rate": 0.0003796188565697091, "loss": 2.8441, "theoretical_loss": 3.7201531972534694, "tokens_seen": 819795968 }, { "epoch": 2.04, "learning_rate": 0.00037960882647943836, "loss": 2.9474, "theoretical_loss": 3.7201238480006027, "tokens_seen": 819861504 }, { "epoch": 2.04, "learning_rate": 0.0003795987963891675, "loss": 3.0012, "theoretical_loss": 3.7200945017505207, "tokens_seen": 819927040 }, { "epoch": 2.04, "learning_rate": 0.0003795887662988967, "loss": 2.6555, "theoretical_loss": 3.7200651585026763, "tokens_seen": 819992576 }, { "epoch": 2.04, "learning_rate": 0.0003795787362086259, "loss": 2.9011, "theoretical_loss": 3.7200358182565227, "tokens_seen": 820058112 }, { "epoch": 2.04, "learning_rate": 0.0003795687061183551, "loss": 2.8915, "theoretical_loss": 3.720006481011513, "tokens_seen": 820123648 }, { "epoch": 2.04, "learning_rate": 0.00037955867602808426, "loss": 2.9514, "theoretical_loss": 3.7199771467671003, "tokens_seen": 820189184 }, { "epoch": 2.04, "learning_rate": 0.00037954864593781344, "loss": 2.9086, "theoretical_loss": 3.7199478155227386, "tokens_seen": 820254720 }, { "epoch": 2.04, "learning_rate": 0.0003795386158475426, "loss": 3.0309, "theoretical_loss": 3.71991848727788, "tokens_seen": 820320256 }, { "epoch": 2.04, "learning_rate": 0.00037952858575727186, "loss": 2.8724, "theoretical_loss": 3.7198891620319796, "tokens_seen": 820385792 }, { "epoch": 2.04, "learning_rate": 0.000379518555667001, "loss": 2.9301, "theoretical_loss": 3.719859839784491, "tokens_seen": 820451328 }, { "epoch": 2.04, "learning_rate": 0.0003795085255767302, "loss": 2.8847, "theoretical_loss": 3.719830520534867, "tokens_seen": 820516864 }, { "epoch": 2.04, "learning_rate": 0.00037949849548645935, "loss": 3.0645, "theoretical_loss": 3.719801204282563, "tokens_seen": 820582400 }, { "epoch": 2.04, "learning_rate": 0.0003794884653961886, "loss": 2.9593, "theoretical_loss": 3.7197718910270328, "tokens_seen": 820647936 }, { "epoch": 2.04, "learning_rate": 0.00037947843530591776, "loss": 2.9224, "theoretical_loss": 3.719742580767731, "tokens_seen": 820713472 }, { "epoch": 2.04, "learning_rate": 0.00037946840521564695, "loss": 2.9554, "theoretical_loss": 3.719713273504111, "tokens_seen": 820779008 }, { "epoch": 2.04, "objective/train/docs_used": 1325039, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8949291706085205, "objective/train/theoretical_loss": 3.719698620995512, "objective/train/tokens_used": 841271776, "theoretical_loss": 3.719698620995512, "tokens_seen": 820811776 }, { "epoch": 2.04, "learning_rate": 0.0003794583751253761, "loss": 2.7278, "theoretical_loss": 3.7196839692356285, "tokens_seen": 820844544 }, { "epoch": 2.04, "learning_rate": 0.00037944834503510536, "loss": 2.9929, "theoretical_loss": 3.7196546679617386, "tokens_seen": 820910080 }, { "epoch": 2.04, "learning_rate": 0.0003794383149448345, "loss": 2.8594, "theoretical_loss": 3.7196253696818955, "tokens_seen": 820975616 }, { "epoch": 2.04, "learning_rate": 0.0003794282848545637, "loss": 2.8303, "theoretical_loss": 3.7195960743955547, "tokens_seen": 821041152 }, { "epoch": 2.04, "learning_rate": 0.00037941825476429285, "loss": 2.956, "theoretical_loss": 3.7195667821021714, "tokens_seen": 821106688 }, { "epoch": 2.04, "learning_rate": 0.0003794082246740221, "loss": 2.9004, "theoretical_loss": 3.7195374928012006, "tokens_seen": 821172224 }, { "epoch": 2.04, "learning_rate": 0.00037939819458375127, "loss": 2.8914, "theoretical_loss": 3.7195082064920983, "tokens_seen": 821237760 }, { "epoch": 2.04, "learning_rate": 0.00037938816449348045, "loss": 2.9762, "theoretical_loss": 3.7194789231743197, "tokens_seen": 821303296 }, { "epoch": 2.04, "learning_rate": 0.00037937813440320963, "loss": 2.8892, "theoretical_loss": 3.719449642847321, "tokens_seen": 821368832 }, { "epoch": 2.04, "learning_rate": 0.0003793681043129388, "loss": 2.7984, "theoretical_loss": 3.719420365510558, "tokens_seen": 821434368 }, { "epoch": 2.04, "learning_rate": 0.000379358074222668, "loss": 2.8938, "theoretical_loss": 3.7193910911634878, "tokens_seen": 821499904 }, { "epoch": 2.04, "learning_rate": 0.00037934804413239723, "loss": 2.957, "theoretical_loss": 3.719361819805565, "tokens_seen": 821565440 }, { "epoch": 2.04, "learning_rate": 0.00037933801404212635, "loss": 3.0268, "theoretical_loss": 3.7193325514362465, "tokens_seen": 821630976 }, { "epoch": 2.04, "learning_rate": 0.0003793279839518556, "loss": 2.8709, "theoretical_loss": 3.7193032860549886, "tokens_seen": 821696512 }, { "epoch": 2.04, "learning_rate": 0.0003793179538615847, "loss": 3.0926, "theoretical_loss": 3.7192740236612485, "tokens_seen": 821762048 }, { "epoch": 2.04, "learning_rate": 0.00037930792377131395, "loss": 2.9047, "theoretical_loss": 3.7192447642544835, "tokens_seen": 821827584 }, { "epoch": 2.04, "learning_rate": 0.00037929789368104313, "loss": 2.716, "theoretical_loss": 3.719215507834149, "tokens_seen": 821893120 }, { "epoch": 2.04, "learning_rate": 0.0003792878635907723, "loss": 2.8547, "theoretical_loss": 3.7191862543997036, "tokens_seen": 821958656 }, { "epoch": 2.04, "learning_rate": 0.0003792778335005015, "loss": 2.8314, "theoretical_loss": 3.7191570039506034, "tokens_seen": 822024192 }, { "epoch": 2.04, "learning_rate": 0.00037926780341023073, "loss": 3.0047, "theoretical_loss": 3.719127756486307, "tokens_seen": 822089728 }, { "epoch": 2.04, "learning_rate": 0.00037925777331995986, "loss": 3.1329, "theoretical_loss": 3.7190985120062705, "tokens_seen": 822155264 }, { "epoch": 2.04, "learning_rate": 0.0003792477432296891, "loss": 3.1125, "theoretical_loss": 3.7190692705099524, "tokens_seen": 822220800 }, { "epoch": 2.04, "learning_rate": 0.0003792377131394182, "loss": 3.0314, "theoretical_loss": 3.71904003199681, "tokens_seen": 822286336 }, { "epoch": 2.04, "learning_rate": 0.00037922768304914746, "loss": 2.8854, "theoretical_loss": 3.719010796466302, "tokens_seen": 822351872 }, { "epoch": 2.04, "learning_rate": 0.00037921765295887664, "loss": 2.9366, "theoretical_loss": 3.718981563917886, "tokens_seen": 822417408 }, { "epoch": 2.04, "objective/train/docs_used": 1327913, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9839601516723633, "objective/train/theoretical_loss": 3.718966948761793, "objective/train/tokens_used": 842910176, "theoretical_loss": 3.718966948761793, "tokens_seen": 822450176 }, { "epoch": 2.04, "learning_rate": 0.0003792076228686058, "loss": 2.9376, "theoretical_loss": 3.7189523343510205, "tokens_seen": 822482944 }, { "epoch": 2.04, "learning_rate": 0.000379197592778335, "loss": 3.036, "theoretical_loss": 3.718923107765163, "tokens_seen": 822548480 }, { "epoch": 2.04, "learning_rate": 0.0003791875626880642, "loss": 3.0581, "theoretical_loss": 3.7188938841597734, "tokens_seen": 822614016 }, { "epoch": 2.04, "learning_rate": 0.00037917753259779336, "loss": 2.8463, "theoretical_loss": 3.718864663534309, "tokens_seen": 822679552 }, { "epoch": 2.04, "learning_rate": 0.0003791675025075226, "loss": 2.908, "theoretical_loss": 3.718835445888229, "tokens_seen": 822745088 }, { "epoch": 2.04, "learning_rate": 0.0003791574724172517, "loss": 2.9281, "theoretical_loss": 3.718806231220993, "tokens_seen": 822810624 }, { "epoch": 2.04, "learning_rate": 0.00037914744232698096, "loss": 2.9359, "theoretical_loss": 3.7187770195320593, "tokens_seen": 822876160 }, { "epoch": 2.04, "learning_rate": 0.0003791374122367101, "loss": 3.0153, "theoretical_loss": 3.7187478108208873, "tokens_seen": 822941696 }, { "epoch": 2.04, "learning_rate": 0.0003791273821464393, "loss": 2.953, "theoretical_loss": 3.718718605086937, "tokens_seen": 823007232 }, { "epoch": 2.04, "learning_rate": 0.00037911735205616856, "loss": 2.668, "theoretical_loss": 3.718689402329667, "tokens_seen": 823072768 }, { "epoch": 2.04, "learning_rate": 0.0003791073219658977, "loss": 3.0679, "theoretical_loss": 3.718660202548537, "tokens_seen": 823138304 }, { "epoch": 2.04, "learning_rate": 0.0003790972918756269, "loss": 3.0636, "theoretical_loss": 3.718631005743007, "tokens_seen": 823203840 }, { "epoch": 2.04, "learning_rate": 0.0003790872617853561, "loss": 3.1405, "theoretical_loss": 3.7186018119125377, "tokens_seen": 823269376 }, { "epoch": 2.04, "learning_rate": 0.0003790772316950853, "loss": 2.8887, "theoretical_loss": 3.7185726210565875, "tokens_seen": 823334912 }, { "epoch": 2.04, "learning_rate": 0.00037906720160481446, "loss": 3.0556, "theoretical_loss": 3.7185434331746183, "tokens_seen": 823400448 }, { "epoch": 2.04, "learning_rate": 0.00037905717151454364, "loss": 3.0467, "theoretical_loss": 3.718514248266089, "tokens_seen": 823465984 }, { "epoch": 2.04, "learning_rate": 0.0003790471414242728, "loss": 2.8527, "theoretical_loss": 3.7184850663304614, "tokens_seen": 823531520 }, { "epoch": 2.04, "learning_rate": 0.00037903711133400206, "loss": 2.8909, "theoretical_loss": 3.718455887367195, "tokens_seen": 823597056 }, { "epoch": 2.04, "learning_rate": 0.0003790270812437312, "loss": 2.9331, "theoretical_loss": 3.718426711375751, "tokens_seen": 823662592 }, { "epoch": 2.04, "learning_rate": 0.0003790170511534604, "loss": 2.9137, "theoretical_loss": 3.7183975383555903, "tokens_seen": 823728128 }, { "epoch": 2.04, "learning_rate": 0.00037900702106318955, "loss": 3.0885, "theoretical_loss": 3.7183683683061743, "tokens_seen": 823793664 }, { "epoch": 2.04, "learning_rate": 0.0003789969909729188, "loss": 3.041, "theoretical_loss": 3.7183392012269634, "tokens_seen": 823859200 }, { "epoch": 2.04, "learning_rate": 0.00037898696088264796, "loss": 2.9435, "theoretical_loss": 3.71831003711742, "tokens_seen": 823924736 }, { "epoch": 2.04, "learning_rate": 0.00037897693079237715, "loss": 2.9151, "theoretical_loss": 3.7182808759770047, "tokens_seen": 823990272 }, { "epoch": 2.04, "learning_rate": 0.0003789669007021063, "loss": 2.9861, "theoretical_loss": 3.7182517178051793, "tokens_seen": 824055808 }, { "epoch": 2.04, "objective/train/docs_used": 1329241, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9945855140686035, "objective/train/theoretical_loss": 3.7182371398323193, "objective/train/tokens_used": 844548576, "theoretical_loss": 3.7182371398323193, "tokens_seen": 824088576 }, { "epoch": 2.04, "learning_rate": 0.00037895687061183556, "loss": 2.9006, "theoretical_loss": 3.7182225626014054, "tokens_seen": 824121344 }, { "epoch": 2.04, "learning_rate": 0.0003789468405215647, "loss": 2.8357, "theoretical_loss": 3.7181934103651457, "tokens_seen": 824186880 }, { "epoch": 2.04, "learning_rate": 0.0003789368104312939, "loss": 3.0264, "theoretical_loss": 3.7181642610958607, "tokens_seen": 824252416 }, { "epoch": 2.04, "learning_rate": 0.00037892678034102305, "loss": 2.9867, "theoretical_loss": 3.7181351147930144, "tokens_seen": 824317952 }, { "epoch": 2.04, "learning_rate": 0.0003789167502507523, "loss": 2.8263, "theoretical_loss": 3.718105971456068, "tokens_seen": 824383488 }, { "epoch": 2.04, "learning_rate": 0.00037890672016048147, "loss": 2.9161, "theoretical_loss": 3.7180768310844834, "tokens_seen": 824449024 }, { "epoch": 2.04, "learning_rate": 0.00037889669007021065, "loss": 2.9939, "theoretical_loss": 3.7180476936777245, "tokens_seen": 824514560 }, { "epoch": 2.04, "learning_rate": 0.00037888665997993983, "loss": 2.8082, "theoretical_loss": 3.7180185592352535, "tokens_seen": 824580096 }, { "epoch": 2.04, "learning_rate": 0.000378876629889669, "loss": 3.0709, "theoretical_loss": 3.7179894277565335, "tokens_seen": 824645632 }, { "epoch": 2.04, "learning_rate": 0.0003788665997993982, "loss": 2.8271, "theoretical_loss": 3.717960299241027, "tokens_seen": 824711168 }, { "epoch": 2.04, "learning_rate": 0.00037885656970912743, "loss": 3.0409, "theoretical_loss": 3.7179311736881973, "tokens_seen": 824776704 }, { "epoch": 2.04, "learning_rate": 0.00037884653961885655, "loss": 2.9865, "theoretical_loss": 3.7179020510975076, "tokens_seen": 824842240 }, { "epoch": 2.04, "learning_rate": 0.0003788365095285858, "loss": 2.9091, "theoretical_loss": 3.717872931468422, "tokens_seen": 824907776 }, { "epoch": 2.04, "learning_rate": 0.0003788264794383149, "loss": 2.9229, "theoretical_loss": 3.717843814800403, "tokens_seen": 824973312 }, { "epoch": 2.04, "learning_rate": 0.00037881644934804415, "loss": 2.9599, "theoretical_loss": 3.717814701092915, "tokens_seen": 825038848 }, { "epoch": 2.04, "learning_rate": 0.00037880641925777333, "loss": 2.8793, "theoretical_loss": 3.7177855903454224, "tokens_seen": 825104384 }, { "epoch": 2.04, "learning_rate": 0.0003787963891675025, "loss": 2.8642, "theoretical_loss": 3.717756482557388, "tokens_seen": 825169920 }, { "epoch": 2.04, "learning_rate": 0.0003787863590772317, "loss": 3.0832, "theoretical_loss": 3.717727377728276, "tokens_seen": 825235456 }, { "epoch": 2.04, "learning_rate": 0.00037877632898696093, "loss": 2.8327, "theoretical_loss": 3.7176982758575523, "tokens_seen": 825300992 }, { "epoch": 2.04, "learning_rate": 0.00037876629889669006, "loss": 2.9156, "theoretical_loss": 3.7176691769446792, "tokens_seen": 825366528 }, { "epoch": 2.04, "learning_rate": 0.0003787562688064193, "loss": 2.8532, "theoretical_loss": 3.7176400809891224, "tokens_seen": 825432064 }, { "epoch": 2.04, "learning_rate": 0.0003787462387161484, "loss": 2.9224, "theoretical_loss": 3.717610987990346, "tokens_seen": 825497600 }, { "epoch": 2.04, "learning_rate": 0.00037873620862587766, "loss": 2.9233, "theoretical_loss": 3.7175818979478157, "tokens_seen": 825563136 }, { "epoch": 2.04, "learning_rate": 0.00037872617853560684, "loss": 3.087, "theoretical_loss": 3.7175528108609956, "tokens_seen": 825628672 }, { "epoch": 2.04, "learning_rate": 0.000378716148445336, "loss": 2.9119, "theoretical_loss": 3.7175237267293517, "tokens_seen": 825694208 }, { "epoch": 2.04, "objective/train/docs_used": 1332234, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.964531898498535, "objective/train/theoretical_loss": 3.7175091857715534, "objective/train/tokens_used": 846186976, "theoretical_loss": 3.7175091857715534, "tokens_seen": 825726976 }, { "epoch": 2.04, "learning_rate": 0.0003787061183550652, "loss": 3.0652, "theoretical_loss": 3.717494645552348, "tokens_seen": 825759744 }, { "epoch": 2.04, "learning_rate": 0.0003786960882647944, "loss": 3.0047, "theoretical_loss": 3.717465567329451, "tokens_seen": 825825280 }, { "epoch": 2.04, "learning_rate": 0.00037868605817452356, "loss": 2.9884, "theoretical_loss": 3.7174364920601257, "tokens_seen": 825890816 }, { "epoch": 2.04, "learning_rate": 0.0003786760280842528, "loss": 2.8382, "theoretical_loss": 3.717407419743838, "tokens_seen": 825956352 }, { "epoch": 2.04, "learning_rate": 0.0003786659979939819, "loss": 2.9706, "theoretical_loss": 3.717378350380053, "tokens_seen": 826021888 }, { "epoch": 2.04, "learning_rate": 0.00037865596790371116, "loss": 2.903, "theoretical_loss": 3.717349283968238, "tokens_seen": 826087424 }, { "epoch": 2.04, "learning_rate": 0.0003786459378134403, "loss": 2.7872, "theoretical_loss": 3.7173202205078577, "tokens_seen": 826152960 }, { "epoch": 2.04, "learning_rate": 0.0003786359077231695, "loss": 3.0769, "theoretical_loss": 3.7172911599983793, "tokens_seen": 826218496 }, { "epoch": 2.04, "learning_rate": 0.0003786258776328987, "loss": 3.0148, "theoretical_loss": 3.7172621024392685, "tokens_seen": 826284032 }, { "epoch": 2.04, "learning_rate": 0.0003786158475426279, "loss": 2.8276, "theoretical_loss": 3.7172330478299918, "tokens_seen": 826349568 }, { "epoch": 2.04, "learning_rate": 0.00037860581745235706, "loss": 3.0171, "theoretical_loss": 3.7172039961700163, "tokens_seen": 826415104 }, { "epoch": 2.04, "learning_rate": 0.0003785957873620863, "loss": 2.8536, "theoretical_loss": 3.7171749474588083, "tokens_seen": 826480640 }, { "epoch": 2.04, "learning_rate": 0.0003785857572718154, "loss": 2.9844, "theoretical_loss": 3.7171459016958353, "tokens_seen": 826546176 }, { "epoch": 2.04, "learning_rate": 0.00037857572718154466, "loss": 2.8717, "theoretical_loss": 3.717116858880564, "tokens_seen": 826611712 }, { "epoch": 2.04, "learning_rate": 0.0003785656970912738, "loss": 2.9968, "theoretical_loss": 3.7170878190124617, "tokens_seen": 826677248 }, { "epoch": 2.04, "learning_rate": 0.000378555667001003, "loss": 2.917, "theoretical_loss": 3.7170587820909953, "tokens_seen": 826742784 }, { "epoch": 2.04, "learning_rate": 0.0003785456369107322, "loss": 3.1072, "theoretical_loss": 3.7170297481156322, "tokens_seen": 826808320 }, { "epoch": 2.04, "learning_rate": 0.0003785356068204614, "loss": 2.9683, "theoretical_loss": 3.717000717085841, "tokens_seen": 826873856 }, { "epoch": 2.04, "learning_rate": 0.00037852557673019057, "loss": 2.9914, "theoretical_loss": 3.716971689001088, "tokens_seen": 826939392 }, { "epoch": 2.04, "learning_rate": 0.00037851554663991975, "loss": 3.0238, "theoretical_loss": 3.716942663860843, "tokens_seen": 827004928 }, { "epoch": 2.04, "learning_rate": 0.00037850551654964893, "loss": 2.8231, "theoretical_loss": 3.716913641664572, "tokens_seen": 827070464 }, { "epoch": 2.04, "learning_rate": 0.00037849548645937816, "loss": 2.9864, "theoretical_loss": 3.7168846224117447, "tokens_seen": 827136000 }, { "epoch": 2.04, "learning_rate": 0.0003784854563691073, "loss": 3.0751, "theoretical_loss": 3.7168556061018285, "tokens_seen": 827201536 }, { "epoch": 2.04, "learning_rate": 0.0003784754262788365, "loss": 2.8913, "theoretical_loss": 3.716826592734292, "tokens_seen": 827267072 }, { "epoch": 2.04, "learning_rate": 0.00037846539618856565, "loss": 2.907, "theoretical_loss": 3.7167975823086037, "tokens_seen": 827332608 }, { "epoch": 2.04, "objective/train/docs_used": 1334883, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.006484270095825, "objective/train/theoretical_loss": 3.7167830781987874, "objective/train/tokens_used": 847825376, "theoretical_loss": 3.7167830781987874, "tokens_seen": 827365376 }, { "epoch": 2.04, "learning_rate": 0.0003784553660982949, "loss": 2.8212, "theoretical_loss": 3.716768574824233, "tokens_seen": 827398144 }, { "epoch": 2.04, "learning_rate": 0.00037844533600802407, "loss": 2.8698, "theoretical_loss": 3.716739570280648, "tokens_seen": 827463680 }, { "epoch": 2.04, "learning_rate": 0.00037843530591775325, "loss": 2.9078, "theoretical_loss": 3.7167105686773176, "tokens_seen": 827529216 }, { "epoch": 2.04, "learning_rate": 0.00037842527582748243, "loss": 2.8393, "theoretical_loss": 3.716681570013712, "tokens_seen": 827594752 }, { "epoch": 2.04, "learning_rate": 0.00037841524573721167, "loss": 2.8626, "theoretical_loss": 3.716652574289299, "tokens_seen": 827660288 }, { "epoch": 2.04, "learning_rate": 0.0003784052156469408, "loss": 2.876, "theoretical_loss": 3.7166235815035487, "tokens_seen": 827725824 }, { "epoch": 2.04, "learning_rate": 0.00037839518555667003, "loss": 2.892, "theoretical_loss": 3.716594591655931, "tokens_seen": 827791360 }, { "epoch": 2.04, "learning_rate": 0.00037838515546639916, "loss": 3.0024, "theoretical_loss": 3.716565604745915, "tokens_seen": 827856896 }, { "epoch": 2.04, "learning_rate": 0.0003783751253761284, "loss": 3.1253, "theoretical_loss": 3.7165366207729704, "tokens_seen": 827922432 }, { "epoch": 2.04, "learning_rate": 0.00037836509528585763, "loss": 2.9897, "theoretical_loss": 3.716507639736568, "tokens_seen": 827987968 }, { "epoch": 2.04, "learning_rate": 0.00037835506519558675, "loss": 3.0705, "theoretical_loss": 3.7164786616361773, "tokens_seen": 828053504 }, { "epoch": 2.04, "learning_rate": 0.000378345035105316, "loss": 2.9435, "theoretical_loss": 3.716449686471268, "tokens_seen": 828119040 }, { "epoch": 2.04, "learning_rate": 0.0003783350050150451, "loss": 2.9977, "theoretical_loss": 3.7164207142413117, "tokens_seen": 828184576 }, { "epoch": 2.04, "learning_rate": 0.00037832497492477435, "loss": 2.974, "theoretical_loss": 3.7163917449457777, "tokens_seen": 828250112 }, { "epoch": 2.04, "learning_rate": 0.00037831494483450353, "loss": 2.9994, "theoretical_loss": 3.7163627785841373, "tokens_seen": 828315648 }, { "epoch": 2.04, "learning_rate": 0.0003783049147442327, "loss": 2.8947, "theoretical_loss": 3.716333815155861, "tokens_seen": 828381184 }, { "epoch": 2.04, "learning_rate": 0.0003782948846539619, "loss": 3.0344, "theoretical_loss": 3.7163048546604203, "tokens_seen": 828446720 }, { "epoch": 2.04, "learning_rate": 0.00037828485456369113, "loss": 2.9099, "theoretical_loss": 3.716275897097286, "tokens_seen": 828512256 }, { "epoch": 2.04, "learning_rate": 0.00037827482447342026, "loss": 2.8092, "theoretical_loss": 3.7162469424659283, "tokens_seen": 828577792 }, { "epoch": 2.04, "learning_rate": 0.0003782647943831495, "loss": 2.8576, "theoretical_loss": 3.7162179907658195, "tokens_seen": 828643328 }, { "epoch": 2.04, "learning_rate": 0.0003782547642928786, "loss": 2.9369, "theoretical_loss": 3.716189041996431, "tokens_seen": 828708864 }, { "epoch": 2.04, "learning_rate": 0.00037824473420260786, "loss": 3.0788, "theoretical_loss": 3.7161600961572345, "tokens_seen": 828774400 }, { "epoch": 2.04, "learning_rate": 0.00037823470411233704, "loss": 3.0443, "theoretical_loss": 3.716131153247701, "tokens_seen": 828839936 }, { "epoch": 2.04, "learning_rate": 0.0003782246740220662, "loss": 2.9032, "theoretical_loss": 3.716102213267303, "tokens_seen": 828905472 }, { "epoch": 2.04, "learning_rate": 0.0003782146439317954, "loss": 3.0717, "theoretical_loss": 3.7160732762155124, "tokens_seen": 828971008 }, { "epoch": 2.04, "objective/train/docs_used": 1337648, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8371119499206543, "objective/train/theoretical_loss": 3.71605880878768, "objective/train/tokens_used": 849463776, "theoretical_loss": 3.71605880878768, "tokens_seen": 829003776 }, { "epoch": 2.04, "learning_rate": 0.0003782046138415246, "loss": 2.8208, "theoretical_loss": 3.7160443420918012, "tokens_seen": 829036544 }, { "epoch": 2.04, "learning_rate": 0.00037819458375125376, "loss": 2.9032, "theoretical_loss": 3.7160154108956425, "tokens_seen": 829102080 }, { "epoch": 2.04, "learning_rate": 0.000378184553660983, "loss": 2.75, "theoretical_loss": 3.715986482626507, "tokens_seen": 829167616 }, { "epoch": 2.04, "learning_rate": 0.0003781745235707121, "loss": 2.8675, "theoretical_loss": 3.7159575572838692, "tokens_seen": 829233152 }, { "epoch": 2.04, "learning_rate": 0.00037816449348044136, "loss": 2.9092, "theoretical_loss": 3.7159286348672005, "tokens_seen": 829298688 }, { "epoch": 2.04, "learning_rate": 0.0003781544633901705, "loss": 3.0256, "theoretical_loss": 3.715899715375974, "tokens_seen": 829364224 }, { "epoch": 2.04, "learning_rate": 0.0003781444332998997, "loss": 3.0021, "theoretical_loss": 3.7158707988096626, "tokens_seen": 829429760 }, { "epoch": 2.04, "learning_rate": 0.0003781344032096289, "loss": 3.0238, "theoretical_loss": 3.71584188516774, "tokens_seen": 829495296 }, { "epoch": 2.04, "learning_rate": 0.0003781243731193581, "loss": 2.8788, "theoretical_loss": 3.715812974449679, "tokens_seen": 829560832 }, { "epoch": 2.04, "learning_rate": 0.00037811434302908726, "loss": 2.9165, "theoretical_loss": 3.7157840666549524, "tokens_seen": 829626368 }, { "epoch": 2.04, "learning_rate": 0.0003781043129388165, "loss": 2.9787, "theoretical_loss": 3.7157551617830347, "tokens_seen": 829691904 }, { "epoch": 2.04, "learning_rate": 0.0003780942828485456, "loss": 2.871, "theoretical_loss": 3.7157262598333993, "tokens_seen": 829757440 }, { "epoch": 2.04, "learning_rate": 0.00037808425275827486, "loss": 2.9764, "theoretical_loss": 3.7156973608055193, "tokens_seen": 829822976 }, { "epoch": 2.04, "learning_rate": 0.000378074222668004, "loss": 2.9726, "theoretical_loss": 3.7156684646988696, "tokens_seen": 829888512 }, { "epoch": 2.04, "learning_rate": 0.0003780641925777332, "loss": 2.9167, "theoretical_loss": 3.7156395715129236, "tokens_seen": 829954048 }, { "epoch": 2.04, "learning_rate": 0.0003780541624874624, "loss": 2.8516, "theoretical_loss": 3.7156106812471563, "tokens_seen": 830019584 }, { "epoch": 2.04, "learning_rate": 0.0003780441323971916, "loss": 3.0056, "theoretical_loss": 3.7155817939010407, "tokens_seen": 830085120 }, { "epoch": 2.04, "learning_rate": 0.00037803410230692077, "loss": 2.85, "theoretical_loss": 3.7155529094740523, "tokens_seen": 830150656 }, { "epoch": 2.04, "learning_rate": 0.00037802407221664995, "loss": 3.0795, "theoretical_loss": 3.7155240279656647, "tokens_seen": 830216192 }, { "epoch": 2.04, "learning_rate": 0.00037801404212637913, "loss": 2.822, "theoretical_loss": 3.715495149375354, "tokens_seen": 830281728 }, { "epoch": 2.04, "learning_rate": 0.00037800401203610836, "loss": 2.9092, "theoretical_loss": 3.7154662737025945, "tokens_seen": 830347264 }, { "epoch": 2.04, "learning_rate": 0.0003779939819458375, "loss": 3.0331, "theoretical_loss": 3.7154374009468607, "tokens_seen": 830412800 }, { "epoch": 2.04, "learning_rate": 0.00037798395185556673, "loss": 2.8584, "theoretical_loss": 3.7154085311076286, "tokens_seen": 830478336 }, { "epoch": 2.04, "learning_rate": 0.00037797392176529585, "loss": 2.9342, "theoretical_loss": 3.715379664184373, "tokens_seen": 830543872 }, { "epoch": 2.04, "learning_rate": 0.0003779638916750251, "loss": 2.9333, "theoretical_loss": 3.7153508001765685, "tokens_seen": 830609408 }, { "epoch": 2.04, "objective/train/docs_used": 1340604, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.828305721282959, "objective/train/theoretical_loss": 3.7153363692657972, "objective/train/tokens_used": 851102176, "theoretical_loss": 3.7153363692657972, "tokens_seen": 830642176 }, { "epoch": 2.04, "learning_rate": 0.00037795386158475427, "loss": 2.9311, "theoretical_loss": 3.7153219390836925, "tokens_seen": 830674944 }, { "epoch": 2.04, "learning_rate": 0.00037794383149448345, "loss": 3.0246, "theoretical_loss": 3.7152930809052194, "tokens_seen": 830740480 }, { "epoch": 2.04, "learning_rate": 0.00037793380140421263, "loss": 2.733, "theoretical_loss": 3.7152642256406248, "tokens_seen": 830806016 }, { "epoch": 2.04, "learning_rate": 0.00037792377131394187, "loss": 3.012, "theoretical_loss": 3.715235373289386, "tokens_seen": 830871552 }, { "epoch": 2.04, "learning_rate": 0.000377913741223671, "loss": 2.9708, "theoretical_loss": 3.715206523850978, "tokens_seen": 830937088 }, { "epoch": 2.04, "learning_rate": 0.00037790371113340023, "loss": 2.9445, "theoretical_loss": 3.715177677324877, "tokens_seen": 831002624 }, { "epoch": 2.04, "learning_rate": 0.00037789368104312936, "loss": 2.9813, "theoretical_loss": 3.71514883371056, "tokens_seen": 831068160 }, { "epoch": 2.04, "learning_rate": 0.0003778836509528586, "loss": 2.8761, "theoretical_loss": 3.7151199930075025, "tokens_seen": 831133696 }, { "epoch": 2.04, "learning_rate": 0.0003778736208625878, "loss": 3.0293, "theoretical_loss": 3.7150911552151826, "tokens_seen": 831199232 }, { "epoch": 2.04, "learning_rate": 0.00037786359077231695, "loss": 2.9785, "theoretical_loss": 3.715062320333076, "tokens_seen": 831264768 }, { "epoch": 2.04, "learning_rate": 0.00037785356068204614, "loss": 2.9045, "theoretical_loss": 3.7150334883606604, "tokens_seen": 831330304 }, { "epoch": 2.04, "learning_rate": 0.0003778435305917753, "loss": 2.8783, "theoretical_loss": 3.7150046592974113, "tokens_seen": 831395840 }, { "epoch": 2.04, "learning_rate": 0.0003778335005015045, "loss": 3.0986, "theoretical_loss": 3.714975833142808, "tokens_seen": 831461376 }, { "epoch": 2.04, "learning_rate": 0.00037782347041123373, "loss": 2.8999, "theoretical_loss": 3.7149470098963255, "tokens_seen": 831526912 }, { "epoch": 2.04, "learning_rate": 0.00037781344032096286, "loss": 2.8266, "theoretical_loss": 3.7149181895574435, "tokens_seen": 831592448 }, { "epoch": 2.04, "learning_rate": 0.0003778034102306921, "loss": 2.9531, "theoretical_loss": 3.714889372125638, "tokens_seen": 831657984 }, { "epoch": 2.04, "learning_rate": 0.0003777933801404213, "loss": 2.8786, "theoretical_loss": 3.7148605576003875, "tokens_seen": 831723520 }, { "epoch": 2.04, "learning_rate": 0.00037778335005015046, "loss": 2.9211, "theoretical_loss": 3.7148317459811695, "tokens_seen": 831789056 }, { "epoch": 2.04, "learning_rate": 0.00037777331995987964, "loss": 3.0057, "theoretical_loss": 3.7148029372674625, "tokens_seen": 831854592 }, { "epoch": 2.05, "learning_rate": 0.0003777632898696088, "loss": 2.8094, "theoretical_loss": 3.7147741314587437, "tokens_seen": 831920128 }, { "epoch": 2.05, "learning_rate": 0.000377753259779338, "loss": 2.9274, "theoretical_loss": 3.7147453285544922, "tokens_seen": 831985664 }, { "epoch": 2.05, "learning_rate": 0.00037774322968906724, "loss": 2.9912, "theoretical_loss": 3.714716528554186, "tokens_seen": 832051200 }, { "epoch": 2.05, "learning_rate": 0.00037773319959879636, "loss": 2.9748, "theoretical_loss": 3.7146877314573037, "tokens_seen": 832116736 }, { "epoch": 2.05, "learning_rate": 0.0003777231695085256, "loss": 2.9168, "theoretical_loss": 3.714658937263324, "tokens_seen": 832182272 }, { "epoch": 2.05, "learning_rate": 0.0003777131394182547, "loss": 3.0075, "theoretical_loss": 3.714630145971726, "tokens_seen": 832247808 }, { "epoch": 2.05, "objective/train/docs_used": 1343375, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.065286159515381, "objective/train/theoretical_loss": 3.714615751414157, "objective/train/tokens_used": 852740576, "theoretical_loss": 3.714615751414157, "tokens_seen": 832280576 }, { "epoch": 2.05, "learning_rate": 0.00037770310932798396, "loss": 2.8322, "theoretical_loss": 3.7146013575819876, "tokens_seen": 832313344 }, { "epoch": 2.05, "learning_rate": 0.00037769307923771314, "loss": 3.049, "theoretical_loss": 3.714572572093589, "tokens_seen": 832378880 }, { "epoch": 2.05, "learning_rate": 0.0003776830491474423, "loss": 2.902, "theoretical_loss": 3.714543789506009, "tokens_seen": 832444416 }, { "epoch": 2.05, "learning_rate": 0.0003776730190571715, "loss": 2.8847, "theoretical_loss": 3.7145150098187267, "tokens_seen": 832509952 }, { "epoch": 2.05, "learning_rate": 0.0003776629889669007, "loss": 3.0249, "theoretical_loss": 3.714486233031222, "tokens_seen": 832575488 }, { "epoch": 2.05, "learning_rate": 0.00037765295887662987, "loss": 2.9091, "theoretical_loss": 3.7144574591429746, "tokens_seen": 832641024 }, { "epoch": 2.05, "learning_rate": 0.0003776429287863591, "loss": 3.0453, "theoretical_loss": 3.7144286881534634, "tokens_seen": 832706560 }, { "epoch": 2.05, "learning_rate": 0.00037763289869608823, "loss": 3.0086, "theoretical_loss": 3.714399920062169, "tokens_seen": 832772096 }, { "epoch": 2.05, "learning_rate": 0.00037762286860581746, "loss": 3.0524, "theoretical_loss": 3.714371154868571, "tokens_seen": 832837632 }, { "epoch": 2.05, "learning_rate": 0.0003776128385155467, "loss": 2.8887, "theoretical_loss": 3.71434239257215, "tokens_seen": 832903168 }, { "epoch": 2.05, "learning_rate": 0.0003776028084252758, "loss": 3.0896, "theoretical_loss": 3.714313633172386, "tokens_seen": 832968704 }, { "epoch": 2.05, "learning_rate": 0.00037759277833500506, "loss": 2.8974, "theoretical_loss": 3.714284876668759, "tokens_seen": 833034240 }, { "epoch": 2.05, "learning_rate": 0.0003775827482447342, "loss": 2.9648, "theoretical_loss": 3.7142561230607507, "tokens_seen": 833099776 }, { "epoch": 2.05, "learning_rate": 0.0003775727181544634, "loss": 3.0313, "theoretical_loss": 3.7142273723478403, "tokens_seen": 833165312 }, { "epoch": 2.05, "learning_rate": 0.0003775626880641926, "loss": 2.9873, "theoretical_loss": 3.7141986245295096, "tokens_seen": 833230848 }, { "epoch": 2.05, "learning_rate": 0.0003775526579739218, "loss": 3.0442, "theoretical_loss": 3.7141698796052394, "tokens_seen": 833296384 }, { "epoch": 2.05, "learning_rate": 0.00037754262788365097, "loss": 2.8443, "theoretical_loss": 3.7141411375745106, "tokens_seen": 833361920 }, { "epoch": 2.05, "learning_rate": 0.00037753259779338015, "loss": 3.0684, "theoretical_loss": 3.7141123984368045, "tokens_seen": 833427456 }, { "epoch": 2.05, "learning_rate": 0.00037752256770310933, "loss": 2.9741, "theoretical_loss": 3.7140836621916025, "tokens_seen": 833492992 }, { "epoch": 2.05, "learning_rate": 0.00037751253761283857, "loss": 2.9687, "theoretical_loss": 3.7140549288383866, "tokens_seen": 833558528 }, { "epoch": 2.05, "learning_rate": 0.0003775025075225677, "loss": 3.0514, "theoretical_loss": 3.714026198376637, "tokens_seen": 833624064 }, { "epoch": 2.05, "learning_rate": 0.00037749247743229693, "loss": 2.9313, "theoretical_loss": 3.7139974708058365, "tokens_seen": 833689600 }, { "epoch": 2.05, "learning_rate": 0.00037748244734202605, "loss": 3.0109, "theoretical_loss": 3.7139687461254667, "tokens_seen": 833755136 }, { "epoch": 2.05, "learning_rate": 0.0003774724172517553, "loss": 2.8889, "theoretical_loss": 3.71394002433501, "tokens_seen": 833820672 }, { "epoch": 2.05, "learning_rate": 0.00037746238716148447, "loss": 2.9149, "theoretical_loss": 3.713911305433948, "tokens_seen": 833886208 }, { "epoch": 2.05, "objective/train/docs_used": 1346191, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9313440322875977, "objective/train/theoretical_loss": 3.7138969470667784, "objective/train/tokens_used": 854378976, "theoretical_loss": 3.7138969470667784, "tokens_seen": 833918976 }, { "epoch": 2.05, "learning_rate": 0.00037745235707121365, "loss": 2.9581, "theoretical_loss": 3.7138825894217633, "tokens_seen": 833951744 }, { "epoch": 2.05, "learning_rate": 0.00037744232698094283, "loss": 2.998, "theoretical_loss": 3.713853876297938, "tokens_seen": 834017280 }, { "epoch": 2.05, "learning_rate": 0.00037743229689067207, "loss": 2.8687, "theoretical_loss": 3.7138251660619552, "tokens_seen": 834082816 }, { "epoch": 2.05, "learning_rate": 0.0003774222668004012, "loss": 3.0473, "theoretical_loss": 3.713796458713297, "tokens_seen": 834148352 }, { "epoch": 2.05, "learning_rate": 0.00037741223671013043, "loss": 2.814, "theoretical_loss": 3.713767754251447, "tokens_seen": 834213888 }, { "epoch": 2.05, "learning_rate": 0.00037740220661985956, "loss": 2.8829, "theoretical_loss": 3.713739052675887, "tokens_seen": 834279424 }, { "epoch": 2.05, "learning_rate": 0.0003773921765295888, "loss": 2.9343, "theoretical_loss": 3.7137103539861007, "tokens_seen": 834344960 }, { "epoch": 2.05, "learning_rate": 0.000377382146439318, "loss": 2.922, "theoretical_loss": 3.7136816581815717, "tokens_seen": 834410496 }, { "epoch": 2.05, "learning_rate": 0.00037737211634904715, "loss": 2.9911, "theoretical_loss": 3.713652965261783, "tokens_seen": 834476032 }, { "epoch": 2.05, "learning_rate": 0.00037736208625877634, "loss": 2.828, "theoretical_loss": 3.713624275226218, "tokens_seen": 834541568 }, { "epoch": 2.05, "learning_rate": 0.0003773520561685055, "loss": 2.723, "theoretical_loss": 3.71359558807436, "tokens_seen": 834607104 }, { "epoch": 2.05, "learning_rate": 0.0003773420260782347, "loss": 2.8835, "theoretical_loss": 3.7135669038056935, "tokens_seen": 834672640 }, { "epoch": 2.05, "learning_rate": 0.00037733199598796393, "loss": 2.9842, "theoretical_loss": 3.713538222419702, "tokens_seen": 834738176 }, { "epoch": 2.05, "learning_rate": 0.00037732196589769306, "loss": 2.9725, "theoretical_loss": 3.7135095439158694, "tokens_seen": 834803712 }, { "epoch": 2.05, "learning_rate": 0.0003773119358074223, "loss": 2.9924, "theoretical_loss": 3.71348086829368, "tokens_seen": 834869248 }, { "epoch": 2.05, "learning_rate": 0.0003773019057171515, "loss": 2.9388, "theoretical_loss": 3.713452195552618, "tokens_seen": 834934784 }, { "epoch": 2.05, "learning_rate": 0.00037729187562688066, "loss": 3.0566, "theoretical_loss": 3.7134235256921677, "tokens_seen": 835000320 }, { "epoch": 2.05, "learning_rate": 0.00037728184553660984, "loss": 2.747, "theoretical_loss": 3.7133948587118137, "tokens_seen": 835065856 }, { "epoch": 2.05, "learning_rate": 0.000377271815446339, "loss": 2.9213, "theoretical_loss": 3.7133661946110412, "tokens_seen": 835131392 }, { "epoch": 2.05, "learning_rate": 0.0003772617853560682, "loss": 2.9489, "theoretical_loss": 3.7133375333893346, "tokens_seen": 835196928 }, { "epoch": 2.05, "learning_rate": 0.00037725175526579744, "loss": 2.8326, "theoretical_loss": 3.7133088750461782, "tokens_seen": 835262464 }, { "epoch": 2.05, "learning_rate": 0.00037724172517552656, "loss": 3.0145, "theoretical_loss": 3.7132802195810584, "tokens_seen": 835328000 }, { "epoch": 2.05, "learning_rate": 0.0003772316950852558, "loss": 2.9155, "theoretical_loss": 3.7132515669934594, "tokens_seen": 835393536 }, { "epoch": 2.05, "learning_rate": 0.0003772216649949849, "loss": 3.0071, "theoretical_loss": 3.7132229172828666, "tokens_seen": 835459072 }, { "epoch": 2.05, "learning_rate": 0.00037721163490471416, "loss": 3.0545, "theoretical_loss": 3.713194270448766, "tokens_seen": 835524608 }, { "epoch": 2.05, "objective/train/docs_used": 1347568, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8355417251586914, "objective/train/theoretical_loss": 3.7131799481102394, "objective/train/tokens_used": 856017376, "theoretical_loss": 3.7131799481102394, "tokens_seen": 835557376 }, { "epoch": 2.05, "learning_rate": 0.00037720160481444334, "loss": 2.9356, "theoretical_loss": 3.713165626490643, "tokens_seen": 835590144 }, { "epoch": 2.05, "learning_rate": 0.0003771915747241725, "loss": 2.9872, "theoretical_loss": 3.713136985407983, "tokens_seen": 835655680 }, { "epoch": 2.05, "learning_rate": 0.0003771815446339017, "loss": 2.9401, "theoretical_loss": 3.713108347200272, "tokens_seen": 835721216 }, { "epoch": 2.05, "learning_rate": 0.0003771715145436309, "loss": 3.0416, "theoretical_loss": 3.713079711866997, "tokens_seen": 835786752 }, { "epoch": 2.05, "learning_rate": 0.00037716148445336007, "loss": 3.0611, "theoretical_loss": 3.7130510794076423, "tokens_seen": 835852288 }, { "epoch": 2.05, "learning_rate": 0.0003771514543630893, "loss": 2.8146, "theoretical_loss": 3.713022449821696, "tokens_seen": 835917824 }, { "epoch": 2.05, "learning_rate": 0.00037714142427281843, "loss": 3.0682, "theoretical_loss": 3.7129938231086426, "tokens_seen": 835983360 }, { "epoch": 2.05, "learning_rate": 0.00037713139418254766, "loss": 2.9376, "theoretical_loss": 3.7129651992679706, "tokens_seen": 836048896 }, { "epoch": 2.05, "learning_rate": 0.00037712136409227685, "loss": 3.0376, "theoretical_loss": 3.712936578299165, "tokens_seen": 836114432 }, { "epoch": 2.05, "learning_rate": 0.000377111334002006, "loss": 3.0191, "theoretical_loss": 3.7129079602017137, "tokens_seen": 836179968 }, { "epoch": 2.05, "learning_rate": 0.0003771013039117352, "loss": 2.8231, "theoretical_loss": 3.712879344975103, "tokens_seen": 836245504 }, { "epoch": 2.05, "learning_rate": 0.0003770912738214644, "loss": 2.9138, "theoretical_loss": 3.7128507326188203, "tokens_seen": 836311040 }, { "epoch": 2.05, "learning_rate": 0.00037708124373119357, "loss": 2.9406, "theoretical_loss": 3.712822123132353, "tokens_seen": 836376576 }, { "epoch": 2.05, "learning_rate": 0.0003770712136409228, "loss": 2.7915, "theoretical_loss": 3.7127935165151875, "tokens_seen": 836442112 }, { "epoch": 2.05, "learning_rate": 0.00037706118355065193, "loss": 3.0626, "theoretical_loss": 3.7127649127668123, "tokens_seen": 836507648 }, { "epoch": 2.05, "learning_rate": 0.00037705115346038117, "loss": 3.0655, "theoretical_loss": 3.7127363118867147, "tokens_seen": 836573184 }, { "epoch": 2.05, "learning_rate": 0.0003770411233701103, "loss": 2.8428, "theoretical_loss": 3.712707713874382, "tokens_seen": 836638720 }, { "epoch": 2.05, "learning_rate": 0.00037703109327983953, "loss": 2.9454, "theoretical_loss": 3.7126791187293025, "tokens_seen": 836704256 }, { "epoch": 2.05, "learning_rate": 0.0003770210631895687, "loss": 3.0744, "theoretical_loss": 3.7126505264509637, "tokens_seen": 836769792 }, { "epoch": 2.05, "learning_rate": 0.0003770110330992979, "loss": 2.8801, "theoretical_loss": 3.712621937038854, "tokens_seen": 836835328 }, { "epoch": 2.05, "learning_rate": 0.0003770010030090271, "loss": 3.2025, "theoretical_loss": 3.7125933504924618, "tokens_seen": 836900864 }, { "epoch": 2.05, "learning_rate": 0.00037699097291875625, "loss": 2.8573, "theoretical_loss": 3.7125647668112753, "tokens_seen": 836966400 }, { "epoch": 2.05, "learning_rate": 0.00037698094282848544, "loss": 2.6744, "theoretical_loss": 3.712536185994783, "tokens_seen": 837031936 }, { "epoch": 2.05, "learning_rate": 0.00037697091273821467, "loss": 3.0397, "theoretical_loss": 3.7125076080424737, "tokens_seen": 837097472 }, { "epoch": 2.05, "learning_rate": 0.0003769608826479438, "loss": 2.9486, "theoretical_loss": 3.712479032953836, "tokens_seen": 837163008 }, { "epoch": 2.05, "objective/train/docs_used": 1350397, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.29722261428833, "objective/train/theoretical_loss": 3.712464746483234, "objective/train/tokens_used": 857655776, "theoretical_loss": 3.712464746483234, "tokens_seen": 837195776 }, { "epoch": 2.05, "learning_rate": 0.00037695085255767303, "loss": 3.0491, "theoretical_loss": 3.712450460728359, "tokens_seen": 837228544 }, { "epoch": 2.05, "learning_rate": 0.0003769408224674022, "loss": 2.8499, "theoretical_loss": 3.7124218913655316, "tokens_seen": 837294080 }, { "epoch": 2.05, "learning_rate": 0.0003769307923771314, "loss": 3.1533, "theoretical_loss": 3.712393324864842, "tokens_seen": 837359616 }, { "epoch": 2.05, "learning_rate": 0.0003769207622868606, "loss": 2.8723, "theoretical_loss": 3.712364761225781, "tokens_seen": 837425152 }, { "epoch": 2.05, "learning_rate": 0.00037691073219658976, "loss": 2.9623, "theoretical_loss": 3.7123362004478375, "tokens_seen": 837490688 }, { "epoch": 2.05, "learning_rate": 0.00037690070210631894, "loss": 2.8175, "theoretical_loss": 3.7123076425305013, "tokens_seen": 837556224 }, { "epoch": 2.05, "learning_rate": 0.0003768906720160482, "loss": 3.1571, "theoretical_loss": 3.7122790874732616, "tokens_seen": 837621760 }, { "epoch": 2.05, "learning_rate": 0.0003768806419257773, "loss": 2.8948, "theoretical_loss": 3.7122505352756083, "tokens_seen": 837687296 }, { "epoch": 2.05, "learning_rate": 0.00037687061183550654, "loss": 3.0573, "theoretical_loss": 3.712221985937031, "tokens_seen": 837752832 }, { "epoch": 2.05, "learning_rate": 0.0003768605817452357, "loss": 2.9405, "theoretical_loss": 3.712193439457021, "tokens_seen": 837818368 }, { "epoch": 2.05, "learning_rate": 0.0003768505516549649, "loss": 2.933, "theoretical_loss": 3.712164895835068, "tokens_seen": 837883904 }, { "epoch": 2.05, "learning_rate": 0.00037684052156469413, "loss": 2.9983, "theoretical_loss": 3.712136355070661, "tokens_seen": 837949440 }, { "epoch": 2.05, "learning_rate": 0.00037683049147442326, "loss": 2.8018, "theoretical_loss": 3.712107817163292, "tokens_seen": 838014976 }, { "epoch": 2.05, "learning_rate": 0.0003768204613841525, "loss": 2.7804, "theoretical_loss": 3.7120792821124513, "tokens_seen": 838080512 }, { "epoch": 2.05, "learning_rate": 0.0003768104312938817, "loss": 2.9337, "theoretical_loss": 3.71205074991763, "tokens_seen": 838146048 }, { "epoch": 2.05, "learning_rate": 0.00037680040120361086, "loss": 3.0956, "theoretical_loss": 3.712022220578317, "tokens_seen": 838211584 }, { "epoch": 2.05, "learning_rate": 0.00037679037111334004, "loss": 2.9785, "theoretical_loss": 3.711993694094006, "tokens_seen": 838277120 }, { "epoch": 2.05, "learning_rate": 0.0003767803410230692, "loss": 3.0397, "theoretical_loss": 3.711965170464186, "tokens_seen": 838342656 }, { "epoch": 2.05, "learning_rate": 0.0003767703109327984, "loss": 3.0015, "theoretical_loss": 3.71193664968835, "tokens_seen": 838408192 }, { "epoch": 2.05, "learning_rate": 0.00037676028084252764, "loss": 3.1117, "theoretical_loss": 3.7119081317659877, "tokens_seen": 838473728 }, { "epoch": 2.05, "learning_rate": 0.00037675025075225676, "loss": 2.8481, "theoretical_loss": 3.711879616696592, "tokens_seen": 838539264 }, { "epoch": 2.05, "learning_rate": 0.000376740220661986, "loss": 3.0265, "theoretical_loss": 3.7118511044796536, "tokens_seen": 838604800 }, { "epoch": 2.05, "learning_rate": 0.0003767301905717151, "loss": 2.935, "theoretical_loss": 3.711822595114665, "tokens_seen": 838670336 }, { "epoch": 2.05, "learning_rate": 0.00037672016048144436, "loss": 2.8883, "theoretical_loss": 3.711794088601118, "tokens_seen": 838735872 }, { "epoch": 2.05, "learning_rate": 0.00037671013039117354, "loss": 2.9281, "theoretical_loss": 3.7117655849385036, "tokens_seen": 838801408 }, { "epoch": 2.05, "objective/train/docs_used": 1352942, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1044812202453613, "objective/train/theoretical_loss": 3.711751334176138, "objective/train/tokens_used": 859294176, "theoretical_loss": 3.711751334176138, "tokens_seen": 838834176 }, { "epoch": 2.05, "learning_rate": 0.0003767001003009027, "loss": 3.1477, "theoretical_loss": 3.711737084126315, "tokens_seen": 838866944 }, { "epoch": 2.05, "learning_rate": 0.0003766900702106319, "loss": 2.9682, "theoretical_loss": 3.711708586164044, "tokens_seen": 838932480 }, { "epoch": 2.05, "learning_rate": 0.0003766800401203611, "loss": 2.8912, "theoretical_loss": 3.711680091051184, "tokens_seen": 838998016 }, { "epoch": 2.05, "learning_rate": 0.00037667001003009027, "loss": 2.9652, "theoretical_loss": 3.7116515987872267, "tokens_seen": 839063552 }, { "epoch": 2.05, "learning_rate": 0.0003766599799398195, "loss": 2.9224, "theoretical_loss": 3.711623109371665, "tokens_seen": 839129088 }, { "epoch": 2.05, "learning_rate": 0.00037664994984954863, "loss": 2.8854, "theoretical_loss": 3.711594622803991, "tokens_seen": 839194624 }, { "epoch": 2.05, "learning_rate": 0.00037663991975927786, "loss": 3.0607, "theoretical_loss": 3.7115661390836987, "tokens_seen": 839260160 }, { "epoch": 2.05, "learning_rate": 0.00037662988966900705, "loss": 3.013, "theoretical_loss": 3.7115376582102804, "tokens_seen": 839325696 }, { "epoch": 2.05, "learning_rate": 0.0003766198595787362, "loss": 2.8071, "theoretical_loss": 3.71150918018323, "tokens_seen": 839391232 }, { "epoch": 2.05, "learning_rate": 0.0003766098294884654, "loss": 3.059, "theoretical_loss": 3.7114807050020406, "tokens_seen": 839456768 }, { "epoch": 2.05, "learning_rate": 0.0003765997993981946, "loss": 2.8915, "theoretical_loss": 3.7114522326662054, "tokens_seen": 839522304 }, { "epoch": 2.05, "learning_rate": 0.00037658976930792377, "loss": 2.9475, "theoretical_loss": 3.711423763175218, "tokens_seen": 839587840 }, { "epoch": 2.05, "learning_rate": 0.000376579739217653, "loss": 2.801, "theoretical_loss": 3.711395296528573, "tokens_seen": 839653376 }, { "epoch": 2.05, "learning_rate": 0.00037656970912738213, "loss": 2.8893, "theoretical_loss": 3.711366832725763, "tokens_seen": 839718912 }, { "epoch": 2.05, "learning_rate": 0.00037655967903711137, "loss": 2.9541, "theoretical_loss": 3.7113383717662822, "tokens_seen": 839784448 }, { "epoch": 2.05, "learning_rate": 0.0003765496489468405, "loss": 2.9477, "theoretical_loss": 3.7113099136496257, "tokens_seen": 839849984 }, { "epoch": 2.05, "learning_rate": 0.00037653961885656973, "loss": 3.0833, "theoretical_loss": 3.7112814583752867, "tokens_seen": 839915520 }, { "epoch": 2.05, "learning_rate": 0.0003765295887662989, "loss": 3.0073, "theoretical_loss": 3.71125300594276, "tokens_seen": 839981056 }, { "epoch": 2.05, "learning_rate": 0.0003765195586760281, "loss": 2.9244, "theoretical_loss": 3.7112245563515405, "tokens_seen": 840046592 }, { "epoch": 2.05, "learning_rate": 0.0003765095285857573, "loss": 2.9102, "theoretical_loss": 3.711196109601122, "tokens_seen": 840112128 }, { "epoch": 2.05, "learning_rate": 0.00037649949849548645, "loss": 2.8426, "theoretical_loss": 3.7111676656909998, "tokens_seen": 840177664 }, { "epoch": 2.05, "learning_rate": 0.00037648946840521564, "loss": 2.94, "theoretical_loss": 3.7111392246206685, "tokens_seen": 840243200 }, { "epoch": 2.05, "learning_rate": 0.00037647943831494487, "loss": 2.8428, "theoretical_loss": 3.7111107863896233, "tokens_seen": 840308736 }, { "epoch": 2.05, "learning_rate": 0.000376469408224674, "loss": 2.8867, "theoretical_loss": 3.7110823509973594, "tokens_seen": 840374272 }, { "epoch": 2.05, "learning_rate": 0.00037645937813440323, "loss": 3.0804, "theoretical_loss": 3.7110539184433717, "tokens_seen": 840439808 }, { "epoch": 2.05, "objective/train/docs_used": 1355828, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.721681594848633, "objective/train/theoretical_loss": 3.711039703230574, "objective/train/tokens_used": 860932576, "theoretical_loss": 3.711039703230574, "tokens_seen": 840472576 }, { "epoch": 2.05, "learning_rate": 0.0003764493480441324, "loss": 2.9378, "theoretical_loss": 3.7110254887271563, "tokens_seen": 840505344 }, { "epoch": 2.05, "learning_rate": 0.0003764393179538616, "loss": 2.9046, "theoretical_loss": 3.710997061848208, "tokens_seen": 840570880 }, { "epoch": 2.05, "learning_rate": 0.0003764292878635908, "loss": 2.999, "theoretical_loss": 3.7109686378060234, "tokens_seen": 840636416 }, { "epoch": 2.05, "learning_rate": 0.00037641925777331996, "loss": 2.9695, "theoretical_loss": 3.7109402166000973, "tokens_seen": 840701952 }, { "epoch": 2.05, "learning_rate": 0.00037640922768304914, "loss": 2.8427, "theoretical_loss": 3.710911798229926, "tokens_seen": 840767488 }, { "epoch": 2.05, "learning_rate": 0.0003763991975927784, "loss": 2.8646, "theoretical_loss": 3.7108833826950054, "tokens_seen": 840833024 }, { "epoch": 2.05, "learning_rate": 0.0003763891675025075, "loss": 2.961, "theoretical_loss": 3.7108549699948323, "tokens_seen": 840898560 }, { "epoch": 2.05, "learning_rate": 0.00037637913741223674, "loss": 2.9745, "theoretical_loss": 3.7108265601289023, "tokens_seen": 840964096 }, { "epoch": 2.05, "learning_rate": 0.00037636910732196586, "loss": 2.9753, "theoretical_loss": 3.7107981530967122, "tokens_seen": 841029632 }, { "epoch": 2.05, "learning_rate": 0.0003763590772316951, "loss": 2.9445, "theoretical_loss": 3.7107697488977585, "tokens_seen": 841095168 }, { "epoch": 2.05, "learning_rate": 0.0003763490471414243, "loss": 2.8244, "theoretical_loss": 3.710741347531538, "tokens_seen": 841160704 }, { "epoch": 2.05, "learning_rate": 0.00037633901705115346, "loss": 2.842, "theoretical_loss": 3.710712948997547, "tokens_seen": 841226240 }, { "epoch": 2.05, "learning_rate": 0.00037632898696088264, "loss": 2.9429, "theoretical_loss": 3.710684553295283, "tokens_seen": 841291776 }, { "epoch": 2.05, "learning_rate": 0.0003763189568706119, "loss": 3.0081, "theoretical_loss": 3.710656160424243, "tokens_seen": 841357312 }, { "epoch": 2.05, "learning_rate": 0.000376308926780341, "loss": 2.8671, "theoretical_loss": 3.7106277703839243, "tokens_seen": 841422848 }, { "epoch": 2.05, "learning_rate": 0.00037629889669007024, "loss": 3.0888, "theoretical_loss": 3.7105993831738235, "tokens_seen": 841488384 }, { "epoch": 2.05, "learning_rate": 0.00037628886659979937, "loss": 2.7587, "theoretical_loss": 3.7105709987934388, "tokens_seen": 841553920 }, { "epoch": 2.05, "learning_rate": 0.0003762788365095286, "loss": 3.0854, "theoretical_loss": 3.710542617242268, "tokens_seen": 841619456 }, { "epoch": 2.05, "learning_rate": 0.0003762688064192578, "loss": 2.9776, "theoretical_loss": 3.710514238519808, "tokens_seen": 841684992 }, { "epoch": 2.05, "learning_rate": 0.00037625877632898696, "loss": 2.8243, "theoretical_loss": 3.710485862625558, "tokens_seen": 841750528 }, { "epoch": 2.05, "learning_rate": 0.00037624874623871615, "loss": 3.0465, "theoretical_loss": 3.7104574895590146, "tokens_seen": 841816064 }, { "epoch": 2.05, "learning_rate": 0.0003762387161484453, "loss": 2.8238, "theoretical_loss": 3.710429119319676, "tokens_seen": 841881600 }, { "epoch": 2.05, "learning_rate": 0.0003762286860581745, "loss": 2.844, "theoretical_loss": 3.7104007519070414, "tokens_seen": 841947136 }, { "epoch": 2.05, "learning_rate": 0.00037621865596790374, "loss": 2.9652, "theoretical_loss": 3.710372387320608, "tokens_seen": 842012672 }, { "epoch": 2.05, "learning_rate": 0.00037620862587763287, "loss": 2.9558, "theoretical_loss": 3.710344025559875, "tokens_seen": 842078208 }, { "epoch": 2.05, "objective/train/docs_used": 1358700, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.019672155380249, "objective/train/theoretical_loss": 3.7103298457389897, "objective/train/tokens_used": 862570976, "theoretical_loss": 3.7103298457389897, "tokens_seen": 842110976 }, { "epoch": 2.05, "learning_rate": 0.0003761985957873621, "loss": 2.9954, "theoretical_loss": 3.710315666624341, "tokens_seen": 842143744 }, { "epoch": 2.05, "learning_rate": 0.00037618856569709123, "loss": 2.9683, "theoretical_loss": 3.710287310513505, "tokens_seen": 842209280 }, { "epoch": 2.05, "learning_rate": 0.00037617853560682047, "loss": 3.0024, "theoretical_loss": 3.7102589572268654, "tokens_seen": 842274816 }, { "epoch": 2.05, "learning_rate": 0.00037616850551654965, "loss": 2.9407, "theoretical_loss": 3.710230606763921, "tokens_seen": 842340352 }, { "epoch": 2.05, "learning_rate": 0.00037615847542627883, "loss": 2.8004, "theoretical_loss": 3.7102022591241717, "tokens_seen": 842405888 }, { "epoch": 2.05, "learning_rate": 0.000376148445336008, "loss": 3.1118, "theoretical_loss": 3.7101739143071164, "tokens_seen": 842471424 }, { "epoch": 2.05, "learning_rate": 0.00037613841524573725, "loss": 2.8909, "theoretical_loss": 3.710145572312254, "tokens_seen": 842536960 }, { "epoch": 2.05, "learning_rate": 0.00037612838515546637, "loss": 2.9011, "theoretical_loss": 3.7101172331390844, "tokens_seen": 842602496 }, { "epoch": 2.05, "learning_rate": 0.0003761183550651956, "loss": 2.9942, "theoretical_loss": 3.7100888967871075, "tokens_seen": 842668032 }, { "epoch": 2.05, "learning_rate": 0.0003761083249749248, "loss": 2.8523, "theoretical_loss": 3.710060563255823, "tokens_seen": 842733568 }, { "epoch": 2.05, "learning_rate": 0.00037609829488465397, "loss": 2.8646, "theoretical_loss": 3.71003223254473, "tokens_seen": 842799104 }, { "epoch": 2.05, "learning_rate": 0.0003760882647943832, "loss": 2.8726, "theoretical_loss": 3.71000390465333, "tokens_seen": 842864640 }, { "epoch": 2.05, "learning_rate": 0.00037607823470411233, "loss": 2.9054, "theoretical_loss": 3.709975579581122, "tokens_seen": 842930176 }, { "epoch": 2.05, "learning_rate": 0.00037606820461384157, "loss": 3.0536, "theoretical_loss": 3.7099472573276064, "tokens_seen": 842995712 }, { "epoch": 2.05, "learning_rate": 0.0003760581745235707, "loss": 2.899, "theoretical_loss": 3.709918937892284, "tokens_seen": 843061248 }, { "epoch": 2.05, "learning_rate": 0.00037604814443329993, "loss": 2.8971, "theoretical_loss": 3.709890621274655, "tokens_seen": 843126784 }, { "epoch": 2.05, "learning_rate": 0.0003760381143430291, "loss": 2.9699, "theoretical_loss": 3.7098623074742205, "tokens_seen": 843192320 }, { "epoch": 2.05, "learning_rate": 0.0003760280842527583, "loss": 3.0144, "theoretical_loss": 3.7098339964904805, "tokens_seen": 843257856 }, { "epoch": 2.05, "learning_rate": 0.0003760180541624875, "loss": 2.8803, "theoretical_loss": 3.7098056883229367, "tokens_seen": 843323392 }, { "epoch": 2.05, "learning_rate": 0.00037600802407221665, "loss": 3.0477, "theoretical_loss": 3.70977738297109, "tokens_seen": 843388928 }, { "epoch": 2.05, "learning_rate": 0.00037599799398194584, "loss": 2.8424, "theoretical_loss": 3.709749080434441, "tokens_seen": 843454464 }, { "epoch": 2.05, "learning_rate": 0.00037598796389167507, "loss": 2.8871, "theoretical_loss": 3.709720780712492, "tokens_seen": 843520000 }, { "epoch": 2.05, "learning_rate": 0.0003759779338014042, "loss": 2.9623, "theoretical_loss": 3.7096924838047434, "tokens_seen": 843585536 }, { "epoch": 2.05, "learning_rate": 0.00037596790371113343, "loss": 2.9034, "theoretical_loss": 3.7096641897106974, "tokens_seen": 843651072 }, { "epoch": 2.05, "learning_rate": 0.0003759578736208626, "loss": 2.9872, "theoretical_loss": 3.7096358984298554, "tokens_seen": 843716608 }, { "epoch": 2.05, "objective/train/docs_used": 1361841, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0454537868499756, "objective/train/theoretical_loss": 3.7096217538442304, "objective/train/tokens_used": 864209376, "theoretical_loss": 3.7096217538442304, "tokens_seen": 843749376 }, { "epoch": 2.05, "learning_rate": 0.0003759478435305918, "loss": 2.9925, "theoretical_loss": 3.7096076099617195, "tokens_seen": 843782144 }, { "epoch": 2.05, "learning_rate": 0.000375937813440321, "loss": 2.8278, "theoretical_loss": 3.7095793243057913, "tokens_seen": 843847680 }, { "epoch": 2.05, "learning_rate": 0.00037592778335005016, "loss": 3.0335, "theoretical_loss": 3.709551041461573, "tokens_seen": 843913216 }, { "epoch": 2.05, "learning_rate": 0.00037591775325977934, "loss": 2.9677, "theoretical_loss": 3.7095227614285666, "tokens_seen": 843978752 }, { "epoch": 2.05, "learning_rate": 0.0003759077231695086, "loss": 2.9009, "theoretical_loss": 3.709494484206275, "tokens_seen": 844044288 }, { "epoch": 2.05, "learning_rate": 0.0003758976930792377, "loss": 3.0898, "theoretical_loss": 3.7094662097941997, "tokens_seen": 844109824 }, { "epoch": 2.05, "learning_rate": 0.00037588766298896694, "loss": 2.9966, "theoretical_loss": 3.7094379381918445, "tokens_seen": 844175360 }, { "epoch": 2.05, "learning_rate": 0.00037587763289869606, "loss": 3.0857, "theoretical_loss": 3.7094096693987106, "tokens_seen": 844240896 }, { "epoch": 2.05, "learning_rate": 0.0003758676028084253, "loss": 2.9837, "theoretical_loss": 3.7093814034143024, "tokens_seen": 844306432 }, { "epoch": 2.05, "learning_rate": 0.0003758575727181545, "loss": 2.9455, "theoretical_loss": 3.7093531402381217, "tokens_seen": 844371968 }, { "epoch": 2.05, "learning_rate": 0.00037584754262788366, "loss": 2.9364, "theoretical_loss": 3.7093248798696714, "tokens_seen": 844437504 }, { "epoch": 2.05, "learning_rate": 0.00037583751253761284, "loss": 2.9169, "theoretical_loss": 3.709296622308456, "tokens_seen": 844503040 }, { "epoch": 2.05, "learning_rate": 0.0003758274824473421, "loss": 2.9261, "theoretical_loss": 3.7092683675539777, "tokens_seen": 844568576 }, { "epoch": 2.05, "learning_rate": 0.0003758174523570712, "loss": 2.8535, "theoretical_loss": 3.7092401156057404, "tokens_seen": 844634112 }, { "epoch": 2.05, "learning_rate": 0.00037580742226680044, "loss": 2.9897, "theoretical_loss": 3.7092118664632476, "tokens_seen": 844699648 }, { "epoch": 2.05, "learning_rate": 0.00037579739217652957, "loss": 2.9245, "theoretical_loss": 3.709183620126003, "tokens_seen": 844765184 }, { "epoch": 2.05, "learning_rate": 0.0003757873620862588, "loss": 2.8964, "theoretical_loss": 3.70915537659351, "tokens_seen": 844830720 }, { "epoch": 2.05, "learning_rate": 0.000375777331995988, "loss": 3.1252, "theoretical_loss": 3.7091271358652738, "tokens_seen": 844896256 }, { "epoch": 2.05, "learning_rate": 0.00037576730190571716, "loss": 2.934, "theoretical_loss": 3.7090988979407973, "tokens_seen": 844961792 }, { "epoch": 2.05, "learning_rate": 0.00037575727181544635, "loss": 2.9349, "theoretical_loss": 3.7090706628195846, "tokens_seen": 845027328 }, { "epoch": 2.05, "learning_rate": 0.0003757472417251755, "loss": 3.0569, "theoretical_loss": 3.7090424305011416, "tokens_seen": 845092864 }, { "epoch": 2.05, "learning_rate": 0.0003757372116349047, "loss": 3.0141, "theoretical_loss": 3.709014200984971, "tokens_seen": 845158400 }, { "epoch": 2.05, "learning_rate": 0.00037572718154463394, "loss": 3.0253, "theoretical_loss": 3.708985974270578, "tokens_seen": 845223936 }, { "epoch": 2.05, "learning_rate": 0.00037571715145436307, "loss": 2.9379, "theoretical_loss": 3.7089577503574676, "tokens_seen": 845289472 }, { "epoch": 2.05, "learning_rate": 0.0003757071213640923, "loss": 2.8867, "theoretical_loss": 3.7089295292451445, "tokens_seen": 845355008 }, { "epoch": 2.05, "objective/train/docs_used": 1364607, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.970616102218628, "objective/train/theoretical_loss": 3.7089154197391236, "objective/train/tokens_used": 865847776, "theoretical_loss": 3.7089154197391236, "tokens_seen": 845387776 }, { "epoch": 2.05, "learning_rate": 0.00037569709127382143, "loss": 3.1248, "theoretical_loss": 3.7089013109331135, "tokens_seen": 845420544 }, { "epoch": 2.05, "learning_rate": 0.00037568706118355067, "loss": 2.927, "theoretical_loss": 3.7088730954208797, "tokens_seen": 845486080 }, { "epoch": 2.05, "learning_rate": 0.00037567703109327985, "loss": 2.9122, "theoretical_loss": 3.7088448827079485, "tokens_seen": 845551616 }, { "epoch": 2.05, "learning_rate": 0.00037566700100300903, "loss": 3.0011, "theoretical_loss": 3.708816672793825, "tokens_seen": 845617152 }, { "epoch": 2.05, "learning_rate": 0.0003756569709127382, "loss": 3.0429, "theoretical_loss": 3.7087884656780155, "tokens_seen": 845682688 }, { "epoch": 2.05, "learning_rate": 0.00037564694082246745, "loss": 2.8527, "theoretical_loss": 3.708760261360024, "tokens_seen": 845748224 }, { "epoch": 2.05, "learning_rate": 0.0003756369107321966, "loss": 3.0262, "theoretical_loss": 3.708732059839358, "tokens_seen": 845813760 }, { "epoch": 2.05, "learning_rate": 0.0003756268806419258, "loss": 2.9287, "theoretical_loss": 3.708703861115522, "tokens_seen": 845879296 }, { "epoch": 2.05, "learning_rate": 0.00037561685055165494, "loss": 3.0069, "theoretical_loss": 3.7086756651880224, "tokens_seen": 845944832 }, { "epoch": 2.05, "learning_rate": 0.00037560682046138417, "loss": 3.0261, "theoretical_loss": 3.7086474720563656, "tokens_seen": 846010368 }, { "epoch": 2.05, "learning_rate": 0.00037559679037111335, "loss": 2.9414, "theoretical_loss": 3.708619281720057, "tokens_seen": 846075904 }, { "epoch": 2.05, "learning_rate": 0.00037558676028084253, "loss": 3.045, "theoretical_loss": 3.708591094178604, "tokens_seen": 846141440 }, { "epoch": 2.05, "learning_rate": 0.0003755767301905717, "loss": 2.9473, "theoretical_loss": 3.708562909431513, "tokens_seen": 846206976 }, { "epoch": 2.05, "learning_rate": 0.0003755667001003009, "loss": 2.8011, "theoretical_loss": 3.7085347274782894, "tokens_seen": 846272512 }, { "epoch": 2.05, "learning_rate": 0.0003755566700100301, "loss": 3.0149, "theoretical_loss": 3.708506548318441, "tokens_seen": 846338048 }, { "epoch": 2.05, "learning_rate": 0.0003755466399197593, "loss": 3.0727, "theoretical_loss": 3.708478371951475, "tokens_seen": 846403584 }, { "epoch": 2.05, "learning_rate": 0.00037553660982948844, "loss": 2.9486, "theoretical_loss": 3.708450198376897, "tokens_seen": 846469120 }, { "epoch": 2.05, "learning_rate": 0.0003755265797392177, "loss": 3.0753, "theoretical_loss": 3.708422027594215, "tokens_seen": 846534656 }, { "epoch": 2.05, "learning_rate": 0.0003755165496489468, "loss": 2.803, "theoretical_loss": 3.7083938596029364, "tokens_seen": 846600192 }, { "epoch": 2.05, "learning_rate": 0.00037550651955867604, "loss": 2.892, "theoretical_loss": 3.708365694402568, "tokens_seen": 846665728 }, { "epoch": 2.05, "learning_rate": 0.0003754964894684052, "loss": 3.017, "theoretical_loss": 3.7083375319926177, "tokens_seen": 846731264 }, { "epoch": 2.05, "learning_rate": 0.0003754864593781344, "loss": 2.8924, "theoretical_loss": 3.708309372372593, "tokens_seen": 846796800 }, { "epoch": 2.05, "learning_rate": 0.0003754764292878636, "loss": 2.9535, "theoretical_loss": 3.708281215542001, "tokens_seen": 846862336 }, { "epoch": 2.05, "learning_rate": 0.0003754663991975928, "loss": 2.9386, "theoretical_loss": 3.7082530615003506, "tokens_seen": 846927872 }, { "epoch": 2.05, "learning_rate": 0.00037545636910732194, "loss": 2.9471, "theoretical_loss": 3.708224910247149, "tokens_seen": 846993408 }, { "epoch": 2.05, "objective/train/docs_used": 1366742, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5778250694274902, "objective/train/theoretical_loss": 3.708210835666063, "objective/train/tokens_used": 867486176, "theoretical_loss": 3.708210835666063, "tokens_seen": 847026176 }, { "epoch": 2.05, "learning_rate": 0.0003754463390170512, "loss": 2.9334, "theoretical_loss": 3.7081967617819047, "tokens_seen": 847058944 }, { "epoch": 2.05, "learning_rate": 0.0003754363089267803, "loss": 2.8898, "theoretical_loss": 3.7081686161041256, "tokens_seen": 847124480 }, { "epoch": 2.05, "learning_rate": 0.00037542627883650954, "loss": 2.9801, "theoretical_loss": 3.7081404732133203, "tokens_seen": 847190016 }, { "epoch": 2.05, "learning_rate": 0.0003754162487462387, "loss": 2.9463, "theoretical_loss": 3.7081123331089976, "tokens_seen": 847255552 }, { "epoch": 2.05, "learning_rate": 0.0003754062186559679, "loss": 3.0765, "theoretical_loss": 3.7080841957906654, "tokens_seen": 847321088 }, { "epoch": 2.05, "learning_rate": 0.0003753961885656971, "loss": 2.872, "theoretical_loss": 3.708056061257833, "tokens_seen": 847386624 }, { "epoch": 2.05, "learning_rate": 0.00037538615847542626, "loss": 2.9247, "theoretical_loss": 3.7080279295100085, "tokens_seen": 847452160 }, { "epoch": 2.05, "learning_rate": 0.00037537612838515544, "loss": 3.0174, "theoretical_loss": 3.707999800546702, "tokens_seen": 847517696 }, { "epoch": 2.05, "learning_rate": 0.0003753660982948847, "loss": 2.8985, "theoretical_loss": 3.7079716743674216, "tokens_seen": 847583232 }, { "epoch": 2.05, "learning_rate": 0.00037535606820461386, "loss": 2.9473, "theoretical_loss": 3.707943550971677, "tokens_seen": 847648768 }, { "epoch": 2.05, "learning_rate": 0.00037534603811434304, "loss": 2.9746, "theoretical_loss": 3.707915430358977, "tokens_seen": 847714304 }, { "epoch": 2.05, "learning_rate": 0.0003753360080240723, "loss": 2.7685, "theoretical_loss": 3.707887312528832, "tokens_seen": 847779840 }, { "epoch": 2.05, "learning_rate": 0.0003753259779338014, "loss": 3.0006, "theoretical_loss": 3.707859197480751, "tokens_seen": 847845376 }, { "epoch": 2.05, "learning_rate": 0.00037531594784353064, "loss": 2.946, "theoretical_loss": 3.7078310852142438, "tokens_seen": 847910912 }, { "epoch": 2.05, "learning_rate": 0.00037530591775325977, "loss": 2.9693, "theoretical_loss": 3.7078029757288204, "tokens_seen": 847976448 }, { "epoch": 2.05, "learning_rate": 0.000375295887662989, "loss": 2.9344, "theoretical_loss": 3.7077748690239902, "tokens_seen": 848041984 }, { "epoch": 2.05, "learning_rate": 0.0003752858575727182, "loss": 2.8962, "theoretical_loss": 3.707746765099264, "tokens_seen": 848107520 }, { "epoch": 2.05, "learning_rate": 0.00037527582748244736, "loss": 2.9225, "theoretical_loss": 3.707718663954152, "tokens_seen": 848173056 }, { "epoch": 2.05, "learning_rate": 0.00037526579739217655, "loss": 2.9996, "theoretical_loss": 3.7076905655881642, "tokens_seen": 848238592 }, { "epoch": 2.05, "learning_rate": 0.0003752557673019057, "loss": 2.9367, "theoretical_loss": 3.7076624700008107, "tokens_seen": 848304128 }, { "epoch": 2.05, "learning_rate": 0.0003752457372116349, "loss": 3.0556, "theoretical_loss": 3.707634377191603, "tokens_seen": 848369664 }, { "epoch": 2.05, "learning_rate": 0.00037523570712136414, "loss": 2.8704, "theoretical_loss": 3.7076062871600515, "tokens_seen": 848435200 }, { "epoch": 2.05, "learning_rate": 0.00037522567703109327, "loss": 3.0365, "theoretical_loss": 3.7075781999056665, "tokens_seen": 848500736 }, { "epoch": 2.05, "learning_rate": 0.0003752156469408225, "loss": 2.9385, "theoretical_loss": 3.70755011542796, "tokens_seen": 848566272 }, { "epoch": 2.05, "learning_rate": 0.00037520561685055163, "loss": 2.8222, "theoretical_loss": 3.707522033726441, "tokens_seen": 848631808 }, { "epoch": 2.05, "objective/train/docs_used": 1369647, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0693740844726562, "objective/train/theoretical_loss": 3.7075079939166002, "objective/train/tokens_used": 869124576, "theoretical_loss": 3.7075079939166002, "tokens_seen": 848664576 }, { "epoch": 2.05, "learning_rate": 0.00037519558676028087, "loss": 2.9107, "theoretical_loss": 3.707493954800624, "tokens_seen": 848697344 }, { "epoch": 2.05, "learning_rate": 0.00037518555667001005, "loss": 2.9847, "theoretical_loss": 3.707465878650017, "tokens_seen": 848762880 }, { "epoch": 2.05, "learning_rate": 0.00037517552657973923, "loss": 2.6607, "theoretical_loss": 3.707437805274134, "tokens_seen": 848828416 }, { "epoch": 2.05, "learning_rate": 0.0003751654964894684, "loss": 2.9318, "theoretical_loss": 3.707409734672485, "tokens_seen": 848893952 }, { "epoch": 2.05, "learning_rate": 0.00037515546639919765, "loss": 2.8844, "theoretical_loss": 3.707381666844582, "tokens_seen": 848959488 }, { "epoch": 2.05, "learning_rate": 0.0003751454363089268, "loss": 2.9055, "theoretical_loss": 3.7073536017899373, "tokens_seen": 849025024 }, { "epoch": 2.05, "learning_rate": 0.000375135406218656, "loss": 2.9979, "theoretical_loss": 3.707325539508063, "tokens_seen": 849090560 }, { "epoch": 2.05, "learning_rate": 0.00037512537612838514, "loss": 3.0102, "theoretical_loss": 3.7072974799984704, "tokens_seen": 849156096 }, { "epoch": 2.05, "learning_rate": 0.00037511534603811437, "loss": 2.9244, "theoretical_loss": 3.707269423260672, "tokens_seen": 849221632 }, { "epoch": 2.05, "learning_rate": 0.00037510531594784355, "loss": 2.9999, "theoretical_loss": 3.7072413692941804, "tokens_seen": 849287168 }, { "epoch": 2.05, "learning_rate": 0.00037509528585757273, "loss": 3.1312, "theoretical_loss": 3.707213318098508, "tokens_seen": 849352704 }, { "epoch": 2.05, "learning_rate": 0.0003750852557673019, "loss": 2.9819, "theoretical_loss": 3.707185269673167, "tokens_seen": 849418240 }, { "epoch": 2.05, "learning_rate": 0.0003750752256770311, "loss": 3.0716, "theoretical_loss": 3.7071572240176702, "tokens_seen": 849483776 }, { "epoch": 2.05, "learning_rate": 0.0003750651955867603, "loss": 2.9168, "theoretical_loss": 3.7071291811315303, "tokens_seen": 849549312 }, { "epoch": 2.05, "learning_rate": 0.0003750551654964895, "loss": 2.8925, "theoretical_loss": 3.7071011410142614, "tokens_seen": 849614848 }, { "epoch": 2.05, "learning_rate": 0.00037504513540621864, "loss": 2.9842, "theoretical_loss": 3.7070731036653743, "tokens_seen": 849680384 }, { "epoch": 2.05, "learning_rate": 0.0003750351053159479, "loss": 2.9832, "theoretical_loss": 3.7070450690843844, "tokens_seen": 849745920 }, { "epoch": 2.05, "learning_rate": 0.000375025075225677, "loss": 2.8114, "theoretical_loss": 3.7070170372708042, "tokens_seen": 849811456 }, { "epoch": 2.05, "learning_rate": 0.00037501504513540624, "loss": 3.004, "theoretical_loss": 3.706989008224147, "tokens_seen": 849876992 }, { "epoch": 2.05, "learning_rate": 0.0003750050150451354, "loss": 3.0096, "theoretical_loss": 3.7069609819439258, "tokens_seen": 849942528 }, { "epoch": 2.05, "learning_rate": 0.0003749949849548646, "loss": 2.9183, "theoretical_loss": 3.7069329584296553, "tokens_seen": 850008064 }, { "epoch": 2.05, "learning_rate": 0.0003749849548645938, "loss": 2.9039, "theoretical_loss": 3.706904937680849, "tokens_seen": 850073600 }, { "epoch": 2.05, "learning_rate": 0.000374974924774323, "loss": 2.7896, "theoretical_loss": 3.7068769196970206, "tokens_seen": 850139136 }, { "epoch": 2.05, "learning_rate": 0.00037496489468405214, "loss": 2.9453, "theoretical_loss": 3.706848904477684, "tokens_seen": 850204672 }, { "epoch": 2.05, "learning_rate": 0.0003749548645937814, "loss": 2.9269, "theoretical_loss": 3.706820892022354, "tokens_seen": 850270208 }, { "epoch": 2.05, "objective/train/docs_used": 1372309, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.758476972579956, "objective/train/theoretical_loss": 3.7068068868310395, "objective/train/tokens_used": 870762976, "theoretical_loss": 3.7068068868310395, "tokens_seen": 850302976 }, { "epoch": 2.05, "learning_rate": 0.0003749448345035105, "loss": 2.932, "theoretical_loss": 3.7067928823305443, "tokens_seen": 850335744 }, { "epoch": 2.05, "learning_rate": 0.00037493480441323974, "loss": 2.8765, "theoretical_loss": 3.7067648754017695, "tokens_seen": 850401280 }, { "epoch": 2.05, "learning_rate": 0.0003749247743229689, "loss": 2.8535, "theoretical_loss": 3.7067368712355444, "tokens_seen": 850466816 }, { "epoch": 2.05, "learning_rate": 0.0003749147442326981, "loss": 2.9973, "theoretical_loss": 3.7067088698313837, "tokens_seen": 850532352 }, { "epoch": 2.05, "learning_rate": 0.0003749047141424273, "loss": 2.8944, "theoretical_loss": 3.7066808711888006, "tokens_seen": 850597888 }, { "epoch": 2.05, "learning_rate": 0.00037489468405215646, "loss": 2.9359, "theoretical_loss": 3.7066528753073125, "tokens_seen": 850663424 }, { "epoch": 2.05, "learning_rate": 0.00037488465396188564, "loss": 3.0541, "theoretical_loss": 3.706624882186433, "tokens_seen": 850728960 }, { "epoch": 2.05, "learning_rate": 0.0003748746238716149, "loss": 3.0819, "theoretical_loss": 3.7065968918256775, "tokens_seen": 850794496 }, { "epoch": 2.05, "learning_rate": 0.000374864593781344, "loss": 2.9874, "theoretical_loss": 3.706568904224561, "tokens_seen": 850860032 }, { "epoch": 2.05, "learning_rate": 0.00037485456369107324, "loss": 3.0571, "theoretical_loss": 3.7065409193825998, "tokens_seen": 850925568 }, { "epoch": 2.05, "learning_rate": 0.00037484453360080237, "loss": 3.0466, "theoretical_loss": 3.706512937299308, "tokens_seen": 850991104 }, { "epoch": 2.05, "learning_rate": 0.0003748345035105316, "loss": 2.8426, "theoretical_loss": 3.7064849579742027, "tokens_seen": 851056640 }, { "epoch": 2.05, "learning_rate": 0.0003748244734202608, "loss": 2.8585, "theoretical_loss": 3.706456981406798, "tokens_seen": 851122176 }, { "epoch": 2.05, "learning_rate": 0.00037481444332998997, "loss": 3.0356, "theoretical_loss": 3.7064290075966113, "tokens_seen": 851187712 }, { "epoch": 2.05, "learning_rate": 0.00037480441323971915, "loss": 2.9978, "theoretical_loss": 3.7064010365431583, "tokens_seen": 851253248 }, { "epoch": 2.05, "learning_rate": 0.0003747943831494484, "loss": 2.9881, "theoretical_loss": 3.706373068245955, "tokens_seen": 851318784 }, { "epoch": 2.05, "learning_rate": 0.0003747843530591775, "loss": 2.9635, "theoretical_loss": 3.7063451027045176, "tokens_seen": 851384320 }, { "epoch": 2.05, "learning_rate": 0.00037477432296890675, "loss": 2.8945, "theoretical_loss": 3.7063171399183616, "tokens_seen": 851449856 }, { "epoch": 2.05, "learning_rate": 0.00037476429287863587, "loss": 2.8113, "theoretical_loss": 3.706289179887005, "tokens_seen": 851515392 }, { "epoch": 2.05, "learning_rate": 0.0003747542627883651, "loss": 2.9472, "theoretical_loss": 3.706261222609964, "tokens_seen": 851580928 }, { "epoch": 2.05, "learning_rate": 0.0003747442326980943, "loss": 3.0293, "theoretical_loss": 3.7062332680867542, "tokens_seen": 851646464 }, { "epoch": 2.05, "learning_rate": 0.00037473420260782347, "loss": 2.8972, "theoretical_loss": 3.7062053163168933, "tokens_seen": 851712000 }, { "epoch": 2.05, "learning_rate": 0.00037472417251755265, "loss": 2.8248, "theoretical_loss": 3.706177367299899, "tokens_seen": 851777536 }, { "epoch": 2.05, "learning_rate": 0.00037471414242728183, "loss": 2.9088, "theoretical_loss": 3.7061494210352874, "tokens_seen": 851843072 }, { "epoch": 2.05, "learning_rate": 0.000374704112337011, "loss": 3.0539, "theoretical_loss": 3.7061214775225766, "tokens_seen": 851908608 }, { "epoch": 2.05, "objective/train/docs_used": 1375128, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1001739501953125, "objective/train/theoretical_loss": 3.7061075067980327, "objective/train/tokens_used": 872401376, "theoretical_loss": 3.7061075067980327, "tokens_seen": 851941376 }, { "epoch": 2.05, "learning_rate": 0.00037469408224674025, "loss": 2.9548, "theoretical_loss": 3.7060935367612826, "tokens_seen": 851974144 }, { "epoch": 2.05, "learning_rate": 0.0003746840521564694, "loss": 3.0884, "theoretical_loss": 3.7060655987509246, "tokens_seen": 852039680 }, { "epoch": 2.05, "learning_rate": 0.0003746740220661986, "loss": 2.9058, "theoretical_loss": 3.706037663491019, "tokens_seen": 852105216 }, { "epoch": 2.05, "learning_rate": 0.00037466399197592774, "loss": 2.8357, "theoretical_loss": 3.706009730981083, "tokens_seen": 852170752 }, { "epoch": 2.05, "learning_rate": 0.000374653961885657, "loss": 2.9353, "theoretical_loss": 3.705981801220636, "tokens_seen": 852236288 }, { "epoch": 2.05, "learning_rate": 0.00037464393179538615, "loss": 3.0792, "theoretical_loss": 3.705953874209195, "tokens_seen": 852301824 }, { "epoch": 2.05, "learning_rate": 0.00037463390170511534, "loss": 2.871, "theoretical_loss": 3.7059259499462778, "tokens_seen": 852367360 }, { "epoch": 2.05, "learning_rate": 0.00037462387161484457, "loss": 2.9127, "theoretical_loss": 3.705898028431404, "tokens_seen": 852432896 }, { "epoch": 2.05, "learning_rate": 0.00037461384152457375, "loss": 2.8091, "theoretical_loss": 3.7058701096640903, "tokens_seen": 852498432 }, { "epoch": 2.05, "learning_rate": 0.00037460381143430293, "loss": 2.9363, "theoretical_loss": 3.7058421936438557, "tokens_seen": 852563968 }, { "epoch": 2.05, "learning_rate": 0.0003745937813440321, "loss": 2.8962, "theoretical_loss": 3.7058142803702188, "tokens_seen": 852629504 }, { "epoch": 2.05, "learning_rate": 0.0003745837512537613, "loss": 2.9883, "theoretical_loss": 3.7057863698426994, "tokens_seen": 852695040 }, { "epoch": 2.05, "learning_rate": 0.0003745737211634905, "loss": 2.9311, "theoretical_loss": 3.705758462060814, "tokens_seen": 852760576 }, { "epoch": 2.05, "learning_rate": 0.0003745636910732197, "loss": 2.9668, "theoretical_loss": 3.7057305570240837, "tokens_seen": 852826112 }, { "epoch": 2.05, "learning_rate": 0.00037455366098294884, "loss": 2.9184, "theoretical_loss": 3.7057026547320264, "tokens_seen": 852891648 }, { "epoch": 2.05, "learning_rate": 0.0003745436308926781, "loss": 2.9619, "theoretical_loss": 3.7056747551841616, "tokens_seen": 852957184 }, { "epoch": 2.05, "learning_rate": 0.0003745336008024072, "loss": 3.0129, "theoretical_loss": 3.705646858380008, "tokens_seen": 853022720 }, { "epoch": 2.05, "learning_rate": 0.00037452357071213644, "loss": 2.9244, "theoretical_loss": 3.705618964319086, "tokens_seen": 853088256 }, { "epoch": 2.05, "learning_rate": 0.0003745135406218656, "loss": 3.1081, "theoretical_loss": 3.7055910730009147, "tokens_seen": 853153792 }, { "epoch": 2.05, "learning_rate": 0.0003745035105315948, "loss": 3.0658, "theoretical_loss": 3.7055631844250136, "tokens_seen": 853219328 }, { "epoch": 2.05, "learning_rate": 0.000374493480441324, "loss": 3.0411, "theoretical_loss": 3.7055352985909025, "tokens_seen": 853284864 }, { "epoch": 2.05, "learning_rate": 0.0003744834503510532, "loss": 3.0142, "theoretical_loss": 3.7055074154981016, "tokens_seen": 853350400 }, { "epoch": 2.05, "learning_rate": 0.00037447342026078234, "loss": 3.04, "theoretical_loss": 3.705479535146131, "tokens_seen": 853415936 }, { "epoch": 2.05, "learning_rate": 0.0003744633901705116, "loss": 2.7573, "theoretical_loss": 3.70545165753451, "tokens_seen": 853481472 }, { "epoch": 2.05, "learning_rate": 0.0003744533600802407, "loss": 3.0389, "theoretical_loss": 3.7054237826627596, "tokens_seen": 853547008 }, { "epoch": 2.05, "objective/train/docs_used": 1376507, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.497962474822998, "objective/train/theoretical_loss": 3.7054098462541862, "objective/train/tokens_used": 874039776, "theoretical_loss": 3.7054098462541862, "tokens_seen": 853579776 }, { "epoch": 2.05, "learning_rate": 0.00037444332998996994, "loss": 2.9151, "theoretical_loss": 3.7053959105304006, "tokens_seen": 853612544 }, { "epoch": 2.05, "learning_rate": 0.0003744332998996991, "loss": 3.0944, "theoretical_loss": 3.705368041136952, "tokens_seen": 853678080 }, { "epoch": 2.05, "learning_rate": 0.0003744232698094283, "loss": 3.0121, "theoretical_loss": 3.7053401744819356, "tokens_seen": 853743616 }, { "epoch": 2.05, "learning_rate": 0.0003744132397191575, "loss": 2.9429, "theoretical_loss": 3.7053123105648718, "tokens_seen": 853809152 }, { "epoch": 2.05, "learning_rate": 0.00037440320962888666, "loss": 3.0417, "theoretical_loss": 3.7052844493852817, "tokens_seen": 853874688 }, { "epoch": 2.05, "learning_rate": 0.00037439317953861585, "loss": 2.9849, "theoretical_loss": 3.7052565909426862, "tokens_seen": 853940224 }, { "epoch": 2.05, "learning_rate": 0.0003743831494483451, "loss": 2.9365, "theoretical_loss": 3.7052287352366067, "tokens_seen": 854005760 }, { "epoch": 2.05, "learning_rate": 0.0003743731193580742, "loss": 3.039, "theoretical_loss": 3.7052008822665634, "tokens_seen": 854071296 }, { "epoch": 2.05, "learning_rate": 0.00037436308926780344, "loss": 2.9727, "theoretical_loss": 3.7051730320320786, "tokens_seen": 854136832 }, { "epoch": 2.05, "learning_rate": 0.00037435305917753257, "loss": 3.0464, "theoretical_loss": 3.7051451845326735, "tokens_seen": 854202368 }, { "epoch": 2.05, "learning_rate": 0.0003743430290872618, "loss": 2.9476, "theoretical_loss": 3.7051173397678694, "tokens_seen": 854267904 }, { "epoch": 2.05, "learning_rate": 0.000374332998996991, "loss": 3.0137, "theoretical_loss": 3.7050894977371884, "tokens_seen": 854333440 }, { "epoch": 2.05, "learning_rate": 0.00037432296890672017, "loss": 2.9405, "theoretical_loss": 3.705061658440152, "tokens_seen": 854398976 }, { "epoch": 2.05, "learning_rate": 0.00037431293881644935, "loss": 2.9151, "theoretical_loss": 3.705033821876283, "tokens_seen": 854464512 }, { "epoch": 2.05, "learning_rate": 0.0003743029087261786, "loss": 2.8686, "theoretical_loss": 3.7050059880451025, "tokens_seen": 854530048 }, { "epoch": 2.05, "learning_rate": 0.0003742928786359077, "loss": 2.9743, "theoretical_loss": 3.704978156946132, "tokens_seen": 854595584 }, { "epoch": 2.05, "learning_rate": 0.00037428284854563695, "loss": 2.9171, "theoretical_loss": 3.704950328578896, "tokens_seen": 854661120 }, { "epoch": 2.05, "learning_rate": 0.00037427281845536607, "loss": 3.0468, "theoretical_loss": 3.7049225029429156, "tokens_seen": 854726656 }, { "epoch": 2.05, "learning_rate": 0.0003742627883650953, "loss": 3.08, "theoretical_loss": 3.704894680037713, "tokens_seen": 854792192 }, { "epoch": 2.05, "learning_rate": 0.0003742527582748245, "loss": 2.8881, "theoretical_loss": 3.704866859862812, "tokens_seen": 854857728 }, { "epoch": 2.05, "learning_rate": 0.00037424272818455367, "loss": 2.9588, "theoretical_loss": 3.704839042417734, "tokens_seen": 854923264 }, { "epoch": 2.05, "learning_rate": 0.00037423269809428285, "loss": 2.9798, "theoretical_loss": 3.7048112277020033, "tokens_seen": 854988800 }, { "epoch": 2.05, "learning_rate": 0.00037422266800401203, "loss": 2.9877, "theoretical_loss": 3.704783415715142, "tokens_seen": 855054336 }, { "epoch": 2.05, "learning_rate": 0.0003742126379137412, "loss": 2.9579, "theoretical_loss": 3.7047556064566733, "tokens_seen": 855119872 }, { "epoch": 2.05, "learning_rate": 0.00037420260782347045, "loss": 2.8351, "theoretical_loss": 3.704727799926121, "tokens_seen": 855185408 }, { "epoch": 2.05, "objective/train/docs_used": 1380476, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.944342613220215, "objective/train/theoretical_loss": 3.704713897683664, "objective/train/tokens_used": 875678176, "theoretical_loss": 3.704713897683664, "tokens_seen": 855218176 }, { "epoch": 2.05, "learning_rate": 0.0003741925777331996, "loss": 2.9494, "theoretical_loss": 3.704699996123008, "tokens_seen": 855250944 }, { "epoch": 2.05, "learning_rate": 0.0003741825476429288, "loss": 2.9989, "theoretical_loss": 3.704672195046858, "tokens_seen": 855316480 }, { "epoch": 2.05, "learning_rate": 0.00037417251755265794, "loss": 3.2413, "theoretical_loss": 3.704644396697195, "tokens_seen": 855382016 }, { "epoch": 2.05, "learning_rate": 0.0003741624874623872, "loss": 2.8964, "theoretical_loss": 3.7046166010735417, "tokens_seen": 855447552 }, { "epoch": 2.05, "learning_rate": 0.00037415245737211635, "loss": 2.984, "theoretical_loss": 3.7045888081754237, "tokens_seen": 855513088 }, { "epoch": 2.05, "learning_rate": 0.00037414242728184554, "loss": 2.883, "theoretical_loss": 3.704561018002363, "tokens_seen": 855578624 }, { "epoch": 2.05, "learning_rate": 0.0003741323971915747, "loss": 2.94, "theoretical_loss": 3.704533230553885, "tokens_seen": 855644160 }, { "epoch": 2.05, "learning_rate": 0.00037412236710130395, "loss": 2.8503, "theoretical_loss": 3.704505445829513, "tokens_seen": 855709696 }, { "epoch": 2.05, "learning_rate": 0.0003741123370110331, "loss": 2.7301, "theoretical_loss": 3.704477663828773, "tokens_seen": 855775232 }, { "epoch": 2.05, "learning_rate": 0.0003741023069207623, "loss": 2.9035, "theoretical_loss": 3.7044498845511873, "tokens_seen": 855840768 }, { "epoch": 2.05, "learning_rate": 0.00037409227683049144, "loss": 3.0722, "theoretical_loss": 3.7044221079962822, "tokens_seen": 855906304 }, { "epoch": 2.05, "learning_rate": 0.0003740822467402207, "loss": 2.9196, "theoretical_loss": 3.7043943341635814, "tokens_seen": 855971840 }, { "epoch": 2.05, "learning_rate": 0.00037407221664994986, "loss": 2.982, "theoretical_loss": 3.70436656305261, "tokens_seen": 856037376 }, { "epoch": 2.05, "learning_rate": 0.00037406218655967904, "loss": 3.0089, "theoretical_loss": 3.7043387946628936, "tokens_seen": 856102912 }, { "epoch": 2.05, "learning_rate": 0.0003740521564694082, "loss": 3.0217, "theoretical_loss": 3.7043110289939563, "tokens_seen": 856168448 }, { "epoch": 2.05, "learning_rate": 0.0003740421263791374, "loss": 3.0398, "theoretical_loss": 3.7042832660453238, "tokens_seen": 856233984 }, { "epoch": 2.05, "learning_rate": 0.0003740320962888666, "loss": 3.0638, "theoretical_loss": 3.7042555058165214, "tokens_seen": 856299520 }, { "epoch": 2.05, "learning_rate": 0.0003740220661985958, "loss": 2.8904, "theoretical_loss": 3.7042277483070745, "tokens_seen": 856365056 }, { "epoch": 2.05, "learning_rate": 0.00037401203610832494, "loss": 2.8277, "theoretical_loss": 3.704199993516508, "tokens_seen": 856430592 }, { "epoch": 2.05, "learning_rate": 0.0003740020060180542, "loss": 2.9428, "theoretical_loss": 3.7041722414443483, "tokens_seen": 856496128 }, { "epoch": 2.05, "learning_rate": 0.00037399197592778336, "loss": 2.7785, "theoretical_loss": 3.704144492090121, "tokens_seen": 856561664 }, { "epoch": 2.05, "learning_rate": 0.00037398194583751254, "loss": 2.8236, "theoretical_loss": 3.704116745453352, "tokens_seen": 856627200 }, { "epoch": 2.05, "learning_rate": 0.0003739719157472417, "loss": 3.0168, "theoretical_loss": 3.7040890015335672, "tokens_seen": 856692736 }, { "epoch": 2.05, "learning_rate": 0.0003739618856569709, "loss": 2.9977, "theoretical_loss": 3.7040612603302927, "tokens_seen": 856758272 }, { "epoch": 2.05, "learning_rate": 0.0003739518555667001, "loss": 2.9957, "theoretical_loss": 3.7040335218430553, "tokens_seen": 856823808 }, { "epoch": 2.05, "objective/train/docs_used": 1381916, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.91264009475708, "objective/train/theoretical_loss": 3.7040196536178023, "objective/train/tokens_used": 877316576, "theoretical_loss": 3.7040196536178023, "tokens_seen": 856856576 }, { "epoch": 2.05, "learning_rate": 0.0003739418254764293, "loss": 2.9975, "theoretical_loss": 3.7040057860713804, "tokens_seen": 856889344 }, { "epoch": 2.05, "learning_rate": 0.00037393179538615845, "loss": 3.0068, "theoretical_loss": 3.7039780530147954, "tokens_seen": 856954880 }, { "epoch": 2.05, "learning_rate": 0.0003739217652958877, "loss": 2.9967, "theoretical_loss": 3.703950322672826, "tokens_seen": 857020416 }, { "epoch": 2.05, "learning_rate": 0.0003739117352056168, "loss": 2.9243, "theoretical_loss": 3.7039225950450003, "tokens_seen": 857085952 }, { "epoch": 2.05, "learning_rate": 0.00037390170511534605, "loss": 3.0077, "theoretical_loss": 3.703894870130844, "tokens_seen": 857151488 }, { "epoch": 2.05, "learning_rate": 0.0003738916750250752, "loss": 3.1213, "theoretical_loss": 3.703867147929884, "tokens_seen": 857217024 }, { "epoch": 2.05, "learning_rate": 0.0003738816449348044, "loss": 3.0326, "theoretical_loss": 3.7038394284416483, "tokens_seen": 857282560 }, { "epoch": 2.05, "learning_rate": 0.00037387161484453364, "loss": 2.7395, "theoretical_loss": 3.7038117116656633, "tokens_seen": 857348096 }, { "epoch": 2.05, "learning_rate": 0.00037386158475426277, "loss": 2.8732, "theoretical_loss": 3.7037839976014566, "tokens_seen": 857413632 }, { "epoch": 2.05, "learning_rate": 0.000373851554663992, "loss": 2.8139, "theoretical_loss": 3.703756286248556, "tokens_seen": 857479168 }, { "epoch": 2.05, "learning_rate": 0.0003738415245737212, "loss": 2.9869, "theoretical_loss": 3.703728577606488, "tokens_seen": 857544704 }, { "epoch": 2.05, "learning_rate": 0.00037383149448345037, "loss": 3.1375, "theoretical_loss": 3.7037008716747812, "tokens_seen": 857610240 }, { "epoch": 2.05, "learning_rate": 0.00037382146439317955, "loss": 2.9442, "theoretical_loss": 3.703673168452963, "tokens_seen": 857675776 }, { "epoch": 2.05, "learning_rate": 0.0003738114343029088, "loss": 3.0713, "theoretical_loss": 3.7036454679405617, "tokens_seen": 857741312 }, { "epoch": 2.05, "learning_rate": 0.0003738014042126379, "loss": 2.9036, "theoretical_loss": 3.7036177701371056, "tokens_seen": 857806848 }, { "epoch": 2.05, "learning_rate": 0.00037379137412236715, "loss": 2.9459, "theoretical_loss": 3.703590075042121, "tokens_seen": 857872384 }, { "epoch": 2.05, "learning_rate": 0.00037378134403209627, "loss": 3.0104, "theoretical_loss": 3.703562382655139, "tokens_seen": 857937920 }, { "epoch": 2.05, "learning_rate": 0.0003737713139418255, "loss": 2.9428, "theoretical_loss": 3.7035346929756856, "tokens_seen": 858003456 }, { "epoch": 2.05, "learning_rate": 0.0003737612838515547, "loss": 2.9782, "theoretical_loss": 3.7035070060032904, "tokens_seen": 858068992 }, { "epoch": 2.05, "learning_rate": 0.00037375125376128387, "loss": 2.9073, "theoretical_loss": 3.703479321737482, "tokens_seen": 858134528 }, { "epoch": 2.05, "learning_rate": 0.00037374122367101305, "loss": 3.0504, "theoretical_loss": 3.703451640177789, "tokens_seen": 858200064 }, { "epoch": 2.05, "learning_rate": 0.00037373119358074223, "loss": 2.9303, "theoretical_loss": 3.7034239613237396, "tokens_seen": 858265600 }, { "epoch": 2.05, "learning_rate": 0.0003737211634904714, "loss": 2.9978, "theoretical_loss": 3.703396285174864, "tokens_seen": 858331136 }, { "epoch": 2.05, "learning_rate": 0.00037371113340020065, "loss": 2.9449, "theoretical_loss": 3.7033686117306908, "tokens_seen": 858396672 }, { "epoch": 2.05, "learning_rate": 0.0003737011033099298, "loss": 2.8833, "theoretical_loss": 3.703340940990749, "tokens_seen": 858462208 }, { "epoch": 2.05, "objective/train/docs_used": 1384663, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.033069133758545, "objective/train/theoretical_loss": 3.7033271066347178, "objective/train/tokens_used": 878954976, "theoretical_loss": 3.7033271066347178, "tokens_seen": 858494976 }, { "epoch": 2.05, "learning_rate": 0.000373691073219659, "loss": 3.004, "theoretical_loss": 3.703313272954568, "tokens_seen": 858527744 }, { "epoch": 2.05, "learning_rate": 0.00037368104312938814, "loss": 3.0327, "theoretical_loss": 3.7032856076216767, "tokens_seen": 858593280 }, { "epoch": 2.05, "learning_rate": 0.0003736710130391174, "loss": 2.9424, "theoretical_loss": 3.7032579449916065, "tokens_seen": 858658816 }, { "epoch": 2.05, "learning_rate": 0.00037366098294884655, "loss": 2.8812, "theoretical_loss": 3.703230285063885, "tokens_seen": 858724352 }, { "epoch": 2.05, "learning_rate": 0.00037365095285857574, "loss": 2.9682, "theoretical_loss": 3.7032026278380425, "tokens_seen": 858789888 }, { "epoch": 2.05, "learning_rate": 0.0003736409227683049, "loss": 2.9599, "theoretical_loss": 3.70317497331361, "tokens_seen": 858855424 }, { "epoch": 2.05, "learning_rate": 0.00037363089267803415, "loss": 2.9688, "theoretical_loss": 3.7031473214901167, "tokens_seen": 858920960 }, { "epoch": 2.05, "learning_rate": 0.0003736208625877633, "loss": 3.0942, "theoretical_loss": 3.7031196723670923, "tokens_seen": 858986496 }, { "epoch": 2.05, "learning_rate": 0.0003736108324974925, "loss": 2.8822, "theoretical_loss": 3.703092025944068, "tokens_seen": 859052032 }, { "epoch": 2.05, "learning_rate": 0.00037360080240722164, "loss": 2.9297, "theoretical_loss": 3.7030643822205738, "tokens_seen": 859117568 }, { "epoch": 2.05, "learning_rate": 0.0003735907723169509, "loss": 3.0343, "theoretical_loss": 3.70303674119614, "tokens_seen": 859183104 }, { "epoch": 2.05, "learning_rate": 0.00037358074222668006, "loss": 2.8751, "theoretical_loss": 3.703009102870298, "tokens_seen": 859248640 }, { "epoch": 2.05, "learning_rate": 0.00037357071213640924, "loss": 2.9396, "theoretical_loss": 3.702981467242578, "tokens_seen": 859314176 }, { "epoch": 2.05, "learning_rate": 0.0003735606820461384, "loss": 2.902, "theoretical_loss": 3.7029538343125106, "tokens_seen": 859379712 }, { "epoch": 2.05, "learning_rate": 0.0003735506519558676, "loss": 2.9556, "theoretical_loss": 3.7029262040796267, "tokens_seen": 859445248 }, { "epoch": 2.05, "learning_rate": 0.0003735406218655968, "loss": 2.8945, "theoretical_loss": 3.7028985765434577, "tokens_seen": 859510784 }, { "epoch": 2.05, "learning_rate": 0.000373530591775326, "loss": 2.959, "theoretical_loss": 3.7028709517035354, "tokens_seen": 859576320 }, { "epoch": 2.05, "learning_rate": 0.00037352056168505514, "loss": 3.0282, "theoretical_loss": 3.70284332955939, "tokens_seen": 859641856 }, { "epoch": 2.05, "learning_rate": 0.0003735105315947844, "loss": 3.0301, "theoretical_loss": 3.7028157101105537, "tokens_seen": 859707392 }, { "epoch": 2.05, "learning_rate": 0.00037350050150451356, "loss": 3.0188, "theoretical_loss": 3.702788093356558, "tokens_seen": 859772928 }, { "epoch": 2.05, "learning_rate": 0.00037349047141424274, "loss": 3.0028, "theoretical_loss": 3.702760479296934, "tokens_seen": 859838464 }, { "epoch": 2.05, "learning_rate": 0.0003734804413239719, "loss": 2.9461, "theoretical_loss": 3.702732867931214, "tokens_seen": 859904000 }, { "epoch": 2.05, "learning_rate": 0.0003734704112337011, "loss": 2.7862, "theoretical_loss": 3.7027052592589307, "tokens_seen": 859969536 }, { "epoch": 2.05, "learning_rate": 0.0003734603811434303, "loss": 3.0456, "theoretical_loss": 3.702677653279615, "tokens_seen": 860035072 }, { "epoch": 2.05, "learning_rate": 0.0003734503510531595, "loss": 2.9969, "theoretical_loss": 3.7026500499927986, "tokens_seen": 860100608 }, { "epoch": 2.05, "objective/train/docs_used": 1387487, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1300666332244873, "objective/train/theoretical_loss": 3.7026362493589318, "objective/train/tokens_used": 880593376, "theoretical_loss": 3.7026362493589318, "tokens_seen": 860133376 }, { "epoch": 2.05, "learning_rate": 0.00037344032096288865, "loss": 3.0407, "theoretical_loss": 3.702622449398015, "tokens_seen": 860166144 }, { "epoch": 2.05, "learning_rate": 0.0003734302908726179, "loss": 3.0111, "theoretical_loss": 3.702594851494796, "tokens_seen": 860231680 }, { "epoch": 2.05, "learning_rate": 0.000373420260782347, "loss": 2.9773, "theoretical_loss": 3.7025672562826744, "tokens_seen": 860297216 }, { "epoch": 2.05, "learning_rate": 0.00037341023069207625, "loss": 2.9001, "theoretical_loss": 3.702539663761182, "tokens_seen": 860362752 }, { "epoch": 2.05, "learning_rate": 0.0003734002006018054, "loss": 2.801, "theoretical_loss": 3.702512073929853, "tokens_seen": 860428288 }, { "epoch": 2.05, "learning_rate": 0.0003733901705115346, "loss": 3.1189, "theoretical_loss": 3.7024844867882187, "tokens_seen": 860493824 }, { "epoch": 2.05, "learning_rate": 0.0003733801404212638, "loss": 2.9056, "theoretical_loss": 3.702456902335813, "tokens_seen": 860559360 }, { "epoch": 2.05, "learning_rate": 0.00037337011033099297, "loss": 2.9587, "theoretical_loss": 3.7024293205721683, "tokens_seen": 860624896 }, { "epoch": 2.05, "learning_rate": 0.00037336008024072215, "loss": 3.0176, "theoretical_loss": 3.7024017414968187, "tokens_seen": 860690432 }, { "epoch": 2.05, "learning_rate": 0.0003733500501504514, "loss": 2.9008, "theoretical_loss": 3.7023741651092967, "tokens_seen": 860755968 }, { "epoch": 2.05, "learning_rate": 0.0003733400200601805, "loss": 2.954, "theoretical_loss": 3.7023465914091362, "tokens_seen": 860821504 }, { "epoch": 2.05, "learning_rate": 0.00037332998996990975, "loss": 2.9337, "theoretical_loss": 3.702319020395871, "tokens_seen": 860887040 }, { "epoch": 2.05, "learning_rate": 0.00037331995987963893, "loss": 3.0237, "theoretical_loss": 3.7022914520690344, "tokens_seen": 860952576 }, { "epoch": 2.05, "learning_rate": 0.0003733099297893681, "loss": 2.9121, "theoretical_loss": 3.7022638864281596, "tokens_seen": 861018112 }, { "epoch": 2.05, "learning_rate": 0.0003732998996990973, "loss": 2.9352, "theoretical_loss": 3.702236323472781, "tokens_seen": 861083648 }, { "epoch": 2.05, "learning_rate": 0.0003732898696088265, "loss": 3.163, "theoretical_loss": 3.7022087632024334, "tokens_seen": 861149184 }, { "epoch": 2.05, "learning_rate": 0.00037327983951855565, "loss": 2.9139, "theoretical_loss": 3.7021812056166494, "tokens_seen": 861214720 }, { "epoch": 2.05, "learning_rate": 0.0003732698094282849, "loss": 3.0634, "theoretical_loss": 3.7021536507149646, "tokens_seen": 861280256 }, { "epoch": 2.05, "learning_rate": 0.000373259779338014, "loss": 2.9135, "theoretical_loss": 3.7021260984969127, "tokens_seen": 861345792 }, { "epoch": 2.05, "learning_rate": 0.00037324974924774325, "loss": 3.0075, "theoretical_loss": 3.702098548962028, "tokens_seen": 861411328 }, { "epoch": 2.05, "learning_rate": 0.0003732397191574724, "loss": 2.939, "theoretical_loss": 3.7020710021098457, "tokens_seen": 861476864 }, { "epoch": 2.05, "learning_rate": 0.0003732296890672016, "loss": 2.9919, "theoretical_loss": 3.7020434579399004, "tokens_seen": 861542400 }, { "epoch": 2.05, "learning_rate": 0.0003732196589769308, "loss": 2.811, "theoretical_loss": 3.7020159164517263, "tokens_seen": 861607936 }, { "epoch": 2.05, "learning_rate": 0.00037320962888666, "loss": 2.9575, "theoretical_loss": 3.701988377644859, "tokens_seen": 861673472 }, { "epoch": 2.05, "learning_rate": 0.00037319959879638916, "loss": 3.1545, "theoretical_loss": 3.701960841518833, "tokens_seen": 861739008 }, { "epoch": 2.05, "objective/train/docs_used": 1390370, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8483493328094482, "objective/train/theoretical_loss": 3.701947074460991, "objective/train/tokens_used": 882231776, "theoretical_loss": 3.701947074460991, "tokens_seen": 861771776 }, { "epoch": 2.05, "learning_rate": 0.00037318956870611834, "loss": 2.9655, "theoretical_loss": 3.701933308073184, "tokens_seen": 861804544 }, { "epoch": 2.05, "learning_rate": 0.0003731795386158475, "loss": 2.9702, "theoretical_loss": 3.7019057773074473, "tokens_seen": 861870080 }, { "epoch": 2.05, "learning_rate": 0.00037316950852557675, "loss": 2.9604, "theoretical_loss": 3.701878249221158, "tokens_seen": 861935616 }, { "epoch": 2.05, "learning_rate": 0.0003731594784353059, "loss": 2.9808, "theoretical_loss": 3.701850723813852, "tokens_seen": 862001152 }, { "epoch": 2.05, "learning_rate": 0.0003731494483450351, "loss": 2.8956, "theoretical_loss": 3.7018232010850642, "tokens_seen": 862066688 }, { "epoch": 2.05, "learning_rate": 0.0003731394182547643, "loss": 2.9861, "theoretical_loss": 3.701795681034331, "tokens_seen": 862132224 }, { "epoch": 2.05, "learning_rate": 0.0003731293881644935, "loss": 2.9702, "theoretical_loss": 3.701768163661188, "tokens_seen": 862197760 }, { "epoch": 2.05, "learning_rate": 0.0003731193580742227, "loss": 2.7179, "theoretical_loss": 3.7017406489651714, "tokens_seen": 862263296 }, { "epoch": 2.05, "learning_rate": 0.00037310932798395184, "loss": 2.7801, "theoretical_loss": 3.701713136945817, "tokens_seen": 862328832 }, { "epoch": 2.05, "learning_rate": 0.0003730992978936811, "loss": 2.8045, "theoretical_loss": 3.7016856276026613, "tokens_seen": 862394368 }, { "epoch": 2.05, "learning_rate": 0.00037308926780341026, "loss": 2.9371, "theoretical_loss": 3.7016581209352406, "tokens_seen": 862459904 }, { "epoch": 2.05, "learning_rate": 0.00037307923771313944, "loss": 3.0083, "theoretical_loss": 3.701630616943091, "tokens_seen": 862525440 }, { "epoch": 2.05, "learning_rate": 0.0003730692076228686, "loss": 2.9602, "theoretical_loss": 3.7016031156257494, "tokens_seen": 862590976 }, { "epoch": 2.05, "learning_rate": 0.0003730591775325978, "loss": 2.9646, "theoretical_loss": 3.701575616982753, "tokens_seen": 862656512 }, { "epoch": 2.05, "learning_rate": 0.000373049147442327, "loss": 2.8918, "theoretical_loss": 3.701548121013637, "tokens_seen": 862722048 }, { "epoch": 2.05, "learning_rate": 0.0003730391173520562, "loss": 3.0106, "theoretical_loss": 3.70152062771794, "tokens_seen": 862787584 }, { "epoch": 2.05, "learning_rate": 0.00037302908726178534, "loss": 2.8895, "theoretical_loss": 3.7014931370951984, "tokens_seen": 862853120 }, { "epoch": 2.05, "learning_rate": 0.0003730190571715146, "loss": 3.0793, "theoretical_loss": 3.701465649144949, "tokens_seen": 862918656 }, { "epoch": 2.05, "learning_rate": 0.00037300902708124376, "loss": 2.9687, "theoretical_loss": 3.70143816386673, "tokens_seen": 862984192 }, { "epoch": 2.05, "learning_rate": 0.00037299899699097294, "loss": 2.9818, "theoretical_loss": 3.7014106812600778, "tokens_seen": 863049728 }, { "epoch": 2.05, "learning_rate": 0.0003729889669007021, "loss": 3.0437, "theoretical_loss": 3.70138320132453, "tokens_seen": 863115264 }, { "epoch": 2.05, "learning_rate": 0.0003729789368104313, "loss": 3.0121, "theoretical_loss": 3.701355724059624, "tokens_seen": 863180800 }, { "epoch": 2.05, "learning_rate": 0.0003729689067201605, "loss": 2.8796, "theoretical_loss": 3.7013282494648987, "tokens_seen": 863246336 }, { "epoch": 2.05, "learning_rate": 0.0003729588766298897, "loss": 3.0371, "theoretical_loss": 3.701300777539891, "tokens_seen": 863311872 }, { "epoch": 2.05, "learning_rate": 0.00037294884653961885, "loss": 2.9143, "theoretical_loss": 3.701273308284139, "tokens_seen": 863377408 }, { "epoch": 2.05, "objective/train/docs_used": 1392731, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9459803104400635, "objective/train/theoretical_loss": 3.7012595746570893, "objective/train/tokens_used": 883870176, "theoretical_loss": 3.7012595746570893, "tokens_seen": 863410176 }, { "epoch": 2.05, "learning_rate": 0.0003729388164493481, "loss": 2.9372, "theoretical_loss": 3.7012458416971805, "tokens_seen": 863442944 }, { "epoch": 2.05, "learning_rate": 0.0003729287863590772, "loss": 3.0572, "theoretical_loss": 3.701218377778554, "tokens_seen": 863508480 }, { "epoch": 2.05, "learning_rate": 0.00037291875626880645, "loss": 2.9955, "theoretical_loss": 3.701190916527798, "tokens_seen": 863574016 }, { "epoch": 2.05, "learning_rate": 0.0003729087261785356, "loss": 2.8925, "theoretical_loss": 3.7011634579444506, "tokens_seen": 863639552 }, { "epoch": 2.05, "learning_rate": 0.0003728986960882648, "loss": 2.9947, "theoretical_loss": 3.70113600202805, "tokens_seen": 863705088 }, { "epoch": 2.05, "learning_rate": 0.000372888665997994, "loss": 2.9706, "theoretical_loss": 3.7011085487781354, "tokens_seen": 863770624 }, { "epoch": 2.05, "learning_rate": 0.00037287863590772317, "loss": 2.8067, "theoretical_loss": 3.701081098194246, "tokens_seen": 863836160 }, { "epoch": 2.05, "learning_rate": 0.00037286860581745235, "loss": 2.8462, "theoretical_loss": 3.7010536502759193, "tokens_seen": 863901696 }, { "epoch": 2.05, "learning_rate": 0.0003728585757271816, "loss": 3.0407, "theoretical_loss": 3.7010262050226954, "tokens_seen": 863967232 }, { "epoch": 2.05, "learning_rate": 0.0003728485456369107, "loss": 2.993, "theoretical_loss": 3.7009987624341125, "tokens_seen": 864032768 }, { "epoch": 2.05, "learning_rate": 0.00037283851554663995, "loss": 3.0799, "theoretical_loss": 3.7009713225097105, "tokens_seen": 864098304 }, { "epoch": 2.05, "learning_rate": 0.00037282848545636913, "loss": 2.896, "theoretical_loss": 3.7009438852490284, "tokens_seen": 864163840 }, { "epoch": 2.05, "learning_rate": 0.0003728184553660983, "loss": 2.8227, "theoretical_loss": 3.7009164506516066, "tokens_seen": 864229376 }, { "epoch": 2.05, "learning_rate": 0.0003728084252758275, "loss": 2.9313, "theoretical_loss": 3.7008890187169827, "tokens_seen": 864294912 }, { "epoch": 2.05, "learning_rate": 0.0003727983951855567, "loss": 2.9452, "theoretical_loss": 3.700861589444698, "tokens_seen": 864360448 }, { "epoch": 2.05, "learning_rate": 0.00037278836509528585, "loss": 2.8955, "theoretical_loss": 3.7008341628342922, "tokens_seen": 864425984 }, { "epoch": 2.05, "learning_rate": 0.0003727783350050151, "loss": 3.0674, "theoretical_loss": 3.7008067388853045, "tokens_seen": 864491520 }, { "epoch": 2.05, "learning_rate": 0.0003727683049147442, "loss": 2.8105, "theoretical_loss": 3.7007793175972754, "tokens_seen": 864557056 }, { "epoch": 2.05, "learning_rate": 0.00037275827482447345, "loss": 3.0123, "theoretical_loss": 3.7007518989697443, "tokens_seen": 864622592 }, { "epoch": 2.05, "learning_rate": 0.0003727482447342026, "loss": 3.0327, "theoretical_loss": 3.7007244830022525, "tokens_seen": 864688128 }, { "epoch": 2.05, "learning_rate": 0.0003727382146439318, "loss": 2.9524, "theoretical_loss": 3.700697069694339, "tokens_seen": 864753664 }, { "epoch": 2.05, "learning_rate": 0.000372728184553661, "loss": 2.9942, "theoretical_loss": 3.700669659045546, "tokens_seen": 864819200 }, { "epoch": 2.05, "learning_rate": 0.0003727181544633902, "loss": 2.8353, "theoretical_loss": 3.700642251055413, "tokens_seen": 864884736 }, { "epoch": 2.06, "learning_rate": 0.00037270812437311936, "loss": 2.8715, "theoretical_loss": 3.7006148457234804, "tokens_seen": 864950272 }, { "epoch": 2.06, "learning_rate": 0.00037269809428284854, "loss": 2.9697, "theoretical_loss": 3.70058744304929, "tokens_seen": 865015808 }, { "epoch": 2.06, "objective/train/docs_used": 1395602, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9205071926116943, "objective/train/theoretical_loss": 3.7005737427087046, "objective/train/tokens_used": 885508576, "theoretical_loss": 3.7005737427087046, "tokens_seen": 865048576 }, { "epoch": 2.06, "learning_rate": 0.0003726880641925777, "loss": 2.9159, "theoretical_loss": 3.7005600430323824, "tokens_seen": 865081344 }, { "epoch": 2.06, "learning_rate": 0.00037267803410230696, "loss": 2.8841, "theoretical_loss": 3.700532645672298, "tokens_seen": 865146880 }, { "epoch": 2.06, "learning_rate": 0.0003726680040120361, "loss": 2.9261, "theoretical_loss": 3.700505250968578, "tokens_seen": 865212416 }, { "epoch": 2.06, "learning_rate": 0.0003726579739217653, "loss": 2.8048, "theoretical_loss": 3.700477858920765, "tokens_seen": 865277952 }, { "epoch": 2.06, "learning_rate": 0.0003726479438314945, "loss": 2.7952, "theoretical_loss": 3.7004504695283984, "tokens_seen": 865343488 }, { "epoch": 2.06, "learning_rate": 0.0003726379137412237, "loss": 2.9344, "theoretical_loss": 3.700423082791022, "tokens_seen": 865409024 }, { "epoch": 2.06, "learning_rate": 0.00037262788365095286, "loss": 2.8917, "theoretical_loss": 3.700395698708175, "tokens_seen": 865474560 }, { "epoch": 2.06, "learning_rate": 0.00037261785356068204, "loss": 3.006, "theoretical_loss": 3.700368317279401, "tokens_seen": 865540096 }, { "epoch": 2.06, "learning_rate": 0.0003726078234704112, "loss": 2.9923, "theoretical_loss": 3.700340938504241, "tokens_seen": 865605632 }, { "epoch": 2.06, "learning_rate": 0.00037259779338014046, "loss": 2.8245, "theoretical_loss": 3.700313562382237, "tokens_seen": 865671168 }, { "epoch": 2.06, "learning_rate": 0.0003725877632898696, "loss": 3.0032, "theoretical_loss": 3.7002861889129313, "tokens_seen": 865736704 }, { "epoch": 2.06, "learning_rate": 0.0003725777331995988, "loss": 3.0705, "theoretical_loss": 3.7002588180958655, "tokens_seen": 865802240 }, { "epoch": 2.06, "learning_rate": 0.00037256770310932795, "loss": 3.0216, "theoretical_loss": 3.7002314499305826, "tokens_seen": 865867776 }, { "epoch": 2.06, "learning_rate": 0.0003725576730190572, "loss": 2.8982, "theoretical_loss": 3.700204084416625, "tokens_seen": 865933312 }, { "epoch": 2.06, "learning_rate": 0.00037254764292878636, "loss": 3.0743, "theoretical_loss": 3.7001767215535346, "tokens_seen": 865998848 }, { "epoch": 2.06, "learning_rate": 0.00037253761283851554, "loss": 2.9412, "theoretical_loss": 3.7001493613408547, "tokens_seen": 866064384 }, { "epoch": 2.06, "learning_rate": 0.0003725275827482447, "loss": 2.8619, "theoretical_loss": 3.700122003778127, "tokens_seen": 866129920 }, { "epoch": 2.06, "learning_rate": 0.00037251755265797396, "loss": 2.8901, "theoretical_loss": 3.700094648864896, "tokens_seen": 866195456 }, { "epoch": 2.06, "learning_rate": 0.0003725075225677031, "loss": 3.0627, "theoretical_loss": 3.700067296600703, "tokens_seen": 866260992 }, { "epoch": 2.06, "learning_rate": 0.0003724974924774323, "loss": 2.9126, "theoretical_loss": 3.7000399469850924, "tokens_seen": 866326528 }, { "epoch": 2.06, "learning_rate": 0.00037248746238716145, "loss": 2.985, "theoretical_loss": 3.7000126000176063, "tokens_seen": 866392064 }, { "epoch": 2.06, "learning_rate": 0.0003724774322968907, "loss": 2.7643, "theoretical_loss": 3.699985255697789, "tokens_seen": 866457600 }, { "epoch": 2.06, "learning_rate": 0.00037246740220661987, "loss": 2.9656, "theoretical_loss": 3.699957914025183, "tokens_seen": 866523136 }, { "epoch": 2.06, "learning_rate": 0.00037245737211634905, "loss": 2.8372, "theoretical_loss": 3.6999305749993328, "tokens_seen": 866588672 }, { "epoch": 2.06, "learning_rate": 0.00037244734202607823, "loss": 3.0466, "theoretical_loss": 3.6999032386197817, "tokens_seen": 866654208 }, { "epoch": 2.06, "objective/train/docs_used": 1398454, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.907623767852783, "objective/train/theoretical_loss": 3.6998895714222253, "objective/train/tokens_used": 887146976, "theoretical_loss": 3.6998895714222253, "tokens_seen": 866686976 }, { "epoch": 2.06, "learning_rate": 0.0003724373119358074, "loss": 2.8923, "theoretical_loss": 3.699875904886073, "tokens_seen": 866719744 }, { "epoch": 2.06, "learning_rate": 0.0003724272818455366, "loss": 2.9941, "theoretical_loss": 3.699848573797751, "tokens_seen": 866785280 }, { "epoch": 2.06, "learning_rate": 0.0003724172517552658, "loss": 3.0421, "theoretical_loss": 3.6998212453543595, "tokens_seen": 866850816 }, { "epoch": 2.06, "learning_rate": 0.00037240722166499495, "loss": 3.0376, "theoretical_loss": 3.699793919555443, "tokens_seen": 866916352 }, { "epoch": 2.06, "learning_rate": 0.0003723971915747242, "loss": 2.8673, "theoretical_loss": 3.6997665964005453, "tokens_seen": 866981888 }, { "epoch": 2.06, "learning_rate": 0.0003723871614844533, "loss": 2.9513, "theoretical_loss": 3.6997392758892107, "tokens_seen": 867047424 }, { "epoch": 2.06, "learning_rate": 0.00037237713139418255, "loss": 2.896, "theoretical_loss": 3.699711958020984, "tokens_seen": 867112960 }, { "epoch": 2.06, "learning_rate": 0.0003723671013039118, "loss": 2.9023, "theoretical_loss": 3.6996846427954093, "tokens_seen": 867178496 }, { "epoch": 2.06, "learning_rate": 0.0003723570712136409, "loss": 3.0523, "theoretical_loss": 3.699657330212032, "tokens_seen": 867244032 }, { "epoch": 2.06, "learning_rate": 0.00037234704112337015, "loss": 3.0119, "theoretical_loss": 3.699630020270396, "tokens_seen": 867309568 }, { "epoch": 2.06, "learning_rate": 0.00037233701103309933, "loss": 3.0049, "theoretical_loss": 3.699602712970047, "tokens_seen": 867375104 }, { "epoch": 2.06, "learning_rate": 0.0003723269809428285, "loss": 2.9424, "theoretical_loss": 3.6995754083105297, "tokens_seen": 867440640 }, { "epoch": 2.06, "learning_rate": 0.0003723169508525577, "loss": 3.0926, "theoretical_loss": 3.699548106291389, "tokens_seen": 867506176 }, { "epoch": 2.06, "learning_rate": 0.0003723069207622869, "loss": 2.9237, "theoretical_loss": 3.6995208069121706, "tokens_seen": 867571712 }, { "epoch": 2.06, "learning_rate": 0.00037229689067201605, "loss": 3.0115, "theoretical_loss": 3.699493510172419, "tokens_seen": 867637248 }, { "epoch": 2.06, "learning_rate": 0.0003722868605817453, "loss": 2.7778, "theoretical_loss": 3.6994662160716807, "tokens_seen": 867702784 }, { "epoch": 2.06, "learning_rate": 0.0003722768304914744, "loss": 2.7363, "theoretical_loss": 3.6994389246095, "tokens_seen": 867768320 }, { "epoch": 2.06, "learning_rate": 0.00037226680040120365, "loss": 2.9761, "theoretical_loss": 3.699411635785424, "tokens_seen": 867833856 }, { "epoch": 2.06, "learning_rate": 0.0003722567703109328, "loss": 2.9283, "theoretical_loss": 3.699384349598998, "tokens_seen": 867899392 }, { "epoch": 2.06, "learning_rate": 0.000372246740220662, "loss": 2.9616, "theoretical_loss": 3.6993570660497674, "tokens_seen": 867964928 }, { "epoch": 2.06, "learning_rate": 0.0003722367101303912, "loss": 3.0476, "theoretical_loss": 3.6993297851372784, "tokens_seen": 868030464 }, { "epoch": 2.06, "learning_rate": 0.0003722266800401204, "loss": 2.8554, "theoretical_loss": 3.699302506861078, "tokens_seen": 868096000 }, { "epoch": 2.06, "learning_rate": 0.00037221664994984956, "loss": 2.9664, "theoretical_loss": 3.699275231220711, "tokens_seen": 868161536 }, { "epoch": 2.06, "learning_rate": 0.00037220661985957874, "loss": 2.9733, "theoretical_loss": 3.699247958215725, "tokens_seen": 868227072 }, { "epoch": 2.06, "learning_rate": 0.0003721965897693079, "loss": 2.9641, "theoretical_loss": 3.699220687845666, "tokens_seen": 868292608 }, { "epoch": 2.06, "objective/train/docs_used": 1401149, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1106722354888916, "objective/train/theoretical_loss": 3.6992070536485926, "objective/train/tokens_used": 888785376, "theoretical_loss": 3.6992070536485926, "tokens_seen": 868325376 }, { "epoch": 2.06, "learning_rate": 0.00037218655967903716, "loss": 3.0338, "theoretical_loss": 3.6991934201100807, "tokens_seen": 868358144 }, { "epoch": 2.06, "learning_rate": 0.0003721765295887663, "loss": 2.8003, "theoretical_loss": 3.6991661550085153, "tokens_seen": 868423680 }, { "epoch": 2.06, "learning_rate": 0.0003721664994984955, "loss": 2.9713, "theoretical_loss": 3.6991388925405175, "tokens_seen": 868489216 }, { "epoch": 2.06, "learning_rate": 0.0003721564694082247, "loss": 2.8885, "theoretical_loss": 3.6991116327056335, "tokens_seen": 868554752 }, { "epoch": 2.06, "learning_rate": 0.0003721464393179539, "loss": 3.1439, "theoretical_loss": 3.6990843755034106, "tokens_seen": 868620288 }, { "epoch": 2.06, "learning_rate": 0.00037213640922768306, "loss": 3.0445, "theoretical_loss": 3.6990571209333956, "tokens_seen": 868685824 }, { "epoch": 2.06, "learning_rate": 0.00037212637913741224, "loss": 2.9412, "theoretical_loss": 3.6990298689951366, "tokens_seen": 868751360 }, { "epoch": 2.06, "learning_rate": 0.0003721163490471414, "loss": 2.8852, "theoretical_loss": 3.69900261968818, "tokens_seen": 868816896 }, { "epoch": 2.06, "learning_rate": 0.00037210631895687066, "loss": 2.9915, "theoretical_loss": 3.6989753730120736, "tokens_seen": 868882432 }, { "epoch": 2.06, "learning_rate": 0.0003720962888665998, "loss": 2.9701, "theoretical_loss": 3.6989481289663653, "tokens_seen": 868947968 }, { "epoch": 2.06, "learning_rate": 0.000372086258776329, "loss": 3.0483, "theoretical_loss": 3.698920887550603, "tokens_seen": 869013504 }, { "epoch": 2.06, "learning_rate": 0.00037207622868605815, "loss": 2.8239, "theoretical_loss": 3.698893648764334, "tokens_seen": 869079040 }, { "epoch": 2.06, "learning_rate": 0.0003720661985957874, "loss": 2.9272, "theoretical_loss": 3.6988664126071056, "tokens_seen": 869144576 }, { "epoch": 2.06, "learning_rate": 0.00037205616850551656, "loss": 2.8569, "theoretical_loss": 3.698839179078467, "tokens_seen": 869210112 }, { "epoch": 2.06, "learning_rate": 0.00037204613841524575, "loss": 2.97, "theoretical_loss": 3.6988119481779664, "tokens_seen": 869275648 }, { "epoch": 2.06, "learning_rate": 0.0003720361083249749, "loss": 2.9146, "theoretical_loss": 3.6987847199051513, "tokens_seen": 869341184 }, { "epoch": 2.06, "learning_rate": 0.00037202607823470416, "loss": 3.0561, "theoretical_loss": 3.6987574942595702, "tokens_seen": 869406720 }, { "epoch": 2.06, "learning_rate": 0.0003720160481444333, "loss": 3.024, "theoretical_loss": 3.6987302712407715, "tokens_seen": 869472256 }, { "epoch": 2.06, "learning_rate": 0.0003720060180541625, "loss": 2.7929, "theoretical_loss": 3.698703050848305, "tokens_seen": 869537792 }, { "epoch": 2.06, "learning_rate": 0.00037199598796389165, "loss": 3.0914, "theoretical_loss": 3.698675833081718, "tokens_seen": 869603328 }, { "epoch": 2.06, "learning_rate": 0.0003719859578736209, "loss": 2.9858, "theoretical_loss": 3.6986486179405595, "tokens_seen": 869668864 }, { "epoch": 2.06, "learning_rate": 0.00037197592778335007, "loss": 3.1283, "theoretical_loss": 3.698621405424379, "tokens_seen": 869734400 }, { "epoch": 2.06, "learning_rate": 0.00037196589769307925, "loss": 2.9031, "theoretical_loss": 3.698594195532726, "tokens_seen": 869799936 }, { "epoch": 2.06, "learning_rate": 0.00037195586760280843, "loss": 2.8883, "theoretical_loss": 3.6985669882651475, "tokens_seen": 869865472 }, { "epoch": 2.06, "learning_rate": 0.0003719458375125376, "loss": 2.8782, "theoretical_loss": 3.6985397836211953, "tokens_seen": 869931008 }, { "epoch": 2.06, "objective/train/docs_used": 1402530, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9196035861968994, "objective/train/theoretical_loss": 3.698526182282938, "objective/train/tokens_used": 890423776, "theoretical_loss": 3.698526182282938, "tokens_seen": 869963776 }, { "epoch": 2.06, "learning_rate": 0.0003719358074222668, "loss": 2.8557, "theoretical_loss": 3.6985125816004176, "tokens_seen": 869996544 }, { "epoch": 2.06, "learning_rate": 0.000371925777331996, "loss": 2.8729, "theoretical_loss": 3.698485382202364, "tokens_seen": 870062080 }, { "epoch": 2.06, "learning_rate": 0.00037191574724172515, "loss": 2.9714, "theoretical_loss": 3.698458185426583, "tokens_seen": 870127616 }, { "epoch": 2.06, "learning_rate": 0.0003719057171514544, "loss": 2.9418, "theoretical_loss": 3.6984309912726268, "tokens_seen": 870193152 }, { "epoch": 2.06, "learning_rate": 0.0003718956870611835, "loss": 2.8483, "theoretical_loss": 3.698403799740043, "tokens_seen": 870258688 }, { "epoch": 2.06, "learning_rate": 0.00037188565697091275, "loss": 2.7373, "theoretical_loss": 3.6983766108283826, "tokens_seen": 870324224 }, { "epoch": 2.06, "learning_rate": 0.00037187562688064193, "loss": 3.0311, "theoretical_loss": 3.6983494245371955, "tokens_seen": 870389760 }, { "epoch": 2.06, "learning_rate": 0.0003718655967903711, "loss": 2.9533, "theoretical_loss": 3.6983222408660317, "tokens_seen": 870455296 }, { "epoch": 2.06, "learning_rate": 0.0003718555667001003, "loss": 2.9211, "theoretical_loss": 3.6982950598144413, "tokens_seen": 870520832 }, { "epoch": 2.06, "learning_rate": 0.00037184553660982953, "loss": 3.0861, "theoretical_loss": 3.6982678813819754, "tokens_seen": 870586368 }, { "epoch": 2.06, "learning_rate": 0.00037183550651955866, "loss": 2.995, "theoretical_loss": 3.6982407055681836, "tokens_seen": 870651904 }, { "epoch": 2.06, "learning_rate": 0.0003718254764292879, "loss": 2.9393, "theoretical_loss": 3.698213532372617, "tokens_seen": 870717440 }, { "epoch": 2.06, "learning_rate": 0.000371815446339017, "loss": 3.0252, "theoretical_loss": 3.6981863617948263, "tokens_seen": 870782976 }, { "epoch": 2.06, "learning_rate": 0.00037180541624874625, "loss": 2.8757, "theoretical_loss": 3.6981591938343623, "tokens_seen": 870848512 }, { "epoch": 2.06, "learning_rate": 0.00037179538615847544, "loss": 2.8607, "theoretical_loss": 3.6981320284907757, "tokens_seen": 870914048 }, { "epoch": 2.06, "learning_rate": 0.0003717853560682046, "loss": 2.8898, "theoretical_loss": 3.698104865763618, "tokens_seen": 870979584 }, { "epoch": 2.06, "learning_rate": 0.0003717753259779338, "loss": 3.068, "theoretical_loss": 3.69807770565244, "tokens_seen": 871045120 }, { "epoch": 2.06, "learning_rate": 0.000371765295887663, "loss": 2.9026, "theoretical_loss": 3.698050548156793, "tokens_seen": 871110656 }, { "epoch": 2.06, "learning_rate": 0.00037175526579739216, "loss": 2.9895, "theoretical_loss": 3.698023393276228, "tokens_seen": 871176192 }, { "epoch": 2.06, "learning_rate": 0.0003717452357071214, "loss": 2.7674, "theoretical_loss": 3.6979962410102973, "tokens_seen": 871241728 }, { "epoch": 2.06, "learning_rate": 0.0003717352056168505, "loss": 2.9425, "theoretical_loss": 3.6979690913585523, "tokens_seen": 871307264 }, { "epoch": 2.06, "learning_rate": 0.00037172517552657976, "loss": 2.927, "theoretical_loss": 3.6979419443205446, "tokens_seen": 871372800 }, { "epoch": 2.06, "learning_rate": 0.0003717151454363089, "loss": 2.9693, "theoretical_loss": 3.697914799895825, "tokens_seen": 871438336 }, { "epoch": 2.06, "learning_rate": 0.0003717051153460381, "loss": 2.9354, "theoretical_loss": 3.6978876580839475, "tokens_seen": 871503872 }, { "epoch": 2.06, "learning_rate": 0.0003716950852557673, "loss": 2.9839, "theoretical_loss": 3.697860518884463, "tokens_seen": 871569408 }, { "epoch": 2.06, "objective/train/docs_used": 1405377, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.81257700920105, "objective/train/theoretical_loss": 3.6978469502642275, "objective/train/tokens_used": 892062176, "theoretical_loss": 3.6978469502642275, "tokens_seen": 871602176 }, { "epoch": 2.06, "learning_rate": 0.0003716850551654965, "loss": 2.9412, "theoretical_loss": 3.697833382296923, "tokens_seen": 871634944 }, { "epoch": 2.06, "learning_rate": 0.00037167502507522566, "loss": 3.0233, "theoretical_loss": 3.6978062483208807, "tokens_seen": 871700480 }, { "epoch": 2.06, "learning_rate": 0.0003716649949849549, "loss": 3.0619, "theoretical_loss": 3.6977791169558882, "tokens_seen": 871766016 }, { "epoch": 2.06, "learning_rate": 0.000371654964894684, "loss": 2.8978, "theoretical_loss": 3.6977519882014978, "tokens_seen": 871831552 }, { "epoch": 2.06, "learning_rate": 0.00037164493480441326, "loss": 3.0562, "theoretical_loss": 3.697724862057263, "tokens_seen": 871897088 }, { "epoch": 2.06, "learning_rate": 0.0003716349047141424, "loss": 3.0876, "theoretical_loss": 3.6976977385227348, "tokens_seen": 871962624 }, { "epoch": 2.06, "learning_rate": 0.0003716248746238716, "loss": 3.0675, "theoretical_loss": 3.6976706175974674, "tokens_seen": 872028160 }, { "epoch": 2.06, "learning_rate": 0.00037161484453360086, "loss": 2.831, "theoretical_loss": 3.6976434992810137, "tokens_seen": 872093696 }, { "epoch": 2.06, "learning_rate": 0.00037160481444333, "loss": 3.0141, "theoretical_loss": 3.697616383572926, "tokens_seen": 872159232 }, { "epoch": 2.06, "learning_rate": 0.0003715947843530592, "loss": 2.9276, "theoretical_loss": 3.6975892704727578, "tokens_seen": 872224768 }, { "epoch": 2.06, "learning_rate": 0.00037158475426278835, "loss": 2.8375, "theoretical_loss": 3.6975621599800625, "tokens_seen": 872290304 }, { "epoch": 2.06, "learning_rate": 0.0003715747241725176, "loss": 2.9718, "theoretical_loss": 3.697535052094393, "tokens_seen": 872355840 }, { "epoch": 2.06, "learning_rate": 0.00037156469408224676, "loss": 3.0193, "theoretical_loss": 3.6975079468153034, "tokens_seen": 872421376 }, { "epoch": 2.06, "learning_rate": 0.00037155466399197595, "loss": 3.0204, "theoretical_loss": 3.697480844142347, "tokens_seen": 872486912 }, { "epoch": 2.06, "learning_rate": 0.0003715446339017051, "loss": 3.0115, "theoretical_loss": 3.6974537440750774, "tokens_seen": 872552448 }, { "epoch": 2.06, "learning_rate": 0.00037153460381143436, "loss": 2.8312, "theoretical_loss": 3.6974266466130485, "tokens_seen": 872617984 }, { "epoch": 2.06, "learning_rate": 0.0003715245737211635, "loss": 2.871, "theoretical_loss": 3.6973995517558143, "tokens_seen": 872683520 }, { "epoch": 2.06, "learning_rate": 0.0003715145436308927, "loss": 2.9578, "theoretical_loss": 3.6973724595029287, "tokens_seen": 872749056 }, { "epoch": 2.06, "learning_rate": 0.00037150451354062185, "loss": 2.9914, "theoretical_loss": 3.697345369853946, "tokens_seen": 872814592 }, { "epoch": 2.06, "learning_rate": 0.0003714944834503511, "loss": 2.8241, "theoretical_loss": 3.6973182828084203, "tokens_seen": 872880128 }, { "epoch": 2.06, "learning_rate": 0.00037148445336008027, "loss": 2.9901, "theoretical_loss": 3.697291198365906, "tokens_seen": 872945664 }, { "epoch": 2.06, "learning_rate": 0.00037147442326980945, "loss": 2.9316, "theoretical_loss": 3.6972641165259574, "tokens_seen": 873011200 }, { "epoch": 2.06, "learning_rate": 0.00037146439317953863, "loss": 2.965, "theoretical_loss": 3.6972370372881285, "tokens_seen": 873076736 }, { "epoch": 2.06, "learning_rate": 0.0003714543630892678, "loss": 3.1174, "theoretical_loss": 3.6972099606519757, "tokens_seen": 873142272 }, { "epoch": 2.06, "learning_rate": 0.000371444332998997, "loss": 3.0111, "theoretical_loss": 3.6971828866170524, "tokens_seen": 873207808 }, { "epoch": 2.06, "objective/train/docs_used": 1408001, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9662058353424072, "objective/train/theoretical_loss": 3.697169350574913, "objective/train/tokens_used": 893700576, "theoretical_loss": 3.697169350574913, "tokens_seen": 873240576 }, { "epoch": 2.06, "learning_rate": 0.00037143430290872623, "loss": 2.9456, "theoretical_loss": 3.6971558151829136, "tokens_seen": 873273344 }, { "epoch": 2.06, "learning_rate": 0.00037142427281845535, "loss": 3.1002, "theoretical_loss": 3.697128746349115, "tokens_seen": 873338880 }, { "epoch": 2.06, "learning_rate": 0.0003714142427281846, "loss": 2.9309, "theoretical_loss": 3.6971016801152112, "tokens_seen": 873404416 }, { "epoch": 2.06, "learning_rate": 0.0003714042126379137, "loss": 3.1124, "theoretical_loss": 3.6970746164807573, "tokens_seen": 873469952 }, { "epoch": 2.06, "learning_rate": 0.00037139418254764295, "loss": 2.8169, "theoretical_loss": 3.6970475554453093, "tokens_seen": 873535488 }, { "epoch": 2.06, "learning_rate": 0.00037138415245737213, "loss": 2.9269, "theoretical_loss": 3.697020497008422, "tokens_seen": 873601024 }, { "epoch": 2.06, "learning_rate": 0.0003713741223671013, "loss": 2.9499, "theoretical_loss": 3.6969934411696515, "tokens_seen": 873666560 }, { "epoch": 2.06, "learning_rate": 0.0003713640922768305, "loss": 2.995, "theoretical_loss": 3.696966387928553, "tokens_seen": 873732096 }, { "epoch": 2.06, "learning_rate": 0.00037135406218655973, "loss": 2.8705, "theoretical_loss": 3.6969393372846824, "tokens_seen": 873797632 }, { "epoch": 2.06, "learning_rate": 0.00037134403209628886, "loss": 2.9581, "theoretical_loss": 3.6969122892375954, "tokens_seen": 873863168 }, { "epoch": 2.06, "learning_rate": 0.0003713340020060181, "loss": 2.9878, "theoretical_loss": 3.6968852437868485, "tokens_seen": 873928704 }, { "epoch": 2.06, "learning_rate": 0.0003713239719157472, "loss": 2.8739, "theoretical_loss": 3.696858200931997, "tokens_seen": 873994240 }, { "epoch": 2.06, "learning_rate": 0.00037131394182547645, "loss": 2.8907, "theoretical_loss": 3.6968311606725983, "tokens_seen": 874059776 }, { "epoch": 2.06, "learning_rate": 0.00037130391173520564, "loss": 3.0651, "theoretical_loss": 3.696804123008208, "tokens_seen": 874125312 }, { "epoch": 2.06, "learning_rate": 0.0003712938816449348, "loss": 2.9598, "theoretical_loss": 3.696777087938382, "tokens_seen": 874190848 }, { "epoch": 2.06, "learning_rate": 0.000371283851554664, "loss": 2.9079, "theoretical_loss": 3.696750055462678, "tokens_seen": 874256384 }, { "epoch": 2.06, "learning_rate": 0.0003712738214643932, "loss": 2.8735, "theoretical_loss": 3.6967230255806522, "tokens_seen": 874321920 }, { "epoch": 2.06, "learning_rate": 0.00037126379137412236, "loss": 2.9504, "theoretical_loss": 3.696695998291861, "tokens_seen": 874387456 }, { "epoch": 2.06, "learning_rate": 0.0003712537612838516, "loss": 2.9875, "theoretical_loss": 3.6966689735958616, "tokens_seen": 874452992 }, { "epoch": 2.06, "learning_rate": 0.0003712437311935807, "loss": 3.0559, "theoretical_loss": 3.696641951492211, "tokens_seen": 874518528 }, { "epoch": 2.06, "learning_rate": 0.00037123370110330996, "loss": 2.9762, "theoretical_loss": 3.696614931980466, "tokens_seen": 874584064 }, { "epoch": 2.06, "learning_rate": 0.0003712236710130391, "loss": 2.9033, "theoretical_loss": 3.696587915060184, "tokens_seen": 874649600 }, { "epoch": 2.06, "learning_rate": 0.0003712136409227683, "loss": 2.9533, "theoretical_loss": 3.6965609007309226, "tokens_seen": 874715136 }, { "epoch": 2.06, "learning_rate": 0.0003712036108324975, "loss": 2.9687, "theoretical_loss": 3.6965338889922386, "tokens_seen": 874780672 }, { "epoch": 2.06, "learning_rate": 0.0003711935807422267, "loss": 3.0522, "theoretical_loss": 3.6965068798436898, "tokens_seen": 874846208 }, { "epoch": 2.06, "objective/train/docs_used": 1410704, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.067448854446411, "objective/train/theoretical_loss": 3.6964933762405776, "objective/train/tokens_used": 895338976, "theoretical_loss": 3.6964933762405776, "tokens_seen": 874878976 }, { "epoch": 2.06, "learning_rate": 0.00037118355065195586, "loss": 2.9443, "theoretical_loss": 3.696479873284834, "tokens_seen": 874911744 }, { "epoch": 2.06, "learning_rate": 0.0003711735205616851, "loss": 2.9262, "theoretical_loss": 3.6964528693152285, "tokens_seen": 874977280 }, { "epoch": 2.06, "learning_rate": 0.0003711634904714142, "loss": 2.8484, "theoretical_loss": 3.6964258679344315, "tokens_seen": 875042816 }, { "epoch": 2.06, "learning_rate": 0.00037115346038114346, "loss": 2.9139, "theoretical_loss": 3.6963988691420013, "tokens_seen": 875108352 }, { "epoch": 2.06, "learning_rate": 0.0003711434302908726, "loss": 2.9718, "theoretical_loss": 3.6963718729374957, "tokens_seen": 875173888 }, { "epoch": 2.06, "learning_rate": 0.0003711334002006018, "loss": 2.9779, "theoretical_loss": 3.6963448793204723, "tokens_seen": 875239424 }, { "epoch": 2.06, "learning_rate": 0.000371123370110331, "loss": 2.7421, "theoretical_loss": 3.6963178882904897, "tokens_seen": 875304960 }, { "epoch": 2.06, "learning_rate": 0.0003711133400200602, "loss": 2.7714, "theoretical_loss": 3.6962908998471065, "tokens_seen": 875370496 }, { "epoch": 2.06, "learning_rate": 0.00037110330992978937, "loss": 2.919, "theoretical_loss": 3.6962639139898816, "tokens_seen": 875436032 }, { "epoch": 2.06, "learning_rate": 0.00037109327983951855, "loss": 2.7879, "theoretical_loss": 3.696236930718373, "tokens_seen": 875501568 }, { "epoch": 2.06, "learning_rate": 0.00037108324974924773, "loss": 2.981, "theoretical_loss": 3.6962099500321393, "tokens_seen": 875567104 }, { "epoch": 2.06, "learning_rate": 0.00037107321965897696, "loss": 2.8966, "theoretical_loss": 3.6961829719307397, "tokens_seen": 875632640 }, { "epoch": 2.06, "learning_rate": 0.0003710631895687061, "loss": 3.027, "theoretical_loss": 3.696155996413733, "tokens_seen": 875698176 }, { "epoch": 2.06, "learning_rate": 0.0003710531594784353, "loss": 2.8294, "theoretical_loss": 3.696129023480678, "tokens_seen": 875763712 }, { "epoch": 2.06, "learning_rate": 0.00037104312938816445, "loss": 2.871, "theoretical_loss": 3.696102053131134, "tokens_seen": 875829248 }, { "epoch": 2.06, "learning_rate": 0.0003710330992978937, "loss": 2.9644, "theoretical_loss": 3.6960750853646607, "tokens_seen": 875894784 }, { "epoch": 2.06, "learning_rate": 0.00037102306920762287, "loss": 2.7624, "theoretical_loss": 3.696048120180817, "tokens_seen": 875960320 }, { "epoch": 2.06, "learning_rate": 0.00037101303911735205, "loss": 2.8897, "theoretical_loss": 3.696021157579162, "tokens_seen": 876025856 }, { "epoch": 2.06, "learning_rate": 0.00037100300902708123, "loss": 3.0026, "theoretical_loss": 3.6959941975592567, "tokens_seen": 876091392 }, { "epoch": 2.06, "learning_rate": 0.00037099297893681047, "loss": 2.9292, "theoretical_loss": 3.69596724012066, "tokens_seen": 876156928 }, { "epoch": 2.06, "learning_rate": 0.0003709829488465396, "loss": 2.9698, "theoretical_loss": 3.69594028526293, "tokens_seen": 876222464 }, { "epoch": 2.06, "learning_rate": 0.00037097291875626883, "loss": 2.8822, "theoretical_loss": 3.6959133329856293, "tokens_seen": 876288000 }, { "epoch": 2.06, "learning_rate": 0.00037096288866599796, "loss": 2.7565, "theoretical_loss": 3.695886383288317, "tokens_seen": 876353536 }, { "epoch": 2.06, "learning_rate": 0.0003709528585757272, "loss": 3.0434, "theoretical_loss": 3.6958594361705526, "tokens_seen": 876419072 }, { "epoch": 2.06, "learning_rate": 0.0003709428284854564, "loss": 3.0542, "theoretical_loss": 3.6958324916318963, "tokens_seen": 876484608 }, { "epoch": 2.06, "objective/train/docs_used": 1413309, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7716903686523438, "objective/train/theoretical_loss": 3.695819020329597, "objective/train/tokens_used": 896977376, "theoretical_loss": 3.695819020329597, "tokens_seen": 876517376 }, { "epoch": 2.06, "learning_rate": 0.00037093279839518555, "loss": 2.7988, "theoretical_loss": 3.6958055496719098, "tokens_seen": 876550144 }, { "epoch": 2.06, "learning_rate": 0.00037092276830491474, "loss": 2.98, "theoretical_loss": 3.6957786102901515, "tokens_seen": 876615680 }, { "epoch": 2.06, "learning_rate": 0.0003709127382146439, "loss": 3.039, "theoretical_loss": 3.6957516734861837, "tokens_seen": 876681216 }, { "epoch": 2.06, "learning_rate": 0.0003709027081243731, "loss": 3.0279, "theoretical_loss": 3.6957247392595667, "tokens_seen": 876746752 }, { "epoch": 2.06, "learning_rate": 0.00037089267803410233, "loss": 2.953, "theoretical_loss": 3.6956978076098608, "tokens_seen": 876812288 }, { "epoch": 2.06, "learning_rate": 0.00037088264794383146, "loss": 2.7375, "theoretical_loss": 3.695670878536627, "tokens_seen": 876877824 }, { "epoch": 2.06, "learning_rate": 0.0003708726178535607, "loss": 2.7967, "theoretical_loss": 3.6956439520394264, "tokens_seen": 876943360 }, { "epoch": 2.06, "learning_rate": 0.00037086258776328993, "loss": 2.9624, "theoretical_loss": 3.6956170281178196, "tokens_seen": 877008896 }, { "epoch": 2.06, "learning_rate": 0.00037085255767301906, "loss": 2.7879, "theoretical_loss": 3.695590106771369, "tokens_seen": 877074432 }, { "epoch": 2.06, "learning_rate": 0.0003708425275827483, "loss": 2.9647, "theoretical_loss": 3.6955631879996345, "tokens_seen": 877139968 }, { "epoch": 2.06, "learning_rate": 0.0003708324974924774, "loss": 2.6194, "theoretical_loss": 3.6955362718021783, "tokens_seen": 877205504 }, { "epoch": 2.06, "learning_rate": 0.00037082246740220665, "loss": 2.786, "theoretical_loss": 3.695509358178562, "tokens_seen": 877271040 }, { "epoch": 2.06, "learning_rate": 0.00037081243731193584, "loss": 2.8655, "theoretical_loss": 3.695482447128347, "tokens_seen": 877336576 }, { "epoch": 2.06, "learning_rate": 0.000370802407221665, "loss": 2.8397, "theoretical_loss": 3.695455538651095, "tokens_seen": 877402112 }, { "epoch": 2.06, "learning_rate": 0.0003707923771313942, "loss": 2.9645, "theoretical_loss": 3.695428632746368, "tokens_seen": 877467648 }, { "epoch": 2.06, "learning_rate": 0.0003707823470411234, "loss": 2.77, "theoretical_loss": 3.6954017294137276, "tokens_seen": 877533184 }, { "epoch": 2.06, "learning_rate": 0.00037077231695085256, "loss": 2.8822, "theoretical_loss": 3.695374828652736, "tokens_seen": 877598720 }, { "epoch": 2.06, "learning_rate": 0.0003707622868605818, "loss": 2.9795, "theoretical_loss": 3.695347930462956, "tokens_seen": 877664256 }, { "epoch": 2.06, "learning_rate": 0.0003707522567703109, "loss": 2.9203, "theoretical_loss": 3.695321034843949, "tokens_seen": 877729792 }, { "epoch": 2.06, "learning_rate": 0.00037074222668004016, "loss": 2.9934, "theoretical_loss": 3.695294141795278, "tokens_seen": 877795328 }, { "epoch": 2.06, "learning_rate": 0.0003707321965897693, "loss": 2.8521, "theoretical_loss": 3.695267251316505, "tokens_seen": 877860864 }, { "epoch": 2.06, "learning_rate": 0.0003707221664994985, "loss": 2.9256, "theoretical_loss": 3.6952403634071924, "tokens_seen": 877926400 }, { "epoch": 2.06, "learning_rate": 0.0003707121364092277, "loss": 3.0181, "theoretical_loss": 3.695213478066904, "tokens_seen": 877991936 }, { "epoch": 2.06, "learning_rate": 0.0003707021063189569, "loss": 2.9482, "theoretical_loss": 3.695186595295201, "tokens_seen": 878057472 }, { "epoch": 2.06, "learning_rate": 0.00037069207622868606, "loss": 2.8583, "theoretical_loss": 3.6951597150916484, "tokens_seen": 878123008 }, { "epoch": 2.06, "objective/train/docs_used": 1416301, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2770228385925293, "objective/train/theoretical_loss": 3.695146275952791, "objective/train/tokens_used": 898615776, "theoretical_loss": 3.695146275952791, "tokens_seen": 878155776 }, { "epoch": 2.06, "learning_rate": 0.0003706820461384153, "loss": 2.9462, "theoretical_loss": 3.6951328374558075, "tokens_seen": 878188544 }, { "epoch": 2.06, "learning_rate": 0.0003706720160481444, "loss": 2.9753, "theoretical_loss": 3.6951059623872418, "tokens_seen": 878254080 }, { "epoch": 2.06, "learning_rate": 0.00037066198595787366, "loss": 2.9423, "theoretical_loss": 3.695079089885515, "tokens_seen": 878319616 }, { "epoch": 2.06, "learning_rate": 0.0003706519558676028, "loss": 3.0241, "theoretical_loss": 3.69505221995019, "tokens_seen": 878385152 }, { "epoch": 2.06, "learning_rate": 0.000370641925777332, "loss": 2.8884, "theoretical_loss": 3.6950253525808305, "tokens_seen": 878450688 }, { "epoch": 2.06, "learning_rate": 0.0003706318956870612, "loss": 2.8575, "theoretical_loss": 3.694998487777, "tokens_seen": 878516224 }, { "epoch": 2.06, "learning_rate": 0.0003706218655967904, "loss": 2.9008, "theoretical_loss": 3.6949716255382623, "tokens_seen": 878581760 }, { "epoch": 2.06, "learning_rate": 0.00037061183550651957, "loss": 2.7981, "theoretical_loss": 3.6949447658641805, "tokens_seen": 878647296 }, { "epoch": 2.06, "learning_rate": 0.00037060180541624875, "loss": 3.0428, "theoretical_loss": 3.6949179087543196, "tokens_seen": 878712832 }, { "epoch": 2.06, "learning_rate": 0.00037059177532597793, "loss": 2.8275, "theoretical_loss": 3.6948910542082425, "tokens_seen": 878778368 }, { "epoch": 2.06, "learning_rate": 0.00037058174523570716, "loss": 2.9015, "theoretical_loss": 3.6948642022255136, "tokens_seen": 878843904 }, { "epoch": 2.06, "learning_rate": 0.0003705717151454363, "loss": 2.8908, "theoretical_loss": 3.694837352805698, "tokens_seen": 878909440 }, { "epoch": 2.06, "learning_rate": 0.0003705616850551655, "loss": 2.7597, "theoretical_loss": 3.6948105059483587, "tokens_seen": 878974976 }, { "epoch": 2.06, "learning_rate": 0.00037055165496489465, "loss": 3.0358, "theoretical_loss": 3.694783661653061, "tokens_seen": 879040512 }, { "epoch": 2.06, "learning_rate": 0.0003705416248746239, "loss": 2.9675, "theoretical_loss": 3.6947568199193688, "tokens_seen": 879106048 }, { "epoch": 2.06, "learning_rate": 0.00037053159478435307, "loss": 2.8756, "theoretical_loss": 3.694729980746847, "tokens_seen": 879171584 }, { "epoch": 2.06, "learning_rate": 0.00037052156469408225, "loss": 2.8398, "theoretical_loss": 3.694703144135061, "tokens_seen": 879237120 }, { "epoch": 2.06, "learning_rate": 0.00037051153460381143, "loss": 2.9997, "theoretical_loss": 3.6946763100835742, "tokens_seen": 879302656 }, { "epoch": 2.06, "learning_rate": 0.00037050150451354067, "loss": 2.9141, "theoretical_loss": 3.694649478591952, "tokens_seen": 879368192 }, { "epoch": 2.06, "learning_rate": 0.0003704914744232698, "loss": 3.086, "theoretical_loss": 3.6946226496597605, "tokens_seen": 879433728 }, { "epoch": 2.06, "learning_rate": 0.00037048144433299903, "loss": 2.9233, "theoretical_loss": 3.694595823286564, "tokens_seen": 879499264 }, { "epoch": 2.06, "learning_rate": 0.00037047141424272816, "loss": 3.0209, "theoretical_loss": 3.6945689994719277, "tokens_seen": 879564800 }, { "epoch": 2.06, "learning_rate": 0.0003704613841524574, "loss": 2.9991, "theoretical_loss": 3.6945421782154177, "tokens_seen": 879630336 }, { "epoch": 2.06, "learning_rate": 0.0003704513540621866, "loss": 2.9302, "theoretical_loss": 3.6945153595165983, "tokens_seen": 879695872 }, { "epoch": 2.06, "learning_rate": 0.00037044132397191575, "loss": 2.9779, "theoretical_loss": 3.694488543375036, "tokens_seen": 879761408 }, { "epoch": 2.06, "objective/train/docs_used": 1419099, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.205845832824707, "objective/train/theoretical_loss": 3.6944751362630903, "objective/train/tokens_used": 900254176, "theoretical_loss": 3.6944751362630903, "tokens_seen": 879794176 }, { "epoch": 2.06, "learning_rate": 0.00037043129388164494, "loss": 3.1867, "theoretical_loss": 3.694461729790296, "tokens_seen": 879826944 }, { "epoch": 2.06, "learning_rate": 0.0003704212637913741, "loss": 3.0179, "theoretical_loss": 3.694434918761944, "tokens_seen": 879892480 }, { "epoch": 2.06, "learning_rate": 0.0003704112337011033, "loss": 2.9511, "theoretical_loss": 3.6944081102895474, "tokens_seen": 879958016 }, { "epoch": 2.06, "learning_rate": 0.00037040120361083253, "loss": 3.1424, "theoretical_loss": 3.6943813043726696, "tokens_seen": 880023552 }, { "epoch": 2.06, "learning_rate": 0.00037039117352056166, "loss": 2.747, "theoretical_loss": 3.6943545010108787, "tokens_seen": 880089088 }, { "epoch": 2.06, "learning_rate": 0.0003703811434302909, "loss": 3.0707, "theoretical_loss": 3.6943277002037407, "tokens_seen": 880154624 }, { "epoch": 2.06, "learning_rate": 0.0003703711133400201, "loss": 3.2187, "theoretical_loss": 3.694300901950821, "tokens_seen": 880220160 }, { "epoch": 2.06, "learning_rate": 0.00037036108324974926, "loss": 2.971, "theoretical_loss": 3.6942741062516866, "tokens_seen": 880285696 }, { "epoch": 2.06, "learning_rate": 0.00037035105315947844, "loss": 2.9561, "theoretical_loss": 3.6942473131059037, "tokens_seen": 880351232 }, { "epoch": 2.06, "learning_rate": 0.0003703410230692076, "loss": 2.8775, "theoretical_loss": 3.69422052251304, "tokens_seen": 880416768 }, { "epoch": 2.06, "learning_rate": 0.0003703309929789368, "loss": 2.8942, "theoretical_loss": 3.694193734472661, "tokens_seen": 880482304 }, { "epoch": 2.06, "learning_rate": 0.00037032096288866604, "loss": 2.8688, "theoretical_loss": 3.6941669489843343, "tokens_seen": 880547840 }, { "epoch": 2.06, "learning_rate": 0.00037031093279839516, "loss": 2.9783, "theoretical_loss": 3.6941401660476263, "tokens_seen": 880613376 }, { "epoch": 2.06, "learning_rate": 0.0003703009027081244, "loss": 2.8782, "theoretical_loss": 3.6941133856621047, "tokens_seen": 880678912 }, { "epoch": 2.06, "learning_rate": 0.0003702908726178535, "loss": 2.8395, "theoretical_loss": 3.6940866078273364, "tokens_seen": 880744448 }, { "epoch": 2.06, "learning_rate": 0.00037028084252758276, "loss": 2.847, "theoretical_loss": 3.694059832542888, "tokens_seen": 880809984 }, { "epoch": 2.06, "learning_rate": 0.00037027081243731194, "loss": 2.9973, "theoretical_loss": 3.694033059808328, "tokens_seen": 880875520 }, { "epoch": 2.06, "learning_rate": 0.0003702607823470411, "loss": 2.9598, "theoretical_loss": 3.694006289623223, "tokens_seen": 880941056 }, { "epoch": 2.06, "learning_rate": 0.0003702507522567703, "loss": 2.8209, "theoretical_loss": 3.693979521987141, "tokens_seen": 881006592 }, { "epoch": 2.06, "learning_rate": 0.0003702407221664995, "loss": 3.022, "theoretical_loss": 3.69395275689965, "tokens_seen": 881072128 }, { "epoch": 2.06, "learning_rate": 0.00037023069207622867, "loss": 2.9508, "theoretical_loss": 3.6939259943603173, "tokens_seen": 881137664 }, { "epoch": 2.06, "learning_rate": 0.0003702206619859579, "loss": 2.9712, "theoretical_loss": 3.693899234368711, "tokens_seen": 881203200 }, { "epoch": 2.06, "learning_rate": 0.00037021063189568703, "loss": 2.9667, "theoretical_loss": 3.693872476924399, "tokens_seen": 881268736 }, { "epoch": 2.06, "learning_rate": 0.00037020060180541626, "loss": 2.9485, "theoretical_loss": 3.6938457220269494, "tokens_seen": 881334272 }, { "epoch": 2.06, "learning_rate": 0.00037019057171514544, "loss": 2.8993, "theoretical_loss": 3.693818969675931, "tokens_seen": 881399808 }, { "epoch": 2.06, "objective/train/docs_used": 1421904, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2515792846679688, "objective/train/theoretical_loss": 3.693805594455198, "objective/train/tokens_used": 901892576, "theoretical_loss": 3.693805594455198, "tokens_seen": 881432576 }, { "epoch": 2.06, "learning_rate": 0.0003701805416248746, "loss": 3.1534, "theoretical_loss": 3.693792219870911, "tokens_seen": 881465344 }, { "epoch": 2.06, "learning_rate": 0.0003701705115346038, "loss": 2.8729, "theoretical_loss": 3.693765472611459, "tokens_seen": 881530880 }, { "epoch": 2.06, "learning_rate": 0.000370160481444333, "loss": 2.715, "theoretical_loss": 3.6937387278971423, "tokens_seen": 881596416 }, { "epoch": 2.06, "learning_rate": 0.00037015045135406217, "loss": 2.9695, "theoretical_loss": 3.693711985727531, "tokens_seen": 881661952 }, { "epoch": 2.06, "learning_rate": 0.0003701404212637914, "loss": 2.9615, "theoretical_loss": 3.693685246102193, "tokens_seen": 881727488 }, { "epoch": 2.06, "learning_rate": 0.00037013039117352053, "loss": 2.9437, "theoretical_loss": 3.693658509020697, "tokens_seen": 881793024 }, { "epoch": 2.06, "learning_rate": 0.00037012036108324977, "loss": 2.956, "theoretical_loss": 3.693631774482612, "tokens_seen": 881858560 }, { "epoch": 2.06, "learning_rate": 0.00037011033099297895, "loss": 3.1483, "theoretical_loss": 3.693605042487508, "tokens_seen": 881924096 }, { "epoch": 2.06, "learning_rate": 0.00037010030090270813, "loss": 2.9268, "theoretical_loss": 3.693578313034953, "tokens_seen": 881989632 }, { "epoch": 2.06, "learning_rate": 0.00037009027081243736, "loss": 2.9678, "theoretical_loss": 3.693551586124517, "tokens_seen": 882055168 }, { "epoch": 2.06, "learning_rate": 0.0003700802407221665, "loss": 2.9715, "theoretical_loss": 3.693524861755769, "tokens_seen": 882120704 }, { "epoch": 2.06, "learning_rate": 0.0003700702106318957, "loss": 3.1106, "theoretical_loss": 3.6934981399282787, "tokens_seen": 882186240 }, { "epoch": 2.06, "learning_rate": 0.00037006018054162485, "loss": 2.9982, "theoretical_loss": 3.6934714206416155, "tokens_seen": 882251776 }, { "epoch": 2.06, "learning_rate": 0.0003700501504513541, "loss": 2.9614, "theoretical_loss": 3.693444703895349, "tokens_seen": 882317312 }, { "epoch": 2.06, "learning_rate": 0.00037004012036108327, "loss": 2.9169, "theoretical_loss": 3.693417989689049, "tokens_seen": 882382848 }, { "epoch": 2.06, "learning_rate": 0.00037003009027081245, "loss": 2.9264, "theoretical_loss": 3.693391278022286, "tokens_seen": 882448384 }, { "epoch": 2.06, "learning_rate": 0.00037002006018054163, "loss": 2.7926, "theoretical_loss": 3.69336456889463, "tokens_seen": 882513920 }, { "epoch": 2.06, "learning_rate": 0.00037001003009027087, "loss": 2.9351, "theoretical_loss": 3.6933378623056505, "tokens_seen": 882579456 }, { "epoch": 2.06, "learning_rate": 0.00037, "loss": 3.0045, "theoretical_loss": 3.6933111582549176, "tokens_seen": 882644992 }, { "epoch": 2.06, "learning_rate": 0.00036998996990972923, "loss": 3.0436, "theoretical_loss": 3.693284456742002, "tokens_seen": 882710528 }, { "epoch": 2.06, "learning_rate": 0.00036997993981945836, "loss": 2.8608, "theoretical_loss": 3.693257757766474, "tokens_seen": 882776064 }, { "epoch": 2.06, "learning_rate": 0.0003699699097291876, "loss": 2.949, "theoretical_loss": 3.6932310613279045, "tokens_seen": 882841600 }, { "epoch": 2.06, "learning_rate": 0.0003699598796389168, "loss": 2.9118, "theoretical_loss": 3.6932043674258637, "tokens_seen": 882907136 }, { "epoch": 2.06, "learning_rate": 0.00036994984954864595, "loss": 2.8669, "theoretical_loss": 3.693177676059922, "tokens_seen": 882972672 }, { "epoch": 2.06, "learning_rate": 0.00036993981945837514, "loss": 2.9145, "theoretical_loss": 3.693150987229652, "tokens_seen": 883038208 }, { "epoch": 2.06, "objective/train/docs_used": 1423225, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2154409885406494, "objective/train/theoretical_loss": 3.693137643765259, "objective/train/tokens_used": 903530976, "theoretical_loss": 3.693137643765259, "tokens_seen": 883070976 }, { "epoch": 2.06, "learning_rate": 0.0003699297893681043, "loss": 2.9684, "theoretical_loss": 3.6931243009346226, "tokens_seen": 883103744 }, { "epoch": 2.06, "learning_rate": 0.0003699197592778335, "loss": 2.8582, "theoretical_loss": 3.6930976171744057, "tokens_seen": 883169280 }, { "epoch": 2.06, "learning_rate": 0.00036990972918756273, "loss": 2.9017, "theoretical_loss": 3.6930709359485725, "tokens_seen": 883234816 }, { "epoch": 2.06, "learning_rate": 0.00036989969909729186, "loss": 2.9624, "theoretical_loss": 3.693044257256694, "tokens_seen": 883300352 }, { "epoch": 2.06, "learning_rate": 0.0003698896690070211, "loss": 2.9433, "theoretical_loss": 3.693017581098342, "tokens_seen": 883365888 }, { "epoch": 2.06, "learning_rate": 0.0003698796389167503, "loss": 2.7925, "theoretical_loss": 3.6929909074730873, "tokens_seen": 883431424 }, { "epoch": 2.06, "learning_rate": 0.00036986960882647946, "loss": 2.9124, "theoretical_loss": 3.692964236380502, "tokens_seen": 883496960 }, { "epoch": 2.06, "learning_rate": 0.00036985957873620864, "loss": 2.9468, "theoretical_loss": 3.692937567820158, "tokens_seen": 883562496 }, { "epoch": 2.06, "learning_rate": 0.0003698495486459378, "loss": 2.8624, "theoretical_loss": 3.6929109017916266, "tokens_seen": 883628032 }, { "epoch": 2.06, "learning_rate": 0.000369839518555667, "loss": 2.8538, "theoretical_loss": 3.6928842382944795, "tokens_seen": 883693568 }, { "epoch": 2.06, "learning_rate": 0.00036982948846539624, "loss": 2.987, "theoretical_loss": 3.6928575773282897, "tokens_seen": 883759104 }, { "epoch": 2.06, "learning_rate": 0.00036981945837512536, "loss": 2.7366, "theoretical_loss": 3.6928309188926285, "tokens_seen": 883824640 }, { "epoch": 2.06, "learning_rate": 0.0003698094282848546, "loss": 2.9504, "theoretical_loss": 3.692804262987068, "tokens_seen": 883890176 }, { "epoch": 2.06, "learning_rate": 0.0003697993981945837, "loss": 2.9104, "theoretical_loss": 3.6927776096111806, "tokens_seen": 883955712 }, { "epoch": 2.06, "learning_rate": 0.00036978936810431296, "loss": 2.9336, "theoretical_loss": 3.692750958764539, "tokens_seen": 884021248 }, { "epoch": 2.06, "learning_rate": 0.00036977933801404214, "loss": 2.8665, "theoretical_loss": 3.6927243104467156, "tokens_seen": 884086784 }, { "epoch": 2.06, "learning_rate": 0.0003697693079237713, "loss": 3.0079, "theoretical_loss": 3.692697664657283, "tokens_seen": 884152320 }, { "epoch": 2.06, "learning_rate": 0.0003697592778335005, "loss": 2.9578, "theoretical_loss": 3.692671021395814, "tokens_seen": 884217856 }, { "epoch": 2.06, "learning_rate": 0.0003697492477432297, "loss": 2.9072, "theoretical_loss": 3.692644380661881, "tokens_seen": 884283392 }, { "epoch": 2.06, "learning_rate": 0.00036973921765295887, "loss": 2.9885, "theoretical_loss": 3.6926177424550573, "tokens_seen": 884348928 }, { "epoch": 2.06, "learning_rate": 0.0003697291875626881, "loss": 2.8389, "theoretical_loss": 3.692591106774916, "tokens_seen": 884414464 }, { "epoch": 2.06, "learning_rate": 0.00036971915747241723, "loss": 3.0959, "theoretical_loss": 3.6925644736210304, "tokens_seen": 884480000 }, { "epoch": 2.06, "learning_rate": 0.00036970912738214646, "loss": 2.8863, "theoretical_loss": 3.692537842992973, "tokens_seen": 884545536 }, { "epoch": 2.06, "learning_rate": 0.00036969909729187565, "loss": 2.8707, "theoretical_loss": 3.6925112148903176, "tokens_seen": 884611072 }, { "epoch": 2.06, "learning_rate": 0.0003696890672016048, "loss": 3.0101, "theoretical_loss": 3.692484589312638, "tokens_seen": 884676608 }, { "epoch": 2.06, "objective/train/docs_used": 1426111, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0977673530578613, "objective/train/theoretical_loss": 3.692471277470531, "objective/train/tokens_used": 905169376, "theoretical_loss": 3.692471277470531, "tokens_seen": 884709376 }, { "epoch": 2.06, "learning_rate": 0.000369679037111334, "loss": 3.0318, "theoretical_loss": 3.6924579662595076, "tokens_seen": 884742144 }, { "epoch": 2.06, "learning_rate": 0.0003696690070210632, "loss": 2.7704, "theoretical_loss": 3.6924313457304994, "tokens_seen": 884807680 }, { "epoch": 2.06, "learning_rate": 0.00036965897693079237, "loss": 2.9077, "theoretical_loss": 3.692404727725188, "tokens_seen": 884873216 }, { "epoch": 2.06, "learning_rate": 0.0003696489468405216, "loss": 2.8248, "theoretical_loss": 3.692378112243146, "tokens_seen": 884938752 }, { "epoch": 2.06, "learning_rate": 0.00036963891675025073, "loss": 2.9359, "theoretical_loss": 3.6923514992839497, "tokens_seen": 885004288 }, { "epoch": 2.06, "learning_rate": 0.00036962888665997997, "loss": 3.0671, "theoretical_loss": 3.6923248888471707, "tokens_seen": 885069824 }, { "epoch": 2.06, "learning_rate": 0.0003696188565697091, "loss": 3.0483, "theoretical_loss": 3.692298280932384, "tokens_seen": 885135360 }, { "epoch": 2.06, "learning_rate": 0.00036960882647943833, "loss": 2.9822, "theoretical_loss": 3.692271675539165, "tokens_seen": 885200896 }, { "epoch": 2.06, "learning_rate": 0.0003695987963891675, "loss": 2.7909, "theoretical_loss": 3.6922450726670863, "tokens_seen": 885266432 }, { "epoch": 2.06, "learning_rate": 0.0003695887662988967, "loss": 3.0407, "theoretical_loss": 3.692218472315724, "tokens_seen": 885331968 }, { "epoch": 2.06, "learning_rate": 0.00036957873620862587, "loss": 2.9689, "theoretical_loss": 3.692191874484651, "tokens_seen": 885397504 }, { "epoch": 2.06, "learning_rate": 0.00036956870611835505, "loss": 3.0543, "theoretical_loss": 3.6921652791734436, "tokens_seen": 885463040 }, { "epoch": 2.06, "learning_rate": 0.00036955867602808424, "loss": 2.9146, "theoretical_loss": 3.692138686381676, "tokens_seen": 885528576 }, { "epoch": 2.06, "learning_rate": 0.00036954864593781347, "loss": 3.0027, "theoretical_loss": 3.6921120961089224, "tokens_seen": 885594112 }, { "epoch": 2.06, "learning_rate": 0.0003695386158475426, "loss": 2.8419, "theoretical_loss": 3.6920855083547583, "tokens_seen": 885659648 }, { "epoch": 2.06, "learning_rate": 0.00036952858575727183, "loss": 2.7723, "theoretical_loss": 3.6920589231187595, "tokens_seen": 885725184 }, { "epoch": 2.06, "learning_rate": 0.000369518555667001, "loss": 2.7897, "theoretical_loss": 3.6920323404005, "tokens_seen": 885790720 }, { "epoch": 2.06, "learning_rate": 0.0003695085255767302, "loss": 2.6649, "theoretical_loss": 3.6920057601995566, "tokens_seen": 885856256 }, { "epoch": 2.06, "learning_rate": 0.0003694984954864594, "loss": 3.0458, "theoretical_loss": 3.691979182515503, "tokens_seen": 885921792 }, { "epoch": 2.06, "learning_rate": 0.00036948846539618856, "loss": 2.8669, "theoretical_loss": 3.6919526073479156, "tokens_seen": 885987328 }, { "epoch": 2.06, "learning_rate": 0.00036947843530591774, "loss": 2.7972, "theoretical_loss": 3.6919260346963703, "tokens_seen": 886052864 }, { "epoch": 2.06, "learning_rate": 0.000369468405215647, "loss": 2.9162, "theoretical_loss": 3.691899464560443, "tokens_seen": 886118400 }, { "epoch": 2.06, "learning_rate": 0.0003694583751253761, "loss": 2.9739, "theoretical_loss": 3.691872896939708, "tokens_seen": 886183936 }, { "epoch": 2.06, "learning_rate": 0.00036944834503510534, "loss": 2.8604, "theoretical_loss": 3.6918463318337422, "tokens_seen": 886249472 }, { "epoch": 2.06, "learning_rate": 0.00036943831494483446, "loss": 2.9408, "theoretical_loss": 3.691819769242122, "tokens_seen": 886315008 }, { "epoch": 2.06, "objective/train/docs_used": 1428912, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8572142124176025, "objective/train/theoretical_loss": 3.6918064888890587, "objective/train/tokens_used": 906807776, "theoretical_loss": 3.6918064888890587, "tokens_seen": 886347776 }, { "epoch": 2.06, "learning_rate": 0.0003694282848545637, "loss": 2.824, "theoretical_loss": 3.691793209164423, "tokens_seen": 886380544 }, { "epoch": 2.06, "learning_rate": 0.0003694182547642929, "loss": 2.9006, "theoretical_loss": 3.691766651600222, "tokens_seen": 886446080 }, { "epoch": 2.06, "learning_rate": 0.00036940822467402206, "loss": 2.965, "theoretical_loss": 3.6917400965490943, "tokens_seen": 886511616 }, { "epoch": 2.06, "learning_rate": 0.00036939819458375124, "loss": 2.8821, "theoretical_loss": 3.691713544010618, "tokens_seen": 886577152 }, { "epoch": 2.06, "learning_rate": 0.0003693881644934805, "loss": 3.0146, "theoretical_loss": 3.6916869939843675, "tokens_seen": 886642688 }, { "epoch": 2.06, "learning_rate": 0.0003693781344032096, "loss": 2.9331, "theoretical_loss": 3.691660446469921, "tokens_seen": 886708224 }, { "epoch": 2.06, "learning_rate": 0.00036936810431293884, "loss": 3.0226, "theoretical_loss": 3.6916339014668553, "tokens_seen": 886773760 }, { "epoch": 2.06, "learning_rate": 0.000369358074222668, "loss": 2.8712, "theoretical_loss": 3.6916073589747462, "tokens_seen": 886839296 }, { "epoch": 2.06, "learning_rate": 0.0003693480441323972, "loss": 2.9559, "theoretical_loss": 3.6915808189931716, "tokens_seen": 886904832 }, { "epoch": 2.06, "learning_rate": 0.00036933801404212644, "loss": 2.8554, "theoretical_loss": 3.691554281521708, "tokens_seen": 886970368 }, { "epoch": 2.06, "learning_rate": 0.00036932798395185556, "loss": 2.8854, "theoretical_loss": 3.6915277465599328, "tokens_seen": 887035904 }, { "epoch": 2.06, "learning_rate": 0.0003693179538615848, "loss": 2.9997, "theoretical_loss": 3.691501214107423, "tokens_seen": 887101440 }, { "epoch": 2.06, "learning_rate": 0.0003693079237713139, "loss": 3.0112, "theoretical_loss": 3.6914746841637562, "tokens_seen": 887166976 }, { "epoch": 2.06, "learning_rate": 0.00036929789368104316, "loss": 2.7911, "theoretical_loss": 3.6914481567285105, "tokens_seen": 887232512 }, { "epoch": 2.06, "learning_rate": 0.00036928786359077234, "loss": 2.9611, "theoretical_loss": 3.691421631801262, "tokens_seen": 887298048 }, { "epoch": 2.06, "learning_rate": 0.0003692778335005015, "loss": 2.8233, "theoretical_loss": 3.6913951093815895, "tokens_seen": 887363584 }, { "epoch": 2.06, "learning_rate": 0.0003692678034102307, "loss": 2.891, "theoretical_loss": 3.69136858946907, "tokens_seen": 887429120 }, { "epoch": 2.06, "learning_rate": 0.0003692577733199599, "loss": 2.9916, "theoretical_loss": 3.691342072063282, "tokens_seen": 887494656 }, { "epoch": 2.06, "learning_rate": 0.00036924774322968907, "loss": 2.8974, "theoretical_loss": 3.691315557163804, "tokens_seen": 887560192 }, { "epoch": 2.06, "learning_rate": 0.0003692377131394183, "loss": 2.9074, "theoretical_loss": 3.6912890447702127, "tokens_seen": 887625728 }, { "epoch": 2.06, "learning_rate": 0.00036922768304914743, "loss": 2.9453, "theoretical_loss": 3.691262534882087, "tokens_seen": 887691264 }, { "epoch": 2.06, "learning_rate": 0.00036921765295887666, "loss": 2.9239, "theoretical_loss": 3.6912360274990057, "tokens_seen": 887756800 }, { "epoch": 2.06, "learning_rate": 0.00036920762286860585, "loss": 2.9116, "theoretical_loss": 3.6912095226205457, "tokens_seen": 887822336 }, { "epoch": 2.06, "learning_rate": 0.000369197592778335, "loss": 3.1156, "theoretical_loss": 3.691183020246287, "tokens_seen": 887887872 }, { "epoch": 2.06, "learning_rate": 0.0003691875626880642, "loss": 2.9032, "theoretical_loss": 3.6911565203758077, "tokens_seen": 887953408 }, { "epoch": 2.06, "objective/train/docs_used": 1431723, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8236136436462402, "objective/train/theoretical_loss": 3.691143271379353, "objective/train/tokens_used": 908446176, "theoretical_loss": 3.691143271379353, "tokens_seen": 887986176 }, { "epoch": 2.06, "learning_rate": 0.0003691775325977934, "loss": 2.935, "theoretical_loss": 3.691130023008686, "tokens_seen": 888018944 }, { "epoch": 2.06, "learning_rate": 0.00036916750250752257, "loss": 3.0264, "theoretical_loss": 3.691103528144501, "tokens_seen": 888084480 }, { "epoch": 2.06, "learning_rate": 0.0003691574724172518, "loss": 2.8128, "theoretical_loss": 3.6910770357828318, "tokens_seen": 888150016 }, { "epoch": 2.06, "learning_rate": 0.00036914744232698093, "loss": 2.8753, "theoretical_loss": 3.6910505459232574, "tokens_seen": 888215552 }, { "epoch": 2.06, "learning_rate": 0.00036913741223671017, "loss": 2.9174, "theoretical_loss": 3.6910240585653566, "tokens_seen": 888281088 }, { "epoch": 2.06, "learning_rate": 0.0003691273821464393, "loss": 2.8194, "theoretical_loss": 3.6909975737087093, "tokens_seen": 888346624 }, { "epoch": 2.06, "learning_rate": 0.00036911735205616853, "loss": 3.0096, "theoretical_loss": 3.690971091352894, "tokens_seen": 888412160 }, { "epoch": 2.06, "learning_rate": 0.0003691073219658977, "loss": 2.8292, "theoretical_loss": 3.69094461149749, "tokens_seen": 888477696 }, { "epoch": 2.06, "learning_rate": 0.0003690972918756269, "loss": 2.9879, "theoretical_loss": 3.690918134142078, "tokens_seen": 888543232 }, { "epoch": 2.06, "learning_rate": 0.0003690872617853561, "loss": 2.9816, "theoretical_loss": 3.690891659286236, "tokens_seen": 888608768 }, { "epoch": 2.06, "learning_rate": 0.00036907723169508525, "loss": 3.0042, "theoretical_loss": 3.690865186929545, "tokens_seen": 888674304 }, { "epoch": 2.06, "learning_rate": 0.00036906720160481444, "loss": 2.9876, "theoretical_loss": 3.6908387170715837, "tokens_seen": 888739840 }, { "epoch": 2.06, "learning_rate": 0.00036905717151454367, "loss": 2.8212, "theoretical_loss": 3.690812249711933, "tokens_seen": 888805376 }, { "epoch": 2.06, "learning_rate": 0.0003690471414242728, "loss": 3.0647, "theoretical_loss": 3.690785784850173, "tokens_seen": 888870912 }, { "epoch": 2.06, "learning_rate": 0.00036903711133400203, "loss": 2.8044, "theoretical_loss": 3.690759322485883, "tokens_seen": 888936448 }, { "epoch": 2.06, "learning_rate": 0.0003690270812437312, "loss": 2.9106, "theoretical_loss": 3.6907328626186438, "tokens_seen": 889001984 }, { "epoch": 2.06, "learning_rate": 0.0003690170511534604, "loss": 2.8078, "theoretical_loss": 3.6907064052480347, "tokens_seen": 889067520 }, { "epoch": 2.06, "learning_rate": 0.0003690070210631896, "loss": 2.9573, "theoretical_loss": 3.690679950373638, "tokens_seen": 889133056 }, { "epoch": 2.06, "learning_rate": 0.00036899699097291876, "loss": 2.9022, "theoretical_loss": 3.690653497995032, "tokens_seen": 889198592 }, { "epoch": 2.06, "learning_rate": 0.00036898696088264794, "loss": 2.9539, "theoretical_loss": 3.690627048111799, "tokens_seen": 889264128 }, { "epoch": 2.06, "learning_rate": 0.0003689769307923772, "loss": 2.8744, "theoretical_loss": 3.6906006007235197, "tokens_seen": 889329664 }, { "epoch": 2.06, "learning_rate": 0.0003689669007021063, "loss": 2.8791, "theoretical_loss": 3.6905741558297738, "tokens_seen": 889395200 }, { "epoch": 2.06, "learning_rate": 0.00036895687061183554, "loss": 2.902, "theoretical_loss": 3.690547713430143, "tokens_seen": 889460736 }, { "epoch": 2.06, "learning_rate": 0.00036894684052156466, "loss": 3.059, "theoretical_loss": 3.6905212735242086, "tokens_seen": 889526272 }, { "epoch": 2.06, "learning_rate": 0.0003689368104312939, "loss": 2.8981, "theoretical_loss": 3.690494836111551, "tokens_seen": 889591808 }, { "epoch": 2.06, "objective/train/docs_used": 1434324, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8619227409362793, "objective/train/theoretical_loss": 3.69048161834007, "objective/train/tokens_used": 910084576, "theoretical_loss": 3.69048161834007, "tokens_seen": 889624576 }, { "epoch": 2.06, "learning_rate": 0.0003689267803410231, "loss": 2.9228, "theoretical_loss": 3.690468401191751, "tokens_seen": 889657344 }, { "epoch": 2.06, "learning_rate": 0.00036891675025075226, "loss": 2.843, "theoretical_loss": 3.6904419687643917, "tokens_seen": 889722880 }, { "epoch": 2.06, "learning_rate": 0.00036890672016048144, "loss": 2.978, "theoretical_loss": 3.690415538829053, "tokens_seen": 889788416 }, { "epoch": 2.06, "learning_rate": 0.0003688966900702107, "loss": 3.0078, "theoretical_loss": 3.6903891113853176, "tokens_seen": 889853952 }, { "epoch": 2.06, "learning_rate": 0.0003688866599799398, "loss": 3.0133, "theoretical_loss": 3.6903626864327657, "tokens_seen": 889919488 }, { "epoch": 2.06, "learning_rate": 0.00036887662988966904, "loss": 2.9005, "theoretical_loss": 3.69033626397098, "tokens_seen": 889985024 }, { "epoch": 2.06, "learning_rate": 0.00036886659979939817, "loss": 2.9216, "theoretical_loss": 3.6903098439995423, "tokens_seen": 890050560 }, { "epoch": 2.06, "learning_rate": 0.0003688565697091274, "loss": 3.0663, "theoretical_loss": 3.690283426518034, "tokens_seen": 890116096 }, { "epoch": 2.06, "learning_rate": 0.0003688465396188566, "loss": 3.0088, "theoretical_loss": 3.690257011526038, "tokens_seen": 890181632 }, { "epoch": 2.06, "learning_rate": 0.00036883650952858576, "loss": 2.8662, "theoretical_loss": 3.6902305990231357, "tokens_seen": 890247168 }, { "epoch": 2.06, "learning_rate": 0.00036882647943831494, "loss": 2.9074, "theoretical_loss": 3.69020418900891, "tokens_seen": 890312704 }, { "epoch": 2.06, "learning_rate": 0.0003688164493480441, "loss": 2.866, "theoretical_loss": 3.6901777814829426, "tokens_seen": 890378240 }, { "epoch": 2.06, "learning_rate": 0.0003688064192577733, "loss": 2.8844, "theoretical_loss": 3.690151376444816, "tokens_seen": 890443776 }, { "epoch": 2.06, "learning_rate": 0.00036879638916750254, "loss": 2.8595, "theoretical_loss": 3.690124973894113, "tokens_seen": 890509312 }, { "epoch": 2.06, "learning_rate": 0.00036878635907723167, "loss": 2.9372, "theoretical_loss": 3.690098573830417, "tokens_seen": 890574848 }, { "epoch": 2.06, "learning_rate": 0.0003687763289869609, "loss": 3.048, "theoretical_loss": 3.6900721762533086, "tokens_seen": 890640384 }, { "epoch": 2.06, "learning_rate": 0.00036876629889669003, "loss": 2.9182, "theoretical_loss": 3.6900457811623726, "tokens_seen": 890705920 }, { "epoch": 2.06, "learning_rate": 0.00036875626880641927, "loss": 2.9787, "theoretical_loss": 3.6900193885571917, "tokens_seen": 890771456 }, { "epoch": 2.06, "learning_rate": 0.00036874623871614845, "loss": 2.9641, "theoretical_loss": 3.689992998437348, "tokens_seen": 890836992 }, { "epoch": 2.06, "learning_rate": 0.00036873620862587763, "loss": 3.0907, "theoretical_loss": 3.6899666108024256, "tokens_seen": 890902528 }, { "epoch": 2.06, "learning_rate": 0.0003687261785356068, "loss": 2.9745, "theoretical_loss": 3.6899402256520073, "tokens_seen": 890968064 }, { "epoch": 2.06, "learning_rate": 0.00036871614844533605, "loss": 2.9089, "theoretical_loss": 3.6899138429856766, "tokens_seen": 891033600 }, { "epoch": 2.06, "learning_rate": 0.00036870611835506517, "loss": 2.7548, "theoretical_loss": 3.6898874628030165, "tokens_seen": 891099136 }, { "epoch": 2.06, "learning_rate": 0.0003686960882647944, "loss": 2.9899, "theoretical_loss": 3.689861085103611, "tokens_seen": 891164672 }, { "epoch": 2.06, "learning_rate": 0.00036868605817452353, "loss": 2.8655, "theoretical_loss": 3.6898347098870437, "tokens_seen": 891230208 }, { "epoch": 2.06, "objective/train/docs_used": 1437051, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1915030479431152, "objective/train/theoretical_loss": 3.6898215232096945, "objective/train/tokens_used": 911722976, "theoretical_loss": 3.6898215232096945, "tokens_seen": 891262976 }, { "epoch": 2.06, "learning_rate": 0.00036867602808425277, "loss": 2.9884, "theoretical_loss": 3.6898083371528987, "tokens_seen": 891295744 }, { "epoch": 2.06, "learning_rate": 0.00036866599799398195, "loss": 2.8499, "theoretical_loss": 3.6897819669007594, "tokens_seen": 891361280 }, { "epoch": 2.06, "learning_rate": 0.00036865596790371113, "loss": 3.0194, "theoretical_loss": 3.6897555991302093, "tokens_seen": 891426816 }, { "epoch": 2.06, "learning_rate": 0.0003686459378134403, "loss": 2.8369, "theoretical_loss": 3.6897292338408336, "tokens_seen": 891492352 }, { "epoch": 2.06, "learning_rate": 0.0003686359077231695, "loss": 2.9375, "theoretical_loss": 3.6897028710322157, "tokens_seen": 891557888 }, { "epoch": 2.06, "learning_rate": 0.0003686258776328987, "loss": 3.0197, "theoretical_loss": 3.6896765107039395, "tokens_seen": 891623424 }, { "epoch": 2.06, "learning_rate": 0.0003686158475426279, "loss": 2.978, "theoretical_loss": 3.6896501528555907, "tokens_seen": 891688960 }, { "epoch": 2.06, "learning_rate": 0.0003686058174523571, "loss": 3.017, "theoretical_loss": 3.6896237974867523, "tokens_seen": 891754496 }, { "epoch": 2.06, "learning_rate": 0.0003685957873620863, "loss": 2.9162, "theoretical_loss": 3.689597444597009, "tokens_seen": 891820032 }, { "epoch": 2.06, "learning_rate": 0.00036858575727181545, "loss": 2.7792, "theoretical_loss": 3.6895710941859465, "tokens_seen": 891885568 }, { "epoch": 2.06, "learning_rate": 0.00036857572718154464, "loss": 2.937, "theoretical_loss": 3.6895447462531488, "tokens_seen": 891951104 }, { "epoch": 2.06, "learning_rate": 0.00036856569709127387, "loss": 3.0171, "theoretical_loss": 3.6895184007982014, "tokens_seen": 892016640 }, { "epoch": 2.06, "learning_rate": 0.000368555667001003, "loss": 2.9892, "theoretical_loss": 3.6894920578206882, "tokens_seen": 892082176 }, { "epoch": 2.06, "learning_rate": 0.00036854563691073223, "loss": 2.8373, "theoretical_loss": 3.689465717320195, "tokens_seen": 892147712 }, { "epoch": 2.06, "learning_rate": 0.0003685356068204614, "loss": 2.7441, "theoretical_loss": 3.6894393792963065, "tokens_seen": 892213248 }, { "epoch": 2.06, "learning_rate": 0.0003685255767301906, "loss": 2.9679, "theoretical_loss": 3.6894130437486083, "tokens_seen": 892278784 }, { "epoch": 2.06, "learning_rate": 0.0003685155466399198, "loss": 2.9665, "theoretical_loss": 3.689386710676686, "tokens_seen": 892344320 }, { "epoch": 2.06, "learning_rate": 0.00036850551654964896, "loss": 2.8787, "theoretical_loss": 3.689360380080125, "tokens_seen": 892409856 }, { "epoch": 2.06, "learning_rate": 0.00036849548645937814, "loss": 2.9918, "theoretical_loss": 3.68933405195851, "tokens_seen": 892475392 }, { "epoch": 2.06, "learning_rate": 0.0003684854563691074, "loss": 2.9679, "theoretical_loss": 3.6893077263114273, "tokens_seen": 892540928 }, { "epoch": 2.06, "learning_rate": 0.0003684754262788365, "loss": 2.9472, "theoretical_loss": 3.6892814031384624, "tokens_seen": 892606464 }, { "epoch": 2.06, "learning_rate": 0.00036846539618856574, "loss": 3.1022, "theoretical_loss": 3.689255082439202, "tokens_seen": 892672000 }, { "epoch": 2.06, "learning_rate": 0.00036845536609829486, "loss": 2.9091, "theoretical_loss": 3.6892287642132313, "tokens_seen": 892737536 }, { "epoch": 2.06, "learning_rate": 0.0003684453360080241, "loss": 3.0437, "theoretical_loss": 3.6892024484601356, "tokens_seen": 892803072 }, { "epoch": 2.06, "learning_rate": 0.0003684353059177533, "loss": 2.9701, "theoretical_loss": 3.689176135179503, "tokens_seen": 892868608 }, { "epoch": 2.06, "objective/train/docs_used": 1438457, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.074929714202881, "objective/train/theoretical_loss": 3.6891629794662304, "objective/train/tokens_used": 913361376, "theoretical_loss": 3.6891629794662304, "tokens_seen": 892901376 }, { "epoch": 2.06, "learning_rate": 0.00036842527582748246, "loss": 2.9849, "theoretical_loss": 3.6891498243709178, "tokens_seen": 892934144 }, { "epoch": 2.06, "learning_rate": 0.00036841524573721164, "loss": 2.8059, "theoretical_loss": 3.689123516033968, "tokens_seen": 892999680 }, { "epoch": 2.06, "learning_rate": 0.0003684052156469409, "loss": 3.0785, "theoretical_loss": 3.6890972101682387, "tokens_seen": 893065216 }, { "epoch": 2.06, "learning_rate": 0.00036839518555667, "loss": 2.9313, "theoretical_loss": 3.689070906773317, "tokens_seen": 893130752 }, { "epoch": 2.06, "learning_rate": 0.00036838515546639924, "loss": 3.0723, "theoretical_loss": 3.6890446058487893, "tokens_seen": 893196288 }, { "epoch": 2.06, "learning_rate": 0.00036837512537612837, "loss": 3.0335, "theoretical_loss": 3.6890183073942433, "tokens_seen": 893261824 }, { "epoch": 2.06, "learning_rate": 0.0003683650952858576, "loss": 2.9741, "theoretical_loss": 3.6889920114092645, "tokens_seen": 893327360 }, { "epoch": 2.06, "learning_rate": 0.0003683550651955868, "loss": 2.9393, "theoretical_loss": 3.6889657178934407, "tokens_seen": 893392896 }, { "epoch": 2.06, "learning_rate": 0.00036834503510531596, "loss": 2.9864, "theoretical_loss": 3.6889394268463587, "tokens_seen": 893458432 }, { "epoch": 2.06, "learning_rate": 0.00036833500501504514, "loss": 3.027, "theoretical_loss": 3.6889131382676057, "tokens_seen": 893523968 }, { "epoch": 2.06, "learning_rate": 0.0003683249749247743, "loss": 2.8747, "theoretical_loss": 3.6888868521567693, "tokens_seen": 893589504 }, { "epoch": 2.06, "learning_rate": 0.0003683149448345035, "loss": 2.9896, "theoretical_loss": 3.6888605685134364, "tokens_seen": 893655040 }, { "epoch": 2.06, "learning_rate": 0.00036830491474423274, "loss": 2.9737, "theoretical_loss": 3.688834287337194, "tokens_seen": 893720576 }, { "epoch": 2.06, "learning_rate": 0.00036829488465396187, "loss": 2.9138, "theoretical_loss": 3.688808008627631, "tokens_seen": 893786112 }, { "epoch": 2.06, "learning_rate": 0.0003682848545636911, "loss": 2.8252, "theoretical_loss": 3.6887817323843333, "tokens_seen": 893851648 }, { "epoch": 2.06, "learning_rate": 0.00036827482447342023, "loss": 2.9564, "theoretical_loss": 3.6887554586068902, "tokens_seen": 893917184 }, { "epoch": 2.06, "learning_rate": 0.00036826479438314947, "loss": 2.9635, "theoretical_loss": 3.6887291872948884, "tokens_seen": 893982720 }, { "epoch": 2.06, "learning_rate": 0.00036825476429287865, "loss": 3.0854, "theoretical_loss": 3.688702918447917, "tokens_seen": 894048256 }, { "epoch": 2.06, "learning_rate": 0.00036824473420260783, "loss": 2.6266, "theoretical_loss": 3.688676652065563, "tokens_seen": 894113792 }, { "epoch": 2.06, "learning_rate": 0.000368234704112337, "loss": 2.9294, "theoretical_loss": 3.688650388147415, "tokens_seen": 894179328 }, { "epoch": 2.06, "learning_rate": 0.00036822467402206625, "loss": 2.771, "theoretical_loss": 3.6886241266930613, "tokens_seen": 894244864 }, { "epoch": 2.06, "learning_rate": 0.00036821464393179537, "loss": 2.9511, "theoretical_loss": 3.6885978677020894, "tokens_seen": 894310400 }, { "epoch": 2.06, "learning_rate": 0.0003682046138415246, "loss": 3.0382, "theoretical_loss": 3.688571611174089, "tokens_seen": 894375936 }, { "epoch": 2.06, "learning_rate": 0.00036819458375125373, "loss": 3.1226, "theoretical_loss": 3.688545357108648, "tokens_seen": 894441472 }, { "epoch": 2.06, "learning_rate": 0.00036818455366098297, "loss": 2.9034, "theoretical_loss": 3.688519105505355, "tokens_seen": 894507008 }, { "epoch": 2.06, "objective/train/docs_used": 1441363, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.813544273376465, "objective/train/theoretical_loss": 3.6885059806268856, "objective/train/tokens_used": 914999776, "theoretical_loss": 3.6885059806268856, "tokens_seen": 894539776 }, { "epoch": 2.06, "learning_rate": 0.00036817452357071215, "loss": 2.9578, "theoretical_loss": 3.688492856363799, "tokens_seen": 894572544 }, { "epoch": 2.06, "learning_rate": 0.00036816449348044133, "loss": 3.018, "theoretical_loss": 3.6884666096835685, "tokens_seen": 894638080 }, { "epoch": 2.06, "learning_rate": 0.0003681544633901705, "loss": 2.8982, "theoretical_loss": 3.6884403654642526, "tokens_seen": 894703616 }, { "epoch": 2.06, "learning_rate": 0.0003681444332998997, "loss": 3.0127, "theoretical_loss": 3.6884141237054404, "tokens_seen": 894769152 }, { "epoch": 2.06, "learning_rate": 0.0003681344032096289, "loss": 2.9449, "theoretical_loss": 3.688387884406721, "tokens_seen": 894834688 }, { "epoch": 2.06, "learning_rate": 0.0003681243731193581, "loss": 2.9986, "theoretical_loss": 3.6883616475676835, "tokens_seen": 894900224 }, { "epoch": 2.06, "learning_rate": 0.00036811434302908724, "loss": 3.0164, "theoretical_loss": 3.688335413187917, "tokens_seen": 894965760 }, { "epoch": 2.06, "learning_rate": 0.0003681043129388165, "loss": 2.9485, "theoretical_loss": 3.6883091812670115, "tokens_seen": 895031296 }, { "epoch": 2.06, "learning_rate": 0.0003680942828485456, "loss": 2.7085, "theoretical_loss": 3.6882829518045566, "tokens_seen": 895096832 }, { "epoch": 2.06, "learning_rate": 0.00036808425275827484, "loss": 2.8525, "theoretical_loss": 3.6882567248001408, "tokens_seen": 895162368 }, { "epoch": 2.06, "learning_rate": 0.000368074222668004, "loss": 2.8376, "theoretical_loss": 3.688230500253355, "tokens_seen": 895227904 }, { "epoch": 2.06, "learning_rate": 0.0003680641925777332, "loss": 2.9396, "theoretical_loss": 3.6882042781637887, "tokens_seen": 895293440 }, { "epoch": 2.06, "learning_rate": 0.0003680541624874624, "loss": 3.0097, "theoretical_loss": 3.6881780585310313, "tokens_seen": 895358976 }, { "epoch": 2.06, "learning_rate": 0.0003680441323971916, "loss": 2.7188, "theoretical_loss": 3.6881518413546734, "tokens_seen": 895424512 }, { "epoch": 2.06, "learning_rate": 0.00036803410230692074, "loss": 2.8806, "theoretical_loss": 3.6881256266343057, "tokens_seen": 895490048 }, { "epoch": 2.06, "learning_rate": 0.00036802407221665, "loss": 2.7957, "theoretical_loss": 3.6880994143695167, "tokens_seen": 895555584 }, { "epoch": 2.06, "learning_rate": 0.0003680140421263791, "loss": 3.0851, "theoretical_loss": 3.688073204559898, "tokens_seen": 895621120 }, { "epoch": 2.06, "learning_rate": 0.00036800401203610834, "loss": 2.892, "theoretical_loss": 3.6880469972050394, "tokens_seen": 895686656 }, { "epoch": 2.06, "learning_rate": 0.0003679939819458375, "loss": 2.885, "theoretical_loss": 3.688020792304532, "tokens_seen": 895752192 }, { "epoch": 2.06, "learning_rate": 0.0003679839518555667, "loss": 2.7738, "theoretical_loss": 3.687994589857966, "tokens_seen": 895817728 }, { "epoch": 2.06, "learning_rate": 0.0003679739217652959, "loss": 2.9693, "theoretical_loss": 3.687968389864932, "tokens_seen": 895883264 }, { "epoch": 2.06, "learning_rate": 0.00036796389167502506, "loss": 2.8055, "theoretical_loss": 3.6879421923250213, "tokens_seen": 895948800 }, { "epoch": 2.06, "learning_rate": 0.00036795386158475424, "loss": 2.8784, "theoretical_loss": 3.687915997237824, "tokens_seen": 896014336 }, { "epoch": 2.06, "learning_rate": 0.0003679438314944835, "loss": 2.7516, "theoretical_loss": 3.687889804602932, "tokens_seen": 896079872 }, { "epoch": 2.06, "learning_rate": 0.0003679338014042126, "loss": 2.9091, "theoretical_loss": 3.6878636144199355, "tokens_seen": 896145408 }, { "epoch": 2.06, "objective/train/docs_used": 1444166, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.187595844268799, "objective/train/theoretical_loss": 3.6878505202477707, "objective/train/tokens_used": 916638176, "theoretical_loss": 3.6878505202477707, "tokens_seen": 896178176 }, { "epoch": 2.06, "learning_rate": 0.00036792377131394184, "loss": 3.0037, "theoretical_loss": 3.687837426688427, "tokens_seen": 896210944 }, { "epoch": 2.06, "learning_rate": 0.00036791374122367097, "loss": 2.823, "theoretical_loss": 3.6878112414079958, "tokens_seen": 896276480 }, { "epoch": 2.06, "learning_rate": 0.0003679037111334002, "loss": 2.7132, "theoretical_loss": 3.6877850585782355, "tokens_seen": 896342016 }, { "epoch": 2.06, "learning_rate": 0.0003678936810431294, "loss": 3.1426, "theoretical_loss": 3.687758878198736, "tokens_seen": 896407552 }, { "epoch": 2.06, "learning_rate": 0.00036788365095285857, "loss": 3.0125, "theoretical_loss": 3.6877327002690894, "tokens_seen": 896473088 }, { "epoch": 2.06, "learning_rate": 0.00036787362086258775, "loss": 2.8931, "theoretical_loss": 3.6877065247888874, "tokens_seen": 896538624 }, { "epoch": 2.06, "learning_rate": 0.000367863590772317, "loss": 2.9293, "theoretical_loss": 3.687680351757722, "tokens_seen": 896604160 }, { "epoch": 2.06, "learning_rate": 0.00036785356068204616, "loss": 3.1325, "theoretical_loss": 3.687654181175185, "tokens_seen": 896669696 }, { "epoch": 2.06, "learning_rate": 0.00036784353059177535, "loss": 2.8668, "theoretical_loss": 3.687628013040868, "tokens_seen": 896735232 }, { "epoch": 2.06, "learning_rate": 0.0003678335005015045, "loss": 2.8513, "theoretical_loss": 3.6876018473543635, "tokens_seen": 896800768 }, { "epoch": 2.06, "learning_rate": 0.0003678234704112337, "loss": 2.9631, "theoretical_loss": 3.687575684115263, "tokens_seen": 896866304 }, { "epoch": 2.06, "learning_rate": 0.00036781344032096294, "loss": 3.0009, "theoretical_loss": 3.6875495233231597, "tokens_seen": 896931840 }, { "epoch": 2.06, "learning_rate": 0.00036780341023069207, "loss": 3.0591, "theoretical_loss": 3.687523364977645, "tokens_seen": 896997376 }, { "epoch": 2.06, "learning_rate": 0.0003677933801404213, "loss": 2.7537, "theoretical_loss": 3.6874972090783125, "tokens_seen": 897062912 }, { "epoch": 2.06, "learning_rate": 0.00036778335005015043, "loss": 3.011, "theoretical_loss": 3.687471055624754, "tokens_seen": 897128448 }, { "epoch": 2.06, "learning_rate": 0.00036777331995987967, "loss": 2.9947, "theoretical_loss": 3.6874449046165623, "tokens_seen": 897193984 }, { "epoch": 2.06, "learning_rate": 0.00036776328986960885, "loss": 2.9118, "theoretical_loss": 3.6874187560533302, "tokens_seen": 897259520 }, { "epoch": 2.06, "learning_rate": 0.00036775325977933803, "loss": 3.0379, "theoretical_loss": 3.6873926099346503, "tokens_seen": 897325056 }, { "epoch": 2.06, "learning_rate": 0.0003677432296890672, "loss": 2.9522, "theoretical_loss": 3.6873664662601158, "tokens_seen": 897390592 }, { "epoch": 2.06, "learning_rate": 0.00036773319959879645, "loss": 2.9292, "theoretical_loss": 3.6873403250293197, "tokens_seen": 897456128 }, { "epoch": 2.06, "learning_rate": 0.00036772316950852557, "loss": 2.9114, "theoretical_loss": 3.6873141862418555, "tokens_seen": 897521664 }, { "epoch": 2.06, "learning_rate": 0.0003677131394182548, "loss": 3.0703, "theoretical_loss": 3.687288049897316, "tokens_seen": 897587200 }, { "epoch": 2.06, "learning_rate": 0.00036770310932798393, "loss": 2.9426, "theoretical_loss": 3.6872619159952946, "tokens_seen": 897652736 }, { "epoch": 2.06, "learning_rate": 0.00036769307923771317, "loss": 2.8942, "theoretical_loss": 3.687235784535385, "tokens_seen": 897718272 }, { "epoch": 2.06, "learning_rate": 0.00036768304914744235, "loss": 2.9175, "theoretical_loss": 3.6872096555171803, "tokens_seen": 897783808 }, { "epoch": 2.06, "objective/train/docs_used": 1447083, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9949562549591064, "objective/train/theoretical_loss": 3.6871965919235903, "objective/train/tokens_used": 918276576, "theoretical_loss": 3.6871965919235903, "tokens_seen": 897816576 }, { "epoch": 2.06, "learning_rate": 0.00036767301905717153, "loss": 2.9953, "theoretical_loss": 3.6871835289402743, "tokens_seen": 897849344 }, { "epoch": 2.06, "learning_rate": 0.0003676629889669007, "loss": 2.7488, "theoretical_loss": 3.6871574048042612, "tokens_seen": 897914880 }, { "epoch": 2.07, "learning_rate": 0.0003676529588766299, "loss": 2.9014, "theoretical_loss": 3.687131283108734, "tokens_seen": 897980416 }, { "epoch": 2.07, "learning_rate": 0.0003676429287863591, "loss": 3.0331, "theoretical_loss": 3.6871051638532872, "tokens_seen": 898045952 }, { "epoch": 2.07, "learning_rate": 0.0003676328986960883, "loss": 2.9142, "theoretical_loss": 3.687079047037515, "tokens_seen": 898111488 }, { "epoch": 2.07, "learning_rate": 0.00036762286860581744, "loss": 2.853, "theoretical_loss": 3.687052932661011, "tokens_seen": 898177024 }, { "epoch": 2.07, "learning_rate": 0.0003676128385155467, "loss": 2.8958, "theoretical_loss": 3.6870268207233696, "tokens_seen": 898242560 }, { "epoch": 2.07, "learning_rate": 0.0003676028084252758, "loss": 2.8854, "theoretical_loss": 3.687000711224185, "tokens_seen": 898308096 }, { "epoch": 2.07, "learning_rate": 0.00036759277833500504, "loss": 3.0161, "theoretical_loss": 3.6869746041630522, "tokens_seen": 898373632 }, { "epoch": 2.07, "learning_rate": 0.0003675827482447342, "loss": 2.8616, "theoretical_loss": 3.6869484995395654, "tokens_seen": 898439168 }, { "epoch": 2.07, "learning_rate": 0.0003675727181544634, "loss": 2.9606, "theoretical_loss": 3.6869223973533183, "tokens_seen": 898504704 }, { "epoch": 2.07, "learning_rate": 0.0003675626880641926, "loss": 3.0016, "theoretical_loss": 3.686896297603907, "tokens_seen": 898570240 }, { "epoch": 2.07, "learning_rate": 0.0003675526579739218, "loss": 2.8504, "theoretical_loss": 3.686870200290925, "tokens_seen": 898635776 }, { "epoch": 2.07, "learning_rate": 0.00036754262788365094, "loss": 2.8516, "theoretical_loss": 3.686844105413969, "tokens_seen": 898701312 }, { "epoch": 2.07, "learning_rate": 0.0003675325977933802, "loss": 3.0082, "theoretical_loss": 3.686818012972632, "tokens_seen": 898766848 }, { "epoch": 2.07, "learning_rate": 0.0003675225677031093, "loss": 3.0034, "theoretical_loss": 3.6867919229665103, "tokens_seen": 898832384 }, { "epoch": 2.07, "learning_rate": 0.00036751253761283854, "loss": 3.0443, "theoretical_loss": 3.6867658353951986, "tokens_seen": 898897920 }, { "epoch": 2.07, "learning_rate": 0.0003675025075225677, "loss": 2.8023, "theoretical_loss": 3.686739750258292, "tokens_seen": 898963456 }, { "epoch": 2.07, "learning_rate": 0.0003674924774322969, "loss": 2.6755, "theoretical_loss": 3.686713667555387, "tokens_seen": 899028992 }, { "epoch": 2.07, "learning_rate": 0.0003674824473420261, "loss": 2.8952, "theoretical_loss": 3.686687587286078, "tokens_seen": 899094528 }, { "epoch": 2.07, "learning_rate": 0.00036747241725175526, "loss": 2.8462, "theoretical_loss": 3.686661509449961, "tokens_seen": 899160064 }, { "epoch": 2.07, "learning_rate": 0.00036746238716148444, "loss": 3.0084, "theoretical_loss": 3.686635434046631, "tokens_seen": 899225600 }, { "epoch": 2.07, "learning_rate": 0.0003674523570712137, "loss": 3.0226, "theoretical_loss": 3.6866093610756847, "tokens_seen": 899291136 }, { "epoch": 2.07, "learning_rate": 0.0003674423269809428, "loss": 2.9458, "theoretical_loss": 3.686583290536718, "tokens_seen": 899356672 }, { "epoch": 2.07, "learning_rate": 0.00036743229689067204, "loss": 2.9865, "theoretical_loss": 3.6865572224293257, "tokens_seen": 899422208 }, { "epoch": 2.07, "objective/train/docs_used": 1449594, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9679436683654785, "objective/train/theoretical_loss": 3.686544189287344, "objective/train/tokens_used": 919914976, "theoretical_loss": 3.686544189287344, "tokens_seen": 899454976 }, { "epoch": 2.07, "learning_rate": 0.00036742226680040117, "loss": 3.0053, "theoretical_loss": 3.686531156753105, "tokens_seen": 899487744 }, { "epoch": 2.07, "learning_rate": 0.0003674122367101304, "loss": 2.9599, "theoretical_loss": 3.6865050935076518, "tokens_seen": 899553280 }, { "epoch": 2.07, "learning_rate": 0.0003674022066198596, "loss": 2.9439, "theoretical_loss": 3.686479032692562, "tokens_seen": 899618816 }, { "epoch": 2.07, "learning_rate": 0.00036739217652958877, "loss": 2.9609, "theoretical_loss": 3.686452974307432, "tokens_seen": 899684352 }, { "epoch": 2.07, "learning_rate": 0.00036738214643931795, "loss": 2.9588, "theoretical_loss": 3.6864269183518585, "tokens_seen": 899749888 }, { "epoch": 2.07, "learning_rate": 0.0003673721163490472, "loss": 3.1194, "theoretical_loss": 3.686400864825438, "tokens_seen": 899815424 }, { "epoch": 2.07, "learning_rate": 0.0003673620862587763, "loss": 3.0064, "theoretical_loss": 3.6863748137277668, "tokens_seen": 899880960 }, { "epoch": 2.07, "learning_rate": 0.00036735205616850555, "loss": 2.9723, "theoretical_loss": 3.6863487650584426, "tokens_seen": 899946496 }, { "epoch": 2.07, "learning_rate": 0.00036734202607823467, "loss": 2.9832, "theoretical_loss": 3.686322718817061, "tokens_seen": 900012032 }, { "epoch": 2.07, "learning_rate": 0.0003673319959879639, "loss": 3.0063, "theoretical_loss": 3.6862966750032196, "tokens_seen": 900077568 }, { "epoch": 2.07, "learning_rate": 0.0003673219658976931, "loss": 2.9106, "theoretical_loss": 3.686270633616515, "tokens_seen": 900143104 }, { "epoch": 2.07, "learning_rate": 0.00036731193580742227, "loss": 2.9958, "theoretical_loss": 3.6862445946565456, "tokens_seen": 900208640 }, { "epoch": 2.07, "learning_rate": 0.00036730190571715145, "loss": 2.7376, "theoretical_loss": 3.6862185581229063, "tokens_seen": 900274176 }, { "epoch": 2.07, "learning_rate": 0.00036729187562688063, "loss": 2.877, "theoretical_loss": 3.6861925240151967, "tokens_seen": 900339712 }, { "epoch": 2.07, "learning_rate": 0.0003672818455366098, "loss": 2.9084, "theoretical_loss": 3.6861664923330126, "tokens_seen": 900405248 }, { "epoch": 2.07, "learning_rate": 0.00036727181544633905, "loss": 2.8932, "theoretical_loss": 3.6861404630759527, "tokens_seen": 900470784 }, { "epoch": 2.07, "learning_rate": 0.0003672617853560682, "loss": 2.9689, "theoretical_loss": 3.686114436243614, "tokens_seen": 900536320 }, { "epoch": 2.07, "learning_rate": 0.0003672517552657974, "loss": 3.0359, "theoretical_loss": 3.686088411835594, "tokens_seen": 900601856 }, { "epoch": 2.07, "learning_rate": 0.00036724172517552654, "loss": 2.9956, "theoretical_loss": 3.686062389851491, "tokens_seen": 900667392 }, { "epoch": 2.07, "learning_rate": 0.00036723169508525577, "loss": 2.9161, "theoretical_loss": 3.686036370290902, "tokens_seen": 900732928 }, { "epoch": 2.07, "learning_rate": 0.00036722166499498495, "loss": 2.9285, "theoretical_loss": 3.686010353153426, "tokens_seen": 900798464 }, { "epoch": 2.07, "learning_rate": 0.00036721163490471414, "loss": 2.8448, "theoretical_loss": 3.685984338438661, "tokens_seen": 900864000 }, { "epoch": 2.07, "learning_rate": 0.0003672016048144433, "loss": 2.7831, "theoretical_loss": 3.6859583261462046, "tokens_seen": 900929536 }, { "epoch": 2.07, "learning_rate": 0.00036719157472417255, "loss": 2.8818, "theoretical_loss": 3.685932316275655, "tokens_seen": 900995072 }, { "epoch": 2.07, "learning_rate": 0.0003671815446339017, "loss": 2.7992, "theoretical_loss": 3.6859063088266115, "tokens_seen": 901060608 }, { "epoch": 2.07, "objective/train/docs_used": 1452400, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.751866579055786, "objective/train/theoretical_loss": 3.6858933060100285, "objective/train/tokens_used": 921553376, "theoretical_loss": 3.6858933060100285, "tokens_seen": 901093376 }, { "epoch": 2.07, "learning_rate": 0.0003671715145436309, "loss": 2.9505, "theoretical_loss": 3.6858803037986716, "tokens_seen": 901126144 }, { "epoch": 2.07, "learning_rate": 0.00036716148445336004, "loss": 2.9287, "theoretical_loss": 3.685854301191434, "tokens_seen": 901191680 }, { "epoch": 2.07, "learning_rate": 0.0003671514543630893, "loss": 2.8064, "theoretical_loss": 3.6858283010044977, "tokens_seen": 901257216 }, { "epoch": 2.07, "learning_rate": 0.00036714142427281846, "loss": 2.9187, "theoretical_loss": 3.6858023032374616, "tokens_seen": 901322752 }, { "epoch": 2.07, "learning_rate": 0.00036713139418254764, "loss": 3.0634, "theoretical_loss": 3.685776307889924, "tokens_seen": 901388288 }, { "epoch": 2.07, "learning_rate": 0.0003671213640922768, "loss": 2.9063, "theoretical_loss": 3.6857503149614845, "tokens_seen": 901453824 }, { "epoch": 2.07, "learning_rate": 0.000367111334002006, "loss": 2.9003, "theoretical_loss": 3.6857243244517415, "tokens_seen": 901519360 }, { "epoch": 2.07, "learning_rate": 0.00036710130391173524, "loss": 2.9255, "theoretical_loss": 3.6856983363602946, "tokens_seen": 901584896 }, { "epoch": 2.07, "learning_rate": 0.0003670912738214644, "loss": 2.7676, "theoretical_loss": 3.685672350686742, "tokens_seen": 901650432 }, { "epoch": 2.07, "learning_rate": 0.0003670812437311936, "loss": 2.9566, "theoretical_loss": 3.6856463674306847, "tokens_seen": 901715968 }, { "epoch": 2.07, "learning_rate": 0.0003670712136409228, "loss": 2.9757, "theoretical_loss": 3.6856203865917205, "tokens_seen": 901781504 }, { "epoch": 2.07, "learning_rate": 0.000367061183550652, "loss": 2.8746, "theoretical_loss": 3.6855944081694503, "tokens_seen": 901847040 }, { "epoch": 2.07, "learning_rate": 0.00036705115346038114, "loss": 2.9321, "theoretical_loss": 3.685568432163473, "tokens_seen": 901912576 }, { "epoch": 2.07, "learning_rate": 0.0003670411233701104, "loss": 2.8973, "theoretical_loss": 3.685542458573388, "tokens_seen": 901978112 }, { "epoch": 2.07, "learning_rate": 0.0003670310932798395, "loss": 3.0295, "theoretical_loss": 3.685516487398796, "tokens_seen": 902043648 }, { "epoch": 2.07, "learning_rate": 0.00036702106318956874, "loss": 2.982, "theoretical_loss": 3.6854905186392957, "tokens_seen": 902109184 }, { "epoch": 2.07, "learning_rate": 0.0003670110330992979, "loss": 2.9582, "theoretical_loss": 3.685464552294488, "tokens_seen": 902174720 }, { "epoch": 2.07, "learning_rate": 0.0003670010030090271, "loss": 2.9573, "theoretical_loss": 3.685438588363973, "tokens_seen": 902240256 }, { "epoch": 2.07, "learning_rate": 0.0003669909729187563, "loss": 2.9444, "theoretical_loss": 3.685412626847351, "tokens_seen": 902305792 }, { "epoch": 2.07, "learning_rate": 0.00036698094282848546, "loss": 2.9394, "theoretical_loss": 3.685386667744221, "tokens_seen": 902371328 }, { "epoch": 2.07, "learning_rate": 0.00036697091273821464, "loss": 2.8858, "theoretical_loss": 3.685360711054185, "tokens_seen": 902436864 }, { "epoch": 2.07, "learning_rate": 0.0003669608826479439, "loss": 2.9558, "theoretical_loss": 3.685334756776842, "tokens_seen": 902502400 }, { "epoch": 2.07, "learning_rate": 0.000366950852557673, "loss": 2.9661, "theoretical_loss": 3.685308804911794, "tokens_seen": 902567936 }, { "epoch": 2.07, "learning_rate": 0.00036694082246740224, "loss": 2.8922, "theoretical_loss": 3.6852828554586408, "tokens_seen": 902633472 }, { "epoch": 2.07, "learning_rate": 0.00036693079237713137, "loss": 3.1437, "theoretical_loss": 3.6852569084169833, "tokens_seen": 902699008 }, { "epoch": 2.07, "objective/train/docs_used": 1455166, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2362396717071533, "objective/train/theoretical_loss": 3.685243935800341, "objective/train/tokens_used": 923191776, "theoretical_loss": 3.685243935800341, "tokens_seen": 902731776 }, { "epoch": 2.07, "learning_rate": 0.0003669207622868606, "loss": 2.9693, "theoretical_loss": 3.6852309637864225, "tokens_seen": 902764544 }, { "epoch": 2.07, "learning_rate": 0.0003669107321965898, "loss": 2.9953, "theoretical_loss": 3.685205021566559, "tokens_seen": 902830080 }, { "epoch": 2.07, "learning_rate": 0.00036690070210631897, "loss": 2.9072, "theoretical_loss": 3.685179081756994, "tokens_seen": 902895616 }, { "epoch": 2.07, "learning_rate": 0.00036689067201604815, "loss": 2.9132, "theoretical_loss": 3.6851531443573293, "tokens_seen": 902961152 }, { "epoch": 2.07, "learning_rate": 0.0003668806419257774, "loss": 2.8964, "theoretical_loss": 3.685127209367165, "tokens_seen": 903026688 }, { "epoch": 2.07, "learning_rate": 0.0003668706118355065, "loss": 2.6622, "theoretical_loss": 3.685101276786103, "tokens_seen": 903092224 }, { "epoch": 2.07, "learning_rate": 0.00036686058174523575, "loss": 3.0839, "theoretical_loss": 3.6850753466137443, "tokens_seen": 903157760 }, { "epoch": 2.07, "learning_rate": 0.00036685055165496487, "loss": 2.8927, "theoretical_loss": 3.685049418849691, "tokens_seen": 903223296 }, { "epoch": 2.07, "learning_rate": 0.0003668405215646941, "loss": 2.9978, "theoretical_loss": 3.685023493493545, "tokens_seen": 903288832 }, { "epoch": 2.07, "learning_rate": 0.0003668304914744233, "loss": 2.831, "theoretical_loss": 3.6849975705449065, "tokens_seen": 903354368 }, { "epoch": 2.07, "learning_rate": 0.00036682046138415247, "loss": 2.8001, "theoretical_loss": 3.684971650003379, "tokens_seen": 903419904 }, { "epoch": 2.07, "learning_rate": 0.00036681043129388165, "loss": 2.9795, "theoretical_loss": 3.684945731868564, "tokens_seen": 903485440 }, { "epoch": 2.07, "learning_rate": 0.00036680040120361083, "loss": 2.9793, "theoretical_loss": 3.6849198161400625, "tokens_seen": 903550976 }, { "epoch": 2.07, "learning_rate": 0.00036679037111334, "loss": 2.9472, "theoretical_loss": 3.6848939028174774, "tokens_seen": 903616512 }, { "epoch": 2.07, "learning_rate": 0.00036678034102306925, "loss": 2.7691, "theoretical_loss": 3.6848679919004104, "tokens_seen": 903682048 }, { "epoch": 2.07, "learning_rate": 0.0003667703109327984, "loss": 3.0297, "theoretical_loss": 3.6848420833884648, "tokens_seen": 903747584 }, { "epoch": 2.07, "learning_rate": 0.0003667602808425276, "loss": 3.0357, "theoretical_loss": 3.684816177281242, "tokens_seen": 903813120 }, { "epoch": 2.07, "learning_rate": 0.00036675025075225674, "loss": 2.9683, "theoretical_loss": 3.684790273578344, "tokens_seen": 903878656 }, { "epoch": 2.07, "learning_rate": 0.000366740220661986, "loss": 2.8675, "theoretical_loss": 3.684764372279375, "tokens_seen": 903944192 }, { "epoch": 2.07, "learning_rate": 0.00036673019057171515, "loss": 2.9851, "theoretical_loss": 3.684738473383936, "tokens_seen": 904009728 }, { "epoch": 2.07, "learning_rate": 0.00036672016048144434, "loss": 2.8118, "theoretical_loss": 3.6847125768916307, "tokens_seen": 904075264 }, { "epoch": 2.07, "learning_rate": 0.0003667101303911735, "loss": 2.9359, "theoretical_loss": 3.684686682802062, "tokens_seen": 904140800 }, { "epoch": 2.07, "learning_rate": 0.00036670010030090275, "loss": 2.8889, "theoretical_loss": 3.6846607911148324, "tokens_seen": 904206336 }, { "epoch": 2.07, "learning_rate": 0.0003666900702106319, "loss": 3.0199, "theoretical_loss": 3.6846349018295443, "tokens_seen": 904271872 }, { "epoch": 2.07, "learning_rate": 0.0003666800401203611, "loss": 2.952, "theoretical_loss": 3.684609014945803, "tokens_seen": 904337408 }, { "epoch": 2.07, "objective/train/docs_used": 1457960, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.265071392059326, "objective/train/theoretical_loss": 3.6845960724043874, "objective/train/tokens_used": 924830176, "theoretical_loss": 3.6845960724043874, "tokens_seen": 904370176 }, { "epoch": 2.07, "learning_rate": 0.00036667001003009024, "loss": 2.9755, "theoretical_loss": 3.684583130463209, "tokens_seen": 904402944 }, { "epoch": 2.07, "learning_rate": 0.0003666599799398195, "loss": 2.8207, "theoretical_loss": 3.6845572483813678, "tokens_seen": 904468480 }, { "epoch": 2.07, "learning_rate": 0.00036664994984954866, "loss": 3.0006, "theoretical_loss": 3.684531368699881, "tokens_seen": 904534016 }, { "epoch": 2.07, "learning_rate": 0.00036663991975927784, "loss": 2.89, "theoretical_loss": 3.6845054914183537, "tokens_seen": 904599552 }, { "epoch": 2.07, "learning_rate": 0.000366629889669007, "loss": 3.0177, "theoretical_loss": 3.6844796165363887, "tokens_seen": 904665088 }, { "epoch": 2.07, "learning_rate": 0.0003666198595787362, "loss": 2.9471, "theoretical_loss": 3.68445374405359, "tokens_seen": 904730624 }, { "epoch": 2.07, "learning_rate": 0.0003666098294884654, "loss": 2.8909, "theoretical_loss": 3.6844278739695606, "tokens_seen": 904796160 }, { "epoch": 2.07, "learning_rate": 0.0003665997993981946, "loss": 3.0656, "theoretical_loss": 3.6844020062839054, "tokens_seen": 904861696 }, { "epoch": 2.07, "learning_rate": 0.00036658976930792374, "loss": 2.7531, "theoretical_loss": 3.6843761409962275, "tokens_seen": 904927232 }, { "epoch": 2.07, "learning_rate": 0.000366579739217653, "loss": 3.1004, "theoretical_loss": 3.6843502781061312, "tokens_seen": 904992768 }, { "epoch": 2.07, "learning_rate": 0.00036656970912738216, "loss": 3.0257, "theoretical_loss": 3.6843244176132215, "tokens_seen": 905058304 }, { "epoch": 2.07, "learning_rate": 0.00036655967903711134, "loss": 2.988, "theoretical_loss": 3.684298559517102, "tokens_seen": 905123840 }, { "epoch": 2.07, "learning_rate": 0.0003665496489468405, "loss": 3.0009, "theoretical_loss": 3.6842727038173764, "tokens_seen": 905189376 }, { "epoch": 2.07, "learning_rate": 0.0003665396188565697, "loss": 2.9325, "theoretical_loss": 3.68424685051365, "tokens_seen": 905254912 }, { "epoch": 2.07, "learning_rate": 0.0003665295887662989, "loss": 2.9073, "theoretical_loss": 3.684220999605527, "tokens_seen": 905320448 }, { "epoch": 2.07, "learning_rate": 0.0003665195586760281, "loss": 2.9651, "theoretical_loss": 3.684195151092612, "tokens_seen": 905385984 }, { "epoch": 2.07, "learning_rate": 0.00036650952858575725, "loss": 2.9708, "theoretical_loss": 3.6841693049745103, "tokens_seen": 905451520 }, { "epoch": 2.07, "learning_rate": 0.0003664994984954865, "loss": 2.686, "theoretical_loss": 3.684143461250826, "tokens_seen": 905517056 }, { "epoch": 2.07, "learning_rate": 0.0003664894684052156, "loss": 2.8964, "theoretical_loss": 3.6841176199211643, "tokens_seen": 905582592 }, { "epoch": 2.07, "learning_rate": 0.00036647943831494484, "loss": 3.022, "theoretical_loss": 3.6840917809851303, "tokens_seen": 905648128 }, { "epoch": 2.07, "learning_rate": 0.000366469408224674, "loss": 2.9405, "theoretical_loss": 3.6840659444423287, "tokens_seen": 905713664 }, { "epoch": 2.07, "learning_rate": 0.0003664593781344032, "loss": 3.0481, "theoretical_loss": 3.684040110292365, "tokens_seen": 905779200 }, { "epoch": 2.07, "learning_rate": 0.0003664493480441324, "loss": 2.9393, "theoretical_loss": 3.6840142785348444, "tokens_seen": 905844736 }, { "epoch": 2.07, "learning_rate": 0.00036643931795386157, "loss": 2.9642, "theoretical_loss": 3.6839884491693726, "tokens_seen": 905910272 }, { "epoch": 2.07, "learning_rate": 0.00036642928786359075, "loss": 2.9978, "theoretical_loss": 3.683962622195555, "tokens_seen": 905975808 }, { "epoch": 2.07, "objective/train/docs_used": 1459482, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.085421562194824, "objective/train/theoretical_loss": 3.6839497096053924, "objective/train/tokens_used": 926468576, "theoretical_loss": 3.6839497096053924, "tokens_seen": 906008576 }, { "epoch": 2.07, "learning_rate": 0.00036641925777332, "loss": 2.9277, "theoretical_loss": 3.683936797612996, "tokens_seen": 906041344 }, { "epoch": 2.07, "learning_rate": 0.0003664092276830491, "loss": 2.9026, "theoretical_loss": 3.683910975421303, "tokens_seen": 906106880 }, { "epoch": 2.07, "learning_rate": 0.00036639919759277835, "loss": 2.7583, "theoretical_loss": 3.6838851556200805, "tokens_seen": 906172416 }, { "epoch": 2.07, "learning_rate": 0.00036638916750250753, "loss": 2.9002, "theoretical_loss": 3.683859338208935, "tokens_seen": 906237952 }, { "epoch": 2.07, "learning_rate": 0.0003663791374122367, "loss": 2.9641, "theoretical_loss": 3.683833523187472, "tokens_seen": 906303488 }, { "epoch": 2.07, "learning_rate": 0.0003663691073219659, "loss": 2.9738, "theoretical_loss": 3.683807710555298, "tokens_seen": 906369024 }, { "epoch": 2.07, "learning_rate": 0.00036635907723169507, "loss": 2.9451, "theoretical_loss": 3.683781900312019, "tokens_seen": 906434560 }, { "epoch": 2.07, "learning_rate": 0.0003663490471414243, "loss": 2.9077, "theoretical_loss": 3.683756092457241, "tokens_seen": 906500096 }, { "epoch": 2.07, "learning_rate": 0.0003663390170511535, "loss": 2.8867, "theoretical_loss": 3.6837302869905706, "tokens_seen": 906565632 }, { "epoch": 2.07, "learning_rate": 0.00036632898696088267, "loss": 2.8571, "theoretical_loss": 3.6837044839116135, "tokens_seen": 906631168 }, { "epoch": 2.07, "learning_rate": 0.00036631895687061185, "loss": 2.9032, "theoretical_loss": 3.683678683219977, "tokens_seen": 906696704 }, { "epoch": 2.07, "learning_rate": 0.00036630892678034103, "loss": 3.0164, "theoretical_loss": 3.6836528849152677, "tokens_seen": 906762240 }, { "epoch": 2.07, "learning_rate": 0.0003662988966900702, "loss": 2.7719, "theoretical_loss": 3.683627088997092, "tokens_seen": 906827776 }, { "epoch": 2.07, "learning_rate": 0.00036628886659979945, "loss": 2.9402, "theoretical_loss": 3.6836012954650563, "tokens_seen": 906893312 }, { "epoch": 2.07, "learning_rate": 0.0003662788365095286, "loss": 2.9619, "theoretical_loss": 3.683575504318768, "tokens_seen": 906958848 }, { "epoch": 2.07, "learning_rate": 0.0003662688064192578, "loss": 2.9401, "theoretical_loss": 3.6835497155578336, "tokens_seen": 907024384 }, { "epoch": 2.07, "learning_rate": 0.00036625877632898694, "loss": 2.9502, "theoretical_loss": 3.6835239291818613, "tokens_seen": 907089920 }, { "epoch": 2.07, "learning_rate": 0.0003662487462387162, "loss": 2.8182, "theoretical_loss": 3.6834981451904563, "tokens_seen": 907155456 }, { "epoch": 2.07, "learning_rate": 0.00036623871614844535, "loss": 2.6658, "theoretical_loss": 3.6834723635832276, "tokens_seen": 907220992 }, { "epoch": 2.07, "learning_rate": 0.00036622868605817454, "loss": 2.9706, "theoretical_loss": 3.6834465843597823, "tokens_seen": 907286528 }, { "epoch": 2.07, "learning_rate": 0.0003662186559679037, "loss": 2.9123, "theoretical_loss": 3.683420807519727, "tokens_seen": 907352064 }, { "epoch": 2.07, "learning_rate": 0.00036620862587763295, "loss": 3.0148, "theoretical_loss": 3.68339503306267, "tokens_seen": 907417600 }, { "epoch": 2.07, "learning_rate": 0.0003661985957873621, "loss": 2.8923, "theoretical_loss": 3.683369260988218, "tokens_seen": 907483136 }, { "epoch": 2.07, "learning_rate": 0.0003661885656970913, "loss": 2.805, "theoretical_loss": 3.6833434912959797, "tokens_seen": 907548672 }, { "epoch": 2.07, "learning_rate": 0.00036617853560682044, "loss": 3.0124, "theoretical_loss": 3.683317723985562, "tokens_seen": 907614208 }, { "epoch": 2.07, "objective/train/docs_used": 1461872, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9048843383789062, "objective/train/theoretical_loss": 3.6833048412234137, "objective/train/tokens_used": 928106976, "theoretical_loss": 3.6833048412234137, "tokens_seen": 907646976 }, { "epoch": 2.07, "learning_rate": 0.0003661685055165497, "loss": 2.7609, "theoretical_loss": 3.683291959056574, "tokens_seen": 907679744 }, { "epoch": 2.07, "learning_rate": 0.00036615847542627886, "loss": 2.7234, "theoretical_loss": 3.6832661965086224, "tokens_seen": 907745280 }, { "epoch": 2.07, "learning_rate": 0.00036614844533600804, "loss": 2.8331, "theoretical_loss": 3.683240436341316, "tokens_seen": 907810816 }, { "epoch": 2.07, "learning_rate": 0.0003661384152457372, "loss": 3.0043, "theoretical_loss": 3.683214678554263, "tokens_seen": 907876352 }, { "epoch": 2.07, "learning_rate": 0.0003661283851554664, "loss": 2.9319, "theoretical_loss": 3.683188923147071, "tokens_seen": 907941888 }, { "epoch": 2.07, "learning_rate": 0.0003661183550651956, "loss": 2.9774, "theoretical_loss": 3.6831631701193492, "tokens_seen": 908007424 }, { "epoch": 2.07, "learning_rate": 0.0003661083249749248, "loss": 2.9262, "theoretical_loss": 3.683137419470706, "tokens_seen": 908072960 }, { "epoch": 2.07, "learning_rate": 0.00036609829488465394, "loss": 2.9271, "theoretical_loss": 3.6831116712007494, "tokens_seen": 908138496 }, { "epoch": 2.07, "learning_rate": 0.0003660882647943832, "loss": 2.8807, "theoretical_loss": 3.683085925309088, "tokens_seen": 908204032 }, { "epoch": 2.07, "learning_rate": 0.00036607823470411236, "loss": 2.934, "theoretical_loss": 3.683060181795331, "tokens_seen": 908269568 }, { "epoch": 2.07, "learning_rate": 0.00036606820461384154, "loss": 2.8823, "theoretical_loss": 3.683034440659087, "tokens_seen": 908335104 }, { "epoch": 2.07, "learning_rate": 0.0003660581745235707, "loss": 3.0721, "theoretical_loss": 3.6830087018999653, "tokens_seen": 908400640 }, { "epoch": 2.07, "learning_rate": 0.0003660481444332999, "loss": 2.9089, "theoretical_loss": 3.6829829655175743, "tokens_seen": 908466176 }, { "epoch": 2.07, "learning_rate": 0.0003660381143430291, "loss": 2.7568, "theoretical_loss": 3.6829572315115238, "tokens_seen": 908531712 }, { "epoch": 2.07, "learning_rate": 0.0003660280842527583, "loss": 3.0296, "theoretical_loss": 3.6829314998814215, "tokens_seen": 908597248 }, { "epoch": 2.07, "learning_rate": 0.00036601805416248745, "loss": 2.8307, "theoretical_loss": 3.6829057706268786, "tokens_seen": 908662784 }, { "epoch": 2.07, "learning_rate": 0.0003660080240722167, "loss": 2.8787, "theoretical_loss": 3.6828800437475033, "tokens_seen": 908728320 }, { "epoch": 2.07, "learning_rate": 0.0003659979939819458, "loss": 2.9948, "theoretical_loss": 3.6828543192429057, "tokens_seen": 908793856 }, { "epoch": 2.07, "learning_rate": 0.00036598796389167504, "loss": 2.97, "theoretical_loss": 3.6828285971126946, "tokens_seen": 908859392 }, { "epoch": 2.07, "learning_rate": 0.0003659779338014042, "loss": 3.0069, "theoretical_loss": 3.6828028773564805, "tokens_seen": 908924928 }, { "epoch": 2.07, "learning_rate": 0.0003659679037111334, "loss": 3.0085, "theoretical_loss": 3.682777159973872, "tokens_seen": 908990464 }, { "epoch": 2.07, "learning_rate": 0.0003659578736208626, "loss": 3.1465, "theoretical_loss": 3.6827514449644805, "tokens_seen": 909056000 }, { "epoch": 2.07, "learning_rate": 0.00036594784353059177, "loss": 2.9862, "theoretical_loss": 3.682725732327915, "tokens_seen": 909121536 }, { "epoch": 2.07, "learning_rate": 0.00036593781344032095, "loss": 2.7593, "theoretical_loss": 3.682700022063785, "tokens_seen": 909187072 }, { "epoch": 2.07, "learning_rate": 0.0003659277833500502, "loss": 2.8152, "theoretical_loss": 3.6826743141717015, "tokens_seen": 909252608 }, { "epoch": 2.07, "objective/train/docs_used": 1464834, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.920927047729492, "objective/train/theoretical_loss": 3.682661461115056, "objective/train/tokens_used": 929745376, "theoretical_loss": 3.682661461115056, "tokens_seen": 909285376 }, { "epoch": 2.07, "learning_rate": 0.0003659177532597793, "loss": 2.921, "theoretical_loss": 3.6826486086512746, "tokens_seen": 909318144 }, { "epoch": 2.07, "learning_rate": 0.00036590772316950855, "loss": 2.8906, "theoretical_loss": 3.6826229055021145, "tokens_seen": 909383680 }, { "epoch": 2.07, "learning_rate": 0.00036589769307923773, "loss": 2.9188, "theoretical_loss": 3.682597204723832, "tokens_seen": 909449216 }, { "epoch": 2.07, "learning_rate": 0.0003658876629889669, "loss": 2.837, "theoretical_loss": 3.6825715063160365, "tokens_seen": 909514752 }, { "epoch": 2.07, "learning_rate": 0.0003658776328986961, "loss": 2.974, "theoretical_loss": 3.6825458102783397, "tokens_seen": 909580288 }, { "epoch": 2.07, "learning_rate": 0.00036586760280842527, "loss": 2.9067, "theoretical_loss": 3.682520116610351, "tokens_seen": 909645824 }, { "epoch": 2.07, "learning_rate": 0.00036585757271815445, "loss": 2.9438, "theoretical_loss": 3.6824944253116825, "tokens_seen": 909711360 }, { "epoch": 2.07, "learning_rate": 0.0003658475426278837, "loss": 2.8538, "theoretical_loss": 3.6824687363819444, "tokens_seen": 909776896 }, { "epoch": 2.07, "learning_rate": 0.0003658375125376128, "loss": 2.8477, "theoretical_loss": 3.6824430498207477, "tokens_seen": 909842432 }, { "epoch": 2.07, "learning_rate": 0.00036582748244734205, "loss": 2.9301, "theoretical_loss": 3.6824173656277033, "tokens_seen": 909907968 }, { "epoch": 2.07, "learning_rate": 0.0003658174523570712, "loss": 2.8092, "theoretical_loss": 3.6823916838024227, "tokens_seen": 909973504 }, { "epoch": 2.07, "learning_rate": 0.0003658074222668004, "loss": 2.906, "theoretical_loss": 3.6823660043445168, "tokens_seen": 910039040 }, { "epoch": 2.07, "learning_rate": 0.0003657973921765296, "loss": 2.9263, "theoretical_loss": 3.6823403272535975, "tokens_seen": 910104576 }, { "epoch": 2.07, "learning_rate": 0.0003657873620862588, "loss": 2.7563, "theoretical_loss": 3.682314652529275, "tokens_seen": 910170112 }, { "epoch": 2.07, "learning_rate": 0.00036577733199598796, "loss": 2.8328, "theoretical_loss": 3.682288980171162, "tokens_seen": 910235648 }, { "epoch": 2.07, "learning_rate": 0.00036576730190571714, "loss": 3.0186, "theoretical_loss": 3.6822633101788695, "tokens_seen": 910301184 }, { "epoch": 2.07, "learning_rate": 0.0003657572718154463, "loss": 2.9158, "theoretical_loss": 3.6822376425520096, "tokens_seen": 910366720 }, { "epoch": 2.07, "learning_rate": 0.00036574724172517555, "loss": 2.905, "theoretical_loss": 3.6822119772901933, "tokens_seen": 910432256 }, { "epoch": 2.07, "learning_rate": 0.0003657372116349047, "loss": 2.8254, "theoretical_loss": 3.682186314393033, "tokens_seen": 910497792 }, { "epoch": 2.07, "learning_rate": 0.0003657271815446339, "loss": 2.9706, "theoretical_loss": 3.682160653860141, "tokens_seen": 910563328 }, { "epoch": 2.07, "learning_rate": 0.0003657171514543631, "loss": 2.9755, "theoretical_loss": 3.6821349956911282, "tokens_seen": 910628864 }, { "epoch": 2.07, "learning_rate": 0.0003657071213640923, "loss": 2.9099, "theoretical_loss": 3.6821093398856077, "tokens_seen": 910694400 }, { "epoch": 2.07, "learning_rate": 0.00036569709127382146, "loss": 2.9888, "theoretical_loss": 3.6820836864431916, "tokens_seen": 910759936 }, { "epoch": 2.07, "learning_rate": 0.00036568706118355064, "loss": 2.8246, "theoretical_loss": 3.682058035363492, "tokens_seen": 910825472 }, { "epoch": 2.07, "learning_rate": 0.0003656770310932798, "loss": 2.9491, "theoretical_loss": 3.6820323866461218, "tokens_seen": 910891008 }, { "epoch": 2.07, "objective/train/docs_used": 1467634, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9951467514038086, "objective/train/theoretical_loss": 3.6820195631731885, "objective/train/tokens_used": 931383776, "theoretical_loss": 3.6820195631731885, "tokens_seen": 910923776 }, { "epoch": 2.07, "learning_rate": 0.00036566700100300906, "loss": 2.7713, "theoretical_loss": 3.6820067402906926, "tokens_seen": 910956544 }, { "epoch": 2.07, "learning_rate": 0.0003656569709127382, "loss": 2.8837, "theoretical_loss": 3.681981096296818, "tokens_seen": 911022080 }, { "epoch": 2.07, "learning_rate": 0.0003656469408224674, "loss": 2.8418, "theoretical_loss": 3.68195545466411, "tokens_seen": 911087616 }, { "epoch": 2.07, "learning_rate": 0.00036563691073219655, "loss": 2.8436, "theoretical_loss": 3.6819298153921816, "tokens_seen": 911153152 }, { "epoch": 2.07, "learning_rate": 0.0003656268806419258, "loss": 2.9043, "theoretical_loss": 3.6819041784806457, "tokens_seen": 911218688 }, { "epoch": 2.07, "learning_rate": 0.00036561685055165496, "loss": 2.81, "theoretical_loss": 3.681878543929115, "tokens_seen": 911284224 }, { "epoch": 2.07, "learning_rate": 0.00036560682046138414, "loss": 3.0725, "theoretical_loss": 3.6818529117372023, "tokens_seen": 911349760 }, { "epoch": 2.07, "learning_rate": 0.0003655967903711134, "loss": 2.7597, "theoretical_loss": 3.681827281904522, "tokens_seen": 911415296 }, { "epoch": 2.07, "learning_rate": 0.00036558676028084256, "loss": 2.8311, "theoretical_loss": 3.6818016544306866, "tokens_seen": 911480832 }, { "epoch": 2.07, "learning_rate": 0.00036557673019057174, "loss": 3.0047, "theoretical_loss": 3.6817760293153086, "tokens_seen": 911546368 }, { "epoch": 2.07, "learning_rate": 0.0003655667001003009, "loss": 2.9307, "theoretical_loss": 3.6817504065580025, "tokens_seen": 911611904 }, { "epoch": 2.07, "learning_rate": 0.0003655566700100301, "loss": 2.8667, "theoretical_loss": 3.681724786158382, "tokens_seen": 911677440 }, { "epoch": 2.07, "learning_rate": 0.0003655466399197593, "loss": 2.9051, "theoretical_loss": 3.68169916811606, "tokens_seen": 911742976 }, { "epoch": 2.07, "learning_rate": 0.0003655366098294885, "loss": 2.9179, "theoretical_loss": 3.6816735524306496, "tokens_seen": 911808512 }, { "epoch": 2.07, "learning_rate": 0.00036552657973921765, "loss": 2.9527, "theoretical_loss": 3.681647939101766, "tokens_seen": 911874048 }, { "epoch": 2.07, "learning_rate": 0.0003655165496489469, "loss": 2.8972, "theoretical_loss": 3.681622328129022, "tokens_seen": 911939584 }, { "epoch": 2.07, "learning_rate": 0.000365506519558676, "loss": 2.8071, "theoretical_loss": 3.6815967195120325, "tokens_seen": 912005120 }, { "epoch": 2.07, "learning_rate": 0.00036549648946840525, "loss": 2.9906, "theoretical_loss": 3.6815711132504108, "tokens_seen": 912070656 }, { "epoch": 2.07, "learning_rate": 0.0003654864593781344, "loss": 2.9704, "theoretical_loss": 3.681545509343771, "tokens_seen": 912136192 }, { "epoch": 2.07, "learning_rate": 0.0003654764292878636, "loss": 2.9512, "theoretical_loss": 3.6815199077917278, "tokens_seen": 912201728 }, { "epoch": 2.07, "learning_rate": 0.0003654663991975928, "loss": 2.9879, "theoretical_loss": 3.6814943085938956, "tokens_seen": 912267264 }, { "epoch": 2.07, "learning_rate": 0.00036545636910732197, "loss": 3.0115, "theoretical_loss": 3.6814687117498885, "tokens_seen": 912332800 }, { "epoch": 2.07, "learning_rate": 0.00036544633901705115, "loss": 2.8963, "theoretical_loss": 3.6814431172593207, "tokens_seen": 912398336 }, { "epoch": 2.07, "learning_rate": 0.0003654363089267804, "loss": 2.896, "theoretical_loss": 3.6814175251218075, "tokens_seen": 912463872 }, { "epoch": 2.07, "learning_rate": 0.0003654262788365095, "loss": 2.899, "theoretical_loss": 3.681391935336963, "tokens_seen": 912529408 }, { "epoch": 2.07, "objective/train/docs_used": 1470449, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.960554599761963, "objective/train/theoretical_loss": 3.6813791413266714, "objective/train/tokens_used": 933022176, "theoretical_loss": 3.6813791413266714, "tokens_seen": 912562176 }, { "epoch": 2.07, "learning_rate": 0.00036541624874623875, "loss": 2.8607, "theoretical_loss": 3.6813663479044028, "tokens_seen": 912594944 }, { "epoch": 2.07, "learning_rate": 0.00036540621865596793, "loss": 2.9438, "theoretical_loss": 3.6813407628237407, "tokens_seen": 912660480 }, { "epoch": 2.07, "learning_rate": 0.0003653961885656971, "loss": 2.9142, "theoretical_loss": 3.6813151800945922, "tokens_seen": 912726016 }, { "epoch": 2.07, "learning_rate": 0.0003653861584754263, "loss": 2.6748, "theoretical_loss": 3.681289599716572, "tokens_seen": 912791552 }, { "epoch": 2.07, "learning_rate": 0.00036537612838515547, "loss": 2.8708, "theoretical_loss": 3.6812640216892962, "tokens_seen": 912857088 }, { "epoch": 2.07, "learning_rate": 0.00036536609829488465, "loss": 2.787, "theoretical_loss": 3.6812384460123786, "tokens_seen": 912922624 }, { "epoch": 2.07, "learning_rate": 0.0003653560682046139, "loss": 2.7915, "theoretical_loss": 3.6812128726854363, "tokens_seen": 912988160 }, { "epoch": 2.07, "learning_rate": 0.000365346038114343, "loss": 3.0535, "theoretical_loss": 3.681187301708083, "tokens_seen": 913053696 }, { "epoch": 2.07, "learning_rate": 0.00036533600802407225, "loss": 3.0415, "theoretical_loss": 3.681161733079935, "tokens_seen": 913119232 }, { "epoch": 2.07, "learning_rate": 0.0003653259779338014, "loss": 2.9508, "theoretical_loss": 3.681136166800608, "tokens_seen": 913184768 }, { "epoch": 2.07, "learning_rate": 0.0003653159478435306, "loss": 3.0148, "theoretical_loss": 3.681110602869717, "tokens_seen": 913250304 }, { "epoch": 2.07, "learning_rate": 0.0003653059177532598, "loss": 2.9231, "theoretical_loss": 3.681085041286879, "tokens_seen": 913315840 }, { "epoch": 2.07, "learning_rate": 0.000365295887662989, "loss": 2.6805, "theoretical_loss": 3.681059482051709, "tokens_seen": 913381376 }, { "epoch": 2.07, "learning_rate": 0.00036528585757271816, "loss": 2.9824, "theoretical_loss": 3.681033925163823, "tokens_seen": 913446912 }, { "epoch": 2.07, "learning_rate": 0.00036527582748244734, "loss": 3.0789, "theoretical_loss": 3.6810083706228376, "tokens_seen": 913512448 }, { "epoch": 2.07, "learning_rate": 0.0003652657973921765, "loss": 2.7454, "theoretical_loss": 3.680982818428368, "tokens_seen": 913577984 }, { "epoch": 2.07, "learning_rate": 0.00036525576730190575, "loss": 2.9258, "theoretical_loss": 3.680957268580031, "tokens_seen": 913643520 }, { "epoch": 2.07, "learning_rate": 0.0003652457372116349, "loss": 3.0787, "theoretical_loss": 3.680931721077443, "tokens_seen": 913709056 }, { "epoch": 2.07, "learning_rate": 0.0003652357071213641, "loss": 2.8321, "theoretical_loss": 3.68090617592022, "tokens_seen": 913774592 }, { "epoch": 2.07, "learning_rate": 0.0003652256770310933, "loss": 2.7825, "theoretical_loss": 3.6808806331079786, "tokens_seen": 913840128 }, { "epoch": 2.07, "learning_rate": 0.0003652156469408225, "loss": 2.9344, "theoretical_loss": 3.6808550926403365, "tokens_seen": 913905664 }, { "epoch": 2.07, "learning_rate": 0.00036520561685055166, "loss": 2.8599, "theoretical_loss": 3.6808295545169085, "tokens_seen": 913971200 }, { "epoch": 2.07, "learning_rate": 0.00036519558676028084, "loss": 2.9951, "theoretical_loss": 3.6808040187373123, "tokens_seen": 914036736 }, { "epoch": 2.07, "learning_rate": 0.00036518555667001, "loss": 2.7779, "theoretical_loss": 3.6807784853011656, "tokens_seen": 914102272 }, { "epoch": 2.07, "learning_rate": 0.00036517552657973926, "loss": 2.9882, "theoretical_loss": 3.6807529542080837, "tokens_seen": 914167808 }, { "epoch": 2.07, "objective/train/docs_used": 1473205, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5724422931671143, "objective/train/theoretical_loss": 3.6807401895400726, "objective/train/tokens_used": 934660576, "theoretical_loss": 3.6807401895400726, "tokens_seen": 914200576 }, { "epoch": 2.07, "learning_rate": 0.0003651654964894684, "loss": 2.8025, "theoretical_loss": 3.6807274254576843, "tokens_seen": 914233344 }, { "epoch": 2.07, "learning_rate": 0.0003651554663991976, "loss": 2.9109, "theoretical_loss": 3.680701899049585, "tokens_seen": 914298880 }, { "epoch": 2.07, "learning_rate": 0.00036514543630892675, "loss": 2.8611, "theoretical_loss": 3.680676374983403, "tokens_seen": 914364416 }, { "epoch": 2.07, "learning_rate": 0.000365135406218656, "loss": 2.872, "theoretical_loss": 3.680650853258755, "tokens_seen": 914429952 }, { "epoch": 2.07, "learning_rate": 0.00036512537612838516, "loss": 2.8308, "theoretical_loss": 3.680625333875259, "tokens_seen": 914495488 }, { "epoch": 2.07, "learning_rate": 0.00036511534603811434, "loss": 2.7967, "theoretical_loss": 3.680599816832532, "tokens_seen": 914561024 }, { "epoch": 2.07, "learning_rate": 0.0003651053159478435, "loss": 2.8362, "theoretical_loss": 3.680574302130192, "tokens_seen": 914626560 }, { "epoch": 2.07, "learning_rate": 0.00036509528585757276, "loss": 2.7805, "theoretical_loss": 3.680548789767856, "tokens_seen": 914692096 }, { "epoch": 2.07, "learning_rate": 0.0003650852557673019, "loss": 2.904, "theoretical_loss": 3.680523279745142, "tokens_seen": 914757632 }, { "epoch": 2.07, "learning_rate": 0.0003650752256770311, "loss": 2.8707, "theoretical_loss": 3.6804977720616687, "tokens_seen": 914823168 }, { "epoch": 2.07, "learning_rate": 0.00036506519558676025, "loss": 3.1425, "theoretical_loss": 3.680472266717053, "tokens_seen": 914888704 }, { "epoch": 2.07, "learning_rate": 0.0003650551654964895, "loss": 3.0712, "theoretical_loss": 3.6804467637109135, "tokens_seen": 914954240 }, { "epoch": 2.07, "learning_rate": 0.00036504513540621867, "loss": 2.9927, "theoretical_loss": 3.6804212630428683, "tokens_seen": 915019776 }, { "epoch": 2.07, "learning_rate": 0.00036503510531594785, "loss": 2.8292, "theoretical_loss": 3.680395764712535, "tokens_seen": 915085312 }, { "epoch": 2.07, "learning_rate": 0.00036502507522567703, "loss": 2.7881, "theoretical_loss": 3.6803702687195328, "tokens_seen": 915150848 }, { "epoch": 2.07, "learning_rate": 0.0003650150451354062, "loss": 2.9356, "theoretical_loss": 3.6803447750634795, "tokens_seen": 915216384 }, { "epoch": 2.07, "learning_rate": 0.0003650050150451354, "loss": 2.8725, "theoretical_loss": 3.6803192837439935, "tokens_seen": 915281920 }, { "epoch": 2.07, "learning_rate": 0.0003649949849548646, "loss": 2.9671, "theoretical_loss": 3.6802937947606935, "tokens_seen": 915347456 }, { "epoch": 2.07, "learning_rate": 0.00036498495486459375, "loss": 2.8575, "theoretical_loss": 3.680268308113199, "tokens_seen": 915412992 }, { "epoch": 2.07, "learning_rate": 0.000364974924774323, "loss": 2.9806, "theoretical_loss": 3.680242823801127, "tokens_seen": 915478528 }, { "epoch": 2.07, "learning_rate": 0.0003649648946840521, "loss": 2.8687, "theoretical_loss": 3.6802173418240978, "tokens_seen": 915544064 }, { "epoch": 2.07, "learning_rate": 0.00036495486459378135, "loss": 2.9047, "theoretical_loss": 3.68019186218173, "tokens_seen": 915609600 }, { "epoch": 2.07, "learning_rate": 0.00036494483450351053, "loss": 2.8617, "theoretical_loss": 3.680166384873642, "tokens_seen": 915675136 }, { "epoch": 2.07, "learning_rate": 0.0003649348044132397, "loss": 2.9748, "theoretical_loss": 3.6801409098994537, "tokens_seen": 915740672 }, { "epoch": 2.07, "learning_rate": 0.0003649247743229689, "loss": 2.9173, "theoretical_loss": 3.6801154372587837, "tokens_seen": 915806208 }, { "epoch": 2.07, "objective/train/docs_used": 1475506, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8323848247528076, "objective/train/theoretical_loss": 3.680102701813399, "objective/train/tokens_used": 936298976, "theoretical_loss": 3.680102701813399, "tokens_seen": 915838976 }, { "epoch": 2.07, "learning_rate": 0.00036491474423269813, "loss": 3.0126, "theoretical_loss": 3.680089966951252, "tokens_seen": 915871744 }, { "epoch": 2.07, "learning_rate": 0.00036490471414242726, "loss": 2.8132, "theoretical_loss": 3.680064498976477, "tokens_seen": 915937280 }, { "epoch": 2.07, "learning_rate": 0.0003648946840521565, "loss": 2.9003, "theoretical_loss": 3.6800390333340793, "tokens_seen": 916002816 }, { "epoch": 2.07, "learning_rate": 0.0003648846539618856, "loss": 3.0977, "theoretical_loss": 3.680013570023677, "tokens_seen": 916068352 }, { "epoch": 2.07, "learning_rate": 0.00036487462387161485, "loss": 2.8774, "theoretical_loss": 3.679988109044891, "tokens_seen": 916133888 }, { "epoch": 2.07, "learning_rate": 0.0003648645937813441, "loss": 2.9677, "theoretical_loss": 3.6799626503973406, "tokens_seen": 916199424 }, { "epoch": 2.07, "learning_rate": 0.0003648545636910732, "loss": 3.0244, "theoretical_loss": 3.6799371940806456, "tokens_seen": 916264960 }, { "epoch": 2.07, "learning_rate": 0.00036484453360080245, "loss": 3.0049, "theoretical_loss": 3.679911740094426, "tokens_seen": 916330496 }, { "epoch": 2.07, "learning_rate": 0.0003648345035105316, "loss": 3.0064, "theoretical_loss": 3.6798862884383015, "tokens_seen": 916396032 }, { "epoch": 2.07, "learning_rate": 0.0003648244734202608, "loss": 2.78, "theoretical_loss": 3.6798608391118925, "tokens_seen": 916461568 }, { "epoch": 2.07, "learning_rate": 0.00036481444332999, "loss": 2.9492, "theoretical_loss": 3.6798353921148195, "tokens_seen": 916527104 }, { "epoch": 2.07, "learning_rate": 0.0003648044132397192, "loss": 3.026, "theoretical_loss": 3.6798099474467016, "tokens_seen": 916592640 }, { "epoch": 2.07, "learning_rate": 0.00036479438314944836, "loss": 2.8676, "theoretical_loss": 3.6797845051071603, "tokens_seen": 916658176 }, { "epoch": 2.07, "learning_rate": 0.00036478435305917754, "loss": 2.992, "theoretical_loss": 3.6797590650958156, "tokens_seen": 916723712 }, { "epoch": 2.07, "learning_rate": 0.0003647743229689067, "loss": 2.9475, "theoretical_loss": 3.679733627412288, "tokens_seen": 916789248 }, { "epoch": 2.07, "learning_rate": 0.00036476429287863595, "loss": 2.9701, "theoretical_loss": 3.6797081920561983, "tokens_seen": 916854784 }, { "epoch": 2.07, "learning_rate": 0.0003647542627883651, "loss": 2.8971, "theoretical_loss": 3.679682759027167, "tokens_seen": 916920320 }, { "epoch": 2.07, "learning_rate": 0.0003647442326980943, "loss": 2.9823, "theoretical_loss": 3.679657328324815, "tokens_seen": 916985856 }, { "epoch": 2.07, "learning_rate": 0.0003647342026078235, "loss": 2.8898, "theoretical_loss": 3.679631899948763, "tokens_seen": 917051392 }, { "epoch": 2.07, "learning_rate": 0.0003647241725175527, "loss": 2.8776, "theoretical_loss": 3.679606473898632, "tokens_seen": 917116928 }, { "epoch": 2.07, "learning_rate": 0.00036471414242728186, "loss": 3.0015, "theoretical_loss": 3.679581050174044, "tokens_seen": 917182464 }, { "epoch": 2.07, "learning_rate": 0.00036470411233701104, "loss": 2.707, "theoretical_loss": 3.6795556287746187, "tokens_seen": 917248000 }, { "epoch": 2.07, "learning_rate": 0.0003646940822467402, "loss": 2.9949, "theoretical_loss": 3.679530209699978, "tokens_seen": 917313536 }, { "epoch": 2.07, "learning_rate": 0.00036468405215646946, "loss": 2.8159, "theoretical_loss": 3.6795047929497438, "tokens_seen": 917379072 }, { "epoch": 2.07, "learning_rate": 0.0003646740220661986, "loss": 2.8358, "theoretical_loss": 3.6794793785235367, "tokens_seen": 917444608 }, { "epoch": 2.07, "objective/train/docs_used": 1478457, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1988391876220703, "objective/train/theoretical_loss": 3.6794666721818245, "objective/train/tokens_used": 937937376, "theoretical_loss": 3.6794666721818245, "tokens_seen": 917477376 }, { "epoch": 2.07, "learning_rate": 0.0003646639919759278, "loss": 2.9685, "theoretical_loss": 3.6794539664209784, "tokens_seen": 917510144 }, { "epoch": 2.07, "learning_rate": 0.00036465396188565695, "loss": 2.9086, "theoretical_loss": 3.6794285566416907, "tokens_seen": 917575680 }, { "epoch": 2.07, "learning_rate": 0.0003646439317953862, "loss": 2.7936, "theoretical_loss": 3.679403149185295, "tokens_seen": 917641216 }, { "epoch": 2.07, "learning_rate": 0.00036463390170511536, "loss": 2.8732, "theoretical_loss": 3.679377744051413, "tokens_seen": 917706752 }, { "epoch": 2.07, "learning_rate": 0.00036462387161484454, "loss": 2.9492, "theoretical_loss": 3.6793523412396674, "tokens_seen": 917772288 }, { "epoch": 2.07, "learning_rate": 0.0003646138415245737, "loss": 2.9561, "theoretical_loss": 3.6793269407496787, "tokens_seen": 917837824 }, { "epoch": 2.07, "learning_rate": 0.00036460381143430296, "loss": 2.8121, "theoretical_loss": 3.6793015425810704, "tokens_seen": 917903360 }, { "epoch": 2.07, "learning_rate": 0.0003645937813440321, "loss": 2.9202, "theoretical_loss": 3.6792761467334643, "tokens_seen": 917968896 }, { "epoch": 2.07, "learning_rate": 0.0003645837512537613, "loss": 3.0296, "theoretical_loss": 3.6792507532064818, "tokens_seen": 918034432 }, { "epoch": 2.07, "learning_rate": 0.00036457372116349045, "loss": 2.8947, "theoretical_loss": 3.679225361999746, "tokens_seen": 918099968 }, { "epoch": 2.07, "learning_rate": 0.0003645636910732197, "loss": 3.0129, "theoretical_loss": 3.679199973112879, "tokens_seen": 918165504 }, { "epoch": 2.07, "learning_rate": 0.00036455366098294887, "loss": 2.8413, "theoretical_loss": 3.6791745865455026, "tokens_seen": 918231040 }, { "epoch": 2.07, "learning_rate": 0.00036454363089267805, "loss": 2.9194, "theoretical_loss": 3.679149202297241, "tokens_seen": 918296576 }, { "epoch": 2.07, "learning_rate": 0.00036453360080240723, "loss": 2.913, "theoretical_loss": 3.6791238203677157, "tokens_seen": 918362112 }, { "epoch": 2.07, "learning_rate": 0.0003645235707121364, "loss": 2.9886, "theoretical_loss": 3.6790984407565497, "tokens_seen": 918427648 }, { "epoch": 2.07, "learning_rate": 0.0003645135406218656, "loss": 3.0006, "theoretical_loss": 3.679073063463366, "tokens_seen": 918493184 }, { "epoch": 2.07, "learning_rate": 0.0003645035105315948, "loss": 2.7815, "theoretical_loss": 3.6790476884877865, "tokens_seen": 918558720 }, { "epoch": 2.07, "learning_rate": 0.00036449348044132395, "loss": 2.8628, "theoretical_loss": 3.679022315829436, "tokens_seen": 918624256 }, { "epoch": 2.07, "learning_rate": 0.0003644834503510532, "loss": 2.7827, "theoretical_loss": 3.6789969454879365, "tokens_seen": 918689792 }, { "epoch": 2.07, "learning_rate": 0.0003644734202607823, "loss": 2.7243, "theoretical_loss": 3.678971577462911, "tokens_seen": 918755328 }, { "epoch": 2.07, "learning_rate": 0.00036446339017051155, "loss": 2.8466, "theoretical_loss": 3.678946211753983, "tokens_seen": 918820864 }, { "epoch": 2.07, "learning_rate": 0.00036445336008024073, "loss": 2.9191, "theoretical_loss": 3.6789208483607765, "tokens_seen": 918886400 }, { "epoch": 2.07, "learning_rate": 0.0003644433299899699, "loss": 2.7906, "theoretical_loss": 3.6788954872829143, "tokens_seen": 918951936 }, { "epoch": 2.07, "learning_rate": 0.0003644332998996991, "loss": 3.0401, "theoretical_loss": 3.6788701285200203, "tokens_seen": 919017472 }, { "epoch": 2.07, "learning_rate": 0.00036442326980942833, "loss": 2.9598, "theoretical_loss": 3.6788447720717175, "tokens_seen": 919083008 }, { "epoch": 2.07, "objective/train/docs_used": 1479799, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.983149766921997, "objective/train/theoretical_loss": 3.6788320947154203, "objective/train/tokens_used": 939575776, "theoretical_loss": 3.6788320947154203, "tokens_seen": 919115776 }, { "epoch": 2.07, "learning_rate": 0.00036441323971915746, "loss": 2.9642, "theoretical_loss": 3.67881941793763, "tokens_seen": 919148544 }, { "epoch": 2.07, "learning_rate": 0.0003644032096288867, "loss": 2.8834, "theoretical_loss": 3.6787940661173817, "tokens_seen": 919214080 }, { "epoch": 2.07, "learning_rate": 0.0003643931795386158, "loss": 2.921, "theoretical_loss": 3.6787687166105965, "tokens_seen": 919279616 }, { "epoch": 2.07, "learning_rate": 0.00036438314944834505, "loss": 2.9807, "theoretical_loss": 3.6787433694168987, "tokens_seen": 919345152 }, { "epoch": 2.07, "learning_rate": 0.00036437311935807424, "loss": 2.9332, "theoretical_loss": 3.6787180245359115, "tokens_seen": 919410688 }, { "epoch": 2.07, "learning_rate": 0.0003643630892678034, "loss": 2.862, "theoretical_loss": 3.6786926819672594, "tokens_seen": 919476224 }, { "epoch": 2.07, "learning_rate": 0.0003643530591775326, "loss": 2.9449, "theoretical_loss": 3.6786673417105673, "tokens_seen": 919541760 }, { "epoch": 2.07, "learning_rate": 0.0003643430290872618, "loss": 2.9299, "theoretical_loss": 3.678642003765459, "tokens_seen": 919607296 }, { "epoch": 2.07, "learning_rate": 0.00036433299899699096, "loss": 2.8494, "theoretical_loss": 3.6786166681315584, "tokens_seen": 919672832 }, { "epoch": 2.07, "learning_rate": 0.0003643229689067202, "loss": 3.0048, "theoretical_loss": 3.6785913348084907, "tokens_seen": 919738368 }, { "epoch": 2.07, "learning_rate": 0.0003643129388164493, "loss": 2.8508, "theoretical_loss": 3.6785660037958805, "tokens_seen": 919803904 }, { "epoch": 2.07, "learning_rate": 0.00036430290872617856, "loss": 2.9431, "theoretical_loss": 3.6785406750933527, "tokens_seen": 919869440 }, { "epoch": 2.07, "learning_rate": 0.0003642928786359077, "loss": 2.8696, "theoretical_loss": 3.6785153487005307, "tokens_seen": 919934976 }, { "epoch": 2.07, "learning_rate": 0.0003642828485456369, "loss": 3.0483, "theoretical_loss": 3.678490024617041, "tokens_seen": 920000512 }, { "epoch": 2.07, "learning_rate": 0.0003642728184553661, "loss": 2.8182, "theoretical_loss": 3.6784647028425077, "tokens_seen": 920066048 }, { "epoch": 2.07, "learning_rate": 0.0003642627883650953, "loss": 2.7898, "theoretical_loss": 3.6784393833765563, "tokens_seen": 920131584 }, { "epoch": 2.07, "learning_rate": 0.00036425275827482446, "loss": 2.7607, "theoretical_loss": 3.678414066218812, "tokens_seen": 920197120 }, { "epoch": 2.07, "learning_rate": 0.0003642427281845537, "loss": 2.9704, "theoretical_loss": 3.6783887513688986, "tokens_seen": 920262656 }, { "epoch": 2.07, "learning_rate": 0.0003642326980942828, "loss": 2.8635, "theoretical_loss": 3.6783634388264432, "tokens_seen": 920328192 }, { "epoch": 2.07, "learning_rate": 0.00036422266800401206, "loss": 2.8994, "theoretical_loss": 3.6783381285910703, "tokens_seen": 920393728 }, { "epoch": 2.07, "learning_rate": 0.0003642126379137412, "loss": 3.0063, "theoretical_loss": 3.6783128206624056, "tokens_seen": 920459264 }, { "epoch": 2.07, "learning_rate": 0.0003642026078234704, "loss": 2.8413, "theoretical_loss": 3.6782875150400742, "tokens_seen": 920524800 }, { "epoch": 2.07, "learning_rate": 0.0003641925777331996, "loss": 2.9173, "theoretical_loss": 3.6782622117237027, "tokens_seen": 920590336 }, { "epoch": 2.07, "learning_rate": 0.0003641825476429288, "loss": 2.9166, "theoretical_loss": 3.678236910712916, "tokens_seen": 920655872 }, { "epoch": 2.07, "learning_rate": 0.00036417251755265797, "loss": 2.8755, "theoretical_loss": 3.6782116120073396, "tokens_seen": 920721408 }, { "epoch": 2.07, "objective/train/docs_used": 1482422, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6098666191101074, "objective/train/theoretical_loss": 3.6781989635188888, "objective/train/tokens_used": 941214176, "theoretical_loss": 3.6781989635188888, "tokens_seen": 920754176 }, { "epoch": 2.07, "learning_rate": 0.00036416248746238715, "loss": 2.8738, "theoretical_loss": 3.6781863156066006, "tokens_seen": 920786944 }, { "epoch": 2.07, "learning_rate": 0.00036415245737211633, "loss": 3.0248, "theoretical_loss": 3.6781610215103244, "tokens_seen": 920852480 }, { "epoch": 2.07, "learning_rate": 0.00036414242728184556, "loss": 2.9055, "theoretical_loss": 3.6781357297181367, "tokens_seen": 920918016 }, { "epoch": 2.07, "learning_rate": 0.0003641323971915747, "loss": 2.997, "theoretical_loss": 3.6781104402296645, "tokens_seen": 920983552 }, { "epoch": 2.07, "learning_rate": 0.0003641223671013039, "loss": 3.0614, "theoretical_loss": 3.6780851530445333, "tokens_seen": 921049088 }, { "epoch": 2.07, "learning_rate": 0.00036411233701103316, "loss": 2.7983, "theoretical_loss": 3.6780598681623697, "tokens_seen": 921114624 }, { "epoch": 2.07, "learning_rate": 0.0003641023069207623, "loss": 2.96, "theoretical_loss": 3.6780345855828003, "tokens_seen": 921180160 }, { "epoch": 2.07, "learning_rate": 0.0003640922768304915, "loss": 2.7424, "theoretical_loss": 3.6780093053054514, "tokens_seen": 921245696 }, { "epoch": 2.07, "learning_rate": 0.00036408224674022065, "loss": 2.9309, "theoretical_loss": 3.67798402732995, "tokens_seen": 921311232 }, { "epoch": 2.07, "learning_rate": 0.0003640722166499499, "loss": 2.8515, "theoretical_loss": 3.677958751655922, "tokens_seen": 921376768 }, { "epoch": 2.07, "learning_rate": 0.00036406218655967907, "loss": 3.0261, "theoretical_loss": 3.677933478282995, "tokens_seen": 921442304 }, { "epoch": 2.07, "learning_rate": 0.00036405215646940825, "loss": 2.9516, "theoretical_loss": 3.6779082072107956, "tokens_seen": 921507840 }, { "epoch": 2.07, "learning_rate": 0.00036404212637913743, "loss": 2.9286, "theoretical_loss": 3.67788293843895, "tokens_seen": 921573376 }, { "epoch": 2.07, "learning_rate": 0.0003640320962888666, "loss": 2.7584, "theoretical_loss": 3.677857671967087, "tokens_seen": 921638912 }, { "epoch": 2.07, "learning_rate": 0.0003640220661985958, "loss": 2.8422, "theoretical_loss": 3.677832407794832, "tokens_seen": 921704448 }, { "epoch": 2.07, "learning_rate": 0.000364012036108325, "loss": 2.9642, "theoretical_loss": 3.677807145921813, "tokens_seen": 921769984 }, { "epoch": 2.07, "learning_rate": 0.00036400200601805415, "loss": 2.8711, "theoretical_loss": 3.677781886347657, "tokens_seen": 921835520 }, { "epoch": 2.07, "learning_rate": 0.0003639919759277834, "loss": 2.8473, "theoretical_loss": 3.677756629071992, "tokens_seen": 921901056 }, { "epoch": 2.07, "learning_rate": 0.0003639819458375125, "loss": 2.9143, "theoretical_loss": 3.677731374094445, "tokens_seen": 921966592 }, { "epoch": 2.07, "learning_rate": 0.00036397191574724175, "loss": 2.8791, "theoretical_loss": 3.6777061214146434, "tokens_seen": 922032128 }, { "epoch": 2.07, "learning_rate": 0.00036396188565697093, "loss": 2.8397, "theoretical_loss": 3.677680871032215, "tokens_seen": 922097664 }, { "epoch": 2.07, "learning_rate": 0.0003639518555667001, "loss": 2.9451, "theoretical_loss": 3.677655622946787, "tokens_seen": 922163200 }, { "epoch": 2.07, "learning_rate": 0.0003639418254764293, "loss": 2.848, "theoretical_loss": 3.677630377157989, "tokens_seen": 922228736 }, { "epoch": 2.07, "learning_rate": 0.00036393179538615853, "loss": 3.0625, "theoretical_loss": 3.6776051336654474, "tokens_seen": 922294272 }, { "epoch": 2.07, "learning_rate": 0.00036392176529588766, "loss": 2.8241, "theoretical_loss": 3.67757989246879, "tokens_seen": 922359808 }, { "epoch": 2.07, "objective/train/docs_used": 1485252, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.905611515045166, "objective/train/theoretical_loss": 3.6775672727313022, "objective/train/tokens_used": 942852576, "theoretical_loss": 3.6775672727313022, "tokens_seen": 922392576 }, { "epoch": 2.07, "learning_rate": 0.0003639117352056169, "loss": 2.7919, "theoretical_loss": 3.677554653567646, "tokens_seen": 922425344 }, { "epoch": 2.07, "learning_rate": 0.000363901705115346, "loss": 2.9881, "theoretical_loss": 3.677529416961643, "tokens_seen": 922490880 }, { "epoch": 2.07, "learning_rate": 0.00036389167502507525, "loss": 2.9549, "theoretical_loss": 3.6775041826504093, "tokens_seen": 922556416 }, { "epoch": 2.07, "learning_rate": 0.00036388164493480444, "loss": 2.8875, "theoretical_loss": 3.6774789506335726, "tokens_seen": 922621952 }, { "epoch": 2.07, "learning_rate": 0.0003638716148445336, "loss": 2.9656, "theoretical_loss": 3.6774537209107625, "tokens_seen": 922687488 }, { "epoch": 2.07, "learning_rate": 0.0003638615847542628, "loss": 3.0058, "theoretical_loss": 3.6774284934816066, "tokens_seen": 922753024 }, { "epoch": 2.07, "learning_rate": 0.000363851554663992, "loss": 2.7907, "theoretical_loss": 3.677403268345734, "tokens_seen": 922818560 }, { "epoch": 2.07, "learning_rate": 0.00036384152457372116, "loss": 2.9933, "theoretical_loss": 3.677378045502773, "tokens_seen": 922884096 }, { "epoch": 2.07, "learning_rate": 0.0003638314944834504, "loss": 2.8649, "theoretical_loss": 3.677352824952353, "tokens_seen": 922949632 }, { "epoch": 2.07, "learning_rate": 0.0003638214643931795, "loss": 2.9023, "theoretical_loss": 3.677327606694102, "tokens_seen": 923015168 }, { "epoch": 2.07, "learning_rate": 0.00036381143430290876, "loss": 2.8118, "theoretical_loss": 3.6773023907276503, "tokens_seen": 923080704 }, { "epoch": 2.07, "learning_rate": 0.0003638014042126379, "loss": 2.947, "theoretical_loss": 3.677277177052625, "tokens_seen": 923146240 }, { "epoch": 2.07, "learning_rate": 0.0003637913741223671, "loss": 2.9311, "theoretical_loss": 3.677251965668657, "tokens_seen": 923211776 }, { "epoch": 2.07, "learning_rate": 0.0003637813440320963, "loss": 2.7118, "theoretical_loss": 3.6772267565753745, "tokens_seen": 923277312 }, { "epoch": 2.07, "learning_rate": 0.0003637713139418255, "loss": 2.7248, "theoretical_loss": 3.6772015497724073, "tokens_seen": 923342848 }, { "epoch": 2.07, "learning_rate": 0.00036376128385155466, "loss": 2.9041, "theoretical_loss": 3.677176345259384, "tokens_seen": 923408384 }, { "epoch": 2.07, "learning_rate": 0.0003637512537612839, "loss": 2.997, "theoretical_loss": 3.6771511430359354, "tokens_seen": 923473920 }, { "epoch": 2.07, "learning_rate": 0.000363741223671013, "loss": 3.0162, "theoretical_loss": 3.6771259431016894, "tokens_seen": 923539456 }, { "epoch": 2.07, "learning_rate": 0.00036373119358074226, "loss": 2.9378, "theoretical_loss": 3.6771007454562774, "tokens_seen": 923604992 }, { "epoch": 2.07, "learning_rate": 0.0003637211634904714, "loss": 2.8624, "theoretical_loss": 3.677075550099328, "tokens_seen": 923670528 }, { "epoch": 2.07, "learning_rate": 0.0003637111334002006, "loss": 3.101, "theoretical_loss": 3.677050357030471, "tokens_seen": 923736064 }, { "epoch": 2.07, "learning_rate": 0.0003637011033099298, "loss": 2.9588, "theoretical_loss": 3.6770251662493365, "tokens_seen": 923801600 }, { "epoch": 2.07, "learning_rate": 0.000363691073219659, "loss": 2.8707, "theoretical_loss": 3.676999977755555, "tokens_seen": 923867136 }, { "epoch": 2.07, "learning_rate": 0.00036368104312938817, "loss": 3.0122, "theoretical_loss": 3.676974791548756, "tokens_seen": 923932672 }, { "epoch": 2.07, "learning_rate": 0.00036367101303911735, "loss": 2.9101, "theoretical_loss": 3.6769496076285693, "tokens_seen": 923998208 }, { "epoch": 2.07, "objective/train/docs_used": 1487983, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.559403896331787, "objective/train/theoretical_loss": 3.6769370165258404, "objective/train/tokens_used": 944490976, "theoretical_loss": 3.6769370165258404, "tokens_seen": 924030976 }, { "epoch": 2.07, "learning_rate": 0.00036366098294884653, "loss": 2.8288, "theoretical_loss": 3.676924425994626, "tokens_seen": 924063744 }, { "epoch": 2.07, "learning_rate": 0.00036365095285857576, "loss": 2.9049, "theoretical_loss": 3.676899246646556, "tokens_seen": 924129280 }, { "epoch": 2.07, "learning_rate": 0.0003636409227683049, "loss": 2.9409, "theoretical_loss": 3.67687406958399, "tokens_seen": 924194816 }, { "epoch": 2.07, "learning_rate": 0.0003636308926780341, "loss": 2.807, "theoretical_loss": 3.676848894806558, "tokens_seen": 924260352 }, { "epoch": 2.07, "learning_rate": 0.00036362086258776325, "loss": 2.9124, "theoretical_loss": 3.676823722313891, "tokens_seen": 924325888 }, { "epoch": 2.07, "learning_rate": 0.0003636108324974925, "loss": 2.9092, "theoretical_loss": 3.6767985521056192, "tokens_seen": 924391424 }, { "epoch": 2.07, "learning_rate": 0.00036360080240722167, "loss": 2.9569, "theoretical_loss": 3.6767733841813737, "tokens_seen": 924456960 }, { "epoch": 2.07, "learning_rate": 0.00036359077231695085, "loss": 2.9446, "theoretical_loss": 3.676748218540786, "tokens_seen": 924522496 }, { "epoch": 2.07, "learning_rate": 0.00036358074222668003, "loss": 2.8383, "theoretical_loss": 3.6767230551834857, "tokens_seen": 924588032 }, { "epoch": 2.07, "learning_rate": 0.00036357071213640927, "loss": 2.9517, "theoretical_loss": 3.676697894109105, "tokens_seen": 924653568 }, { "epoch": 2.07, "learning_rate": 0.0003635606820461384, "loss": 2.8858, "theoretical_loss": 3.6766727353172737, "tokens_seen": 924719104 }, { "epoch": 2.07, "learning_rate": 0.00036355065195586763, "loss": 2.9463, "theoretical_loss": 3.6766475788076245, "tokens_seen": 924784640 }, { "epoch": 2.07, "learning_rate": 0.00036354062186559676, "loss": 2.8633, "theoretical_loss": 3.6766224245797874, "tokens_seen": 924850176 }, { "epoch": 2.07, "learning_rate": 0.000363530591775326, "loss": 2.887, "theoretical_loss": 3.6765972726333946, "tokens_seen": 924915712 }, { "epoch": 2.07, "learning_rate": 0.00036352056168505517, "loss": 2.9766, "theoretical_loss": 3.6765721229680772, "tokens_seen": 924981248 }, { "epoch": 2.07, "learning_rate": 0.00036351053159478435, "loss": 2.8173, "theoretical_loss": 3.6765469755834665, "tokens_seen": 925046784 }, { "epoch": 2.07, "learning_rate": 0.00036350050150451353, "loss": 2.9032, "theoretical_loss": 3.676521830479195, "tokens_seen": 925112320 }, { "epoch": 2.07, "learning_rate": 0.0003634904714142427, "loss": 2.8758, "theoretical_loss": 3.6764966876548932, "tokens_seen": 925177856 }, { "epoch": 2.07, "learning_rate": 0.0003634804413239719, "loss": 2.8255, "theoretical_loss": 3.676471547110194, "tokens_seen": 925243392 }, { "epoch": 2.07, "learning_rate": 0.00036347041123370113, "loss": 2.9067, "theoretical_loss": 3.676446408844728, "tokens_seen": 925308928 }, { "epoch": 2.07, "learning_rate": 0.00036346038114343026, "loss": 2.8437, "theoretical_loss": 3.6764212728581285, "tokens_seen": 925374464 }, { "epoch": 2.07, "learning_rate": 0.0003634503510531595, "loss": 2.9315, "theoretical_loss": 3.676396139150027, "tokens_seen": 925440000 }, { "epoch": 2.07, "learning_rate": 0.0003634403209628886, "loss": 2.931, "theoretical_loss": 3.6763710077200553, "tokens_seen": 925505536 }, { "epoch": 2.07, "learning_rate": 0.00036343029087261786, "loss": 2.7352, "theoretical_loss": 3.6763458785678456, "tokens_seen": 925571072 }, { "epoch": 2.07, "learning_rate": 0.00036342026078234704, "loss": 2.861, "theoretical_loss": 3.676320751693031, "tokens_seen": 925636608 }, { "epoch": 2.07, "objective/train/docs_used": 1490702, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.833678722381592, "objective/train/theoretical_loss": 3.6763081891095317, "objective/train/tokens_used": 946129376, "theoretical_loss": 3.6763081891095317, "tokens_seen": 925669376 }, { "epoch": 2.07, "learning_rate": 0.0003634102306920762, "loss": 3.0328, "theoretical_loss": 3.676295627095243, "tokens_seen": 925702144 }, { "epoch": 2.07, "learning_rate": 0.0003634002006018054, "loss": 2.7127, "theoretical_loss": 3.6762705047741147, "tokens_seen": 925767680 }, { "epoch": 2.07, "learning_rate": 0.00036339017051153464, "loss": 3.0051, "theoretical_loss": 3.6762453847292784, "tokens_seen": 925833216 }, { "epoch": 2.07, "learning_rate": 0.00036338014042126376, "loss": 2.9528, "theoretical_loss": 3.676220266960367, "tokens_seen": 925898752 }, { "epoch": 2.07, "learning_rate": 0.000363370110330993, "loss": 2.9068, "theoretical_loss": 3.6761951514670126, "tokens_seen": 925964288 }, { "epoch": 2.07, "learning_rate": 0.0003633600802407222, "loss": 2.9812, "theoretical_loss": 3.6761700382488485, "tokens_seen": 926029824 }, { "epoch": 2.07, "learning_rate": 0.00036335005015045136, "loss": 2.9321, "theoretical_loss": 3.6761449273055073, "tokens_seen": 926095360 }, { "epoch": 2.07, "learning_rate": 0.0003633400200601806, "loss": 3.086, "theoretical_loss": 3.6761198186366224, "tokens_seen": 926160896 }, { "epoch": 2.07, "learning_rate": 0.0003633299899699097, "loss": 2.8576, "theoretical_loss": 3.6760947122418264, "tokens_seen": 926226432 }, { "epoch": 2.07, "learning_rate": 0.00036331995987963896, "loss": 2.8618, "theoretical_loss": 3.676069608120753, "tokens_seen": 926291968 }, { "epoch": 2.07, "learning_rate": 0.0003633099297893681, "loss": 2.9384, "theoretical_loss": 3.6760445062730347, "tokens_seen": 926357504 }, { "epoch": 2.07, "learning_rate": 0.0003632998996990973, "loss": 2.7644, "theoretical_loss": 3.6760194066983054, "tokens_seen": 926423040 }, { "epoch": 2.07, "learning_rate": 0.0003632898696088265, "loss": 2.9855, "theoretical_loss": 3.6759943093961986, "tokens_seen": 926488576 }, { "epoch": 2.07, "learning_rate": 0.0003632798395185557, "loss": 2.6604, "theoretical_loss": 3.675969214366347, "tokens_seen": 926554112 }, { "epoch": 2.07, "learning_rate": 0.00036326980942828486, "loss": 3.023, "theoretical_loss": 3.6759441216083855, "tokens_seen": 926619648 }, { "epoch": 2.07, "learning_rate": 0.0003632597793380141, "loss": 3.1457, "theoretical_loss": 3.6759190311219467, "tokens_seen": 926685184 }, { "epoch": 2.07, "learning_rate": 0.0003632497492477432, "loss": 3.0266, "theoretical_loss": 3.6758939429066646, "tokens_seen": 926750720 }, { "epoch": 2.07, "learning_rate": 0.00036323971915747246, "loss": 2.8931, "theoretical_loss": 3.6758688569621727, "tokens_seen": 926816256 }, { "epoch": 2.07, "learning_rate": 0.0003632296890672016, "loss": 2.8165, "theoretical_loss": 3.6758437732881055, "tokens_seen": 926881792 }, { "epoch": 2.07, "learning_rate": 0.0003632196589769308, "loss": 2.7825, "theoretical_loss": 3.675818691884097, "tokens_seen": 926947328 }, { "epoch": 2.07, "learning_rate": 0.00036320962888666, "loss": 2.934, "theoretical_loss": 3.675793612749781, "tokens_seen": 927012864 }, { "epoch": 2.07, "learning_rate": 0.0003631995987963892, "loss": 2.8976, "theoretical_loss": 3.6757685358847914, "tokens_seen": 927078400 }, { "epoch": 2.07, "learning_rate": 0.00036318956870611837, "loss": 2.5901, "theoretical_loss": 3.6757434612887634, "tokens_seen": 927143936 }, { "epoch": 2.07, "learning_rate": 0.00036317953861584755, "loss": 2.9372, "theoretical_loss": 3.6757183889613305, "tokens_seen": 927209472 }, { "epoch": 2.07, "learning_rate": 0.00036316950852557673, "loss": 3.0035, "theoretical_loss": 3.675693318902127, "tokens_seen": 927275008 }, { "epoch": 2.07, "objective/train/docs_used": 1493732, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.034885883331299, "objective/train/theoretical_loss": 3.6756807847229975, "objective/train/tokens_used": 947767776, "theoretical_loss": 3.6756807847229975, "tokens_seen": 927307776 }, { "epoch": 2.07, "learning_rate": 0.00036315947843530596, "loss": 2.9152, "theoretical_loss": 3.675668251110788, "tokens_seen": 927340544 }, { "epoch": 2.07, "learning_rate": 0.0003631494483450351, "loss": 2.77, "theoretical_loss": 3.6756431855869485, "tokens_seen": 927406080 }, { "epoch": 2.07, "learning_rate": 0.0003631394182547643, "loss": 2.7346, "theoretical_loss": 3.675618122330242, "tokens_seen": 927471616 }, { "epoch": 2.07, "learning_rate": 0.00036312938816449345, "loss": 2.8873, "theoretical_loss": 3.6755930613403036, "tokens_seen": 927537152 }, { "epoch": 2.07, "learning_rate": 0.0003631193580742227, "loss": 3.0375, "theoretical_loss": 3.6755680026167683, "tokens_seen": 927602688 }, { "epoch": 2.07, "learning_rate": 0.00036310932798395187, "loss": 3.0664, "theoretical_loss": 3.675542946159272, "tokens_seen": 927668224 }, { "epoch": 2.07, "learning_rate": 0.00036309929789368105, "loss": 2.9528, "theoretical_loss": 3.6755178919674476, "tokens_seen": 927733760 }, { "epoch": 2.07, "learning_rate": 0.00036308926780341023, "loss": 2.9474, "theoretical_loss": 3.675492840040932, "tokens_seen": 927799296 }, { "epoch": 2.07, "learning_rate": 0.00036307923771313947, "loss": 2.9468, "theoretical_loss": 3.6754677903793604, "tokens_seen": 927864832 }, { "epoch": 2.07, "learning_rate": 0.0003630692076228686, "loss": 2.785, "theoretical_loss": 3.675442742982367, "tokens_seen": 927930368 }, { "epoch": 2.07, "learning_rate": 0.00036305917753259783, "loss": 2.6869, "theoretical_loss": 3.6754176978495874, "tokens_seen": 927995904 }, { "epoch": 2.07, "learning_rate": 0.00036304914744232696, "loss": 2.8211, "theoretical_loss": 3.6753926549806577, "tokens_seen": 928061440 }, { "epoch": 2.07, "learning_rate": 0.0003630391173520562, "loss": 2.776, "theoretical_loss": 3.675367614375213, "tokens_seen": 928126976 }, { "epoch": 2.07, "learning_rate": 0.00036302908726178537, "loss": 2.9286, "theoretical_loss": 3.6753425760328886, "tokens_seen": 928192512 }, { "epoch": 2.07, "learning_rate": 0.00036301905717151455, "loss": 2.9934, "theoretical_loss": 3.675317539953321, "tokens_seen": 928258048 }, { "epoch": 2.07, "learning_rate": 0.00036300902708124374, "loss": 2.9791, "theoretical_loss": 3.675292506136145, "tokens_seen": 928323584 }, { "epoch": 2.07, "learning_rate": 0.0003629989969909729, "loss": 2.872, "theoretical_loss": 3.6752674745809975, "tokens_seen": 928389120 }, { "epoch": 2.07, "learning_rate": 0.0003629889669007021, "loss": 2.9699, "theoretical_loss": 3.6752424452875134, "tokens_seen": 928454656 }, { "epoch": 2.07, "learning_rate": 0.00036297893681043133, "loss": 2.823, "theoretical_loss": 3.67521741825533, "tokens_seen": 928520192 }, { "epoch": 2.07, "learning_rate": 0.00036296890672016046, "loss": 2.8088, "theoretical_loss": 3.6751923934840818, "tokens_seen": 928585728 }, { "epoch": 2.07, "learning_rate": 0.0003629588766298897, "loss": 2.8466, "theoretical_loss": 3.6751673709734067, "tokens_seen": 928651264 }, { "epoch": 2.07, "learning_rate": 0.0003629488465396188, "loss": 2.8562, "theoretical_loss": 3.6751423507229397, "tokens_seen": 928716800 }, { "epoch": 2.07, "learning_rate": 0.00036293881644934806, "loss": 2.9347, "theoretical_loss": 3.6751173327323174, "tokens_seen": 928782336 }, { "epoch": 2.07, "learning_rate": 0.00036292878635907724, "loss": 2.9578, "theoretical_loss": 3.6750923170011767, "tokens_seen": 928847872 }, { "epoch": 2.07, "learning_rate": 0.0003629187562688064, "loss": 3.0678, "theoretical_loss": 3.675067303529154, "tokens_seen": 928913408 }, { "epoch": 2.07, "objective/train/docs_used": 1495096, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2447121143341064, "objective/train/theoretical_loss": 3.6750547976401986, "objective/train/tokens_used": 949406176, "theoretical_loss": 3.6750547976401986, "tokens_seen": 928946176 }, { "epoch": 2.07, "learning_rate": 0.0003629087261785356, "loss": 2.8979, "theoretical_loss": 3.6750422923158856, "tokens_seen": 928978944 }, { "epoch": 2.07, "learning_rate": 0.00036289869608826484, "loss": 3.0582, "theoretical_loss": 3.675017283361009, "tokens_seen": 929044480 }, { "epoch": 2.07, "learning_rate": 0.00036288866599799396, "loss": 2.9471, "theoretical_loss": 3.6749922766641596, "tokens_seen": 929110016 }, { "epoch": 2.07, "learning_rate": 0.0003628786359077232, "loss": 2.9859, "theoretical_loss": 3.6749672722249755, "tokens_seen": 929175552 }, { "epoch": 2.07, "learning_rate": 0.0003628686058174523, "loss": 3.0822, "theoretical_loss": 3.674942270043094, "tokens_seen": 929241088 }, { "epoch": 2.07, "learning_rate": 0.00036285857572718156, "loss": 2.9929, "theoretical_loss": 3.6749172701181507, "tokens_seen": 929306624 }, { "epoch": 2.07, "learning_rate": 0.00036284854563691074, "loss": 3.0332, "theoretical_loss": 3.6748922724497834, "tokens_seen": 929372160 }, { "epoch": 2.07, "learning_rate": 0.0003628385155466399, "loss": 2.8959, "theoretical_loss": 3.674867277037629, "tokens_seen": 929437696 }, { "epoch": 2.07, "learning_rate": 0.0003628284854563691, "loss": 2.7861, "theoretical_loss": 3.6748422838813255, "tokens_seen": 929503232 }, { "epoch": 2.07, "learning_rate": 0.0003628184553660983, "loss": 2.8301, "theoretical_loss": 3.6748172929805105, "tokens_seen": 929568768 }, { "epoch": 2.07, "learning_rate": 0.00036280842527582747, "loss": 2.8093, "theoretical_loss": 3.6747923043348205, "tokens_seen": 929634304 }, { "epoch": 2.07, "learning_rate": 0.0003627983951855567, "loss": 2.8922, "theoretical_loss": 3.674767317943893, "tokens_seen": 929699840 }, { "epoch": 2.07, "learning_rate": 0.00036278836509528583, "loss": 2.9532, "theoretical_loss": 3.6747423338073664, "tokens_seen": 929765376 }, { "epoch": 2.07, "learning_rate": 0.00036277833500501506, "loss": 2.8056, "theoretical_loss": 3.674717351924878, "tokens_seen": 929830912 }, { "epoch": 2.07, "learning_rate": 0.00036276830491474424, "loss": 2.8616, "theoretical_loss": 3.674692372296066, "tokens_seen": 929896448 }, { "epoch": 2.07, "learning_rate": 0.0003627582748244734, "loss": 2.8233, "theoretical_loss": 3.6746673949205677, "tokens_seen": 929961984 }, { "epoch": 2.07, "learning_rate": 0.0003627482447342026, "loss": 3.0571, "theoretical_loss": 3.6746424197980208, "tokens_seen": 930027520 }, { "epoch": 2.07, "learning_rate": 0.0003627382146439318, "loss": 3.035, "theoretical_loss": 3.6746174469280644, "tokens_seen": 930093056 }, { "epoch": 2.07, "learning_rate": 0.00036272818455366097, "loss": 2.9948, "theoretical_loss": 3.6745924763103357, "tokens_seen": 930158592 }, { "epoch": 2.07, "learning_rate": 0.0003627181544633902, "loss": 2.916, "theoretical_loss": 3.6745675079444737, "tokens_seen": 930224128 }, { "epoch": 2.07, "learning_rate": 0.00036270812437311933, "loss": 3.0427, "theoretical_loss": 3.6745425418301165, "tokens_seen": 930289664 }, { "epoch": 2.07, "learning_rate": 0.00036269809428284857, "loss": 2.888, "theoretical_loss": 3.6745175779669017, "tokens_seen": 930355200 }, { "epoch": 2.07, "learning_rate": 0.0003626880641925777, "loss": 2.7529, "theoretical_loss": 3.6744926163544687, "tokens_seen": 930420736 }, { "epoch": 2.07, "learning_rate": 0.00036267803410230693, "loss": 2.978, "theoretical_loss": 3.6744676569924555, "tokens_seen": 930486272 }, { "epoch": 2.07, "learning_rate": 0.0003626680040120361, "loss": 2.644, "theoretical_loss": 3.674442699880501, "tokens_seen": 930551808 }, { "epoch": 2.07, "objective/train/docs_used": 1497971, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.074779987335205, "objective/train/theoretical_loss": 3.6744302221681826, "objective/train/tokens_used": 951044576, "theoretical_loss": 3.6744302221681826, "tokens_seen": 930584576 }, { "epoch": 2.07, "learning_rate": 0.0003626579739217653, "loss": 2.8272, "theoretical_loss": 3.674417745018244, "tokens_seen": 930617344 }, { "epoch": 2.07, "learning_rate": 0.00036264794383149447, "loss": 2.8926, "theoretical_loss": 3.674392792405323, "tokens_seen": 930682880 }, { "epoch": 2.07, "learning_rate": 0.00036263791374122365, "loss": 2.8956, "theoretical_loss": 3.6743678420413772, "tokens_seen": 930748416 }, { "epoch": 2.07, "learning_rate": 0.00036262788365095283, "loss": 3.0257, "theoretical_loss": 3.674342893926045, "tokens_seen": 930813952 }, { "epoch": 2.07, "learning_rate": 0.00036261785356068207, "loss": 3.015, "theoretical_loss": 3.674317948058966, "tokens_seen": 930879488 }, { "epoch": 2.08, "learning_rate": 0.00036260782347041125, "loss": 2.9407, "theoretical_loss": 3.6742930044397797, "tokens_seen": 930945024 }, { "epoch": 2.08, "learning_rate": 0.00036259779338014043, "loss": 2.8448, "theoretical_loss": 3.6742680630681237, "tokens_seen": 931010560 }, { "epoch": 2.08, "learning_rate": 0.00036258776328986967, "loss": 2.8856, "theoretical_loss": 3.6742431239436395, "tokens_seen": 931076096 }, { "epoch": 2.08, "learning_rate": 0.0003625777331995988, "loss": 3.0729, "theoretical_loss": 3.6742181870659647, "tokens_seen": 931141632 }, { "epoch": 2.08, "learning_rate": 0.00036256770310932803, "loss": 2.8569, "theoretical_loss": 3.6741932524347396, "tokens_seen": 931207168 }, { "epoch": 2.08, "learning_rate": 0.00036255767301905716, "loss": 2.9063, "theoretical_loss": 3.6741683200496036, "tokens_seen": 931272704 }, { "epoch": 2.08, "learning_rate": 0.0003625476429287864, "loss": 2.8566, "theoretical_loss": 3.6741433899101965, "tokens_seen": 931338240 }, { "epoch": 2.08, "learning_rate": 0.0003625376128385156, "loss": 2.9448, "theoretical_loss": 3.6741184620161578, "tokens_seen": 931403776 }, { "epoch": 2.08, "learning_rate": 0.00036252758274824475, "loss": 2.7692, "theoretical_loss": 3.674093536367127, "tokens_seen": 931469312 }, { "epoch": 2.08, "learning_rate": 0.00036251755265797394, "loss": 2.9109, "theoretical_loss": 3.6740686129627447, "tokens_seen": 931534848 }, { "epoch": 2.08, "learning_rate": 0.0003625075225677031, "loss": 2.867, "theoretical_loss": 3.674043691802651, "tokens_seen": 931600384 }, { "epoch": 2.08, "learning_rate": 0.0003624974924774323, "loss": 2.8416, "theoretical_loss": 3.6740187728864844, "tokens_seen": 931665920 }, { "epoch": 2.08, "learning_rate": 0.00036248746238716153, "loss": 2.8971, "theoretical_loss": 3.6739938562138867, "tokens_seen": 931731456 }, { "epoch": 2.08, "learning_rate": 0.00036247743229689066, "loss": 2.8231, "theoretical_loss": 3.673968941784497, "tokens_seen": 931796992 }, { "epoch": 2.08, "learning_rate": 0.0003624674022066199, "loss": 2.7679, "theoretical_loss": 3.6739440295979566, "tokens_seen": 931862528 }, { "epoch": 2.08, "learning_rate": 0.000362457372116349, "loss": 2.6721, "theoretical_loss": 3.673919119653905, "tokens_seen": 931928064 }, { "epoch": 2.08, "learning_rate": 0.00036244734202607826, "loss": 2.8875, "theoretical_loss": 3.673894211951983, "tokens_seen": 931993600 }, { "epoch": 2.08, "learning_rate": 0.00036243731193580744, "loss": 3.1452, "theoretical_loss": 3.6738693064918313, "tokens_seen": 932059136 }, { "epoch": 2.08, "learning_rate": 0.0003624272818455366, "loss": 2.8665, "theoretical_loss": 3.6738444032730904, "tokens_seen": 932124672 }, { "epoch": 2.08, "learning_rate": 0.0003624172517552658, "loss": 2.7186, "theoretical_loss": 3.6738195022954008, "tokens_seen": 932190208 }, { "epoch": 2.08, "objective/train/docs_used": 1500859, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9369304180145264, "objective/train/theoretical_loss": 3.6738070526468385, "objective/train/tokens_used": 952682976, "theoretical_loss": 3.6738070526468385, "tokens_seen": 932222976 }, { "epoch": 2.08, "learning_rate": 0.00036240722166499504, "loss": 2.7594, "theoretical_loss": 3.673794603558404, "tokens_seen": 932255744 }, { "epoch": 2.08, "learning_rate": 0.00036239719157472416, "loss": 2.9877, "theoretical_loss": 3.67376970706174, "tokens_seen": 932321280 }, { "epoch": 2.08, "learning_rate": 0.0003623871614844534, "loss": 2.7318, "theoretical_loss": 3.67374481280505, "tokens_seen": 932386816 }, { "epoch": 2.08, "learning_rate": 0.0003623771313941825, "loss": 2.9536, "theoretical_loss": 3.673719920787976, "tokens_seen": 932452352 }, { "epoch": 2.08, "learning_rate": 0.00036236710130391176, "loss": 2.7592, "theoretical_loss": 3.6736950310101575, "tokens_seen": 932517888 }, { "epoch": 2.08, "learning_rate": 0.00036235707121364094, "loss": 2.8697, "theoretical_loss": 3.6736701434712367, "tokens_seen": 932583424 }, { "epoch": 2.08, "learning_rate": 0.0003623470411233701, "loss": 2.7918, "theoretical_loss": 3.673645258170855, "tokens_seen": 932648960 }, { "epoch": 2.08, "learning_rate": 0.0003623370110330993, "loss": 2.851, "theoretical_loss": 3.673620375108653, "tokens_seen": 932714496 }, { "epoch": 2.08, "learning_rate": 0.0003623269809428285, "loss": 2.8662, "theoretical_loss": 3.673595494284273, "tokens_seen": 932780032 }, { "epoch": 2.08, "learning_rate": 0.00036231695085255767, "loss": 2.853, "theoretical_loss": 3.673570615697356, "tokens_seen": 932845568 }, { "epoch": 2.08, "learning_rate": 0.0003623069207622869, "loss": 3.0449, "theoretical_loss": 3.673545739347544, "tokens_seen": 932911104 }, { "epoch": 2.08, "learning_rate": 0.00036229689067201603, "loss": 2.9634, "theoretical_loss": 3.6735208652344786, "tokens_seen": 932976640 }, { "epoch": 2.08, "learning_rate": 0.00036228686058174526, "loss": 2.8178, "theoretical_loss": 3.6734959933578013, "tokens_seen": 933042176 }, { "epoch": 2.08, "learning_rate": 0.00036227683049147444, "loss": 2.846, "theoretical_loss": 3.673471123717154, "tokens_seen": 933107712 }, { "epoch": 2.08, "learning_rate": 0.0003622668004012036, "loss": 2.9289, "theoretical_loss": 3.673446256312179, "tokens_seen": 933173248 }, { "epoch": 2.08, "learning_rate": 0.0003622567703109328, "loss": 2.8434, "theoretical_loss": 3.6734213911425178, "tokens_seen": 933238784 }, { "epoch": 2.08, "learning_rate": 0.000362246740220662, "loss": 2.9856, "theoretical_loss": 3.673396528207814, "tokens_seen": 933304320 }, { "epoch": 2.08, "learning_rate": 0.00036223671013039117, "loss": 2.791, "theoretical_loss": 3.6733716675077073, "tokens_seen": 933369856 }, { "epoch": 2.08, "learning_rate": 0.0003622266800401204, "loss": 2.7631, "theoretical_loss": 3.673346809041842, "tokens_seen": 933435392 }, { "epoch": 2.08, "learning_rate": 0.00036221664994984953, "loss": 2.9061, "theoretical_loss": 3.67332195280986, "tokens_seen": 933500928 }, { "epoch": 2.08, "learning_rate": 0.00036220661985957877, "loss": 2.8866, "theoretical_loss": 3.6732970988114033, "tokens_seen": 933566464 }, { "epoch": 2.08, "learning_rate": 0.0003621965897693079, "loss": 2.9241, "theoretical_loss": 3.673272247046114, "tokens_seen": 933632000 }, { "epoch": 2.08, "learning_rate": 0.00036218655967903713, "loss": 2.8808, "theoretical_loss": 3.6732473975136366, "tokens_seen": 933697536 }, { "epoch": 2.08, "learning_rate": 0.0003621765295887663, "loss": 2.9543, "theoretical_loss": 3.673222550213612, "tokens_seen": 933763072 }, { "epoch": 2.08, "learning_rate": 0.0003621664994984955, "loss": 2.9253, "theoretical_loss": 3.6731977051456837, "tokens_seen": 933828608 }, { "epoch": 2.08, "objective/train/docs_used": 1503031, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9961791038513184, "objective/train/theoretical_loss": 3.673185283448644, "objective/train/tokens_used": 954321376, "theoretical_loss": 3.673185283448644, "tokens_seen": 933861376 }, { "epoch": 2.08, "learning_rate": 0.00036215646940822467, "loss": 2.878, "theoretical_loss": 3.673172862309494, "tokens_seen": 933894144 }, { "epoch": 2.08, "learning_rate": 0.00036214643931795385, "loss": 2.996, "theoretical_loss": 3.6731480217046872, "tokens_seen": 933959680 }, { "epoch": 2.08, "learning_rate": 0.00036213640922768303, "loss": 2.8864, "theoretical_loss": 3.6731231833309046, "tokens_seen": 934025216 }, { "epoch": 2.08, "learning_rate": 0.00036212637913741227, "loss": 2.775, "theoretical_loss": 3.6730983471877905, "tokens_seen": 934090752 }, { "epoch": 2.08, "learning_rate": 0.0003621163490471414, "loss": 2.9316, "theoretical_loss": 3.6730735132749874, "tokens_seen": 934156288 }, { "epoch": 2.08, "learning_rate": 0.00036210631895687063, "loss": 2.7923, "theoretical_loss": 3.6730486815921393, "tokens_seen": 934221824 }, { "epoch": 2.08, "learning_rate": 0.0003620962888665998, "loss": 2.8774, "theoretical_loss": 3.673023852138889, "tokens_seen": 934287360 }, { "epoch": 2.08, "learning_rate": 0.000362086258776329, "loss": 2.8379, "theoretical_loss": 3.67299902491488, "tokens_seen": 934352896 }, { "epoch": 2.08, "learning_rate": 0.0003620762286860582, "loss": 2.7538, "theoretical_loss": 3.6729741999197554, "tokens_seen": 934418432 }, { "epoch": 2.08, "learning_rate": 0.00036206619859578736, "loss": 3.0089, "theoretical_loss": 3.6729493771531594, "tokens_seen": 934483968 }, { "epoch": 2.08, "learning_rate": 0.00036205616850551654, "loss": 2.8328, "theoretical_loss": 3.6729245566147357, "tokens_seen": 934549504 }, { "epoch": 2.08, "learning_rate": 0.0003620461384152458, "loss": 2.9042, "theoretical_loss": 3.6728997383041277, "tokens_seen": 934615040 }, { "epoch": 2.08, "learning_rate": 0.0003620361083249749, "loss": 3.1102, "theoretical_loss": 3.6728749222209798, "tokens_seen": 934680576 }, { "epoch": 2.08, "learning_rate": 0.00036202607823470414, "loss": 2.9811, "theoretical_loss": 3.672850108364935, "tokens_seen": 934746112 }, { "epoch": 2.08, "learning_rate": 0.00036201604814443326, "loss": 3.0195, "theoretical_loss": 3.6728252967356383, "tokens_seen": 934811648 }, { "epoch": 2.08, "learning_rate": 0.0003620060180541625, "loss": 2.8828, "theoretical_loss": 3.6728004873327333, "tokens_seen": 934877184 }, { "epoch": 2.08, "learning_rate": 0.0003619959879638917, "loss": 2.8819, "theoretical_loss": 3.672775680155864, "tokens_seen": 934942720 }, { "epoch": 2.08, "learning_rate": 0.00036198595787362086, "loss": 2.9643, "theoretical_loss": 3.672750875204675, "tokens_seen": 935008256 }, { "epoch": 2.08, "learning_rate": 0.00036197592778335004, "loss": 2.8659, "theoretical_loss": 3.6727260724788104, "tokens_seen": 935073792 }, { "epoch": 2.08, "learning_rate": 0.0003619658976930792, "loss": 2.8988, "theoretical_loss": 3.672701271977915, "tokens_seen": 935139328 }, { "epoch": 2.08, "learning_rate": 0.0003619558676028084, "loss": 2.936, "theoretical_loss": 3.6726764737016326, "tokens_seen": 935204864 }, { "epoch": 2.08, "learning_rate": 0.00036194583751253764, "loss": 2.9264, "theoretical_loss": 3.6726516776496085, "tokens_seen": 935270400 }, { "epoch": 2.08, "learning_rate": 0.00036193580742226677, "loss": 2.7019, "theoretical_loss": 3.6726268838214873, "tokens_seen": 935335936 }, { "epoch": 2.08, "learning_rate": 0.000361925777331996, "loss": 2.9088, "theoretical_loss": 3.6726020922169136, "tokens_seen": 935401472 }, { "epoch": 2.08, "learning_rate": 0.0003619157472417252, "loss": 2.8215, "theoretical_loss": 3.6725773028355313, "tokens_seen": 935467008 }, { "epoch": 2.08, "objective/train/docs_used": 1505779, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0452187061309814, "objective/train/theoretical_loss": 3.6725649089784262, "objective/train/tokens_used": 955959776, "theoretical_loss": 3.6725649089784262, "tokens_seen": 935499776 }, { "epoch": 2.08, "learning_rate": 0.00036190571715145436, "loss": 2.9355, "theoretical_loss": 3.672552515676987, "tokens_seen": 935532544 }, { "epoch": 2.08, "learning_rate": 0.00036189568706118354, "loss": 2.7148, "theoretical_loss": 3.6725277307409243, "tokens_seen": 935598080 }, { "epoch": 2.08, "learning_rate": 0.0003618856569709127, "loss": 2.7359, "theoretical_loss": 3.6725029480269886, "tokens_seen": 935663616 }, { "epoch": 2.08, "learning_rate": 0.0003618756268806419, "loss": 2.8048, "theoretical_loss": 3.6724781675348255, "tokens_seen": 935729152 }, { "epoch": 2.08, "learning_rate": 0.00036186559679037114, "loss": 2.8042, "theoretical_loss": 3.67245338926408, "tokens_seen": 935794688 }, { "epoch": 2.08, "learning_rate": 0.0003618555667001003, "loss": 2.8573, "theoretical_loss": 3.6724286132143975, "tokens_seen": 935860224 }, { "epoch": 2.08, "learning_rate": 0.0003618455366098295, "loss": 2.976, "theoretical_loss": 3.672403839385423, "tokens_seen": 935925760 }, { "epoch": 2.08, "learning_rate": 0.0003618355065195587, "loss": 2.927, "theoretical_loss": 3.6723790677768022, "tokens_seen": 935991296 }, { "epoch": 2.08, "learning_rate": 0.00036182547642928787, "loss": 2.7611, "theoretical_loss": 3.672354298388181, "tokens_seen": 936056832 }, { "epoch": 2.08, "learning_rate": 0.0003618154463390171, "loss": 2.8416, "theoretical_loss": 3.672329531219205, "tokens_seen": 936122368 }, { "epoch": 2.08, "learning_rate": 0.00036180541624874623, "loss": 2.8096, "theoretical_loss": 3.672304766269519, "tokens_seen": 936187904 }, { "epoch": 2.08, "learning_rate": 0.00036179538615847546, "loss": 2.9064, "theoretical_loss": 3.67228000353877, "tokens_seen": 936253440 }, { "epoch": 2.08, "learning_rate": 0.00036178535606820464, "loss": 2.8831, "theoretical_loss": 3.672255243026603, "tokens_seen": 936318976 }, { "epoch": 2.08, "learning_rate": 0.0003617753259779338, "loss": 2.8769, "theoretical_loss": 3.6722304847326646, "tokens_seen": 936384512 }, { "epoch": 2.08, "learning_rate": 0.000361765295887663, "loss": 2.8887, "theoretical_loss": 3.6722057286566003, "tokens_seen": 936450048 }, { "epoch": 2.08, "learning_rate": 0.0003617552657973922, "loss": 2.7532, "theoretical_loss": 3.6721809747980565, "tokens_seen": 936515584 }, { "epoch": 2.08, "learning_rate": 0.00036174523570712137, "loss": 2.9422, "theoretical_loss": 3.6721562231566796, "tokens_seen": 936581120 }, { "epoch": 2.08, "learning_rate": 0.0003617352056168506, "loss": 2.8445, "theoretical_loss": 3.6721314737321156, "tokens_seen": 936646656 }, { "epoch": 2.08, "learning_rate": 0.00036172517552657973, "loss": 2.887, "theoretical_loss": 3.672106726524011, "tokens_seen": 936712192 }, { "epoch": 2.08, "learning_rate": 0.00036171514543630897, "loss": 3.0021, "theoretical_loss": 3.672081981532012, "tokens_seen": 936777728 }, { "epoch": 2.08, "learning_rate": 0.0003617051153460381, "loss": 2.8561, "theoretical_loss": 3.6720572387557655, "tokens_seen": 936843264 }, { "epoch": 2.08, "learning_rate": 0.00036169508525576733, "loss": 2.9548, "theoretical_loss": 3.672032498194918, "tokens_seen": 936908800 }, { "epoch": 2.08, "learning_rate": 0.0003616850551654965, "loss": 2.9024, "theoretical_loss": 3.672007759849116, "tokens_seen": 936974336 }, { "epoch": 2.08, "learning_rate": 0.0003616750250752257, "loss": 2.9547, "theoretical_loss": 3.6719830237180067, "tokens_seen": 937039872 }, { "epoch": 2.08, "learning_rate": 0.00036166499498495487, "loss": 2.8712, "theoretical_loss": 3.6719582898012364, "tokens_seen": 937105408 }, { "epoch": 2.08, "objective/train/docs_used": 1508733, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1014935970306396, "objective/train/theoretical_loss": 3.6719459236731185, "objective/train/tokens_used": 957598176, "theoretical_loss": 3.6719459236731185, "tokens_seen": 937138176 }, { "epoch": 2.08, "learning_rate": 0.00036165496489468405, "loss": 2.9012, "theoretical_loss": 3.6719335580984525, "tokens_seen": 937170944 }, { "epoch": 2.08, "learning_rate": 0.00036164493480441323, "loss": 2.993, "theoretical_loss": 3.671908828609302, "tokens_seen": 937236480 }, { "epoch": 2.08, "learning_rate": 0.00036163490471414247, "loss": 2.9195, "theoretical_loss": 3.6718841013334313, "tokens_seen": 937302016 }, { "epoch": 2.08, "learning_rate": 0.0003616248746238716, "loss": 2.837, "theoretical_loss": 3.6718593762704885, "tokens_seen": 937367552 }, { "epoch": 2.08, "learning_rate": 0.00036161484453360083, "loss": 2.664, "theoretical_loss": 3.6718346534201203, "tokens_seen": 937433088 }, { "epoch": 2.08, "learning_rate": 0.00036160481444333, "loss": 2.7736, "theoretical_loss": 3.6718099327819744, "tokens_seen": 937498624 }, { "epoch": 2.08, "learning_rate": 0.0003615947843530592, "loss": 2.8577, "theoretical_loss": 3.671785214355698, "tokens_seen": 937564160 }, { "epoch": 2.08, "learning_rate": 0.0003615847542627884, "loss": 2.6467, "theoretical_loss": 3.671760498140939, "tokens_seen": 937629696 }, { "epoch": 2.08, "learning_rate": 0.00036157472417251756, "loss": 2.8699, "theoretical_loss": 3.671735784137344, "tokens_seen": 937695232 }, { "epoch": 2.08, "learning_rate": 0.00036156469408224674, "loss": 2.8909, "theoretical_loss": 3.6717110723445616, "tokens_seen": 937760768 }, { "epoch": 2.08, "learning_rate": 0.000361554663991976, "loss": 2.842, "theoretical_loss": 3.671686362762239, "tokens_seen": 937826304 }, { "epoch": 2.08, "learning_rate": 0.0003615446339017051, "loss": 2.7354, "theoretical_loss": 3.6716616553900248, "tokens_seen": 937891840 }, { "epoch": 2.08, "learning_rate": 0.00036153460381143434, "loss": 2.9486, "theoretical_loss": 3.6716369502275663, "tokens_seen": 937957376 }, { "epoch": 2.08, "learning_rate": 0.00036152457372116346, "loss": 2.7918, "theoretical_loss": 3.671612247274511, "tokens_seen": 938022912 }, { "epoch": 2.08, "learning_rate": 0.0003615145436308927, "loss": 2.919, "theoretical_loss": 3.6715875465305077, "tokens_seen": 938088448 }, { "epoch": 2.08, "learning_rate": 0.0003615045135406219, "loss": 2.9125, "theoretical_loss": 3.671562847995205, "tokens_seen": 938153984 }, { "epoch": 2.08, "learning_rate": 0.00036149448345035106, "loss": 2.8318, "theoretical_loss": 3.67153815166825, "tokens_seen": 938219520 }, { "epoch": 2.08, "learning_rate": 0.00036148445336008024, "loss": 3.0905, "theoretical_loss": 3.671513457549292, "tokens_seen": 938285056 }, { "epoch": 2.08, "learning_rate": 0.0003614744232698094, "loss": 2.8952, "theoretical_loss": 3.6714887656379784, "tokens_seen": 938350592 }, { "epoch": 2.08, "learning_rate": 0.0003614643931795386, "loss": 2.9808, "theoretical_loss": 3.671464075933958, "tokens_seen": 938416128 }, { "epoch": 2.08, "learning_rate": 0.00036145436308926784, "loss": 2.9627, "theoretical_loss": 3.6714393884368803, "tokens_seen": 938481664 }, { "epoch": 2.08, "learning_rate": 0.00036144433299899697, "loss": 2.9948, "theoretical_loss": 3.6714147031463926, "tokens_seen": 938547200 }, { "epoch": 2.08, "learning_rate": 0.0003614343029087262, "loss": 2.8215, "theoretical_loss": 3.6713900200621445, "tokens_seen": 938612736 }, { "epoch": 2.08, "learning_rate": 0.0003614242728184554, "loss": 2.9482, "theoretical_loss": 3.6713653391837844, "tokens_seen": 938678272 }, { "epoch": 2.08, "learning_rate": 0.00036141424272818456, "loss": 2.8947, "theoretical_loss": 3.6713406605109613, "tokens_seen": 938743808 }, { "epoch": 2.08, "objective/train/docs_used": 1511509, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7950587272644043, "objective/train/theoretical_loss": 3.671328322001517, "objective/train/tokens_used": 959236576, "theoretical_loss": 3.671328322001517, "tokens_seen": 938776576 }, { "epoch": 2.08, "learning_rate": 0.00036140421263791374, "loss": 2.8758, "theoretical_loss": 3.671315984043324, "tokens_seen": 938809344 }, { "epoch": 2.08, "learning_rate": 0.0003613941825476429, "loss": 2.8614, "theoretical_loss": 3.671291309780522, "tokens_seen": 938874880 }, { "epoch": 2.08, "learning_rate": 0.0003613841524573721, "loss": 2.9655, "theoretical_loss": 3.671266637722204, "tokens_seen": 938940416 }, { "epoch": 2.08, "learning_rate": 0.00036137412236710134, "loss": 2.9031, "theoretical_loss": 3.671241967868019, "tokens_seen": 939005952 }, { "epoch": 2.08, "learning_rate": 0.00036136409227683047, "loss": 2.7798, "theoretical_loss": 3.6712173002176165, "tokens_seen": 939071488 }, { "epoch": 2.08, "learning_rate": 0.0003613540621865597, "loss": 2.8979, "theoretical_loss": 3.6711926347706463, "tokens_seen": 939137024 }, { "epoch": 2.08, "learning_rate": 0.00036134403209628883, "loss": 2.9563, "theoretical_loss": 3.6711679715267573, "tokens_seen": 939202560 }, { "epoch": 2.08, "learning_rate": 0.00036133400200601807, "loss": 2.9822, "theoretical_loss": 3.671143310485599, "tokens_seen": 939268096 }, { "epoch": 2.08, "learning_rate": 0.00036132397191574725, "loss": 3.0339, "theoretical_loss": 3.6711186516468213, "tokens_seen": 939333632 }, { "epoch": 2.08, "learning_rate": 0.00036131394182547643, "loss": 2.9584, "theoretical_loss": 3.6710939950100734, "tokens_seen": 939399168 }, { "epoch": 2.08, "learning_rate": 0.0003613039117352056, "loss": 2.8869, "theoretical_loss": 3.6710693405750057, "tokens_seen": 939464704 }, { "epoch": 2.08, "learning_rate": 0.00036129388164493484, "loss": 2.8713, "theoretical_loss": 3.6710446883412677, "tokens_seen": 939530240 }, { "epoch": 2.08, "learning_rate": 0.00036128385155466397, "loss": 2.8795, "theoretical_loss": 3.6710200383085096, "tokens_seen": 939595776 }, { "epoch": 2.08, "learning_rate": 0.0003612738214643932, "loss": 2.8923, "theoretical_loss": 3.6709953904763806, "tokens_seen": 939661312 }, { "epoch": 2.08, "learning_rate": 0.00036126379137412233, "loss": 2.9336, "theoretical_loss": 3.670970744844532, "tokens_seen": 939726848 }, { "epoch": 2.08, "learning_rate": 0.00036125376128385157, "loss": 2.8408, "theoretical_loss": 3.670946101412613, "tokens_seen": 939792384 }, { "epoch": 2.08, "learning_rate": 0.00036124373119358075, "loss": 2.9817, "theoretical_loss": 3.6709214601802738, "tokens_seen": 939857920 }, { "epoch": 2.08, "learning_rate": 0.00036123370110330993, "loss": 2.915, "theoretical_loss": 3.6708968211471653, "tokens_seen": 939923456 }, { "epoch": 2.08, "learning_rate": 0.0003612236710130391, "loss": 2.8187, "theoretical_loss": 3.6708721843129375, "tokens_seen": 939988992 }, { "epoch": 2.08, "learning_rate": 0.0003612136409227683, "loss": 2.9154, "theoretical_loss": 3.670847549677241, "tokens_seen": 940054528 }, { "epoch": 2.08, "learning_rate": 0.0003612036108324975, "loss": 2.9299, "theoretical_loss": 3.6708229172397266, "tokens_seen": 940120064 }, { "epoch": 2.08, "learning_rate": 0.0003611935807422267, "loss": 2.9211, "theoretical_loss": 3.6707982870000446, "tokens_seen": 940185600 }, { "epoch": 2.08, "learning_rate": 0.00036118355065195584, "loss": 2.8646, "theoretical_loss": 3.670773658957846, "tokens_seen": 940251136 }, { "epoch": 2.08, "learning_rate": 0.00036117352056168507, "loss": 2.9189, "theoretical_loss": 3.6707490331127812, "tokens_seen": 940316672 }, { "epoch": 2.08, "learning_rate": 0.0003611634904714142, "loss": 2.7889, "theoretical_loss": 3.6707244094645013, "tokens_seen": 940382208 }, { "epoch": 2.08, "objective/train/docs_used": 1514286, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.709304094314575, "objective/train/theoretical_loss": 3.6707120984640467, "objective/train/tokens_used": 960874976, "theoretical_loss": 3.6707120984640467, "tokens_seen": 940414976 }, { "epoch": 2.08, "learning_rate": 0.00036115346038114343, "loss": 2.8605, "theoretical_loss": 3.6706997880126573, "tokens_seen": 940447744 }, { "epoch": 2.08, "learning_rate": 0.0003611434302908726, "loss": 2.8286, "theoretical_loss": 3.6706751687569, "tokens_seen": 940513280 }, { "epoch": 2.08, "learning_rate": 0.0003611334002006018, "loss": 3.0481, "theoretical_loss": 3.6706505516968813, "tokens_seen": 940578816 }, { "epoch": 2.08, "learning_rate": 0.000361123370110331, "loss": 2.7836, "theoretical_loss": 3.670625936832251, "tokens_seen": 940644352 }, { "epoch": 2.08, "learning_rate": 0.0003611133400200602, "loss": 3.0728, "theoretical_loss": 3.6706013241626616, "tokens_seen": 940709888 }, { "epoch": 2.08, "learning_rate": 0.0003611033099297894, "loss": 2.7715, "theoretical_loss": 3.6705767136877645, "tokens_seen": 940775424 }, { "epoch": 2.08, "learning_rate": 0.0003610932798395186, "loss": 2.8197, "theoretical_loss": 3.67055210540721, "tokens_seen": 940840960 }, { "epoch": 2.08, "learning_rate": 0.00036108324974924776, "loss": 2.8226, "theoretical_loss": 3.6705274993206505, "tokens_seen": 940906496 }, { "epoch": 2.08, "learning_rate": 0.00036107321965897694, "loss": 2.9243, "theoretical_loss": 3.670502895427738, "tokens_seen": 940972032 }, { "epoch": 2.08, "learning_rate": 0.0003610631895687062, "loss": 2.8261, "theoretical_loss": 3.670478293728123, "tokens_seen": 941037568 }, { "epoch": 2.08, "learning_rate": 0.0003610531594784353, "loss": 2.7546, "theoretical_loss": 3.6704536942214574, "tokens_seen": 941103104 }, { "epoch": 2.08, "learning_rate": 0.00036104312938816454, "loss": 2.9359, "theoretical_loss": 3.670429096907394, "tokens_seen": 941168640 }, { "epoch": 2.08, "learning_rate": 0.00036103309929789366, "loss": 2.805, "theoretical_loss": 3.670404501785584, "tokens_seen": 941234176 }, { "epoch": 2.08, "learning_rate": 0.0003610230692076229, "loss": 2.8471, "theoretical_loss": 3.6703799088556797, "tokens_seen": 941299712 }, { "epoch": 2.08, "learning_rate": 0.0003610130391173521, "loss": 2.9445, "theoretical_loss": 3.670355318117333, "tokens_seen": 941365248 }, { "epoch": 2.08, "learning_rate": 0.00036100300902708126, "loss": 2.8547, "theoretical_loss": 3.670330729570196, "tokens_seen": 941430784 }, { "epoch": 2.08, "learning_rate": 0.00036099297893681044, "loss": 2.9044, "theoretical_loss": 3.670306143213921, "tokens_seen": 941496320 }, { "epoch": 2.08, "learning_rate": 0.0003609829488465396, "loss": 2.682, "theoretical_loss": 3.6702815590481603, "tokens_seen": 941561856 }, { "epoch": 2.08, "learning_rate": 0.0003609729187562688, "loss": 2.8736, "theoretical_loss": 3.6702569770725666, "tokens_seen": 941627392 }, { "epoch": 2.08, "learning_rate": 0.00036096288866599804, "loss": 2.7802, "theoretical_loss": 3.670232397286792, "tokens_seen": 941692928 }, { "epoch": 2.08, "learning_rate": 0.00036095285857572717, "loss": 2.9253, "theoretical_loss": 3.6702078196904884, "tokens_seen": 941758464 }, { "epoch": 2.08, "learning_rate": 0.0003609428284854564, "loss": 2.9419, "theoretical_loss": 3.67018324428331, "tokens_seen": 941824000 }, { "epoch": 2.08, "learning_rate": 0.0003609327983951856, "loss": 2.8881, "theoretical_loss": 3.6701586710649083, "tokens_seen": 941889536 }, { "epoch": 2.08, "learning_rate": 0.00036092276830491476, "loss": 2.911, "theoretical_loss": 3.670134100034937, "tokens_seen": 941955072 }, { "epoch": 2.08, "learning_rate": 0.00036091273821464394, "loss": 2.8756, "theoretical_loss": 3.670109531193048, "tokens_seen": 942020608 }, { "epoch": 2.08, "objective/train/docs_used": 1516952, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.757025718688965, "objective/train/theoretical_loss": 3.6700972475925258, "objective/train/tokens_used": 962513376, "theoretical_loss": 3.6700972475925258, "tokens_seen": 942053376 }, { "epoch": 2.08, "learning_rate": 0.0003609027081243731, "loss": 2.8359, "theoretical_loss": 3.670084964538894, "tokens_seen": 942086144 }, { "epoch": 2.08, "learning_rate": 0.0003608926780341023, "loss": 2.8302, "theoretical_loss": 3.6700604000721295, "tokens_seen": 942151680 }, { "epoch": 2.08, "learning_rate": 0.00036088264794383154, "loss": 2.9642, "theoretical_loss": 3.6700358377924065, "tokens_seen": 942217216 }, { "epoch": 2.08, "learning_rate": 0.00036087261785356067, "loss": 2.921, "theoretical_loss": 3.6700112776993787, "tokens_seen": 942282752 }, { "epoch": 2.08, "learning_rate": 0.0003608625877632899, "loss": 2.9253, "theoretical_loss": 3.669986719792699, "tokens_seen": 942348288 }, { "epoch": 2.08, "learning_rate": 0.00036085255767301903, "loss": 2.8272, "theoretical_loss": 3.669962164072021, "tokens_seen": 942413824 }, { "epoch": 2.08, "learning_rate": 0.00036084252758274827, "loss": 2.9667, "theoretical_loss": 3.669937610536998, "tokens_seen": 942479360 }, { "epoch": 2.08, "learning_rate": 0.00036083249749247745, "loss": 3.0803, "theoretical_loss": 3.6699130591872833, "tokens_seen": 942544896 }, { "epoch": 2.08, "learning_rate": 0.00036082246740220663, "loss": 2.8474, "theoretical_loss": 3.6698885100225302, "tokens_seen": 942610432 }, { "epoch": 2.08, "learning_rate": 0.0003608124373119358, "loss": 2.9377, "theoretical_loss": 3.6698639630423937, "tokens_seen": 942675968 }, { "epoch": 2.08, "learning_rate": 0.00036080240722166505, "loss": 2.8672, "theoretical_loss": 3.669839418246527, "tokens_seen": 942741504 }, { "epoch": 2.08, "learning_rate": 0.00036079237713139417, "loss": 2.8517, "theoretical_loss": 3.6698148756345823, "tokens_seen": 942807040 }, { "epoch": 2.08, "learning_rate": 0.0003607823470411234, "loss": 2.7395, "theoretical_loss": 3.6697903352062156, "tokens_seen": 942872576 }, { "epoch": 2.08, "learning_rate": 0.00036077231695085253, "loss": 2.9664, "theoretical_loss": 3.6697657969610793, "tokens_seen": 942938112 }, { "epoch": 2.08, "learning_rate": 0.00036076228686058177, "loss": 3.0005, "theoretical_loss": 3.669741260898829, "tokens_seen": 943003648 }, { "epoch": 2.08, "learning_rate": 0.00036075225677031095, "loss": 2.9532, "theoretical_loss": 3.6697167270191176, "tokens_seen": 943069184 }, { "epoch": 2.08, "learning_rate": 0.00036074222668004013, "loss": 2.8194, "theoretical_loss": 3.6696921953215997, "tokens_seen": 943134720 }, { "epoch": 2.08, "learning_rate": 0.0003607321965897693, "loss": 2.7457, "theoretical_loss": 3.66966766580593, "tokens_seen": 943200256 }, { "epoch": 2.08, "learning_rate": 0.0003607221664994985, "loss": 2.9387, "theoretical_loss": 3.669643138471762, "tokens_seen": 943265792 }, { "epoch": 2.08, "learning_rate": 0.0003607121364092277, "loss": 2.8392, "theoretical_loss": 3.669618613318751, "tokens_seen": 943331328 }, { "epoch": 2.08, "learning_rate": 0.0003607021063189569, "loss": 2.8966, "theoretical_loss": 3.6695940903465507, "tokens_seen": 943396864 }, { "epoch": 2.08, "learning_rate": 0.00036069207622868604, "loss": 2.9829, "theoretical_loss": 3.6695695695548167, "tokens_seen": 943462400 }, { "epoch": 2.08, "learning_rate": 0.00036068204613841527, "loss": 3.0076, "theoretical_loss": 3.669545050943203, "tokens_seen": 943527936 }, { "epoch": 2.08, "learning_rate": 0.0003606720160481444, "loss": 2.9752, "theoretical_loss": 3.6695205345113644, "tokens_seen": 943593472 }, { "epoch": 2.08, "learning_rate": 0.00036066198595787364, "loss": 2.9667, "theoretical_loss": 3.6694960202589555, "tokens_seen": 943659008 }, { "epoch": 2.08, "objective/train/docs_used": 1519824, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.811267614364624, "objective/train/theoretical_loss": 3.6694837639499296, "objective/train/tokens_used": 964151776, "theoretical_loss": 3.6694837639499296, "tokens_seen": 943691776 }, { "epoch": 2.08, "learning_rate": 0.0003606519558676028, "loss": 2.848, "theoretical_loss": 3.669471508185632, "tokens_seen": 943724544 }, { "epoch": 2.08, "learning_rate": 0.000360641925777332, "loss": 3.162, "theoretical_loss": 3.6694469982910483, "tokens_seen": 943790080 }, { "epoch": 2.08, "learning_rate": 0.0003606318956870612, "loss": 2.8702, "theoretical_loss": 3.66942249057486, "tokens_seen": 943855616 }, { "epoch": 2.08, "learning_rate": 0.0003606218655967904, "loss": 2.9304, "theoretical_loss": 3.669397985036721, "tokens_seen": 943921152 }, { "epoch": 2.08, "learning_rate": 0.00036061183550651954, "loss": 2.9676, "theoretical_loss": 3.669373481676288, "tokens_seen": 943986688 }, { "epoch": 2.08, "learning_rate": 0.0003606018054162488, "loss": 2.7977, "theoretical_loss": 3.669348980493216, "tokens_seen": 944052224 }, { "epoch": 2.08, "learning_rate": 0.0003605917753259779, "loss": 2.8475, "theoretical_loss": 3.669324481487159, "tokens_seen": 944117760 }, { "epoch": 2.08, "learning_rate": 0.00036058174523570714, "loss": 2.875, "theoretical_loss": 3.6692999846577745, "tokens_seen": 944183296 }, { "epoch": 2.08, "learning_rate": 0.0003605717151454363, "loss": 2.9026, "theoretical_loss": 3.669275490004717, "tokens_seen": 944248832 }, { "epoch": 2.08, "learning_rate": 0.0003605616850551655, "loss": 3.0442, "theoretical_loss": 3.669250997527642, "tokens_seen": 944314368 }, { "epoch": 2.08, "learning_rate": 0.0003605516549648947, "loss": 2.8717, "theoretical_loss": 3.669226507226206, "tokens_seen": 944379904 }, { "epoch": 2.08, "learning_rate": 0.00036054162487462386, "loss": 2.8336, "theoretical_loss": 3.669202019100063, "tokens_seen": 944445440 }, { "epoch": 2.08, "learning_rate": 0.00036053159478435304, "loss": 2.9874, "theoretical_loss": 3.669177533148871, "tokens_seen": 944510976 }, { "epoch": 2.08, "learning_rate": 0.0003605215646940823, "loss": 2.9148, "theoretical_loss": 3.6691530493722855, "tokens_seen": 944576512 }, { "epoch": 2.08, "learning_rate": 0.0003605115346038114, "loss": 2.9715, "theoretical_loss": 3.6691285677699614, "tokens_seen": 944642048 }, { "epoch": 2.08, "learning_rate": 0.00036050150451354064, "loss": 2.755, "theoretical_loss": 3.6691040883415553, "tokens_seen": 944707584 }, { "epoch": 2.08, "learning_rate": 0.00036049147442326977, "loss": 2.8653, "theoretical_loss": 3.6690796110867234, "tokens_seen": 944773120 }, { "epoch": 2.08, "learning_rate": 0.000360481444332999, "loss": 2.9388, "theoretical_loss": 3.669055136005123, "tokens_seen": 944838656 }, { "epoch": 2.08, "learning_rate": 0.0003604714142427282, "loss": 3.0764, "theoretical_loss": 3.6690306630964082, "tokens_seen": 944904192 }, { "epoch": 2.08, "learning_rate": 0.00036046138415245737, "loss": 2.9106, "theoretical_loss": 3.6690061923602375, "tokens_seen": 944969728 }, { "epoch": 2.08, "learning_rate": 0.00036045135406218655, "loss": 3.0156, "theoretical_loss": 3.668981723796266, "tokens_seen": 945035264 }, { "epoch": 2.08, "learning_rate": 0.0003604413239719158, "loss": 2.6893, "theoretical_loss": 3.668957257404151, "tokens_seen": 945100800 }, { "epoch": 2.08, "learning_rate": 0.0003604312938816449, "loss": 3.0585, "theoretical_loss": 3.66893279318355, "tokens_seen": 945166336 }, { "epoch": 2.08, "learning_rate": 0.00036042126379137414, "loss": 2.9372, "theoretical_loss": 3.6689083311341175, "tokens_seen": 945231872 }, { "epoch": 2.08, "learning_rate": 0.00036041123370110327, "loss": 2.7895, "theoretical_loss": 3.668883871255512, "tokens_seen": 945297408 }, { "epoch": 2.08, "objective/train/docs_used": 1521316, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9999074935913086, "objective/train/theoretical_loss": 3.6688716421301617, "objective/train/tokens_used": 965790176, "theoretical_loss": 3.6688716421301617, "tokens_seen": 945330176 }, { "epoch": 2.08, "learning_rate": 0.0003604012036108325, "loss": 2.913, "theoretical_loss": 3.6688594135473895, "tokens_seen": 945362944 }, { "epoch": 2.08, "learning_rate": 0.0003603911735205617, "loss": 2.8694, "theoretical_loss": 3.6688349580094073, "tokens_seen": 945428480 }, { "epoch": 2.08, "learning_rate": 0.00036038114343029087, "loss": 3.0816, "theoretical_loss": 3.6688105046412227, "tokens_seen": 945494016 }, { "epoch": 2.08, "learning_rate": 0.00036037111334002005, "loss": 2.8874, "theoretical_loss": 3.668786053442493, "tokens_seen": 945559552 }, { "epoch": 2.08, "learning_rate": 0.00036036108324974923, "loss": 2.8854, "theoretical_loss": 3.6687616044128744, "tokens_seen": 945625088 }, { "epoch": 2.08, "learning_rate": 0.00036035105315947847, "loss": 2.8942, "theoretical_loss": 3.6687371575520245, "tokens_seen": 945690624 }, { "epoch": 2.08, "learning_rate": 0.00036034102306920765, "loss": 2.9113, "theoretical_loss": 3.668712712859601, "tokens_seen": 945756160 }, { "epoch": 2.08, "learning_rate": 0.00036033099297893683, "loss": 2.9889, "theoretical_loss": 3.6686882703352612, "tokens_seen": 945821696 }, { "epoch": 2.08, "learning_rate": 0.000360320962888666, "loss": 2.8476, "theoretical_loss": 3.668663829978663, "tokens_seen": 945887232 }, { "epoch": 2.08, "learning_rate": 0.00036031093279839525, "loss": 2.9471, "theoretical_loss": 3.668639391789463, "tokens_seen": 945952768 }, { "epoch": 2.08, "learning_rate": 0.00036030090270812437, "loss": 2.9551, "theoretical_loss": 3.66861495576732, "tokens_seen": 946018304 }, { "epoch": 2.08, "learning_rate": 0.0003602908726178536, "loss": 2.7712, "theoretical_loss": 3.6685905219118906, "tokens_seen": 946083840 }, { "epoch": 2.08, "learning_rate": 0.00036028084252758273, "loss": 2.8189, "theoretical_loss": 3.6685660902228334, "tokens_seen": 946149376 }, { "epoch": 2.08, "learning_rate": 0.00036027081243731197, "loss": 2.9091, "theoretical_loss": 3.668541660699806, "tokens_seen": 946214912 }, { "epoch": 2.08, "learning_rate": 0.00036026078234704115, "loss": 2.9169, "theoretical_loss": 3.668517233342466, "tokens_seen": 946280448 }, { "epoch": 2.08, "learning_rate": 0.00036025075225677033, "loss": 2.9325, "theoretical_loss": 3.6684928081504724, "tokens_seen": 946345984 }, { "epoch": 2.08, "learning_rate": 0.0003602407221664995, "loss": 2.8479, "theoretical_loss": 3.6684683851234823, "tokens_seen": 946411520 }, { "epoch": 2.08, "learning_rate": 0.0003602306920762287, "loss": 2.91, "theoretical_loss": 3.668443964261155, "tokens_seen": 946477056 }, { "epoch": 2.08, "learning_rate": 0.0003602206619859579, "loss": 2.9298, "theoretical_loss": 3.6684195455631476, "tokens_seen": 946542592 }, { "epoch": 2.08, "learning_rate": 0.0003602106318956871, "loss": 2.6382, "theoretical_loss": 3.6683951290291192, "tokens_seen": 946608128 }, { "epoch": 2.08, "learning_rate": 0.00036020060180541624, "loss": 2.9897, "theoretical_loss": 3.6683707146587277, "tokens_seen": 946673664 }, { "epoch": 2.08, "learning_rate": 0.0003601905717151455, "loss": 2.9346, "theoretical_loss": 3.6683463024516323, "tokens_seen": 946739200 }, { "epoch": 2.08, "learning_rate": 0.0003601805416248746, "loss": 2.711, "theoretical_loss": 3.6683218924074907, "tokens_seen": 946804736 }, { "epoch": 2.08, "learning_rate": 0.00036017051153460384, "loss": 2.7601, "theoretical_loss": 3.668297484525963, "tokens_seen": 946870272 }, { "epoch": 2.08, "learning_rate": 0.000360160481444333, "loss": 2.8482, "theoretical_loss": 3.668273078806706, "tokens_seen": 946935808 }, { "epoch": 2.08, "objective/train/docs_used": 1524063, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0326390266418457, "objective/train/theoretical_loss": 3.6682608767578233, "objective/train/tokens_used": 967428576, "theoretical_loss": 3.6682608767578233, "tokens_seen": 946968576 }, { "epoch": 2.08, "learning_rate": 0.0003601504513540622, "loss": 3.0548, "theoretical_loss": 3.66824867524938, "tokens_seen": 947001344 }, { "epoch": 2.08, "learning_rate": 0.0003601404212637914, "loss": 2.9548, "theoretical_loss": 3.668224273853644, "tokens_seen": 947066880 }, { "epoch": 2.08, "learning_rate": 0.0003601303911735206, "loss": 2.8881, "theoretical_loss": 3.6681998746191553, "tokens_seen": 947132416 }, { "epoch": 2.08, "learning_rate": 0.00036012036108324974, "loss": 2.9178, "theoretical_loss": 3.668175477545575, "tokens_seen": 947197952 }, { "epoch": 2.08, "learning_rate": 0.000360110330992979, "loss": 2.9106, "theoretical_loss": 3.668151082632561, "tokens_seen": 947263488 }, { "epoch": 2.08, "learning_rate": 0.0003601003009027081, "loss": 3.0052, "theoretical_loss": 3.668126689879773, "tokens_seen": 947329024 }, { "epoch": 2.08, "learning_rate": 0.00036009027081243734, "loss": 2.9675, "theoretical_loss": 3.6681022992868693, "tokens_seen": 947394560 }, { "epoch": 2.08, "learning_rate": 0.0003600802407221665, "loss": 2.8882, "theoretical_loss": 3.6680779108535106, "tokens_seen": 947460096 }, { "epoch": 2.08, "learning_rate": 0.0003600702106318957, "loss": 2.8386, "theoretical_loss": 3.6680535245793555, "tokens_seen": 947525632 }, { "epoch": 2.08, "learning_rate": 0.0003600601805416249, "loss": 2.9873, "theoretical_loss": 3.668029140464064, "tokens_seen": 947591168 }, { "epoch": 2.08, "learning_rate": 0.00036005015045135406, "loss": 2.7943, "theoretical_loss": 3.6680047585072955, "tokens_seen": 947656704 }, { "epoch": 2.08, "learning_rate": 0.00036004012036108324, "loss": 2.8054, "theoretical_loss": 3.6679803787087097, "tokens_seen": 947722240 }, { "epoch": 2.08, "learning_rate": 0.0003600300902708125, "loss": 2.9259, "theoretical_loss": 3.6679560010679664, "tokens_seen": 947787776 }, { "epoch": 2.08, "learning_rate": 0.0003600200601805416, "loss": 2.8775, "theoretical_loss": 3.6679316255847247, "tokens_seen": 947853312 }, { "epoch": 2.08, "learning_rate": 0.00036001003009027084, "loss": 2.9561, "theoretical_loss": 3.6679072522586456, "tokens_seen": 947918848 }, { "epoch": 2.08, "learning_rate": 0.00035999999999999997, "loss": 2.8385, "theoretical_loss": 3.6678828810893886, "tokens_seen": 947984384 }, { "epoch": 2.08, "learning_rate": 0.0003599899699097292, "loss": 2.7439, "theoretical_loss": 3.667858512076614, "tokens_seen": 948049920 }, { "epoch": 2.08, "learning_rate": 0.0003599799398194584, "loss": 2.9654, "theoretical_loss": 3.667834145219981, "tokens_seen": 948115456 }, { "epoch": 2.08, "learning_rate": 0.00035996990972918757, "loss": 2.9278, "theoretical_loss": 3.667809780519151, "tokens_seen": 948180992 }, { "epoch": 2.08, "learning_rate": 0.00035995987963891675, "loss": 2.9594, "theoretical_loss": 3.667785417973784, "tokens_seen": 948246528 }, { "epoch": 2.08, "learning_rate": 0.000359949849548646, "loss": 3.0275, "theoretical_loss": 3.6677610575835393, "tokens_seen": 948312064 }, { "epoch": 2.08, "learning_rate": 0.0003599398194583751, "loss": 2.9092, "theoretical_loss": 3.6677366993480787, "tokens_seen": 948377600 }, { "epoch": 2.08, "learning_rate": 0.00035992978936810434, "loss": 2.8328, "theoretical_loss": 3.6677123432670626, "tokens_seen": 948443136 }, { "epoch": 2.08, "learning_rate": 0.00035991975927783347, "loss": 2.9542, "theoretical_loss": 3.66768798934015, "tokens_seen": 948508672 }, { "epoch": 2.08, "learning_rate": 0.0003599097291875627, "loss": 3.0221, "theoretical_loss": 3.667663637567004, "tokens_seen": 948574208 }, { "epoch": 2.08, "objective/train/docs_used": 1526876, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.930964946746826, "objective/train/theoretical_loss": 3.667651462487987, "objective/train/tokens_used": 969066976, "theoretical_loss": 3.667651462487987, "tokens_seen": 948606976 }, { "epoch": 2.08, "learning_rate": 0.0003598996990972919, "loss": 2.8311, "theoretical_loss": 3.6676392879472837, "tokens_seen": 948639744 }, { "epoch": 2.08, "learning_rate": 0.00035988966900702107, "loss": 2.8571, "theoretical_loss": 3.66761494048065, "tokens_seen": 948705280 }, { "epoch": 2.08, "learning_rate": 0.00035987963891675025, "loss": 3.0449, "theoretical_loss": 3.667590595166765, "tokens_seen": 948770816 }, { "epoch": 2.08, "learning_rate": 0.00035986960882647943, "loss": 2.9925, "theoretical_loss": 3.667566252005288, "tokens_seen": 948836352 }, { "epoch": 2.08, "learning_rate": 0.0003598595787362086, "loss": 2.8265, "theoretical_loss": 3.6675419109958813, "tokens_seen": 948901888 }, { "epoch": 2.08, "learning_rate": 0.00035984954864593785, "loss": 3.0439, "theoretical_loss": 3.6675175721382054, "tokens_seen": 948967424 }, { "epoch": 2.08, "learning_rate": 0.000359839518555667, "loss": 2.9805, "theoretical_loss": 3.6674932354319214, "tokens_seen": 949032960 }, { "epoch": 2.08, "learning_rate": 0.0003598294884653962, "loss": 2.9457, "theoretical_loss": 3.6674689008766914, "tokens_seen": 949098496 }, { "epoch": 2.08, "learning_rate": 0.00035981945837512534, "loss": 2.8306, "theoretical_loss": 3.6674445684721766, "tokens_seen": 949164032 }, { "epoch": 2.08, "learning_rate": 0.00035980942828485457, "loss": 2.8665, "theoretical_loss": 3.667420238218037, "tokens_seen": 949229568 }, { "epoch": 2.08, "learning_rate": 0.00035979939819458375, "loss": 3.1011, "theoretical_loss": 3.667395910113936, "tokens_seen": 949295104 }, { "epoch": 2.08, "learning_rate": 0.00035978936810431293, "loss": 2.9074, "theoretical_loss": 3.6673715841595342, "tokens_seen": 949360640 }, { "epoch": 2.08, "learning_rate": 0.0003597793380140421, "loss": 2.8988, "theoretical_loss": 3.667347260354494, "tokens_seen": 949426176 }, { "epoch": 2.08, "learning_rate": 0.00035976930792377135, "loss": 3.1049, "theoretical_loss": 3.667322938698476, "tokens_seen": 949491712 }, { "epoch": 2.08, "learning_rate": 0.0003597592778335005, "loss": 2.8702, "theoretical_loss": 3.6672986191911425, "tokens_seen": 949557248 }, { "epoch": 2.08, "learning_rate": 0.0003597492477432297, "loss": 2.9368, "theoretical_loss": 3.6672743018321556, "tokens_seen": 949622784 }, { "epoch": 2.08, "learning_rate": 0.00035973921765295884, "loss": 2.9217, "theoretical_loss": 3.6672499866211776, "tokens_seen": 949688320 }, { "epoch": 2.08, "learning_rate": 0.0003597291875626881, "loss": 2.8615, "theoretical_loss": 3.66722567355787, "tokens_seen": 949753856 }, { "epoch": 2.08, "learning_rate": 0.00035971915747241726, "loss": 3.036, "theoretical_loss": 3.6672013626418956, "tokens_seen": 949819392 }, { "epoch": 2.08, "learning_rate": 0.00035970912738214644, "loss": 2.9405, "theoretical_loss": 3.6671770538729156, "tokens_seen": 949884928 }, { "epoch": 2.08, "learning_rate": 0.0003596990972918756, "loss": 2.7308, "theoretical_loss": 3.6671527472505927, "tokens_seen": 949950464 }, { "epoch": 2.08, "learning_rate": 0.0003596890672016048, "loss": 2.8973, "theoretical_loss": 3.6671284427745894, "tokens_seen": 950016000 }, { "epoch": 2.08, "learning_rate": 0.000359679037111334, "loss": 3.0602, "theoretical_loss": 3.667104140444568, "tokens_seen": 950081536 }, { "epoch": 2.08, "learning_rate": 0.0003596690070210632, "loss": 3.2048, "theoretical_loss": 3.667079840260191, "tokens_seen": 950147072 }, { "epoch": 2.08, "learning_rate": 0.00035965897693079234, "loss": 3.0375, "theoretical_loss": 3.6670555422211213, "tokens_seen": 950212608 }, { "epoch": 2.08, "objective/train/docs_used": 1529664, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.94755482673645, "objective/train/theoretical_loss": 3.6670433940059715, "objective/train/tokens_used": 970705376, "theoretical_loss": 3.6670433940059715, "tokens_seen": 950245376 }, { "epoch": 2.08, "learning_rate": 0.0003596489468405216, "loss": 2.9549, "theoretical_loss": 3.6670312463270216, "tokens_seen": 950278144 }, { "epoch": 2.08, "learning_rate": 0.00035963891675025076, "loss": 2.8839, "theoretical_loss": 3.6670069525775535, "tokens_seen": 950343680 }, { "epoch": 2.08, "learning_rate": 0.00035962888665997994, "loss": 2.9053, "theoretical_loss": 3.6669826609723817, "tokens_seen": 950409216 }, { "epoch": 2.08, "learning_rate": 0.0003596188565697091, "loss": 2.8338, "theoretical_loss": 3.6669583715111678, "tokens_seen": 950474752 }, { "epoch": 2.08, "learning_rate": 0.0003596088264794383, "loss": 2.9019, "theoretical_loss": 3.6669340841935747, "tokens_seen": 950540288 }, { "epoch": 2.08, "learning_rate": 0.00035959879638916754, "loss": 2.8467, "theoretical_loss": 3.666909799019266, "tokens_seen": 950605824 }, { "epoch": 2.08, "learning_rate": 0.0003595887662988967, "loss": 2.8945, "theoretical_loss": 3.6668855159879046, "tokens_seen": 950671360 }, { "epoch": 2.08, "learning_rate": 0.0003595787362086259, "loss": 2.9221, "theoretical_loss": 3.6668612350991534, "tokens_seen": 950736896 }, { "epoch": 2.08, "learning_rate": 0.0003595687061183551, "loss": 2.8804, "theoretical_loss": 3.666836956352676, "tokens_seen": 950802432 }, { "epoch": 2.08, "learning_rate": 0.00035955867602808426, "loss": 2.8432, "theoretical_loss": 3.666812679748136, "tokens_seen": 950867968 }, { "epoch": 2.08, "learning_rate": 0.00035954864593781344, "loss": 2.8538, "theoretical_loss": 3.666788405285197, "tokens_seen": 950933504 }, { "epoch": 2.08, "learning_rate": 0.0003595386158475427, "loss": 2.8247, "theoretical_loss": 3.6667641329635217, "tokens_seen": 950999040 }, { "epoch": 2.08, "learning_rate": 0.0003595285857572718, "loss": 2.8152, "theoretical_loss": 3.666739862782774, "tokens_seen": 951064576 }, { "epoch": 2.08, "learning_rate": 0.00035951855566700104, "loss": 2.8394, "theoretical_loss": 3.6667155947426173, "tokens_seen": 951130112 }, { "epoch": 2.08, "learning_rate": 0.00035950852557673017, "loss": 2.9202, "theoretical_loss": 3.666691328842716, "tokens_seen": 951195648 }, { "epoch": 2.08, "learning_rate": 0.0003594984954864594, "loss": 2.8472, "theoretical_loss": 3.6666670650827333, "tokens_seen": 951261184 }, { "epoch": 2.08, "learning_rate": 0.0003594884653961886, "loss": 2.6958, "theoretical_loss": 3.6666428034623335, "tokens_seen": 951326720 }, { "epoch": 2.08, "learning_rate": 0.00035947843530591777, "loss": 3.0004, "theoretical_loss": 3.66661854398118, "tokens_seen": 951392256 }, { "epoch": 2.08, "learning_rate": 0.00035946840521564695, "loss": 3.0575, "theoretical_loss": 3.6665942866389374, "tokens_seen": 951457792 }, { "epoch": 2.08, "learning_rate": 0.0003594583751253762, "loss": 2.8536, "theoretical_loss": 3.6665700314352696, "tokens_seen": 951523328 }, { "epoch": 2.08, "learning_rate": 0.0003594483450351053, "loss": 2.8957, "theoretical_loss": 3.6665457783698407, "tokens_seen": 951588864 }, { "epoch": 2.08, "learning_rate": 0.00035943831494483454, "loss": 3.0509, "theoretical_loss": 3.666521527442315, "tokens_seen": 951654400 }, { "epoch": 2.08, "learning_rate": 0.00035942828485456367, "loss": 2.8413, "theoretical_loss": 3.666497278652357, "tokens_seen": 951719936 }, { "epoch": 2.08, "learning_rate": 0.0003594182547642929, "loss": 2.8578, "theoretical_loss": 3.6664730319996313, "tokens_seen": 951785472 }, { "epoch": 2.08, "learning_rate": 0.0003594082246740221, "loss": 2.8032, "theoretical_loss": 3.6664487874838017, "tokens_seen": 951851008 }, { "epoch": 2.08, "objective/train/docs_used": 1532237, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.927743434906006, "objective/train/theoretical_loss": 3.666436666027118, "objective/train/tokens_used": 972343776, "theoretical_loss": 3.666436666027118, "tokens_seen": 951883776 }, { "epoch": 2.08, "learning_rate": 0.00035939819458375127, "loss": 2.8656, "theoretical_loss": 3.666424545104533, "tokens_seen": 951916544 }, { "epoch": 2.08, "learning_rate": 0.00035938816449348045, "loss": 2.6826, "theoretical_loss": 3.66640030486149, "tokens_seen": 951982080 }, { "epoch": 2.08, "learning_rate": 0.00035937813440320963, "loss": 2.865, "theoretical_loss": 3.6663760667543377, "tokens_seen": 952047616 }, { "epoch": 2.08, "learning_rate": 0.0003593681043129388, "loss": 3.0163, "theoretical_loss": 3.666351830782741, "tokens_seen": 952113152 }, { "epoch": 2.08, "learning_rate": 0.00035935807422266805, "loss": 2.8684, "theoretical_loss": 3.6663275969463633, "tokens_seen": 952178688 }, { "epoch": 2.08, "learning_rate": 0.0003593480441323972, "loss": 2.8704, "theoretical_loss": 3.6663033652448713, "tokens_seen": 952244224 }, { "epoch": 2.08, "learning_rate": 0.0003593380140421264, "loss": 2.889, "theoretical_loss": 3.666279135677929, "tokens_seen": 952309760 }, { "epoch": 2.08, "learning_rate": 0.00035932798395185554, "loss": 2.9327, "theoretical_loss": 3.666254908245202, "tokens_seen": 952375296 }, { "epoch": 2.08, "learning_rate": 0.00035931795386158477, "loss": 2.8858, "theoretical_loss": 3.6662306829463556, "tokens_seen": 952440832 }, { "epoch": 2.08, "learning_rate": 0.00035930792377131395, "loss": 2.8585, "theoretical_loss": 3.666206459781054, "tokens_seen": 952506368 }, { "epoch": 2.08, "learning_rate": 0.00035929789368104313, "loss": 2.8757, "theoretical_loss": 3.6661822387489638, "tokens_seen": 952571904 }, { "epoch": 2.08, "learning_rate": 0.0003592878635907723, "loss": 3.0122, "theoretical_loss": 3.6661580198497496, "tokens_seen": 952637440 }, { "epoch": 2.08, "learning_rate": 0.00035927783350050155, "loss": 2.8535, "theoretical_loss": 3.666133803083077, "tokens_seen": 952702976 }, { "epoch": 2.08, "learning_rate": 0.0003592678034102307, "loss": 2.86, "theoretical_loss": 3.666109588448612, "tokens_seen": 952768512 }, { "epoch": 2.08, "learning_rate": 0.0003592577733199599, "loss": 2.9058, "theoretical_loss": 3.6660853759460195, "tokens_seen": 952834048 }, { "epoch": 2.08, "learning_rate": 0.00035924774322968904, "loss": 3.0865, "theoretical_loss": 3.6660611655749658, "tokens_seen": 952899584 }, { "epoch": 2.08, "learning_rate": 0.0003592377131394183, "loss": 2.9298, "theoretical_loss": 3.666036957335116, "tokens_seen": 952965120 }, { "epoch": 2.08, "learning_rate": 0.00035922768304914746, "loss": 2.669, "theoretical_loss": 3.666012751226137, "tokens_seen": 953030656 }, { "epoch": 2.08, "learning_rate": 0.00035921765295887664, "loss": 2.886, "theoretical_loss": 3.665988547247694, "tokens_seen": 953096192 }, { "epoch": 2.08, "learning_rate": 0.0003592076228686058, "loss": 2.9054, "theoretical_loss": 3.665964345399453, "tokens_seen": 953161728 }, { "epoch": 2.08, "learning_rate": 0.000359197592778335, "loss": 3.0261, "theoretical_loss": 3.66594014568108, "tokens_seen": 953227264 }, { "epoch": 2.08, "learning_rate": 0.0003591875626880642, "loss": 2.8252, "theoretical_loss": 3.665915948092242, "tokens_seen": 953292800 }, { "epoch": 2.08, "learning_rate": 0.0003591775325977934, "loss": 2.7888, "theoretical_loss": 3.665891752632604, "tokens_seen": 953358336 }, { "epoch": 2.08, "learning_rate": 0.00035916750250752254, "loss": 2.8669, "theoretical_loss": 3.665867559301833, "tokens_seen": 953423872 }, { "epoch": 2.08, "learning_rate": 0.0003591574724172518, "loss": 2.9872, "theoretical_loss": 3.665843368099595, "tokens_seen": 953489408 }, { "epoch": 2.08, "objective/train/docs_used": 1535056, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6104745864868164, "objective/train/theoretical_loss": 3.665831273296572, "objective/train/tokens_used": 973982176, "theoretical_loss": 3.665831273296572, "tokens_seen": 953522176 }, { "epoch": 2.08, "learning_rate": 0.00035914744232698096, "loss": 2.7826, "theoretical_loss": 3.665819179025557, "tokens_seen": 953554944 }, { "epoch": 2.08, "learning_rate": 0.00035913741223671014, "loss": 3.0413, "theoretical_loss": 3.665794992079385, "tokens_seen": 953620480 }, { "epoch": 2.08, "learning_rate": 0.0003591273821464393, "loss": 2.8472, "theoretical_loss": 3.665770807260746, "tokens_seen": 953686016 }, { "epoch": 2.08, "learning_rate": 0.0003591173520561685, "loss": 2.8904, "theoretical_loss": 3.6657466245693064, "tokens_seen": 953751552 }, { "epoch": 2.08, "learning_rate": 0.0003591073219658977, "loss": 2.8176, "theoretical_loss": 3.665722444004733, "tokens_seen": 953817088 }, { "epoch": 2.08, "learning_rate": 0.0003590972918756269, "loss": 2.8831, "theoretical_loss": 3.665698265566693, "tokens_seen": 953882624 }, { "epoch": 2.08, "learning_rate": 0.00035908726178535605, "loss": 2.9403, "theoretical_loss": 3.6656740892548525, "tokens_seen": 953948160 }, { "epoch": 2.08, "learning_rate": 0.0003590772316950853, "loss": 3.0042, "theoretical_loss": 3.6656499150688795, "tokens_seen": 954013696 }, { "epoch": 2.08, "learning_rate": 0.0003590672016048144, "loss": 2.8474, "theoretical_loss": 3.66562574300844, "tokens_seen": 954079232 }, { "epoch": 2.08, "learning_rate": 0.00035905717151454364, "loss": 2.9524, "theoretical_loss": 3.665601573073202, "tokens_seen": 954144768 }, { "epoch": 2.08, "learning_rate": 0.0003590471414242728, "loss": 2.8934, "theoretical_loss": 3.665577405262832, "tokens_seen": 954210304 }, { "epoch": 2.08, "learning_rate": 0.000359037111334002, "loss": 2.7594, "theoretical_loss": 3.665553239576998, "tokens_seen": 954275840 }, { "epoch": 2.08, "learning_rate": 0.0003590270812437312, "loss": 2.9434, "theoretical_loss": 3.6655290760153663, "tokens_seen": 954341376 }, { "epoch": 2.08, "learning_rate": 0.00035901705115346037, "loss": 2.8122, "theoretical_loss": 3.6655049145776055, "tokens_seen": 954406912 }, { "epoch": 2.08, "learning_rate": 0.00035900702106318955, "loss": 2.8326, "theoretical_loss": 3.6654807552633826, "tokens_seen": 954472448 }, { "epoch": 2.08, "learning_rate": 0.0003589969909729188, "loss": 2.9666, "theoretical_loss": 3.665456598072365, "tokens_seen": 954537984 }, { "epoch": 2.08, "learning_rate": 0.0003589869608826479, "loss": 2.5319, "theoretical_loss": 3.6654324430042204, "tokens_seen": 954603520 }, { "epoch": 2.08, "learning_rate": 0.00035897693079237715, "loss": 2.7328, "theoretical_loss": 3.665408290058617, "tokens_seen": 954669056 }, { "epoch": 2.08, "learning_rate": 0.00035896690070210633, "loss": 2.7581, "theoretical_loss": 3.665384139235222, "tokens_seen": 954734592 }, { "epoch": 2.08, "learning_rate": 0.0003589568706118355, "loss": 2.9344, "theoretical_loss": 3.6653599905337035, "tokens_seen": 954800128 }, { "epoch": 2.08, "learning_rate": 0.0003589468405215647, "loss": 2.997, "theoretical_loss": 3.6653358439537294, "tokens_seen": 954865664 }, { "epoch": 2.08, "learning_rate": 0.00035893681043129387, "loss": 2.8163, "theoretical_loss": 3.6653116994949677, "tokens_seen": 954931200 }, { "epoch": 2.08, "learning_rate": 0.00035892678034102305, "loss": 2.8882, "theoretical_loss": 3.665287557157087, "tokens_seen": 954996736 }, { "epoch": 2.08, "learning_rate": 0.0003589167502507523, "loss": 2.9913, "theoretical_loss": 3.6652634169397547, "tokens_seen": 955062272 }, { "epoch": 2.08, "learning_rate": 0.0003589067201604814, "loss": 2.8181, "theoretical_loss": 3.6652392788426393, "tokens_seen": 955127808 }, { "epoch": 2.08, "objective/train/docs_used": 1537778, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0604515075683594, "objective/train/theoretical_loss": 3.6652272105890598, "objective/train/tokens_used": 975620576, "theoretical_loss": 3.6652272105890598, "tokens_seen": 955160576 }, { "epoch": 2.08, "learning_rate": 0.00035889669007021065, "loss": 2.9458, "theoretical_loss": 3.66521514286541, "tokens_seen": 955193344 }, { "epoch": 2.08, "learning_rate": 0.0003588866599799398, "loss": 2.9073, "theoretical_loss": 3.6651910090077333, "tokens_seen": 955258880 }, { "epoch": 2.08, "learning_rate": 0.000358876629889669, "loss": 2.9658, "theoretical_loss": 3.6651668772692796, "tokens_seen": 955324416 }, { "epoch": 2.08, "learning_rate": 0.0003588665997993982, "loss": 2.942, "theoretical_loss": 3.6651427476497167, "tokens_seen": 955389952 }, { "epoch": 2.08, "learning_rate": 0.0003588565697091274, "loss": 3.0567, "theoretical_loss": 3.6651186201487125, "tokens_seen": 955455488 }, { "epoch": 2.08, "learning_rate": 0.0003588465396188566, "loss": 2.9097, "theoretical_loss": 3.6650944947659374, "tokens_seen": 955521024 }, { "epoch": 2.08, "learning_rate": 0.00035883650952858574, "loss": 3.0464, "theoretical_loss": 3.6650703715010584, "tokens_seen": 955586560 }, { "epoch": 2.08, "learning_rate": 0.00035882647943831497, "loss": 2.9633, "theoretical_loss": 3.665046250353745, "tokens_seen": 955652096 }, { "epoch": 2.08, "learning_rate": 0.00035881644934804415, "loss": 2.7198, "theoretical_loss": 3.6650221313236666, "tokens_seen": 955717632 }, { "epoch": 2.08, "learning_rate": 0.00035880641925777333, "loss": 2.919, "theoretical_loss": 3.6649980144104917, "tokens_seen": 955783168 }, { "epoch": 2.08, "learning_rate": 0.0003587963891675025, "loss": 2.7491, "theoretical_loss": 3.6649738996138894, "tokens_seen": 955848704 }, { "epoch": 2.08, "learning_rate": 0.00035878635907723175, "loss": 2.9593, "theoretical_loss": 3.664949786933529, "tokens_seen": 955914240 }, { "epoch": 2.08, "learning_rate": 0.0003587763289869609, "loss": 2.8306, "theoretical_loss": 3.664925676369079, "tokens_seen": 955979776 }, { "epoch": 2.08, "learning_rate": 0.0003587662988966901, "loss": 2.8737, "theoretical_loss": 3.6649015679202104, "tokens_seen": 956045312 }, { "epoch": 2.08, "learning_rate": 0.00035875626880641924, "loss": 2.7876, "theoretical_loss": 3.664877461586591, "tokens_seen": 956110848 }, { "epoch": 2.08, "learning_rate": 0.0003587462387161485, "loss": 2.8577, "theoretical_loss": 3.6648533573678908, "tokens_seen": 956176384 }, { "epoch": 2.08, "learning_rate": 0.00035873620862587766, "loss": 2.7784, "theoretical_loss": 3.664829255263779, "tokens_seen": 956241920 }, { "epoch": 2.08, "learning_rate": 0.00035872617853560684, "loss": 2.8191, "theoretical_loss": 3.6648051552739256, "tokens_seen": 956307456 }, { "epoch": 2.08, "learning_rate": 0.000358716148445336, "loss": 2.852, "theoretical_loss": 3.664781057398, "tokens_seen": 956372992 }, { "epoch": 2.08, "learning_rate": 0.0003587061183550652, "loss": 2.8943, "theoretical_loss": 3.664756961635672, "tokens_seen": 956438528 }, { "epoch": 2.08, "learning_rate": 0.0003586960882647944, "loss": 2.7919, "theoretical_loss": 3.6647328679866122, "tokens_seen": 956504064 }, { "epoch": 2.08, "learning_rate": 0.0003586860581745236, "loss": 3.0777, "theoretical_loss": 3.664708776450489, "tokens_seen": 956569600 }, { "epoch": 2.08, "learning_rate": 0.00035867602808425274, "loss": 2.7611, "theoretical_loss": 3.6646846870269734, "tokens_seen": 956635136 }, { "epoch": 2.08, "learning_rate": 0.000358665997993982, "loss": 2.8532, "theoretical_loss": 3.664660599715735, "tokens_seen": 956700672 }, { "epoch": 2.08, "learning_rate": 0.00035865596790371116, "loss": 2.9985, "theoretical_loss": 3.6646365145164435, "tokens_seen": 956766208 }, { "epoch": 2.08, "objective/train/docs_used": 1539261, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8975367546081543, "objective/train/theoretical_loss": 3.6646244727086756, "objective/train/tokens_used": 977258976, "theoretical_loss": 3.6646244727086756, "tokens_seen": 956798976 }, { "epoch": 2.08, "learning_rate": 0.00035864593781344034, "loss": 2.7423, "theoretical_loss": 3.6646124314287705, "tokens_seen": 956831744 }, { "epoch": 2.08, "learning_rate": 0.0003586359077231695, "loss": 2.8981, "theoretical_loss": 3.664588350452385, "tokens_seen": 956897280 }, { "epoch": 2.08, "learning_rate": 0.0003586258776328987, "loss": 2.9175, "theoretical_loss": 3.6645642715869577, "tokens_seen": 956962816 }, { "epoch": 2.08, "learning_rate": 0.0003586158475426279, "loss": 2.8425, "theoretical_loss": 3.664540194832159, "tokens_seen": 957028352 }, { "epoch": 2.08, "learning_rate": 0.0003586058174523571, "loss": 2.8401, "theoretical_loss": 3.6645161201876597, "tokens_seen": 957093888 }, { "epoch": 2.08, "learning_rate": 0.00035859578736208625, "loss": 2.898, "theoretical_loss": 3.6644920476531295, "tokens_seen": 957159424 }, { "epoch": 2.08, "learning_rate": 0.0003585857572718155, "loss": 3.1086, "theoretical_loss": 3.66446797722824, "tokens_seen": 957224960 }, { "epoch": 2.08, "learning_rate": 0.0003585757271815446, "loss": 3.0204, "theoretical_loss": 3.664443908912661, "tokens_seen": 957290496 }, { "epoch": 2.08, "learning_rate": 0.00035856569709127384, "loss": 2.8582, "theoretical_loss": 3.6644198427060646, "tokens_seen": 957356032 }, { "epoch": 2.08, "learning_rate": 0.000358555667001003, "loss": 2.8327, "theoretical_loss": 3.66439577860812, "tokens_seen": 957421568 }, { "epoch": 2.08, "learning_rate": 0.0003585456369107322, "loss": 2.9816, "theoretical_loss": 3.6643717166184993, "tokens_seen": 957487104 }, { "epoch": 2.08, "learning_rate": 0.0003585356068204614, "loss": 3.0361, "theoretical_loss": 3.664347656736873, "tokens_seen": 957552640 }, { "epoch": 2.08, "learning_rate": 0.00035852557673019057, "loss": 2.8426, "theoretical_loss": 3.6643235989629126, "tokens_seen": 957618176 }, { "epoch": 2.08, "learning_rate": 0.00035851554663991975, "loss": 2.9563, "theoretical_loss": 3.6642995432962886, "tokens_seen": 957683712 }, { "epoch": 2.08, "learning_rate": 0.000358505516549649, "loss": 2.8099, "theoretical_loss": 3.6642754897366725, "tokens_seen": 957749248 }, { "epoch": 2.08, "learning_rate": 0.0003584954864593781, "loss": 2.939, "theoretical_loss": 3.664251438283736, "tokens_seen": 957814784 }, { "epoch": 2.08, "learning_rate": 0.00035848545636910735, "loss": 2.8787, "theoretical_loss": 3.66422738893715, "tokens_seen": 957880320 }, { "epoch": 2.08, "learning_rate": 0.00035847542627883653, "loss": 2.8135, "theoretical_loss": 3.6642033416965862, "tokens_seen": 957945856 }, { "epoch": 2.08, "learning_rate": 0.0003584653961885657, "loss": 2.9703, "theoretical_loss": 3.6641792965617155, "tokens_seen": 958011392 }, { "epoch": 2.08, "learning_rate": 0.0003584553660982949, "loss": 2.9024, "theoretical_loss": 3.6641552535322104, "tokens_seen": 958076928 }, { "epoch": 2.08, "learning_rate": 0.00035844533600802407, "loss": 2.9546, "theoretical_loss": 3.664131212607742, "tokens_seen": 958142464 }, { "epoch": 2.08, "learning_rate": 0.00035843530591775325, "loss": 3.0016, "theoretical_loss": 3.664107173787982, "tokens_seen": 958208000 }, { "epoch": 2.08, "learning_rate": 0.0003584252758274825, "loss": 2.9529, "theoretical_loss": 3.664083137072602, "tokens_seen": 958273536 }, { "epoch": 2.08, "learning_rate": 0.0003584152457372116, "loss": 2.8463, "theoretical_loss": 3.664059102461275, "tokens_seen": 958339072 }, { "epoch": 2.08, "learning_rate": 0.00035840521564694085, "loss": 2.8428, "theoretical_loss": 3.6640350699536715, "tokens_seen": 958404608 }, { "epoch": 2.08, "objective/train/docs_used": 1542166, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1807515621185303, "objective/train/theoretical_loss": 3.6640230544886645, "objective/train/tokens_used": 978897376, "theoretical_loss": 3.6640230544886645, "tokens_seen": 958437376 }, { "epoch": 2.08, "learning_rate": 0.00035839518555667, "loss": 2.928, "theoretical_loss": 3.6640110395494645, "tokens_seen": 958470144 }, { "epoch": 2.08, "learning_rate": 0.0003583851554663992, "loss": 2.7526, "theoretical_loss": 3.6639870112483264, "tokens_seen": 958535680 }, { "epoch": 2.08, "learning_rate": 0.0003583751253761284, "loss": 2.8625, "theoretical_loss": 3.6639629850499285, "tokens_seen": 958601216 }, { "epoch": 2.08, "learning_rate": 0.0003583650952858576, "loss": 2.7909, "theoretical_loss": 3.663938960953943, "tokens_seen": 958666752 }, { "epoch": 2.08, "learning_rate": 0.00035835506519558676, "loss": 2.9833, "theoretical_loss": 3.6639149389600427, "tokens_seen": 958732288 }, { "epoch": 2.08, "learning_rate": 0.00035834503510531594, "loss": 2.845, "theoretical_loss": 3.6638909190678994, "tokens_seen": 958797824 }, { "epoch": 2.08, "learning_rate": 0.0003583350050150451, "loss": 3.0776, "theoretical_loss": 3.6638669012771867, "tokens_seen": 958863360 }, { "epoch": 2.08, "learning_rate": 0.00035832497492477435, "loss": 2.6522, "theoretical_loss": 3.6638428855875764, "tokens_seen": 958928896 }, { "epoch": 2.08, "learning_rate": 0.0003583149448345035, "loss": 2.9712, "theoretical_loss": 3.6638188719987417, "tokens_seen": 958994432 }, { "epoch": 2.08, "learning_rate": 0.0003583049147442327, "loss": 2.9583, "theoretical_loss": 3.663794860510354, "tokens_seen": 959059968 }, { "epoch": 2.08, "learning_rate": 0.0003582948846539619, "loss": 2.9341, "theoretical_loss": 3.663770851122087, "tokens_seen": 959125504 }, { "epoch": 2.08, "learning_rate": 0.0003582848545636911, "loss": 2.8899, "theoretical_loss": 3.6637468438336134, "tokens_seen": 959191040 }, { "epoch": 2.08, "learning_rate": 0.00035827482447342026, "loss": 2.8212, "theoretical_loss": 3.663722838644606, "tokens_seen": 959256576 }, { "epoch": 2.08, "learning_rate": 0.00035826479438314944, "loss": 2.8806, "theoretical_loss": 3.6636988355547384, "tokens_seen": 959322112 }, { "epoch": 2.08, "learning_rate": 0.0003582547642928786, "loss": 2.8441, "theoretical_loss": 3.6636748345636825, "tokens_seen": 959387648 }, { "epoch": 2.08, "learning_rate": 0.00035824473420260786, "loss": 2.8124, "theoretical_loss": 3.6636508356711124, "tokens_seen": 959453184 }, { "epoch": 2.08, "learning_rate": 0.000358234704112337, "loss": 2.8743, "theoretical_loss": 3.6636268388767013, "tokens_seen": 959518720 }, { "epoch": 2.08, "learning_rate": 0.0003582246740220662, "loss": 2.9818, "theoretical_loss": 3.663602844180122, "tokens_seen": 959584256 }, { "epoch": 2.08, "learning_rate": 0.00035821464393179535, "loss": 2.8112, "theoretical_loss": 3.6635788515810477, "tokens_seen": 959649792 }, { "epoch": 2.08, "learning_rate": 0.0003582046138415246, "loss": 2.9766, "theoretical_loss": 3.663554861079152, "tokens_seen": 959715328 }, { "epoch": 2.08, "learning_rate": 0.00035819458375125376, "loss": 2.7765, "theoretical_loss": 3.6635308726741087, "tokens_seen": 959780864 }, { "epoch": 2.08, "learning_rate": 0.00035818455366098294, "loss": 3.0198, "theoretical_loss": 3.6635068863655915, "tokens_seen": 959846400 }, { "epoch": 2.08, "learning_rate": 0.0003581745235707121, "loss": 2.9788, "theoretical_loss": 3.663482902153273, "tokens_seen": 959911936 }, { "epoch": 2.08, "learning_rate": 0.00035816449348044136, "loss": 2.9587, "theoretical_loss": 3.6634589200368284, "tokens_seen": 959977472 }, { "epoch": 2.08, "learning_rate": 0.0003581544633901705, "loss": 2.9922, "theoretical_loss": 3.6634349400159305, "tokens_seen": 960043008 }, { "epoch": 2.08, "objective/train/docs_used": 1544419, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8362796306610107, "objective/train/theoretical_loss": 3.6634229507912095, "objective/train/tokens_used": 980535776, "theoretical_loss": 3.6634229507912095, "tokens_seen": 960075776 }, { "epoch": 2.08, "learning_rate": 0.0003581444332998997, "loss": 2.9844, "theoretical_loss": 3.6634109620902535, "tokens_seen": 960108544 }, { "epoch": 2.08, "learning_rate": 0.00035813440320962885, "loss": 2.8158, "theoretical_loss": 3.6633869862594706, "tokens_seen": 960174080 }, { "epoch": 2.08, "learning_rate": 0.0003581243731193581, "loss": 2.8409, "theoretical_loss": 3.663363012523257, "tokens_seen": 960239616 }, { "epoch": 2.08, "learning_rate": 0.00035811434302908727, "loss": 2.7402, "theoretical_loss": 3.663339040881286, "tokens_seen": 960305152 }, { "epoch": 2.08, "learning_rate": 0.00035810431293881645, "loss": 2.9145, "theoretical_loss": 3.6633150713332325, "tokens_seen": 960370688 }, { "epoch": 2.08, "learning_rate": 0.0003580942828485457, "loss": 2.9197, "theoretical_loss": 3.6632911038787697, "tokens_seen": 960436224 }, { "epoch": 2.08, "learning_rate": 0.0003580842527582748, "loss": 2.8378, "theoretical_loss": 3.6632671385175724, "tokens_seen": 960501760 }, { "epoch": 2.08, "learning_rate": 0.00035807422266800404, "loss": 2.8294, "theoretical_loss": 3.663243175249315, "tokens_seen": 960567296 }, { "epoch": 2.08, "learning_rate": 0.0003580641925777332, "loss": 2.8851, "theoretical_loss": 3.663219214073672, "tokens_seen": 960632832 }, { "epoch": 2.08, "learning_rate": 0.0003580541624874624, "loss": 2.8946, "theoretical_loss": 3.6631952549903177, "tokens_seen": 960698368 }, { "epoch": 2.08, "learning_rate": 0.0003580441323971916, "loss": 3.0316, "theoretical_loss": 3.663171297998927, "tokens_seen": 960763904 }, { "epoch": 2.08, "learning_rate": 0.00035803410230692077, "loss": 2.7943, "theoretical_loss": 3.6631473430991743, "tokens_seen": 960829440 }, { "epoch": 2.08, "learning_rate": 0.00035802407221664995, "loss": 3.0094, "theoretical_loss": 3.663123390290734, "tokens_seen": 960894976 }, { "epoch": 2.08, "learning_rate": 0.0003580140421263792, "loss": 2.8892, "theoretical_loss": 3.6630994395732817, "tokens_seen": 960960512 }, { "epoch": 2.08, "learning_rate": 0.0003580040120361083, "loss": 2.773, "theoretical_loss": 3.663075490946492, "tokens_seen": 961026048 }, { "epoch": 2.08, "learning_rate": 0.00035799398194583755, "loss": 3.032, "theoretical_loss": 3.66305154441004, "tokens_seen": 961091584 }, { "epoch": 2.08, "learning_rate": 0.00035798395185556673, "loss": 2.9277, "theoretical_loss": 3.6630275999636, "tokens_seen": 961157120 }, { "epoch": 2.08, "learning_rate": 0.0003579739217652959, "loss": 2.9159, "theoretical_loss": 3.663003657606848, "tokens_seen": 961222656 }, { "epoch": 2.08, "learning_rate": 0.0003579638916750251, "loss": 2.776, "theoretical_loss": 3.662979717339458, "tokens_seen": 961288192 }, { "epoch": 2.08, "learning_rate": 0.00035795386158475427, "loss": 2.7369, "theoretical_loss": 3.6629557791611065, "tokens_seen": 961353728 }, { "epoch": 2.08, "learning_rate": 0.00035794383149448345, "loss": 2.7819, "theoretical_loss": 3.662931843071468, "tokens_seen": 961419264 }, { "epoch": 2.08, "learning_rate": 0.0003579338014042127, "loss": 2.8367, "theoretical_loss": 3.662907909070219, "tokens_seen": 961484800 }, { "epoch": 2.08, "learning_rate": 0.0003579237713139418, "loss": 2.8966, "theoretical_loss": 3.662883977157034, "tokens_seen": 961550336 }, { "epoch": 2.08, "learning_rate": 0.00035791374122367105, "loss": 2.9299, "theoretical_loss": 3.662860047331588, "tokens_seen": 961615872 }, { "epoch": 2.08, "learning_rate": 0.0003579037111334002, "loss": 2.9916, "theoretical_loss": 3.6628361195935577, "tokens_seen": 961681408 }, { "epoch": 2.08, "objective/train/docs_used": 1547059, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.936875581741333, "objective/train/theoretical_loss": 3.662824156507222, "objective/train/tokens_used": 982174176, "theoretical_loss": 3.662824156507222, "tokens_seen": 961714176 }, { "epoch": 2.08, "learning_rate": 0.0003578936810431294, "loss": 2.7832, "theoretical_loss": 3.6628121939426186, "tokens_seen": 961746944 }, { "epoch": 2.08, "learning_rate": 0.0003578836509528586, "loss": 2.8606, "theoretical_loss": 3.6627882703784462, "tokens_seen": 961812480 }, { "epoch": 2.08, "learning_rate": 0.0003578736208625878, "loss": 2.7996, "theoretical_loss": 3.6627643489007164, "tokens_seen": 961878016 }, { "epoch": 2.08, "learning_rate": 0.00035786359077231696, "loss": 2.9348, "theoretical_loss": 3.662740429509105, "tokens_seen": 961943552 }, { "epoch": 2.08, "learning_rate": 0.00035785356068204614, "loss": 2.8311, "theoretical_loss": 3.6627165122032874, "tokens_seen": 962009088 }, { "epoch": 2.08, "learning_rate": 0.0003578435305917753, "loss": 2.9002, "theoretical_loss": 3.662692596982941, "tokens_seen": 962074624 }, { "epoch": 2.08, "learning_rate": 0.00035783350050150455, "loss": 2.812, "theoretical_loss": 3.662668683847741, "tokens_seen": 962140160 }, { "epoch": 2.08, "learning_rate": 0.0003578234704112337, "loss": 2.9007, "theoretical_loss": 3.662644772797364, "tokens_seen": 962205696 }, { "epoch": 2.08, "learning_rate": 0.0003578134403209629, "loss": 2.8294, "theoretical_loss": 3.662620863831486, "tokens_seen": 962271232 }, { "epoch": 2.08, "learning_rate": 0.0003578034102306921, "loss": 2.9905, "theoretical_loss": 3.662596956949783, "tokens_seen": 962336768 }, { "epoch": 2.08, "learning_rate": 0.0003577933801404213, "loss": 2.7614, "theoretical_loss": 3.6625730521519326, "tokens_seen": 962402304 }, { "epoch": 2.08, "learning_rate": 0.00035778335005015046, "loss": 2.926, "theoretical_loss": 3.6625491494376097, "tokens_seen": 962467840 }, { "epoch": 2.08, "learning_rate": 0.00035777331995987964, "loss": 2.9052, "theoretical_loss": 3.6625252488064914, "tokens_seen": 962533376 }, { "epoch": 2.08, "learning_rate": 0.0003577632898696088, "loss": 2.8415, "theoretical_loss": 3.662501350258255, "tokens_seen": 962598912 }, { "epoch": 2.08, "learning_rate": 0.00035775325977933806, "loss": 2.7614, "theoretical_loss": 3.662477453792577, "tokens_seen": 962664448 }, { "epoch": 2.08, "learning_rate": 0.0003577432296890672, "loss": 2.976, "theoretical_loss": 3.6624535594091334, "tokens_seen": 962729984 }, { "epoch": 2.08, "learning_rate": 0.0003577331995987964, "loss": 2.8909, "theoretical_loss": 3.662429667107602, "tokens_seen": 962795520 }, { "epoch": 2.08, "learning_rate": 0.00035772316950852555, "loss": 2.7976, "theoretical_loss": 3.662405776887659, "tokens_seen": 962861056 }, { "epoch": 2.08, "learning_rate": 0.0003577131394182548, "loss": 2.6602, "theoretical_loss": 3.6623818887489814, "tokens_seen": 962926592 }, { "epoch": 2.08, "learning_rate": 0.00035770310932798396, "loss": 3.0172, "theoretical_loss": 3.6623580026912466, "tokens_seen": 962992128 }, { "epoch": 2.08, "learning_rate": 0.00035769307923771314, "loss": 2.8379, "theoretical_loss": 3.6623341187141314, "tokens_seen": 963057664 }, { "epoch": 2.08, "learning_rate": 0.0003576830491474423, "loss": 3.009, "theoretical_loss": 3.6623102368173135, "tokens_seen": 963123200 }, { "epoch": 2.08, "learning_rate": 0.00035767301905717156, "loss": 2.7523, "theoretical_loss": 3.66228635700047, "tokens_seen": 963188736 }, { "epoch": 2.08, "learning_rate": 0.0003576629889669007, "loss": 2.8901, "theoretical_loss": 3.6622624792632776, "tokens_seen": 963254272 }, { "epoch": 2.08, "learning_rate": 0.0003576529588766299, "loss": 2.9816, "theoretical_loss": 3.6622386036054144, "tokens_seen": 963319808 }, { "epoch": 2.08, "objective/train/docs_used": 1550039, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7122764587402344, "objective/train/theoretical_loss": 3.6622266665561307, "objective/train/tokens_used": 983812576, "theoretical_loss": 3.6622266665561307, "tokens_seen": 963352576 }, { "epoch": 2.08, "learning_rate": 0.00035764292878635905, "loss": 2.7542, "theoretical_loss": 3.662214730026558, "tokens_seen": 963385344 }, { "epoch": 2.08, "learning_rate": 0.0003576328986960883, "loss": 2.8501, "theoretical_loss": 3.6621908585263854, "tokens_seen": 963450880 }, { "epoch": 2.08, "learning_rate": 0.00035762286860581747, "loss": 2.9601, "theoretical_loss": 3.6621669891045743, "tokens_seen": 963516416 }, { "epoch": 2.08, "learning_rate": 0.00035761283851554665, "loss": 2.9907, "theoretical_loss": 3.6621431217608027, "tokens_seen": 963581952 }, { "epoch": 2.08, "learning_rate": 0.00035760280842527583, "loss": 2.9747, "theoretical_loss": 3.6621192564947487, "tokens_seen": 963647488 }, { "epoch": 2.08, "learning_rate": 0.000357592778335005, "loss": 2.7951, "theoretical_loss": 3.6620953933060894, "tokens_seen": 963713024 }, { "epoch": 2.08, "learning_rate": 0.0003575827482447342, "loss": 2.8351, "theoretical_loss": 3.6620715321945028, "tokens_seen": 963778560 }, { "epoch": 2.08, "learning_rate": 0.0003575727181544634, "loss": 2.7244, "theoretical_loss": 3.662047673159668, "tokens_seen": 963844096 }, { "epoch": 2.08, "learning_rate": 0.00035756268806419255, "loss": 2.8565, "theoretical_loss": 3.6620238162012613, "tokens_seen": 963909632 }, { "epoch": 2.09, "learning_rate": 0.0003575526579739218, "loss": 2.8586, "theoretical_loss": 3.661999961318962, "tokens_seen": 963975168 }, { "epoch": 2.09, "learning_rate": 0.0003575426278836509, "loss": 2.7938, "theoretical_loss": 3.6619761085124476, "tokens_seen": 964040704 }, { "epoch": 2.09, "learning_rate": 0.00035753259779338015, "loss": 2.9118, "theoretical_loss": 3.6619522577813974, "tokens_seen": 964106240 }, { "epoch": 2.09, "learning_rate": 0.00035752256770310933, "loss": 2.8854, "theoretical_loss": 3.6619284091254887, "tokens_seen": 964171776 }, { "epoch": 2.09, "learning_rate": 0.0003575125376128385, "loss": 2.7953, "theoretical_loss": 3.661904562544401, "tokens_seen": 964237312 }, { "epoch": 2.09, "learning_rate": 0.0003575025075225677, "loss": 2.8644, "theoretical_loss": 3.6618807180378115, "tokens_seen": 964302848 }, { "epoch": 2.09, "learning_rate": 0.00035749247743229693, "loss": 2.8563, "theoretical_loss": 3.6618568756054, "tokens_seen": 964368384 }, { "epoch": 2.09, "learning_rate": 0.00035748244734202606, "loss": 2.7214, "theoretical_loss": 3.661833035246844, "tokens_seen": 964433920 }, { "epoch": 2.09, "learning_rate": 0.0003574724172517553, "loss": 2.9169, "theoretical_loss": 3.6618091969618227, "tokens_seen": 964499456 }, { "epoch": 2.09, "learning_rate": 0.0003574623871614844, "loss": 3.19, "theoretical_loss": 3.6617853607500153, "tokens_seen": 964564992 }, { "epoch": 2.09, "learning_rate": 0.00035745235707121365, "loss": 3.0258, "theoretical_loss": 3.6617615266111, "tokens_seen": 964630528 }, { "epoch": 2.09, "learning_rate": 0.00035744232698094283, "loss": 3.0045, "theoretical_loss": 3.6617376945447564, "tokens_seen": 964696064 }, { "epoch": 2.09, "learning_rate": 0.000357432296890672, "loss": 2.7623, "theoretical_loss": 3.661713864550663, "tokens_seen": 964761600 }, { "epoch": 2.09, "learning_rate": 0.0003574222668004012, "loss": 2.8479, "theoretical_loss": 3.6616900366284986, "tokens_seen": 964827136 }, { "epoch": 2.09, "learning_rate": 0.0003574122367101304, "loss": 2.6557, "theoretical_loss": 3.661666210777943, "tokens_seen": 964892672 }, { "epoch": 2.09, "learning_rate": 0.00035740220661985956, "loss": 2.7597, "theoretical_loss": 3.6616423869986754, "tokens_seen": 964958208 }, { "epoch": 2.09, "objective/train/docs_used": 1552779, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.957716226577759, "objective/train/theoretical_loss": 3.6616304758856737, "objective/train/tokens_used": 985450976, "theoretical_loss": 3.6616304758856737, "tokens_seen": 964990976 }, { "epoch": 2.09, "learning_rate": 0.0003573921765295888, "loss": 3.0026, "theoretical_loss": 3.6616185652903743, "tokens_seen": 965023744 }, { "epoch": 2.09, "learning_rate": 0.0003573821464393179, "loss": 2.7386, "theoretical_loss": 3.6615947456527196, "tokens_seen": 965089280 }, { "epoch": 2.09, "learning_rate": 0.00035737211634904716, "loss": 2.9809, "theoretical_loss": 3.661570928085391, "tokens_seen": 965154816 }, { "epoch": 2.09, "learning_rate": 0.0003573620862587763, "loss": 3.0081, "theoretical_loss": 3.6615471125880674, "tokens_seen": 965220352 }, { "epoch": 2.09, "learning_rate": 0.0003573520561685055, "loss": 2.9899, "theoretical_loss": 3.6615232991604283, "tokens_seen": 965285888 }, { "epoch": 2.09, "learning_rate": 0.00035734202607823475, "loss": 3.0801, "theoretical_loss": 3.6614994878021543, "tokens_seen": 965351424 }, { "epoch": 2.09, "learning_rate": 0.0003573319959879639, "loss": 2.9339, "theoretical_loss": 3.661475678512924, "tokens_seen": 965416960 }, { "epoch": 2.09, "learning_rate": 0.0003573219658976931, "loss": 2.8056, "theoretical_loss": 3.6614518712924173, "tokens_seen": 965482496 }, { "epoch": 2.09, "learning_rate": 0.0003573119358074223, "loss": 2.718, "theoretical_loss": 3.6614280661403154, "tokens_seen": 965548032 }, { "epoch": 2.09, "learning_rate": 0.0003573019057171515, "loss": 2.9569, "theoretical_loss": 3.6614042630562964, "tokens_seen": 965613568 }, { "epoch": 2.09, "learning_rate": 0.00035729187562688066, "loss": 2.9628, "theoretical_loss": 3.661380462040041, "tokens_seen": 965679104 }, { "epoch": 2.09, "learning_rate": 0.00035728184553660984, "loss": 2.8832, "theoretical_loss": 3.6613566630912304, "tokens_seen": 965744640 }, { "epoch": 2.09, "learning_rate": 0.000357271815446339, "loss": 2.7079, "theoretical_loss": 3.661332866209543, "tokens_seen": 965810176 }, { "epoch": 2.09, "learning_rate": 0.00035726178535606826, "loss": 2.8147, "theoretical_loss": 3.661309071394659, "tokens_seen": 965875712 }, { "epoch": 2.09, "learning_rate": 0.0003572517552657974, "loss": 2.9809, "theoretical_loss": 3.6612852786462606, "tokens_seen": 965941248 }, { "epoch": 2.09, "learning_rate": 0.0003572417251755266, "loss": 2.9602, "theoretical_loss": 3.661261487964026, "tokens_seen": 966006784 }, { "epoch": 2.09, "learning_rate": 0.00035723169508525575, "loss": 2.9594, "theoretical_loss": 3.6612376993476374, "tokens_seen": 966072320 }, { "epoch": 2.09, "learning_rate": 0.000357221664994985, "loss": 2.7647, "theoretical_loss": 3.661213912796774, "tokens_seen": 966137856 }, { "epoch": 2.09, "learning_rate": 0.00035721163490471416, "loss": 2.945, "theoretical_loss": 3.661190128311117, "tokens_seen": 966203392 }, { "epoch": 2.09, "learning_rate": 0.00035720160481444334, "loss": 3.0505, "theoretical_loss": 3.6611663458903463, "tokens_seen": 966268928 }, { "epoch": 2.09, "learning_rate": 0.0003571915747241725, "loss": 3.0443, "theoretical_loss": 3.661142565534144, "tokens_seen": 966334464 }, { "epoch": 2.09, "learning_rate": 0.00035718154463390176, "loss": 2.8076, "theoretical_loss": 3.661118787242189, "tokens_seen": 966400000 }, { "epoch": 2.09, "learning_rate": 0.0003571715145436309, "loss": 2.8222, "theoretical_loss": 3.661095011014164, "tokens_seen": 966465536 }, { "epoch": 2.09, "learning_rate": 0.0003571614844533601, "loss": 3.0309, "theoretical_loss": 3.6610712368497484, "tokens_seen": 966531072 }, { "epoch": 2.09, "learning_rate": 0.00035715145436308925, "loss": 2.8189, "theoretical_loss": 3.6610474647486244, "tokens_seen": 966596608 }, { "epoch": 2.09, "objective/train/docs_used": 1555485, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.546677827835083, "objective/train/theoretical_loss": 3.6610355794716964, "objective/train/tokens_used": 987089376, "theoretical_loss": 3.6610355794716964, "tokens_seen": 966629376 }, { "epoch": 2.09, "learning_rate": 0.0003571414242728185, "loss": 2.6851, "theoretical_loss": 3.661023694710472, "tokens_seen": 966662144 }, { "epoch": 2.09, "learning_rate": 0.00035713139418254767, "loss": 2.8656, "theoretical_loss": 3.660999926734973, "tokens_seen": 966727680 }, { "epoch": 2.09, "learning_rate": 0.00035712136409227685, "loss": 2.8367, "theoretical_loss": 3.660976160821809, "tokens_seen": 966793216 }, { "epoch": 2.09, "learning_rate": 0.00035711133400200603, "loss": 2.9751, "theoretical_loss": 3.66095239697066, "tokens_seen": 966858752 }, { "epoch": 2.09, "learning_rate": 0.0003571013039117352, "loss": 2.8834, "theoretical_loss": 3.6609286351812083, "tokens_seen": 966924288 }, { "epoch": 2.09, "learning_rate": 0.0003570912738214644, "loss": 2.8333, "theoretical_loss": 3.6609048754531353, "tokens_seen": 966989824 }, { "epoch": 2.09, "learning_rate": 0.0003570812437311936, "loss": 2.8909, "theoretical_loss": 3.660881117786122, "tokens_seen": 967055360 }, { "epoch": 2.09, "learning_rate": 0.00035707121364092275, "loss": 2.9111, "theoretical_loss": 3.6608573621798506, "tokens_seen": 967120896 }, { "epoch": 2.09, "learning_rate": 0.000357061183550652, "loss": 2.8387, "theoretical_loss": 3.660833608634002, "tokens_seen": 967186432 }, { "epoch": 2.09, "learning_rate": 0.0003570511534603811, "loss": 2.9047, "theoretical_loss": 3.660809857148258, "tokens_seen": 967251968 }, { "epoch": 2.09, "learning_rate": 0.00035704112337011035, "loss": 2.958, "theoretical_loss": 3.6607861077223007, "tokens_seen": 967317504 }, { "epoch": 2.09, "learning_rate": 0.00035703109327983953, "loss": 2.9779, "theoretical_loss": 3.660762360355812, "tokens_seen": 967383040 }, { "epoch": 2.09, "learning_rate": 0.0003570210631895687, "loss": 2.8478, "theoretical_loss": 3.6607386150484738, "tokens_seen": 967448576 }, { "epoch": 2.09, "learning_rate": 0.0003570110330992979, "loss": 2.9308, "theoretical_loss": 3.660714871799968, "tokens_seen": 967514112 }, { "epoch": 2.09, "learning_rate": 0.00035700100300902713, "loss": 2.9821, "theoretical_loss": 3.660691130609976, "tokens_seen": 967579648 }, { "epoch": 2.09, "learning_rate": 0.00035699097291875626, "loss": 2.9907, "theoretical_loss": 3.6606673914781807, "tokens_seen": 967645184 }, { "epoch": 2.09, "learning_rate": 0.0003569809428284855, "loss": 2.9263, "theoretical_loss": 3.6606436544042644, "tokens_seen": 967710720 }, { "epoch": 2.09, "learning_rate": 0.0003569709127382146, "loss": 3.0009, "theoretical_loss": 3.660619919387909, "tokens_seen": 967776256 }, { "epoch": 2.09, "learning_rate": 0.00035696088264794385, "loss": 2.7783, "theoretical_loss": 3.660596186428797, "tokens_seen": 967841792 }, { "epoch": 2.09, "learning_rate": 0.00035695085255767303, "loss": 2.9226, "theoretical_loss": 3.6605724555266104, "tokens_seen": 967907328 }, { "epoch": 2.09, "learning_rate": 0.0003569408224674022, "loss": 2.8979, "theoretical_loss": 3.6605487266810317, "tokens_seen": 967972864 }, { "epoch": 2.09, "learning_rate": 0.0003569307923771314, "loss": 2.9203, "theoretical_loss": 3.6605249998917437, "tokens_seen": 968038400 }, { "epoch": 2.09, "learning_rate": 0.0003569207622868606, "loss": 2.9683, "theoretical_loss": 3.6605012751584294, "tokens_seen": 968103936 }, { "epoch": 2.09, "learning_rate": 0.00035691073219658976, "loss": 2.8684, "theoretical_loss": 3.6604775524807707, "tokens_seen": 968169472 }, { "epoch": 2.09, "learning_rate": 0.000356900702106319, "loss": 2.824, "theoretical_loss": 3.660453831858451, "tokens_seen": 968235008 }, { "epoch": 2.09, "objective/train/docs_used": 1556912, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4819939136505127, "objective/train/theoretical_loss": 3.660441972317944, "objective/train/tokens_used": 988727776, "theoretical_loss": 3.660441972317944, "tokens_seen": 968267776 }, { "epoch": 2.09, "learning_rate": 0.0003568906720160481, "loss": 2.7503, "theoretical_loss": 3.660430113291153, "tokens_seen": 968300544 }, { "epoch": 2.09, "learning_rate": 0.00035688064192577736, "loss": 2.8845, "theoretical_loss": 3.6604063967785594, "tokens_seen": 968366080 }, { "epoch": 2.09, "learning_rate": 0.0003568706118355065, "loss": 2.9593, "theoretical_loss": 3.6603826823203534, "tokens_seen": 968431616 }, { "epoch": 2.09, "learning_rate": 0.0003568605817452357, "loss": 2.8275, "theoretical_loss": 3.6603589699162176, "tokens_seen": 968497152 }, { "epoch": 2.09, "learning_rate": 0.0003568505516549649, "loss": 2.8178, "theoretical_loss": 3.6603352595658354, "tokens_seen": 968562688 }, { "epoch": 2.09, "learning_rate": 0.0003568405215646941, "loss": 2.9202, "theoretical_loss": 3.6603115512688897, "tokens_seen": 968628224 }, { "epoch": 2.09, "learning_rate": 0.00035683049147442326, "loss": 2.9904, "theoretical_loss": 3.6602878450250644, "tokens_seen": 968693760 }, { "epoch": 2.09, "learning_rate": 0.0003568204613841525, "loss": 2.8804, "theoretical_loss": 3.6602641408340424, "tokens_seen": 968759296 }, { "epoch": 2.09, "learning_rate": 0.0003568104312938816, "loss": 3.0311, "theoretical_loss": 3.660240438695507, "tokens_seen": 968824832 }, { "epoch": 2.09, "learning_rate": 0.00035680040120361086, "loss": 2.8204, "theoretical_loss": 3.660216738609142, "tokens_seen": 968890368 }, { "epoch": 2.09, "learning_rate": 0.00035679037111334, "loss": 3.0162, "theoretical_loss": 3.6601930405746304, "tokens_seen": 968955904 }, { "epoch": 2.09, "learning_rate": 0.0003567803410230692, "loss": 2.924, "theoretical_loss": 3.660169344591656, "tokens_seen": 969021440 }, { "epoch": 2.09, "learning_rate": 0.0003567703109327984, "loss": 2.8548, "theoretical_loss": 3.660145650659903, "tokens_seen": 969086976 }, { "epoch": 2.09, "learning_rate": 0.0003567602808425276, "loss": 2.9601, "theoretical_loss": 3.660121958779054, "tokens_seen": 969152512 }, { "epoch": 2.09, "learning_rate": 0.00035675025075225677, "loss": 2.8052, "theoretical_loss": 3.660098268948794, "tokens_seen": 969218048 }, { "epoch": 2.09, "learning_rate": 0.00035674022066198595, "loss": 2.9765, "theoretical_loss": 3.6600745811688062, "tokens_seen": 969283584 }, { "epoch": 2.09, "learning_rate": 0.00035673019057171513, "loss": 2.9345, "theoretical_loss": 3.6600508954387747, "tokens_seen": 969349120 }, { "epoch": 2.09, "learning_rate": 0.00035672016048144436, "loss": 2.8118, "theoretical_loss": 3.6600272117583836, "tokens_seen": 969414656 }, { "epoch": 2.09, "learning_rate": 0.0003567101303911735, "loss": 2.7335, "theoretical_loss": 3.6600035301273177, "tokens_seen": 969480192 }, { "epoch": 2.09, "learning_rate": 0.0003567001003009027, "loss": 2.8341, "theoretical_loss": 3.65997985054526, "tokens_seen": 969545728 }, { "epoch": 2.09, "learning_rate": 0.00035669007021063185, "loss": 2.7865, "theoretical_loss": 3.659956173011895, "tokens_seen": 969611264 }, { "epoch": 2.09, "learning_rate": 0.0003566800401203611, "loss": 2.7395, "theoretical_loss": 3.6599324975269063, "tokens_seen": 969676800 }, { "epoch": 2.09, "learning_rate": 0.00035667001003009027, "loss": 2.9136, "theoretical_loss": 3.65990882408998, "tokens_seen": 969742336 }, { "epoch": 2.09, "learning_rate": 0.00035665997993981945, "loss": 2.9216, "theoretical_loss": 3.6598851527007996, "tokens_seen": 969807872 }, { "epoch": 2.09, "learning_rate": 0.00035664994984954863, "loss": 2.9517, "theoretical_loss": 3.6598614833590495, "tokens_seen": 969873408 }, { "epoch": 2.09, "objective/train/docs_used": 1559557, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9658780097961426, "objective/train/theoretical_loss": 3.6598496494558623, "objective/train/tokens_used": 990366176, "theoretical_loss": 3.6598496494558623, "tokens_seen": 969906176 }, { "epoch": 2.09, "learning_rate": 0.00035663991975927787, "loss": 2.9091, "theoretical_loss": 3.6598378160644147, "tokens_seen": 969938944 }, { "epoch": 2.09, "learning_rate": 0.000356629889669007, "loss": 2.9405, "theoretical_loss": 3.659814150816579, "tokens_seen": 970004480 }, { "epoch": 2.09, "learning_rate": 0.00035661985957873623, "loss": 3.048, "theoretical_loss": 3.6597904876152283, "tokens_seen": 970070016 }, { "epoch": 2.09, "learning_rate": 0.00035660982948846536, "loss": 2.626, "theoretical_loss": 3.6597668264600465, "tokens_seen": 970135552 }, { "epoch": 2.09, "learning_rate": 0.0003565997993981946, "loss": 3.0229, "theoretical_loss": 3.659743167350719, "tokens_seen": 970201088 }, { "epoch": 2.09, "learning_rate": 0.0003565897693079238, "loss": 3.0986, "theoretical_loss": 3.65971951028693, "tokens_seen": 970266624 }, { "epoch": 2.09, "learning_rate": 0.00035657973921765295, "loss": 2.9795, "theoretical_loss": 3.659695855268365, "tokens_seen": 970332160 }, { "epoch": 2.09, "learning_rate": 0.0003565697091273822, "loss": 2.8357, "theoretical_loss": 3.6596722022947095, "tokens_seen": 970397696 }, { "epoch": 2.09, "learning_rate": 0.0003565596790371113, "loss": 2.8979, "theoretical_loss": 3.659648551365648, "tokens_seen": 970463232 }, { "epoch": 2.09, "learning_rate": 0.00035654964894684055, "loss": 2.9757, "theoretical_loss": 3.659624902480866, "tokens_seen": 970528768 }, { "epoch": 2.09, "learning_rate": 0.00035653961885656973, "loss": 2.8855, "theoretical_loss": 3.6596012556400486, "tokens_seen": 970594304 }, { "epoch": 2.09, "learning_rate": 0.0003565295887662989, "loss": 3.0182, "theoretical_loss": 3.6595776108428812, "tokens_seen": 970659840 }, { "epoch": 2.09, "learning_rate": 0.0003565195586760281, "loss": 2.9054, "theoretical_loss": 3.659553968089049, "tokens_seen": 970725376 }, { "epoch": 2.09, "learning_rate": 0.00035650952858575733, "loss": 2.7913, "theoretical_loss": 3.659530327378238, "tokens_seen": 970790912 }, { "epoch": 2.09, "learning_rate": 0.00035649949849548646, "loss": 2.8807, "theoretical_loss": 3.6595066887101337, "tokens_seen": 970856448 }, { "epoch": 2.09, "learning_rate": 0.0003564894684052157, "loss": 2.9635, "theoretical_loss": 3.659483052084421, "tokens_seen": 970921984 }, { "epoch": 2.09, "learning_rate": 0.0003564794383149448, "loss": 2.8219, "theoretical_loss": 3.6594594175007864, "tokens_seen": 970987520 }, { "epoch": 2.09, "learning_rate": 0.00035646940822467405, "loss": 2.9961, "theoretical_loss": 3.659435784958915, "tokens_seen": 971053056 }, { "epoch": 2.09, "learning_rate": 0.00035645937813440323, "loss": 2.9186, "theoretical_loss": 3.6594121544584937, "tokens_seen": 971118592 }, { "epoch": 2.09, "learning_rate": 0.0003564493480441324, "loss": 3.0107, "theoretical_loss": 3.6593885259992067, "tokens_seen": 971184128 }, { "epoch": 2.09, "learning_rate": 0.0003564393179538616, "loss": 2.8764, "theoretical_loss": 3.6593648995807415, "tokens_seen": 971249664 }, { "epoch": 2.09, "learning_rate": 0.0003564292878635908, "loss": 2.8935, "theoretical_loss": 3.659341275202784, "tokens_seen": 971315200 }, { "epoch": 2.09, "learning_rate": 0.00035641925777331996, "loss": 2.8434, "theoretical_loss": 3.6593176528650195, "tokens_seen": 971380736 }, { "epoch": 2.09, "learning_rate": 0.0003564092276830492, "loss": 2.8724, "theoretical_loss": 3.659294032567135, "tokens_seen": 971446272 }, { "epoch": 2.09, "learning_rate": 0.0003563991975927783, "loss": 2.8941, "theoretical_loss": 3.659270414308816, "tokens_seen": 971511808 }, { "epoch": 2.09, "objective/train/docs_used": 1562551, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8056797981262207, "objective/train/theoretical_loss": 3.6592586059443954, "objective/train/tokens_used": 992004576, "theoretical_loss": 3.6592586059443954, "tokens_seen": 971544576 }, { "epoch": 2.09, "learning_rate": 0.00035638916750250756, "loss": 2.9521, "theoretical_loss": 3.6592467980897494, "tokens_seen": 971577344 }, { "epoch": 2.09, "learning_rate": 0.0003563791374122367, "loss": 2.8828, "theoretical_loss": 3.6592231839096208, "tokens_seen": 971642880 }, { "epoch": 2.09, "learning_rate": 0.0003563691073219659, "loss": 2.8026, "theoretical_loss": 3.6591995717681174, "tokens_seen": 971708416 }, { "epoch": 2.09, "learning_rate": 0.0003563590772316951, "loss": 2.8439, "theoretical_loss": 3.659175961664926, "tokens_seen": 971773952 }, { "epoch": 2.09, "learning_rate": 0.0003563490471414243, "loss": 2.852, "theoretical_loss": 3.659152353599733, "tokens_seen": 971839488 }, { "epoch": 2.09, "learning_rate": 0.00035633901705115346, "loss": 2.9405, "theoretical_loss": 3.659128747572224, "tokens_seen": 971905024 }, { "epoch": 2.09, "learning_rate": 0.0003563289869608827, "loss": 2.8414, "theoretical_loss": 3.659105143582087, "tokens_seen": 971970560 }, { "epoch": 2.09, "learning_rate": 0.0003563189568706118, "loss": 2.8005, "theoretical_loss": 3.6590815416290083, "tokens_seen": 972036096 }, { "epoch": 2.09, "learning_rate": 0.00035630892678034106, "loss": 2.9352, "theoretical_loss": 3.6590579417126747, "tokens_seen": 972101632 }, { "epoch": 2.09, "learning_rate": 0.0003562988966900702, "loss": 2.8242, "theoretical_loss": 3.659034343832774, "tokens_seen": 972167168 }, { "epoch": 2.09, "learning_rate": 0.0003562888665997994, "loss": 3.0067, "theoretical_loss": 3.659010747988992, "tokens_seen": 972232704 }, { "epoch": 2.09, "learning_rate": 0.0003562788365095286, "loss": 2.8685, "theoretical_loss": 3.658987154181016, "tokens_seen": 972298240 }, { "epoch": 2.09, "learning_rate": 0.0003562688064192578, "loss": 2.869, "theoretical_loss": 3.6589635624085335, "tokens_seen": 972363776 }, { "epoch": 2.09, "learning_rate": 0.00035625877632898697, "loss": 2.8063, "theoretical_loss": 3.6589399726712317, "tokens_seen": 972429312 }, { "epoch": 2.09, "learning_rate": 0.00035624874623871615, "loss": 2.7468, "theoretical_loss": 3.6589163849687982, "tokens_seen": 972494848 }, { "epoch": 2.09, "learning_rate": 0.00035623871614844533, "loss": 2.7496, "theoretical_loss": 3.6588927993009195, "tokens_seen": 972560384 }, { "epoch": 2.09, "learning_rate": 0.00035622868605817456, "loss": 2.9768, "theoretical_loss": 3.658869215667284, "tokens_seen": 972625920 }, { "epoch": 2.09, "learning_rate": 0.0003562186559679037, "loss": 2.8475, "theoretical_loss": 3.658845634067579, "tokens_seen": 972691456 }, { "epoch": 2.09, "learning_rate": 0.0003562086258776329, "loss": 2.9523, "theoretical_loss": 3.658822054501491, "tokens_seen": 972756992 }, { "epoch": 2.09, "learning_rate": 0.00035619859578736205, "loss": 2.9979, "theoretical_loss": 3.658798476968709, "tokens_seen": 972822528 }, { "epoch": 2.09, "learning_rate": 0.0003561885656970913, "loss": 2.8892, "theoretical_loss": 3.65877490146892, "tokens_seen": 972888064 }, { "epoch": 2.09, "learning_rate": 0.00035617853560682047, "loss": 2.8606, "theoretical_loss": 3.658751328001812, "tokens_seen": 972953600 }, { "epoch": 2.09, "learning_rate": 0.00035616850551654965, "loss": 2.9679, "theoretical_loss": 3.658727756567073, "tokens_seen": 973019136 }, { "epoch": 2.09, "learning_rate": 0.00035615847542627883, "loss": 2.9292, "theoretical_loss": 3.65870418716439, "tokens_seen": 973084672 }, { "epoch": 2.09, "learning_rate": 0.00035614844533600807, "loss": 2.834, "theoretical_loss": 3.658680619793452, "tokens_seen": 973150208 }, { "epoch": 2.09, "objective/train/docs_used": 1565549, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7884864807128906, "objective/train/theoretical_loss": 3.65866883686979, "objective/train/tokens_used": 993642976, "theoretical_loss": 3.65866883686979, "tokens_seen": 973182976 }, { "epoch": 2.09, "learning_rate": 0.0003561384152457372, "loss": 2.9672, "theoretical_loss": 3.658657054453947, "tokens_seen": 973215744 }, { "epoch": 2.09, "learning_rate": 0.00035612838515546643, "loss": 2.7714, "theoretical_loss": 3.6586334911455625, "tokens_seen": 973281280 }, { "epoch": 2.09, "learning_rate": 0.00035611835506519556, "loss": 2.8787, "theoretical_loss": 3.658609929867987, "tokens_seen": 973346816 }, { "epoch": 2.09, "learning_rate": 0.0003561083249749248, "loss": 2.9298, "theoretical_loss": 3.6585863706209087, "tokens_seen": 973412352 }, { "epoch": 2.09, "learning_rate": 0.00035609829488465397, "loss": 2.8534, "theoretical_loss": 3.658562813404016, "tokens_seen": 973477888 }, { "epoch": 2.09, "learning_rate": 0.00035608826479438315, "loss": 2.8293, "theoretical_loss": 3.6585392582169973, "tokens_seen": 973543424 }, { "epoch": 2.09, "learning_rate": 0.00035607823470411233, "loss": 2.9309, "theoretical_loss": 3.6585157050595414, "tokens_seen": 973608960 }, { "epoch": 2.09, "learning_rate": 0.0003560682046138415, "loss": 2.9029, "theoretical_loss": 3.658492153931336, "tokens_seen": 973674496 }, { "epoch": 2.09, "learning_rate": 0.0003560581745235707, "loss": 2.9103, "theoretical_loss": 3.6584686048320707, "tokens_seen": 973740032 }, { "epoch": 2.09, "learning_rate": 0.00035604814443329993, "loss": 2.8001, "theoretical_loss": 3.6584450577614334, "tokens_seen": 973805568 }, { "epoch": 2.09, "learning_rate": 0.00035603811434302906, "loss": 2.9348, "theoretical_loss": 3.658421512719113, "tokens_seen": 973871104 }, { "epoch": 2.09, "learning_rate": 0.0003560280842527583, "loss": 2.9681, "theoretical_loss": 3.658397969704798, "tokens_seen": 973936640 }, { "epoch": 2.09, "learning_rate": 0.0003560180541624874, "loss": 2.8197, "theoretical_loss": 3.6583744287181785, "tokens_seen": 974002176 }, { "epoch": 2.09, "learning_rate": 0.00035600802407221666, "loss": 3.0199, "theoretical_loss": 3.6583508897589425, "tokens_seen": 974067712 }, { "epoch": 2.09, "learning_rate": 0.00035599799398194584, "loss": 3.0307, "theoretical_loss": 3.658327352826779, "tokens_seen": 974133248 }, { "epoch": 2.09, "learning_rate": 0.000355987963891675, "loss": 2.8483, "theoretical_loss": 3.658303817921377, "tokens_seen": 974198784 }, { "epoch": 2.09, "learning_rate": 0.0003559779338014042, "loss": 2.8948, "theoretical_loss": 3.6582802850424256, "tokens_seen": 974264320 }, { "epoch": 2.09, "learning_rate": 0.00035596790371113344, "loss": 2.917, "theoretical_loss": 3.658256754189615, "tokens_seen": 974329856 }, { "epoch": 2.09, "learning_rate": 0.00035595787362086256, "loss": 2.8011, "theoretical_loss": 3.6582332253626335, "tokens_seen": 974395392 }, { "epoch": 2.09, "learning_rate": 0.0003559478435305918, "loss": 2.9358, "theoretical_loss": 3.65820969856117, "tokens_seen": 974460928 }, { "epoch": 2.09, "learning_rate": 0.0003559378134403209, "loss": 2.8621, "theoretical_loss": 3.658186173784916, "tokens_seen": 974526464 }, { "epoch": 2.09, "learning_rate": 0.00035592778335005016, "loss": 2.8745, "theoretical_loss": 3.6581626510335585, "tokens_seen": 974592000 }, { "epoch": 2.09, "learning_rate": 0.00035591775325977934, "loss": 2.9739, "theoretical_loss": 3.6581391303067887, "tokens_seen": 974657536 }, { "epoch": 2.09, "learning_rate": 0.0003559077231695085, "loss": 3.0823, "theoretical_loss": 3.6581156116042957, "tokens_seen": 974723072 }, { "epoch": 2.09, "learning_rate": 0.0003558976930792377, "loss": 2.9694, "theoretical_loss": 3.658092094925769, "tokens_seen": 974788608 }, { "epoch": 2.09, "objective/train/docs_used": 1568455, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9623382091522217, "objective/train/theoretical_loss": 3.658080337345396, "objective/train/tokens_used": 995281376, "theoretical_loss": 3.658080337345396, "tokens_seen": 974821376 }, { "epoch": 2.09, "learning_rate": 0.0003558876629889669, "loss": 2.9377, "theoretical_loss": 3.658068580270899, "tokens_seen": 974854144 }, { "epoch": 2.09, "learning_rate": 0.00035587763289869607, "loss": 2.8722, "theoretical_loss": 3.6580450676393745, "tokens_seen": 974919680 }, { "epoch": 2.09, "learning_rate": 0.0003558676028084253, "loss": 3.0076, "theoretical_loss": 3.6580215570308865, "tokens_seen": 974985216 }, { "epoch": 2.09, "learning_rate": 0.00035585757271815443, "loss": 2.8121, "theoretical_loss": 3.6579980484451244, "tokens_seen": 975050752 }, { "epoch": 2.09, "learning_rate": 0.00035584754262788366, "loss": 2.9526, "theoretical_loss": 3.6579745418817784, "tokens_seen": 975116288 }, { "epoch": 2.09, "learning_rate": 0.0003558375125376129, "loss": 2.8338, "theoretical_loss": 3.6579510373405384, "tokens_seen": 975181824 }, { "epoch": 2.09, "learning_rate": 0.000355827482447342, "loss": 2.9088, "theoretical_loss": 3.657927534821095, "tokens_seen": 975247360 }, { "epoch": 2.09, "learning_rate": 0.00035581745235707126, "loss": 2.9815, "theoretical_loss": 3.6579040343231375, "tokens_seen": 975312896 }, { "epoch": 2.09, "learning_rate": 0.0003558074222668004, "loss": 2.913, "theoretical_loss": 3.6578805358463575, "tokens_seen": 975378432 }, { "epoch": 2.09, "learning_rate": 0.0003557973921765296, "loss": 2.8718, "theoretical_loss": 3.6578570393904446, "tokens_seen": 975443968 }, { "epoch": 2.09, "learning_rate": 0.0003557873620862588, "loss": 2.9272, "theoretical_loss": 3.6578335449550896, "tokens_seen": 975509504 }, { "epoch": 2.09, "learning_rate": 0.000355777331995988, "loss": 2.9208, "theoretical_loss": 3.6578100525399826, "tokens_seen": 975575040 }, { "epoch": 2.09, "learning_rate": 0.00035576730190571717, "loss": 2.8132, "theoretical_loss": 3.657786562144815, "tokens_seen": 975640576 }, { "epoch": 2.09, "learning_rate": 0.00035575727181544635, "loss": 2.8864, "theoretical_loss": 3.657763073769276, "tokens_seen": 975706112 }, { "epoch": 2.09, "learning_rate": 0.00035574724172517553, "loss": 3.015, "theoretical_loss": 3.657739587413057, "tokens_seen": 975771648 }, { "epoch": 2.09, "learning_rate": 0.00035573721163490476, "loss": 2.8921, "theoretical_loss": 3.65771610307585, "tokens_seen": 975837184 }, { "epoch": 2.09, "learning_rate": 0.0003557271815446339, "loss": 2.9199, "theoretical_loss": 3.657692620757344, "tokens_seen": 975902720 }, { "epoch": 2.09, "learning_rate": 0.0003557171514543631, "loss": 2.9558, "theoretical_loss": 3.6576691404572315, "tokens_seen": 975968256 }, { "epoch": 2.09, "learning_rate": 0.00035570712136409225, "loss": 2.8154, "theoretical_loss": 3.6576456621752023, "tokens_seen": 976033792 }, { "epoch": 2.09, "learning_rate": 0.0003556970912738215, "loss": 3.0942, "theoretical_loss": 3.657622185910948, "tokens_seen": 976099328 }, { "epoch": 2.09, "learning_rate": 0.00035568706118355067, "loss": 2.7492, "theoretical_loss": 3.6575987116641597, "tokens_seen": 976164864 }, { "epoch": 2.09, "learning_rate": 0.00035567703109327985, "loss": 2.8845, "theoretical_loss": 3.6575752394345287, "tokens_seen": 976230400 }, { "epoch": 2.09, "learning_rate": 0.00035566700100300903, "loss": 2.9361, "theoretical_loss": 3.6575517692217456, "tokens_seen": 976295936 }, { "epoch": 2.09, "learning_rate": 0.00035565697091273827, "loss": 2.7843, "theoretical_loss": 3.6575283010255033, "tokens_seen": 976361472 }, { "epoch": 2.09, "learning_rate": 0.0003556469408224674, "loss": 3.0309, "theoretical_loss": 3.657504834845491, "tokens_seen": 976427008 }, { "epoch": 2.09, "objective/train/docs_used": 1571348, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4276504516601562, "objective/train/theoretical_loss": 3.6574931025114754, "objective/train/tokens_used": 996919776, "theoretical_loss": 3.6574931025114754, "tokens_seen": 976459776 }, { "epoch": 2.09, "learning_rate": 0.00035563691073219663, "loss": 2.6552, "theoretical_loss": 3.6574813706814018, "tokens_seen": 976492544 }, { "epoch": 2.09, "learning_rate": 0.00035562688064192576, "loss": 2.9534, "theoretical_loss": 3.657457908532927, "tokens_seen": 976558080 }, { "epoch": 2.09, "learning_rate": 0.000355616850551655, "loss": 2.9676, "theoretical_loss": 3.6574344483997576, "tokens_seen": 976623616 }, { "epoch": 2.09, "learning_rate": 0.00035560682046138417, "loss": 2.8386, "theoretical_loss": 3.657410990281586, "tokens_seen": 976689152 }, { "epoch": 2.09, "learning_rate": 0.00035559679037111335, "loss": 3.0276, "theoretical_loss": 3.6573875341781035, "tokens_seen": 976754688 }, { "epoch": 2.09, "learning_rate": 0.00035558676028084253, "loss": 3.0406, "theoretical_loss": 3.6573640800890015, "tokens_seen": 976820224 }, { "epoch": 2.09, "learning_rate": 0.0003555767301905717, "loss": 2.8245, "theoretical_loss": 3.657340628013973, "tokens_seen": 976885760 }, { "epoch": 2.09, "learning_rate": 0.0003555667001003009, "loss": 2.9291, "theoretical_loss": 3.6573171779527085, "tokens_seen": 976951296 }, { "epoch": 2.09, "learning_rate": 0.00035555667001003013, "loss": 2.9587, "theoretical_loss": 3.657293729904902, "tokens_seen": 977016832 }, { "epoch": 2.09, "learning_rate": 0.00035554663991975926, "loss": 2.7862, "theoretical_loss": 3.6572702838702433, "tokens_seen": 977082368 }, { "epoch": 2.09, "learning_rate": 0.0003555366098294885, "loss": 2.9119, "theoretical_loss": 3.6572468398484266, "tokens_seen": 977147904 }, { "epoch": 2.09, "learning_rate": 0.0003555265797392176, "loss": 2.9296, "theoretical_loss": 3.6572233978391426, "tokens_seen": 977213440 }, { "epoch": 2.09, "learning_rate": 0.00035551654964894686, "loss": 3.0387, "theoretical_loss": 3.6571999578420846, "tokens_seen": 977278976 }, { "epoch": 2.09, "learning_rate": 0.00035550651955867604, "loss": 2.785, "theoretical_loss": 3.657176519856944, "tokens_seen": 977344512 }, { "epoch": 2.09, "learning_rate": 0.0003554964894684052, "loss": 2.918, "theoretical_loss": 3.657153083883414, "tokens_seen": 977410048 }, { "epoch": 2.09, "learning_rate": 0.0003554864593781344, "loss": 2.776, "theoretical_loss": 3.657129649921187, "tokens_seen": 977475584 }, { "epoch": 2.09, "learning_rate": 0.00035547642928786364, "loss": 2.6627, "theoretical_loss": 3.6571062179699556, "tokens_seen": 977541120 }, { "epoch": 2.09, "learning_rate": 0.00035546639919759276, "loss": 3.0424, "theoretical_loss": 3.6570827880294114, "tokens_seen": 977606656 }, { "epoch": 2.09, "learning_rate": 0.000355456369107322, "loss": 2.8192, "theoretical_loss": 3.6570593600992485, "tokens_seen": 977672192 }, { "epoch": 2.09, "learning_rate": 0.0003554463390170511, "loss": 2.8775, "theoretical_loss": 3.6570359341791585, "tokens_seen": 977737728 }, { "epoch": 2.09, "learning_rate": 0.00035543630892678036, "loss": 2.8324, "theoretical_loss": 3.657012510268835, "tokens_seen": 977803264 }, { "epoch": 2.09, "learning_rate": 0.00035542627883650954, "loss": 2.9092, "theoretical_loss": 3.656989088367971, "tokens_seen": 977868800 }, { "epoch": 2.09, "learning_rate": 0.0003554162487462387, "loss": 3.0861, "theoretical_loss": 3.656965668476259, "tokens_seen": 977934336 }, { "epoch": 2.09, "learning_rate": 0.0003554062186559679, "loss": 2.7552, "theoretical_loss": 3.656942250593392, "tokens_seen": 977999872 }, { "epoch": 2.09, "learning_rate": 0.0003553961885656971, "loss": 2.8615, "theoretical_loss": 3.6569188347190638, "tokens_seen": 978065408 }, { "epoch": 2.09, "objective/train/docs_used": 1573654, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.027608871459961, "objective/train/theoretical_loss": 3.656907127535005, "objective/train/tokens_used": 998558176, "theoretical_loss": 3.656907127535005, "tokens_seen": 978098176 }, { "epoch": 2.09, "learning_rate": 0.00035538615847542627, "loss": 2.9216, "theoretical_loss": 3.6568954208529663, "tokens_seen": 978130944 }, { "epoch": 2.09, "learning_rate": 0.0003553761283851555, "loss": 2.9217, "theoretical_loss": 3.6568720089947937, "tokens_seen": 978196480 }, { "epoch": 2.09, "learning_rate": 0.00035536609829488463, "loss": 2.9361, "theoretical_loss": 3.656848599144239, "tokens_seen": 978262016 }, { "epoch": 2.09, "learning_rate": 0.00035535606820461386, "loss": 2.8162, "theoretical_loss": 3.656825191300996, "tokens_seen": 978327552 }, { "epoch": 2.09, "learning_rate": 0.00035534603811434304, "loss": 2.6821, "theoretical_loss": 3.656801785464757, "tokens_seen": 978393088 }, { "epoch": 2.09, "learning_rate": 0.0003553360080240722, "loss": 2.9345, "theoretical_loss": 3.656778381635217, "tokens_seen": 978458624 }, { "epoch": 2.09, "learning_rate": 0.0003553259779338014, "loss": 3.0592, "theoretical_loss": 3.656754979812068, "tokens_seen": 978524160 }, { "epoch": 2.09, "learning_rate": 0.0003553159478435306, "loss": 2.96, "theoretical_loss": 3.6567315799950046, "tokens_seen": 978589696 }, { "epoch": 2.09, "learning_rate": 0.00035530591775325977, "loss": 2.9523, "theoretical_loss": 3.656708182183721, "tokens_seen": 978655232 }, { "epoch": 2.09, "learning_rate": 0.000355295887662989, "loss": 2.9621, "theoretical_loss": 3.6566847863779097, "tokens_seen": 978720768 }, { "epoch": 2.09, "learning_rate": 0.00035528585757271813, "loss": 2.8665, "theoretical_loss": 3.6566613925772655, "tokens_seen": 978786304 }, { "epoch": 2.09, "learning_rate": 0.00035527582748244737, "loss": 2.8837, "theoretical_loss": 3.6566380007814816, "tokens_seen": 978851840 }, { "epoch": 2.09, "learning_rate": 0.0003552657973921765, "loss": 2.8698, "theoretical_loss": 3.656614610990253, "tokens_seen": 978917376 }, { "epoch": 2.09, "learning_rate": 0.00035525576730190573, "loss": 2.9446, "theoretical_loss": 3.6565912232032725, "tokens_seen": 978982912 }, { "epoch": 2.09, "learning_rate": 0.0003552457372116349, "loss": 2.8714, "theoretical_loss": 3.6565678374202344, "tokens_seen": 979048448 }, { "epoch": 2.09, "learning_rate": 0.0003552357071213641, "loss": 2.8274, "theoretical_loss": 3.656544453640834, "tokens_seen": 979113984 }, { "epoch": 2.09, "learning_rate": 0.00035522567703109327, "loss": 2.8218, "theoretical_loss": 3.6565210718647645, "tokens_seen": 979179520 }, { "epoch": 2.09, "learning_rate": 0.00035521564694082245, "loss": 2.9607, "theoretical_loss": 3.6564976920917207, "tokens_seen": 979245056 }, { "epoch": 2.09, "learning_rate": 0.00035520561685055163, "loss": 2.8551, "theoretical_loss": 3.6564743143213962, "tokens_seen": 979310592 }, { "epoch": 2.09, "learning_rate": 0.00035519558676028087, "loss": 2.8843, "theoretical_loss": 3.656450938553486, "tokens_seen": 979376128 }, { "epoch": 2.09, "learning_rate": 0.00035518555667001, "loss": 2.7778, "theoretical_loss": 3.656427564787685, "tokens_seen": 979441664 }, { "epoch": 2.09, "learning_rate": 0.00035517552657973923, "loss": 2.8875, "theoretical_loss": 3.6564041930236875, "tokens_seen": 979507200 }, { "epoch": 2.09, "learning_rate": 0.0003551654964894684, "loss": 2.8738, "theoretical_loss": 3.6563808232611876, "tokens_seen": 979572736 }, { "epoch": 2.09, "learning_rate": 0.0003551554663991976, "loss": 2.6981, "theoretical_loss": 3.656357455499881, "tokens_seen": 979638272 }, { "epoch": 2.09, "learning_rate": 0.0003551454363089268, "loss": 2.9627, "theoretical_loss": 3.6563340897394614, "tokens_seen": 979703808 }, { "epoch": 2.09, "objective/train/docs_used": 1575093, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5647804737091064, "objective/train/theoretical_loss": 3.6563224076094887, "objective/train/tokens_used": 1000196576, "theoretical_loss": 3.6563224076094887, "tokens_seen": 979736576 }, { "epoch": 2.09, "learning_rate": 0.00035513540621865596, "loss": 2.8373, "theoretical_loss": 3.6563107259796235, "tokens_seen": 979769344 }, { "epoch": 2.09, "learning_rate": 0.00035512537612838514, "loss": 2.9064, "theoretical_loss": 3.6562873642200637, "tokens_seen": 979834880 }, { "epoch": 2.09, "learning_rate": 0.00035511534603811437, "loss": 2.925, "theoretical_loss": 3.656264004460476, "tokens_seen": 979900416 }, { "epoch": 2.09, "learning_rate": 0.0003551053159478435, "loss": 2.9128, "theoretical_loss": 3.6562406467005557, "tokens_seen": 979965952 }, { "epoch": 2.09, "learning_rate": 0.00035509528585757273, "loss": 2.918, "theoretical_loss": 3.656217290939998, "tokens_seen": 980031488 }, { "epoch": 2.09, "learning_rate": 0.0003550852557673019, "loss": 2.8572, "theoretical_loss": 3.6561939371784975, "tokens_seen": 980097024 }, { "epoch": 2.09, "learning_rate": 0.0003550752256770311, "loss": 2.982, "theoretical_loss": 3.6561705854157496, "tokens_seen": 980162560 }, { "epoch": 2.09, "learning_rate": 0.00035506519558676033, "loss": 2.9277, "theoretical_loss": 3.65614723565145, "tokens_seen": 980228096 }, { "epoch": 2.09, "learning_rate": 0.00035505516549648946, "loss": 3.0478, "theoretical_loss": 3.656123887885294, "tokens_seen": 980293632 }, { "epoch": 2.09, "learning_rate": 0.0003550451354062187, "loss": 2.97, "theoretical_loss": 3.6561005421169765, "tokens_seen": 980359168 }, { "epoch": 2.09, "learning_rate": 0.0003550351053159478, "loss": 2.9136, "theoretical_loss": 3.6560771983461944, "tokens_seen": 980424704 }, { "epoch": 2.09, "learning_rate": 0.00035502507522567706, "loss": 2.9235, "theoretical_loss": 3.6560538565726413, "tokens_seen": 980490240 }, { "epoch": 2.09, "learning_rate": 0.00035501504513540624, "loss": 3.0087, "theoretical_loss": 3.6560305167960143, "tokens_seen": 980555776 }, { "epoch": 2.09, "learning_rate": 0.0003550050150451354, "loss": 2.8189, "theoretical_loss": 3.656007179016009, "tokens_seen": 980621312 }, { "epoch": 2.09, "learning_rate": 0.0003549949849548646, "loss": 2.9326, "theoretical_loss": 3.6559838432323204, "tokens_seen": 980686848 }, { "epoch": 2.09, "learning_rate": 0.00035498495486459384, "loss": 2.9969, "theoretical_loss": 3.655960509444645, "tokens_seen": 980752384 }, { "epoch": 2.09, "learning_rate": 0.00035497492477432296, "loss": 2.7973, "theoretical_loss": 3.655937177652678, "tokens_seen": 980817920 }, { "epoch": 2.09, "learning_rate": 0.0003549648946840522, "loss": 2.9977, "theoretical_loss": 3.655913847856117, "tokens_seen": 980883456 }, { "epoch": 2.09, "learning_rate": 0.0003549548645937813, "loss": 2.8767, "theoretical_loss": 3.6558905200546556, "tokens_seen": 980948992 }, { "epoch": 2.09, "learning_rate": 0.00035494483450351056, "loss": 2.81, "theoretical_loss": 3.655867194247992, "tokens_seen": 981014528 }, { "epoch": 2.09, "learning_rate": 0.00035493480441323974, "loss": 2.9691, "theoretical_loss": 3.655843870435822, "tokens_seen": 981080064 }, { "epoch": 2.09, "learning_rate": 0.0003549247743229689, "loss": 2.8794, "theoretical_loss": 3.655820548617841, "tokens_seen": 981145600 }, { "epoch": 2.09, "learning_rate": 0.0003549147442326981, "loss": 2.7654, "theoretical_loss": 3.6557972287937455, "tokens_seen": 981211136 }, { "epoch": 2.09, "learning_rate": 0.0003549047141424273, "loss": 3.12, "theoretical_loss": 3.655773910963233, "tokens_seen": 981276672 }, { "epoch": 2.09, "learning_rate": 0.00035489468405215647, "loss": 2.973, "theoretical_loss": 3.655750595125998, "tokens_seen": 981342208 }, { "epoch": 2.09, "objective/train/docs_used": 1577929, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9241225719451904, "objective/train/theoretical_loss": 3.6557389379547653, "objective/train/tokens_used": 1001834976, "theoretical_loss": 3.6557389379547653, "tokens_seen": 981374976 }, { "epoch": 2.09, "learning_rate": 0.0003548846539618857, "loss": 2.9138, "theoretical_loss": 3.655727281281739, "tokens_seen": 981407744 }, { "epoch": 2.09, "learning_rate": 0.00035487462387161483, "loss": 2.9501, "theoretical_loss": 3.655703969430151, "tokens_seen": 981473280 }, { "epoch": 2.09, "learning_rate": 0.00035486459378134406, "loss": 2.9708, "theoretical_loss": 3.6556806595709315, "tokens_seen": 981538816 }, { "epoch": 2.09, "learning_rate": 0.00035485456369107324, "loss": 2.9108, "theoretical_loss": 3.655657351703778, "tokens_seen": 981604352 }, { "epoch": 2.09, "learning_rate": 0.0003548445336008024, "loss": 2.9368, "theoretical_loss": 3.655634045828385, "tokens_seen": 981669888 }, { "epoch": 2.09, "learning_rate": 0.0003548345035105316, "loss": 2.9518, "theoretical_loss": 3.655610741944451, "tokens_seen": 981735424 }, { "epoch": 2.09, "learning_rate": 0.0003548244734202608, "loss": 2.9718, "theoretical_loss": 3.655587440051673, "tokens_seen": 981800960 }, { "epoch": 2.09, "learning_rate": 0.00035481444332998997, "loss": 2.9221, "theoretical_loss": 3.6555641401497474, "tokens_seen": 981866496 }, { "epoch": 2.09, "learning_rate": 0.0003548044132397192, "loss": 2.7732, "theoretical_loss": 3.6555408422383713, "tokens_seen": 981932032 }, { "epoch": 2.09, "learning_rate": 0.00035479438314944833, "loss": 2.8498, "theoretical_loss": 3.655517546317242, "tokens_seen": 981997568 }, { "epoch": 2.09, "learning_rate": 0.00035478435305917757, "loss": 2.7824, "theoretical_loss": 3.655494252386056, "tokens_seen": 982063104 }, { "epoch": 2.09, "learning_rate": 0.0003547743229689067, "loss": 3.038, "theoretical_loss": 3.6554709604445117, "tokens_seen": 982128640 }, { "epoch": 2.09, "learning_rate": 0.00035476429287863593, "loss": 2.9998, "theoretical_loss": 3.655447670492306, "tokens_seen": 982194176 }, { "epoch": 2.09, "learning_rate": 0.0003547542627883651, "loss": 2.981, "theoretical_loss": 3.6554243825291355, "tokens_seen": 982259712 }, { "epoch": 2.09, "learning_rate": 0.0003547442326980943, "loss": 3.0067, "theoretical_loss": 3.655401096554699, "tokens_seen": 982325248 }, { "epoch": 2.09, "learning_rate": 0.00035473420260782347, "loss": 2.8096, "theoretical_loss": 3.655377812568693, "tokens_seen": 982390784 }, { "epoch": 2.09, "learning_rate": 0.00035472417251755265, "loss": 2.8086, "theoretical_loss": 3.655354530570815, "tokens_seen": 982456320 }, { "epoch": 2.09, "learning_rate": 0.00035471414242728183, "loss": 2.9417, "theoretical_loss": 3.6553312505607627, "tokens_seen": 982521856 }, { "epoch": 2.09, "learning_rate": 0.00035470411233701107, "loss": 2.8853, "theoretical_loss": 3.6553079725382345, "tokens_seen": 982587392 }, { "epoch": 2.09, "learning_rate": 0.0003546940822467402, "loss": 2.8025, "theoretical_loss": 3.6552846965029273, "tokens_seen": 982652928 }, { "epoch": 2.09, "learning_rate": 0.00035468405215646943, "loss": 2.8923, "theoretical_loss": 3.6552614224545397, "tokens_seen": 982718464 }, { "epoch": 2.09, "learning_rate": 0.0003546740220661986, "loss": 2.863, "theoretical_loss": 3.655238150392769, "tokens_seen": 982784000 }, { "epoch": 2.09, "learning_rate": 0.0003546639919759278, "loss": 2.8169, "theoretical_loss": 3.6552148803173137, "tokens_seen": 982849536 }, { "epoch": 2.09, "learning_rate": 0.000354653961885657, "loss": 2.7883, "theoretical_loss": 3.6551916122278714, "tokens_seen": 982915072 }, { "epoch": 2.09, "learning_rate": 0.00035464393179538616, "loss": 2.6873, "theoretical_loss": 3.6551683461241407, "tokens_seen": 982980608 }, { "debugging/Self-BLEU-5": 0.5801073596540403, "debugging/distinct-1-grams": 0.7375902008327904, "debugging/distinct-2-grams": 0.9237222631162243, "debugging/entropy-1-grams": 6.393198754828559, "debugging/entropy-2-grams": 7.494975577625613, "debugging/length": 495.6551724137931, "debugging/num_segments": 29, "epoch": 2.09, "objective/train/docs_used": 1580721, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.543586492538452, "objective/train/theoretical_loss": 3.6551567138168224, "objective/train/tokens_used": 1003473376, "theoretical_loss": 3.6551567138168224, "tokens_seen": 983013376 }, { "epoch": 2.09, "learning_rate": 0.00035463390170511534, "loss": 2.7352, "theoretical_loss": 3.655145082005819, "tokens_seen": 983046144 }, { "epoch": 2.09, "learning_rate": 0.00035462387161484457, "loss": 2.7758, "theoretical_loss": 3.655121819872605, "tokens_seen": 983111680 }, { "epoch": 2.09, "learning_rate": 0.0003546138415245737, "loss": 3.0057, "theoretical_loss": 3.6550985597241974, "tokens_seen": 983177216 }, { "epoch": 2.09, "learning_rate": 0.00035460381143430293, "loss": 2.9139, "theoretical_loss": 3.6550753015602937, "tokens_seen": 983242752 }, { "epoch": 2.09, "learning_rate": 0.00035459378134403206, "loss": 2.8655, "theoretical_loss": 3.655052045380593, "tokens_seen": 983308288 }, { "epoch": 2.09, "learning_rate": 0.0003545837512537613, "loss": 2.7409, "theoretical_loss": 3.655028791184793, "tokens_seen": 983373824 }, { "epoch": 2.09, "learning_rate": 0.0003545737211634905, "loss": 2.7835, "theoretical_loss": 3.6550055389725937, "tokens_seen": 983439360 }, { "epoch": 2.09, "learning_rate": 0.00035456369107321966, "loss": 2.8853, "theoretical_loss": 3.654982288743692, "tokens_seen": 983504896 }, { "epoch": 2.09, "learning_rate": 0.00035455366098294884, "loss": 2.8986, "theoretical_loss": 3.654959040497788, "tokens_seen": 983570432 }, { "epoch": 2.09, "learning_rate": 0.000354543630892678, "loss": 2.9324, "theoretical_loss": 3.6549357942345795, "tokens_seen": 983635968 }, { "epoch": 2.09, "learning_rate": 0.0003545336008024072, "loss": 2.6946, "theoretical_loss": 3.654912549953766, "tokens_seen": 983701504 }, { "epoch": 2.09, "learning_rate": 0.00035452357071213644, "loss": 2.8571, "theoretical_loss": 3.654889307655046, "tokens_seen": 983767040 }, { "epoch": 2.09, "learning_rate": 0.00035451354062186556, "loss": 2.7506, "theoretical_loss": 3.6548660673381184, "tokens_seen": 983832576 }, { "epoch": 2.09, "learning_rate": 0.0003545035105315948, "loss": 2.8629, "theoretical_loss": 3.654842829002683, "tokens_seen": 983898112 }, { "epoch": 2.09, "learning_rate": 0.000354493480441324, "loss": 2.8307, "theoretical_loss": 3.6548195926484377, "tokens_seen": 983963648 }, { "epoch": 2.09, "learning_rate": 0.00035448345035105316, "loss": 2.9496, "theoretical_loss": 3.654796358275082, "tokens_seen": 984029184 }, { "epoch": 2.09, "learning_rate": 0.00035447342026078234, "loss": 3.0103, "theoretical_loss": 3.6547731258823157, "tokens_seen": 984094720 }, { "epoch": 2.09, "learning_rate": 0.0003544633901705115, "loss": 2.8661, "theoretical_loss": 3.654749895469838, "tokens_seen": 984160256 }, { "epoch": 2.09, "learning_rate": 0.0003544533600802407, "loss": 2.8665, "theoretical_loss": 3.654726667037348, "tokens_seen": 984225792 }, { "epoch": 2.09, "learning_rate": 0.00035444332998996994, "loss": 2.8856, "theoretical_loss": 3.6547034405845444, "tokens_seen": 984291328 }, { "epoch": 2.09, "learning_rate": 0.00035443329989969907, "loss": 2.8966, "theoretical_loss": 3.654680216111128, "tokens_seen": 984356864 }, { "epoch": 2.09, "learning_rate": 0.0003544232698094283, "loss": 2.7901, "theoretical_loss": 3.654656993616798, "tokens_seen": 984422400 }, { "epoch": 2.09, "learning_rate": 0.00035441323971915743, "loss": 2.9631, "theoretical_loss": 3.6546337731012533, "tokens_seen": 984487936 }, { "epoch": 2.09, "learning_rate": 0.00035440320962888667, "loss": 2.9059, "theoretical_loss": 3.654610554564194, "tokens_seen": 984553472 }, { "epoch": 2.09, "learning_rate": 0.00035439317953861585, "loss": 2.8929, "theoretical_loss": 3.65458733800532, "tokens_seen": 984619008 }, { "epoch": 2.09, "objective/train/docs_used": 1583365, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8142967224121094, "objective/train/theoretical_loss": 3.6545757304676085, "objective/train/tokens_used": 1005111776, "theoretical_loss": 3.6545757304676085, "tokens_seen": 984651776 }, { "epoch": 2.09, "learning_rate": 0.00035438314944834503, "loss": 2.7494, "theoretical_loss": 3.654564123424331, "tokens_seen": 984684544 }, { "epoch": 2.09, "learning_rate": 0.0003543731193580742, "loss": 2.811, "theoretical_loss": 3.654540910820927, "tokens_seen": 984750080 }, { "epoch": 2.09, "learning_rate": 0.00035436308926780344, "loss": 2.8952, "theoretical_loss": 3.6545177001948073, "tokens_seen": 984815616 }, { "epoch": 2.09, "learning_rate": 0.0003543530591775326, "loss": 2.8641, "theoretical_loss": 3.654494491545673, "tokens_seen": 984881152 }, { "epoch": 2.09, "learning_rate": 0.0003543430290872618, "loss": 2.8686, "theoretical_loss": 3.6544712848732237, "tokens_seen": 984946688 }, { "epoch": 2.09, "learning_rate": 0.000354332998996991, "loss": 2.9863, "theoretical_loss": 3.6544480801771586, "tokens_seen": 985012224 }, { "epoch": 2.09, "learning_rate": 0.00035432296890672017, "loss": 2.9397, "theoretical_loss": 3.6544248774571795, "tokens_seen": 985077760 }, { "epoch": 2.09, "learning_rate": 0.0003543129388164494, "loss": 2.6119, "theoretical_loss": 3.654401676712985, "tokens_seen": 985143296 }, { "epoch": 2.09, "learning_rate": 0.00035430290872617853, "loss": 2.8891, "theoretical_loss": 3.6543784779442774, "tokens_seen": 985208832 }, { "epoch": 2.09, "learning_rate": 0.00035429287863590777, "loss": 2.8921, "theoretical_loss": 3.654355281150756, "tokens_seen": 985274368 }, { "epoch": 2.09, "learning_rate": 0.0003542828485456369, "loss": 2.7914, "theoretical_loss": 3.6543320863321203, "tokens_seen": 985339904 }, { "epoch": 2.09, "learning_rate": 0.00035427281845536613, "loss": 2.7904, "theoretical_loss": 3.6543088934880723, "tokens_seen": 985405440 }, { "epoch": 2.09, "learning_rate": 0.0003542627883650953, "loss": 2.6763, "theoretical_loss": 3.6542857026183126, "tokens_seen": 985470976 }, { "epoch": 2.09, "learning_rate": 0.0003542527582748245, "loss": 2.9608, "theoretical_loss": 3.654262513722541, "tokens_seen": 985536512 }, { "epoch": 2.09, "learning_rate": 0.00035424272818455367, "loss": 2.882, "theoretical_loss": 3.6542393268004587, "tokens_seen": 985602048 }, { "epoch": 2.09, "learning_rate": 0.00035423269809428285, "loss": 2.8775, "theoretical_loss": 3.6542161418517667, "tokens_seen": 985667584 }, { "epoch": 2.09, "learning_rate": 0.00035422266800401203, "loss": 2.7387, "theoretical_loss": 3.654192958876165, "tokens_seen": 985733120 }, { "epoch": 2.09, "learning_rate": 0.00035421263791374127, "loss": 2.9107, "theoretical_loss": 3.654169777873355, "tokens_seen": 985798656 }, { "epoch": 2.09, "learning_rate": 0.0003542026078234704, "loss": 2.9599, "theoretical_loss": 3.654146598843038, "tokens_seen": 985864192 }, { "epoch": 2.09, "learning_rate": 0.00035419257773319963, "loss": 3.1016, "theoretical_loss": 3.6541234217849152, "tokens_seen": 985929728 }, { "epoch": 2.09, "learning_rate": 0.0003541825476429288, "loss": 2.8869, "theoretical_loss": 3.654100246698687, "tokens_seen": 985995264 }, { "epoch": 2.09, "learning_rate": 0.000354172517552658, "loss": 2.9262, "theoretical_loss": 3.6540770735840553, "tokens_seen": 986060800 }, { "epoch": 2.09, "learning_rate": 0.0003541624874623872, "loss": 2.8841, "theoretical_loss": 3.6540539024407206, "tokens_seen": 986126336 }, { "epoch": 2.09, "learning_rate": 0.00035415245737211636, "loss": 2.8164, "theoretical_loss": 3.6540307332683843, "tokens_seen": 986191872 }, { "epoch": 2.09, "learning_rate": 0.00035414242728184554, "loss": 2.6208, "theoretical_loss": 3.6540075660667486, "tokens_seen": 986257408 }, { "epoch": 2.09, "objective/train/docs_used": 1585914, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8930206298828125, "objective/train/theoretical_loss": 3.6539959832048496, "objective/train/tokens_used": 1006750176, "theoretical_loss": 3.6539959832048496, "tokens_seen": 986290176 }, { "epoch": 2.09, "learning_rate": 0.00035413239719157477, "loss": 2.9266, "theoretical_loss": 3.6539844008355145, "tokens_seen": 986322944 }, { "epoch": 2.09, "learning_rate": 0.0003541223671013039, "loss": 2.9611, "theoretical_loss": 3.6539612375743826, "tokens_seen": 986388480 }, { "epoch": 2.09, "learning_rate": 0.00035411233701103314, "loss": 2.8774, "theoretical_loss": 3.653938076283056, "tokens_seen": 986454016 }, { "epoch": 2.09, "learning_rate": 0.00035410230692076226, "loss": 2.8211, "theoretical_loss": 3.6539149169612353, "tokens_seen": 986519552 }, { "epoch": 2.09, "learning_rate": 0.0003540922768304915, "loss": 2.9442, "theoretical_loss": 3.653891759608623, "tokens_seen": 986585088 }, { "epoch": 2.09, "learning_rate": 0.0003540822467402207, "loss": 2.9005, "theoretical_loss": 3.65386860422492, "tokens_seen": 986650624 }, { "epoch": 2.09, "learning_rate": 0.00035407221664994986, "loss": 2.9632, "theoretical_loss": 3.6538454508098286, "tokens_seen": 986716160 }, { "epoch": 2.09, "learning_rate": 0.00035406218655967904, "loss": 2.9042, "theoretical_loss": 3.6538222993630507, "tokens_seen": 986781696 }, { "epoch": 2.09, "learning_rate": 0.0003540521564694082, "loss": 2.9518, "theoretical_loss": 3.6537991498842883, "tokens_seen": 986847232 }, { "epoch": 2.09, "learning_rate": 0.0003540421263791374, "loss": 2.7447, "theoretical_loss": 3.653776002373243, "tokens_seen": 986912768 }, { "epoch": 2.09, "learning_rate": 0.00035403209628886664, "loss": 2.8992, "theoretical_loss": 3.653752856829618, "tokens_seen": 986978304 }, { "epoch": 2.09, "learning_rate": 0.00035402206619859577, "loss": 2.9168, "theoretical_loss": 3.6537297132531137, "tokens_seen": 987043840 }, { "epoch": 2.09, "learning_rate": 0.000354012036108325, "loss": 2.8775, "theoretical_loss": 3.6537065716434336, "tokens_seen": 987109376 }, { "epoch": 2.09, "learning_rate": 0.0003540020060180542, "loss": 2.855, "theoretical_loss": 3.65368343200028, "tokens_seen": 987174912 }, { "epoch": 2.09, "learning_rate": 0.00035399197592778336, "loss": 2.9907, "theoretical_loss": 3.6536602943233545, "tokens_seen": 987240448 }, { "epoch": 2.09, "learning_rate": 0.00035398194583751254, "loss": 2.8836, "theoretical_loss": 3.6536371586123604, "tokens_seen": 987305984 }, { "epoch": 2.09, "learning_rate": 0.0003539719157472417, "loss": 2.7315, "theoretical_loss": 3.6536140248669993, "tokens_seen": 987371520 }, { "epoch": 2.09, "learning_rate": 0.0003539618856569709, "loss": 2.9045, "theoretical_loss": 3.6535908930869745, "tokens_seen": 987437056 }, { "epoch": 2.09, "learning_rate": 0.00035395185556670014, "loss": 3.0388, "theoretical_loss": 3.653567763271988, "tokens_seen": 987502592 }, { "epoch": 2.09, "learning_rate": 0.00035394182547642927, "loss": 2.9724, "theoretical_loss": 3.6535446354217433, "tokens_seen": 987568128 }, { "epoch": 2.09, "learning_rate": 0.0003539317953861585, "loss": 2.8265, "theoretical_loss": 3.6535215095359423, "tokens_seen": 987633664 }, { "epoch": 2.09, "learning_rate": 0.00035392176529588763, "loss": 2.6129, "theoretical_loss": 3.653498385614288, "tokens_seen": 987699200 }, { "epoch": 2.09, "learning_rate": 0.00035391173520561687, "loss": 2.9211, "theoretical_loss": 3.6534752636564836, "tokens_seen": 987764736 }, { "epoch": 2.09, "learning_rate": 0.00035390170511534605, "loss": 2.6084, "theoretical_loss": 3.653452143662232, "tokens_seen": 987830272 }, { "epoch": 2.09, "learning_rate": 0.00035389167502507523, "loss": 2.8873, "theoretical_loss": 3.653429025631236, "tokens_seen": 987895808 }, { "epoch": 2.09, "objective/train/docs_used": 1587538, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.583278179168701, "objective/train/theoretical_loss": 3.653417467351866, "objective/train/tokens_used": 1008388576, "theoretical_loss": 3.653417467351866, "tokens_seen": 987928576 }, { "epoch": 2.09, "learning_rate": 0.0003538816449348044, "loss": 2.7492, "theoretical_loss": 3.6534059095631983, "tokens_seen": 987961344 }, { "epoch": 2.09, "learning_rate": 0.00035387161484453364, "loss": 2.9024, "theoretical_loss": 3.653382795457823, "tokens_seen": 988026880 }, { "epoch": 2.09, "learning_rate": 0.00035386158475426277, "loss": 2.904, "theoretical_loss": 3.653359683314812, "tokens_seen": 988092416 }, { "epoch": 2.09, "learning_rate": 0.000353851554663992, "loss": 3.0099, "theoretical_loss": 3.6533365731338696, "tokens_seen": 988157952 }, { "epoch": 2.09, "learning_rate": 0.00035384152457372113, "loss": 3.0924, "theoretical_loss": 3.6533134649146985, "tokens_seen": 988223488 }, { "epoch": 2.09, "learning_rate": 0.00035383149448345037, "loss": 2.963, "theoretical_loss": 3.653290358657003, "tokens_seen": 988289024 }, { "epoch": 2.09, "learning_rate": 0.00035382146439317955, "loss": 2.7841, "theoretical_loss": 3.6532672543604856, "tokens_seen": 988354560 }, { "epoch": 2.09, "learning_rate": 0.00035381143430290873, "loss": 2.9042, "theoretical_loss": 3.65324415202485, "tokens_seen": 988420096 }, { "epoch": 2.09, "learning_rate": 0.0003538014042126379, "loss": 2.8193, "theoretical_loss": 3.6532210516498003, "tokens_seen": 988485632 }, { "epoch": 2.09, "learning_rate": 0.0003537913741223671, "loss": 2.9178, "theoretical_loss": 3.65319795323504, "tokens_seen": 988551168 }, { "epoch": 2.09, "learning_rate": 0.0003537813440320963, "loss": 2.8788, "theoretical_loss": 3.6531748567802715, "tokens_seen": 988616704 }, { "epoch": 2.09, "learning_rate": 0.0003537713139418255, "loss": 2.9648, "theoretical_loss": 3.6531517622852006, "tokens_seen": 988682240 }, { "epoch": 2.09, "learning_rate": 0.00035376128385155464, "loss": 3.0248, "theoretical_loss": 3.65312866974953, "tokens_seen": 988747776 }, { "epoch": 2.09, "learning_rate": 0.00035375125376128387, "loss": 2.9383, "theoretical_loss": 3.653105579172964, "tokens_seen": 988813312 }, { "epoch": 2.09, "learning_rate": 0.000353741223671013, "loss": 2.9598, "theoretical_loss": 3.6530824905552057, "tokens_seen": 988878848 }, { "epoch": 2.09, "learning_rate": 0.00035373119358074223, "loss": 2.8699, "theoretical_loss": 3.6530594038959605, "tokens_seen": 988944384 }, { "epoch": 2.09, "learning_rate": 0.0003537211634904714, "loss": 2.9058, "theoretical_loss": 3.6530363191949315, "tokens_seen": 989009920 }, { "epoch": 2.09, "learning_rate": 0.0003537111334002006, "loss": 2.8302, "theoretical_loss": 3.653013236451823, "tokens_seen": 989075456 }, { "epoch": 2.09, "learning_rate": 0.0003537011033099298, "loss": 2.878, "theoretical_loss": 3.65299015566634, "tokens_seen": 989140992 }, { "epoch": 2.09, "learning_rate": 0.000353691073219659, "loss": 2.8319, "theoretical_loss": 3.652967076838186, "tokens_seen": 989206528 }, { "epoch": 2.09, "learning_rate": 0.00035368104312938814, "loss": 2.9385, "theoretical_loss": 3.652943999967065, "tokens_seen": 989272064 }, { "epoch": 2.09, "learning_rate": 0.0003536710130391174, "loss": 2.9579, "theoretical_loss": 3.6529209250526824, "tokens_seen": 989337600 }, { "epoch": 2.09, "learning_rate": 0.0003536609829488465, "loss": 3.0081, "theoretical_loss": 3.6528978520947417, "tokens_seen": 989403136 }, { "epoch": 2.09, "learning_rate": 0.00035365095285857574, "loss": 2.9689, "theoretical_loss": 3.6528747810929487, "tokens_seen": 989468672 }, { "epoch": 2.09, "learning_rate": 0.0003536409227683049, "loss": 3.0294, "theoretical_loss": 3.652851712047007, "tokens_seen": 989534208 }, { "epoch": 2.09, "objective/train/docs_used": 1590158, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8802359104156494, "objective/train/theoretical_loss": 3.652840178257388, "objective/train/tokens_used": 1010026976, "theoretical_loss": 3.652840178257388, "tokens_seen": 989566976 }, { "epoch": 2.09, "learning_rate": 0.0003536308926780341, "loss": 2.7733, "theoretical_loss": 3.6528286449566214, "tokens_seen": 989599744 }, { "epoch": 2.09, "learning_rate": 0.0003536208625877633, "loss": 3.0123, "theoretical_loss": 3.652805579821497, "tokens_seen": 989665280 }, { "epoch": 2.09, "learning_rate": 0.00035361083249749246, "loss": 2.7971, "theoretical_loss": 3.652782516641338, "tokens_seen": 989730816 }, { "epoch": 2.09, "learning_rate": 0.0003536008024072217, "loss": 2.8074, "theoretical_loss": 3.65275945541585, "tokens_seen": 989796352 }, { "epoch": 2.09, "learning_rate": 0.0003535907723169509, "loss": 3.0878, "theoretical_loss": 3.652736396144738, "tokens_seen": 989861888 }, { "epoch": 2.09, "learning_rate": 0.00035358074222668006, "loss": 2.7901, "theoretical_loss": 3.6527133388277058, "tokens_seen": 989927424 }, { "epoch": 2.09, "learning_rate": 0.00035357071213640924, "loss": 2.8368, "theoretical_loss": 3.6526902834644597, "tokens_seen": 989992960 }, { "epoch": 2.09, "learning_rate": 0.0003535606820461384, "loss": 2.757, "theoretical_loss": 3.6526672300547043, "tokens_seen": 990058496 }, { "epoch": 2.09, "learning_rate": 0.0003535506519558676, "loss": 2.8447, "theoretical_loss": 3.6526441785981447, "tokens_seen": 990124032 }, { "epoch": 2.09, "learning_rate": 0.00035354062186559684, "loss": 2.993, "theoretical_loss": 3.6526211290944866, "tokens_seen": 990189568 }, { "epoch": 2.09, "learning_rate": 0.00035353059177532597, "loss": 2.8521, "theoretical_loss": 3.6525980815434345, "tokens_seen": 990255104 }, { "epoch": 2.09, "learning_rate": 0.0003535205616850552, "loss": 2.8146, "theoretical_loss": 3.652575035944695, "tokens_seen": 990320640 }, { "epoch": 2.09, "learning_rate": 0.0003535105315947844, "loss": 2.8954, "theoretical_loss": 3.6525519922979726, "tokens_seen": 990386176 }, { "epoch": 2.09, "learning_rate": 0.00035350050150451356, "loss": 2.7761, "theoretical_loss": 3.6525289506029726, "tokens_seen": 990451712 }, { "epoch": 2.09, "learning_rate": 0.00035349047141424274, "loss": 3.0351, "theoretical_loss": 3.652505910859402, "tokens_seen": 990517248 }, { "epoch": 2.09, "learning_rate": 0.0003534804413239719, "loss": 2.8498, "theoretical_loss": 3.6524828730669645, "tokens_seen": 990582784 }, { "epoch": 2.09, "learning_rate": 0.0003534704112337011, "loss": 2.8291, "theoretical_loss": 3.652459837225367, "tokens_seen": 990648320 }, { "epoch": 2.09, "learning_rate": 0.00035346038114343034, "loss": 2.9621, "theoretical_loss": 3.6524368033343153, "tokens_seen": 990713856 }, { "epoch": 2.09, "learning_rate": 0.00035345035105315947, "loss": 3.0525, "theoretical_loss": 3.6524137713935145, "tokens_seen": 990779392 }, { "epoch": 2.09, "learning_rate": 0.0003534403209628887, "loss": 2.8511, "theoretical_loss": 3.6523907414026713, "tokens_seen": 990844928 }, { "epoch": 2.09, "learning_rate": 0.00035343029087261783, "loss": 2.6775, "theoretical_loss": 3.652367713361491, "tokens_seen": 990910464 }, { "epoch": 2.09, "learning_rate": 0.00035342026078234707, "loss": 2.8985, "theoretical_loss": 3.6523446872696805, "tokens_seen": 990976000 }, { "epoch": 2.09, "learning_rate": 0.00035341023069207625, "loss": 3.0226, "theoretical_loss": 3.6523216631269446, "tokens_seen": 991041536 }, { "epoch": 2.09, "learning_rate": 0.00035340020060180543, "loss": 2.9088, "theoretical_loss": 3.6522986409329903, "tokens_seen": 991107072 }, { "epoch": 2.09, "learning_rate": 0.0003533901705115346, "loss": 3.1946, "theoretical_loss": 3.652275620687524, "tokens_seen": 991172608 }, { "epoch": 2.09, "objective/train/docs_used": 1592918, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1007988452911377, "objective/train/theoretical_loss": 3.6522641112953815, "objective/train/tokens_used": 1011665376, "theoretical_loss": 3.6522641112953815, "tokens_seen": 991205376 }, { "epoch": 2.09, "learning_rate": 0.00035338014042126384, "loss": 2.8879, "theoretical_loss": 3.6522526023902513, "tokens_seen": 991238144 }, { "epoch": 2.09, "learning_rate": 0.00035337011033099297, "loss": 2.8002, "theoretical_loss": 3.652229586040879, "tokens_seen": 991303680 }, { "epoch": 2.09, "learning_rate": 0.0003533600802407222, "loss": 2.8688, "theoretical_loss": 3.6522065716391134, "tokens_seen": 991369216 }, { "epoch": 2.09, "learning_rate": 0.00035335005015045133, "loss": 2.8943, "theoretical_loss": 3.6521835591846603, "tokens_seen": 991434752 }, { "epoch": 2.09, "learning_rate": 0.00035334002006018057, "loss": 2.8929, "theoretical_loss": 3.6521605486772275, "tokens_seen": 991500288 }, { "epoch": 2.09, "learning_rate": 0.00035332998996990975, "loss": 2.8703, "theoretical_loss": 3.652137540116521, "tokens_seen": 991565824 }, { "epoch": 2.09, "learning_rate": 0.00035331995987963893, "loss": 2.993, "theoretical_loss": 3.652114533502247, "tokens_seen": 991631360 }, { "epoch": 2.09, "learning_rate": 0.0003533099297893681, "loss": 2.8677, "theoretical_loss": 3.6520915288341125, "tokens_seen": 991696896 }, { "epoch": 2.09, "learning_rate": 0.0003532998996990973, "loss": 2.9925, "theoretical_loss": 3.6520685261118246, "tokens_seen": 991762432 }, { "epoch": 2.09, "learning_rate": 0.0003532898696088265, "loss": 2.9422, "theoretical_loss": 3.65204552533509, "tokens_seen": 991827968 }, { "epoch": 2.09, "learning_rate": 0.0003532798395185557, "loss": 2.792, "theoretical_loss": 3.6520225265036155, "tokens_seen": 991893504 }, { "epoch": 2.09, "learning_rate": 0.00035326980942828484, "loss": 2.8945, "theoretical_loss": 3.651999529617108, "tokens_seen": 991959040 }, { "epoch": 2.09, "learning_rate": 0.00035325977933801407, "loss": 2.9771, "theoretical_loss": 3.651976534675275, "tokens_seen": 992024576 }, { "epoch": 2.09, "learning_rate": 0.0003532497492477432, "loss": 2.9776, "theoretical_loss": 3.6519535416778224, "tokens_seen": 992090112 }, { "epoch": 2.09, "learning_rate": 0.00035323971915747243, "loss": 3.0508, "theoretical_loss": 3.651930550624459, "tokens_seen": 992155648 }, { "epoch": 2.09, "learning_rate": 0.0003532296890672016, "loss": 2.9603, "theoretical_loss": 3.651907561514891, "tokens_seen": 992221184 }, { "epoch": 2.09, "learning_rate": 0.0003532196589769308, "loss": 2.8133, "theoretical_loss": 3.651884574348826, "tokens_seen": 992286720 }, { "epoch": 2.09, "learning_rate": 0.00035320962888666, "loss": 2.6607, "theoretical_loss": 3.6518615891259714, "tokens_seen": 992352256 }, { "epoch": 2.09, "learning_rate": 0.0003531995987963892, "loss": 2.9473, "theoretical_loss": 3.6518386058460344, "tokens_seen": 992417792 }, { "epoch": 2.09, "learning_rate": 0.00035318956870611834, "loss": 2.8626, "theoretical_loss": 3.651815624508722, "tokens_seen": 992483328 }, { "epoch": 2.09, "learning_rate": 0.0003531795386158476, "loss": 2.8648, "theoretical_loss": 3.651792645113743, "tokens_seen": 992548864 }, { "epoch": 2.09, "learning_rate": 0.0003531695085255767, "loss": 2.6803, "theoretical_loss": 3.651769667660804, "tokens_seen": 992614400 }, { "epoch": 2.09, "learning_rate": 0.00035315947843530594, "loss": 2.979, "theoretical_loss": 3.6517466921496133, "tokens_seen": 992679936 }, { "epoch": 2.09, "learning_rate": 0.0003531494483450351, "loss": 2.8107, "theoretical_loss": 3.651723718579878, "tokens_seen": 992745472 }, { "epoch": 2.09, "learning_rate": 0.0003531394182547643, "loss": 2.9587, "theoretical_loss": 3.651700746951306, "tokens_seen": 992811008 }, { "epoch": 2.09, "objective/train/docs_used": 1595760, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0931923389434814, "objective/train/theoretical_loss": 3.651689261864865, "objective/train/tokens_used": 1013303776, "theoretical_loss": 3.651689261864865, "tokens_seen": 992843776 }, { "epoch": 2.09, "learning_rate": 0.0003531293881644935, "loss": 2.9549, "theoretical_loss": 3.6516777772636058, "tokens_seen": 992876544 }, { "epoch": 2.09, "learning_rate": 0.00035311935807422266, "loss": 2.8347, "theoretical_loss": 3.6516548095164847, "tokens_seen": 992942080 }, { "epoch": 2.09, "learning_rate": 0.00035310932798395184, "loss": 2.8614, "theoretical_loss": 3.651631843709651, "tokens_seen": 993007616 }, { "epoch": 2.09, "learning_rate": 0.0003530992978936811, "loss": 2.9961, "theoretical_loss": 3.6516088798428123, "tokens_seen": 993073152 }, { "epoch": 2.09, "learning_rate": 0.0003530892678034102, "loss": 2.8201, "theoretical_loss": 3.6515859179156775, "tokens_seen": 993138688 }, { "epoch": 2.09, "learning_rate": 0.00035307923771313944, "loss": 3.1054, "theoretical_loss": 3.651562957927954, "tokens_seen": 993204224 }, { "epoch": 2.09, "learning_rate": 0.00035306920762286857, "loss": 3.0293, "theoretical_loss": 3.651539999879351, "tokens_seen": 993269760 }, { "epoch": 2.09, "learning_rate": 0.0003530591775325978, "loss": 2.9544, "theoretical_loss": 3.6515170437695756, "tokens_seen": 993335296 }, { "epoch": 2.09, "learning_rate": 0.000353049147442327, "loss": 2.9763, "theoretical_loss": 3.6514940895983368, "tokens_seen": 993400832 }, { "epoch": 2.09, "learning_rate": 0.00035303911735205617, "loss": 2.9701, "theoretical_loss": 3.651471137365343, "tokens_seen": 993466368 }, { "epoch": 2.09, "learning_rate": 0.00035302908726178535, "loss": 2.8499, "theoretical_loss": 3.6514481870703026, "tokens_seen": 993531904 }, { "epoch": 2.09, "learning_rate": 0.0003530190571715146, "loss": 2.9024, "theoretical_loss": 3.6514252387129242, "tokens_seen": 993597440 }, { "epoch": 2.09, "learning_rate": 0.0003530090270812437, "loss": 2.826, "theoretical_loss": 3.651402292292916, "tokens_seen": 993662976 }, { "epoch": 2.09, "learning_rate": 0.00035299899699097294, "loss": 2.8525, "theoretical_loss": 3.651379347809988, "tokens_seen": 993728512 }, { "epoch": 2.09, "learning_rate": 0.00035298896690070207, "loss": 2.8094, "theoretical_loss": 3.6513564052638476, "tokens_seen": 993794048 }, { "epoch": 2.09, "learning_rate": 0.0003529789368104313, "loss": 2.912, "theoretical_loss": 3.6513334646542037, "tokens_seen": 993859584 }, { "epoch": 2.09, "learning_rate": 0.0003529689067201605, "loss": 2.9545, "theoretical_loss": 3.651310525980766, "tokens_seen": 993925120 }, { "epoch": 2.09, "learning_rate": 0.00035295887662988967, "loss": 2.7874, "theoretical_loss": 3.6512875892432426, "tokens_seen": 993990656 }, { "epoch": 2.09, "learning_rate": 0.00035294884653961885, "loss": 2.9984, "theoretical_loss": 3.651264654441343, "tokens_seen": 994056192 }, { "epoch": 2.09, "learning_rate": 0.00035293881644934803, "loss": 2.8334, "theoretical_loss": 3.6512417215747766, "tokens_seen": 994121728 }, { "epoch": 2.09, "learning_rate": 0.0003529287863590772, "loss": 3.0445, "theoretical_loss": 3.6512187906432514, "tokens_seen": 994187264 }, { "epoch": 2.09, "learning_rate": 0.00035291875626880645, "loss": 2.6707, "theoretical_loss": 3.6511958616464772, "tokens_seen": 994252800 }, { "epoch": 2.09, "learning_rate": 0.0003529087261785356, "loss": 2.866, "theoretical_loss": 3.6511729345841637, "tokens_seen": 994318336 }, { "epoch": 2.09, "learning_rate": 0.0003528986960882648, "loss": 2.8615, "theoretical_loss": 3.651150009456019, "tokens_seen": 994383872 }, { "epoch": 2.09, "learning_rate": 0.00035288866599799394, "loss": 2.9041, "theoretical_loss": 3.651127086261754, "tokens_seen": 994449408 }, { "epoch": 2.09, "objective/train/docs_used": 1598565, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0549213886260986, "objective/train/theoretical_loss": 3.6511156253897346, "objective/train/tokens_used": 1014942176, "theoretical_loss": 3.6511156253897346, "tokens_seen": 994482176 }, { "epoch": 2.09, "learning_rate": 0.00035287863590772317, "loss": 2.8098, "theoretical_loss": 3.651104165001077, "tokens_seen": 994514944 }, { "epoch": 2.09, "learning_rate": 0.00035286860581745235, "loss": 2.8804, "theoretical_loss": 3.651081245673698, "tokens_seen": 994580480 }, { "epoch": 2.09, "learning_rate": 0.00035285857572718153, "loss": 2.7728, "theoretical_loss": 3.6510583282793263, "tokens_seen": 994646016 }, { "epoch": 2.09, "learning_rate": 0.00035284854563691077, "loss": 3.0077, "theoretical_loss": 3.651035412817672, "tokens_seen": 994711552 }, { "epoch": 2.09, "learning_rate": 0.00035283851554663995, "loss": 2.858, "theoretical_loss": 3.6510124992884436, "tokens_seen": 994777088 }, { "epoch": 2.09, "learning_rate": 0.00035282848545636913, "loss": 2.9745, "theoretical_loss": 3.6509895876913525, "tokens_seen": 994842624 }, { "epoch": 2.09, "learning_rate": 0.0003528184553660983, "loss": 3.0556, "theoretical_loss": 3.650966678026107, "tokens_seen": 994908160 }, { "epoch": 2.09, "learning_rate": 0.0003528084252758275, "loss": 2.9345, "theoretical_loss": 3.650943770292418, "tokens_seen": 994973696 }, { "epoch": 2.09, "learning_rate": 0.0003527983951855567, "loss": 2.889, "theoretical_loss": 3.6509208644899953, "tokens_seen": 995039232 }, { "epoch": 2.09, "learning_rate": 0.0003527883650952859, "loss": 2.9628, "theoretical_loss": 3.6508979606185488, "tokens_seen": 995104768 }, { "epoch": 2.09, "learning_rate": 0.00035277833500501504, "loss": 2.7945, "theoretical_loss": 3.650875058677788, "tokens_seen": 995170304 }, { "epoch": 2.09, "learning_rate": 0.00035276830491474427, "loss": 2.9055, "theoretical_loss": 3.650852158667424, "tokens_seen": 995235840 }, { "epoch": 2.09, "learning_rate": 0.0003527582748244734, "loss": 2.864, "theoretical_loss": 3.6508292605871664, "tokens_seen": 995301376 }, { "epoch": 2.09, "learning_rate": 0.00035274824473420263, "loss": 2.7787, "theoretical_loss": 3.6508063644367255, "tokens_seen": 995366912 }, { "epoch": 2.09, "learning_rate": 0.0003527382146439318, "loss": 2.9785, "theoretical_loss": 3.6507834702158113, "tokens_seen": 995432448 }, { "epoch": 2.09, "learning_rate": 0.000352728184553661, "loss": 2.9266, "theoretical_loss": 3.650760577924135, "tokens_seen": 995497984 }, { "epoch": 2.09, "learning_rate": 0.0003527181544633902, "loss": 2.889, "theoretical_loss": 3.6507376875614064, "tokens_seen": 995563520 }, { "epoch": 2.09, "learning_rate": 0.0003527081243731194, "loss": 2.806, "theoretical_loss": 3.6507147991273357, "tokens_seen": 995629056 }, { "epoch": 2.09, "learning_rate": 0.00035269809428284854, "loss": 2.7373, "theoretical_loss": 3.6506919126216344, "tokens_seen": 995694592 }, { "epoch": 2.09, "learning_rate": 0.0003526880641925778, "loss": 2.6776, "theoretical_loss": 3.6506690280440126, "tokens_seen": 995760128 }, { "epoch": 2.09, "learning_rate": 0.0003526780341023069, "loss": 2.8387, "theoretical_loss": 3.650646145394181, "tokens_seen": 995825664 }, { "epoch": 2.09, "learning_rate": 0.00035266800401203614, "loss": 2.8391, "theoretical_loss": 3.65062326467185, "tokens_seen": 995891200 }, { "epoch": 2.09, "learning_rate": 0.0003526579739217653, "loss": 3.001, "theoretical_loss": 3.6506003858767313, "tokens_seen": 995956736 }, { "epoch": 2.09, "learning_rate": 0.0003526479438314945, "loss": 2.8491, "theoretical_loss": 3.6505775090085355, "tokens_seen": 996022272 }, { "epoch": 2.09, "learning_rate": 0.0003526379137412237, "loss": 2.8832, "theoretical_loss": 3.650554634066973, "tokens_seen": 996087808 }, { "epoch": 2.09, "objective/train/docs_used": 1601234, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9211387634277344, "objective/train/theoretical_loss": 3.650543197318589, "objective/train/tokens_used": 1016580576, "theoretical_loss": 3.650543197318589, "tokens_seen": 996120576 }, { "epoch": 2.09, "learning_rate": 0.00035262788365095286, "loss": 2.9969, "theoretical_loss": 3.650531761051755, "tokens_seen": 996153344 }, { "epoch": 2.09, "learning_rate": 0.00035261785356068204, "loss": 2.9083, "theoretical_loss": 3.650508889962593, "tokens_seen": 996218880 }, { "epoch": 2.09, "learning_rate": 0.0003526078234704113, "loss": 2.9804, "theoretical_loss": 3.6504860207991974, "tokens_seen": 996284416 }, { "epoch": 2.09, "learning_rate": 0.0003525977933801404, "loss": 2.8363, "theoretical_loss": 3.65046315356128, "tokens_seen": 996349952 }, { "epoch": 2.09, "learning_rate": 0.00035258776328986964, "loss": 2.8863, "theoretical_loss": 3.650440288248552, "tokens_seen": 996415488 }, { "epoch": 2.09, "learning_rate": 0.00035257773319959877, "loss": 2.895, "theoretical_loss": 3.6504174248607244, "tokens_seen": 996481024 }, { "epoch": 2.09, "learning_rate": 0.000352567703109328, "loss": 2.9422, "theoretical_loss": 3.6503945633975086, "tokens_seen": 996546560 }, { "epoch": 2.09, "learning_rate": 0.0003525576730190572, "loss": 2.9232, "theoretical_loss": 3.6503717038586156, "tokens_seen": 996612096 }, { "epoch": 2.09, "learning_rate": 0.00035254764292878637, "loss": 2.8196, "theoretical_loss": 3.6503488462437583, "tokens_seen": 996677632 }, { "epoch": 2.09, "learning_rate": 0.00035253761283851555, "loss": 2.8978, "theoretical_loss": 3.6503259905526475, "tokens_seen": 996743168 }, { "epoch": 2.09, "learning_rate": 0.0003525275827482448, "loss": 2.744, "theoretical_loss": 3.650303136784994, "tokens_seen": 996808704 }, { "epoch": 2.09, "learning_rate": 0.0003525175526579739, "loss": 2.9369, "theoretical_loss": 3.6502802849405107, "tokens_seen": 996874240 }, { "epoch": 2.1, "learning_rate": 0.00035250752256770314, "loss": 2.8621, "theoretical_loss": 3.6502574350189088, "tokens_seen": 996939776 }, { "epoch": 2.1, "learning_rate": 0.00035249749247743227, "loss": 2.921, "theoretical_loss": 3.6502345870199004, "tokens_seen": 997005312 }, { "epoch": 2.1, "learning_rate": 0.0003524874623871615, "loss": 2.7311, "theoretical_loss": 3.650211740943197, "tokens_seen": 997070848 }, { "epoch": 2.1, "learning_rate": 0.0003524774322968907, "loss": 2.8801, "theoretical_loss": 3.6501888967885106, "tokens_seen": 997136384 }, { "epoch": 2.1, "learning_rate": 0.00035246740220661987, "loss": 2.9681, "theoretical_loss": 3.6501660545555534, "tokens_seen": 997201920 }, { "epoch": 2.1, "learning_rate": 0.00035245737211634905, "loss": 2.8427, "theoretical_loss": 3.650143214244037, "tokens_seen": 997267456 }, { "epoch": 2.1, "learning_rate": 0.00035244734202607823, "loss": 2.8073, "theoretical_loss": 3.6501203758536738, "tokens_seen": 997332992 }, { "epoch": 2.1, "learning_rate": 0.0003524373119358074, "loss": 3.0458, "theoretical_loss": 3.6500975393841766, "tokens_seen": 997398528 }, { "epoch": 2.1, "learning_rate": 0.00035242728184553665, "loss": 2.8285, "theoretical_loss": 3.650074704835257, "tokens_seen": 997464064 }, { "epoch": 2.1, "learning_rate": 0.0003524172517552658, "loss": 2.8811, "theoretical_loss": 3.650051872206627, "tokens_seen": 997529600 }, { "epoch": 2.1, "learning_rate": 0.000352407221664995, "loss": 2.8725, "theoretical_loss": 3.6500290414979997, "tokens_seen": 997595136 }, { "epoch": 2.1, "learning_rate": 0.00035239719157472414, "loss": 2.9465, "theoretical_loss": 3.650006212709087, "tokens_seen": 997660672 }, { "epoch": 2.1, "learning_rate": 0.00035238716148445337, "loss": 2.8452, "theoretical_loss": 3.649983385839602, "tokens_seen": 997726208 }, { "epoch": 2.1, "objective/train/docs_used": 1602688, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.41609787940979, "objective/train/theoretical_loss": 3.649971973124554, "objective/train/tokens_used": 1018218976, "theoretical_loss": 3.649971973124554, "tokens_seen": 997758976 }, { "epoch": 2.1, "learning_rate": 0.00035237713139418255, "loss": 2.7693, "theoretical_loss": 3.649960560889256, "tokens_seen": 997791744 }, { "epoch": 2.1, "learning_rate": 0.00035236710130391173, "loss": 2.7954, "theoretical_loss": 3.649937737857763, "tokens_seen": 997857280 }, { "epoch": 2.1, "learning_rate": 0.0003523570712136409, "loss": 2.8501, "theoretical_loss": 3.649914916744835, "tokens_seen": 997922816 }, { "epoch": 2.1, "learning_rate": 0.00035234704112337015, "loss": 2.9444, "theoretical_loss": 3.6498920975501847, "tokens_seen": 997988352 }, { "epoch": 2.1, "learning_rate": 0.0003523370110330993, "loss": 2.8804, "theoretical_loss": 3.6498692802735255, "tokens_seen": 998053888 }, { "epoch": 2.1, "learning_rate": 0.0003523269809428285, "loss": 3.0806, "theoretical_loss": 3.649846464914569, "tokens_seen": 998119424 }, { "epoch": 2.1, "learning_rate": 0.00035231695085255764, "loss": 2.8971, "theoretical_loss": 3.64982365147303, "tokens_seen": 998184960 }, { "epoch": 2.1, "learning_rate": 0.0003523069207622869, "loss": 2.8429, "theoretical_loss": 3.64980083994862, "tokens_seen": 998250496 }, { "epoch": 2.1, "learning_rate": 0.00035229689067201606, "loss": 2.6288, "theoretical_loss": 3.6497780303410523, "tokens_seen": 998316032 }, { "epoch": 2.1, "learning_rate": 0.00035228686058174524, "loss": 2.9004, "theoretical_loss": 3.6497552226500405, "tokens_seen": 998381568 }, { "epoch": 2.1, "learning_rate": 0.0003522768304914744, "loss": 2.8158, "theoretical_loss": 3.649732416875297, "tokens_seen": 998447104 }, { "epoch": 2.1, "learning_rate": 0.0003522668004012036, "loss": 2.919, "theoretical_loss": 3.649709613016536, "tokens_seen": 998512640 }, { "epoch": 2.1, "learning_rate": 0.0003522567703109328, "loss": 2.9927, "theoretical_loss": 3.64968681107347, "tokens_seen": 998578176 }, { "epoch": 2.1, "learning_rate": 0.000352246740220662, "loss": 2.8456, "theoretical_loss": 3.649664011045813, "tokens_seen": 998643712 }, { "epoch": 2.1, "learning_rate": 0.00035223671013039114, "loss": 2.8852, "theoretical_loss": 3.649641212933278, "tokens_seen": 998709248 }, { "epoch": 2.1, "learning_rate": 0.0003522266800401204, "loss": 2.9031, "theoretical_loss": 3.6496184167355787, "tokens_seen": 998774784 }, { "epoch": 2.1, "learning_rate": 0.00035221664994984956, "loss": 2.8363, "theoretical_loss": 3.649595622452428, "tokens_seen": 998840320 }, { "epoch": 2.1, "learning_rate": 0.00035220661985957874, "loss": 2.8742, "theoretical_loss": 3.6495728300835397, "tokens_seen": 998905856 }, { "epoch": 2.1, "learning_rate": 0.0003521965897693079, "loss": 2.832, "theoretical_loss": 3.649550039628629, "tokens_seen": 998971392 }, { "epoch": 2.1, "learning_rate": 0.0003521865596790371, "loss": 2.9922, "theoretical_loss": 3.6495272510874077, "tokens_seen": 999036928 }, { "epoch": 2.1, "learning_rate": 0.0003521765295887663, "loss": 2.9223, "theoretical_loss": 3.64950446445959, "tokens_seen": 999102464 }, { "epoch": 2.1, "learning_rate": 0.0003521664994984955, "loss": 3.0839, "theoretical_loss": 3.6494816797448904, "tokens_seen": 999168000 }, { "epoch": 2.1, "learning_rate": 0.00035215646940822465, "loss": 2.756, "theoretical_loss": 3.649458896943022, "tokens_seen": 999233536 }, { "epoch": 2.1, "learning_rate": 0.0003521464393179539, "loss": 2.8804, "theoretical_loss": 3.6494361160536997, "tokens_seen": 999299072 }, { "epoch": 2.1, "learning_rate": 0.000352136409227683, "loss": 2.9849, "theoretical_loss": 3.649413337076637, "tokens_seen": 999364608 }, { "epoch": 2.1, "objective/train/docs_used": 1605516, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8316280841827393, "objective/train/theoretical_loss": 3.6494019483051137, "objective/train/tokens_used": 1019857376, "theoretical_loss": 3.6494019483051137, "tokens_seen": 999397376 }, { "epoch": 2.1, "learning_rate": 0.00035212637913741224, "loss": 2.9387, "theoretical_loss": 3.6493905600115477, "tokens_seen": 999430144 }, { "epoch": 2.1, "learning_rate": 0.0003521163490471414, "loss": 3.0366, "theoretical_loss": 3.649367784858147, "tokens_seen": 999495680 }, { "epoch": 2.1, "learning_rate": 0.0003521063189568706, "loss": 2.7372, "theoretical_loss": 3.6493450116161483, "tokens_seen": 999561216 }, { "epoch": 2.1, "learning_rate": 0.00035209628886659984, "loss": 3.1284, "theoretical_loss": 3.649322240285265, "tokens_seen": 999626752 }, { "epoch": 2.1, "learning_rate": 0.00035208625877632897, "loss": 2.8551, "theoretical_loss": 3.6492994708652136, "tokens_seen": 999692288 }, { "epoch": 2.1, "learning_rate": 0.0003520762286860582, "loss": 2.8078, "theoretical_loss": 3.649276703355707, "tokens_seen": 999757824 }, { "epoch": 2.1, "learning_rate": 0.0003520661985957874, "loss": 3.0163, "theoretical_loss": 3.64925393775646, "tokens_seen": 999823360 }, { "epoch": 2.1, "learning_rate": 0.00035205616850551657, "loss": 2.8968, "theoretical_loss": 3.649231174067187, "tokens_seen": 999888896 }, { "epoch": 2.1, "learning_rate": 0.00035204613841524575, "loss": 2.9651, "theoretical_loss": 3.649208412287603, "tokens_seen": 999954432 }, { "epoch": 2.1, "learning_rate": 0.000352036108324975, "loss": 3.0437, "theoretical_loss": 3.6491856524174224, "tokens_seen": 1000019968 }, { "epoch": 2.1, "learning_rate": 0.0003520260782347041, "loss": 2.8893, "theoretical_loss": 3.649162894456359, "tokens_seen": 1000085504 }, { "epoch": 2.1, "learning_rate": 0.00035201604814443334, "loss": 2.8239, "theoretical_loss": 3.6491401384041295, "tokens_seen": 1000151040 }, { "epoch": 2.1, "learning_rate": 0.00035200601805416247, "loss": 2.9072, "theoretical_loss": 3.649117384260448, "tokens_seen": 1000216576 }, { "epoch": 2.1, "learning_rate": 0.0003519959879638917, "loss": 2.9402, "theoretical_loss": 3.649094632025028, "tokens_seen": 1000282112 }, { "epoch": 2.1, "learning_rate": 0.0003519859578736209, "loss": 2.9095, "theoretical_loss": 3.6490718816975862, "tokens_seen": 1000347648 }, { "epoch": 2.1, "learning_rate": 0.00035197592778335007, "loss": 2.987, "theoretical_loss": 3.649049133277837, "tokens_seen": 1000413184 }, { "epoch": 2.1, "learning_rate": 0.00035196589769307925, "loss": 2.9062, "theoretical_loss": 3.6490263867654953, "tokens_seen": 1000478720 }, { "epoch": 2.1, "learning_rate": 0.00035195586760280843, "loss": 2.936, "theoretical_loss": 3.649003642160276, "tokens_seen": 1000544256 }, { "epoch": 2.1, "learning_rate": 0.0003519458375125376, "loss": 2.9053, "theoretical_loss": 3.648980899461895, "tokens_seen": 1000609792 }, { "epoch": 2.1, "learning_rate": 0.00035193580742226685, "loss": 2.9058, "theoretical_loss": 3.6489581586700677, "tokens_seen": 1000675328 }, { "epoch": 2.1, "learning_rate": 0.000351925777331996, "loss": 2.9094, "theoretical_loss": 3.6489354197845083, "tokens_seen": 1000740864 }, { "epoch": 2.1, "learning_rate": 0.0003519157472417252, "loss": 2.9989, "theoretical_loss": 3.648912682804933, "tokens_seen": 1000806400 }, { "epoch": 2.1, "learning_rate": 0.00035190571715145434, "loss": 2.9492, "theoretical_loss": 3.6488899477310577, "tokens_seen": 1000871936 }, { "epoch": 2.1, "learning_rate": 0.00035189568706118357, "loss": 2.786, "theoretical_loss": 3.648867214562596, "tokens_seen": 1000937472 }, { "epoch": 2.1, "learning_rate": 0.00035188565697091275, "loss": 2.9666, "theoretical_loss": 3.648844483299266, "tokens_seen": 1001003008 }, { "epoch": 2.1, "objective/train/docs_used": 1608464, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8904836177825928, "objective/train/theoretical_loss": 3.648833118381936, "objective/train/tokens_used": 1021495776, "theoretical_loss": 3.648833118381936, "tokens_seen": 1001035776 }, { "epoch": 2.1, "learning_rate": 0.00035187562688064193, "loss": 2.971, "theoretical_loss": 3.6488217539407817, "tokens_seen": 1001068544 }, { "epoch": 2.1, "learning_rate": 0.0003518655967903711, "loss": 2.8298, "theoretical_loss": 3.6487990264868593, "tokens_seen": 1001134080 }, { "epoch": 2.1, "learning_rate": 0.00035185556670010035, "loss": 2.8467, "theoretical_loss": 3.6487763009372145, "tokens_seen": 1001199616 }, { "epoch": 2.1, "learning_rate": 0.0003518455366098295, "loss": 2.8622, "theoretical_loss": 3.6487535772915627, "tokens_seen": 1001265152 }, { "epoch": 2.1, "learning_rate": 0.0003518355065195587, "loss": 3.0166, "theoretical_loss": 3.6487308555496205, "tokens_seen": 1001330688 }, { "epoch": 2.1, "learning_rate": 0.00035182547642928784, "loss": 2.8738, "theoretical_loss": 3.6487081357111033, "tokens_seen": 1001396224 }, { "epoch": 2.1, "learning_rate": 0.0003518154463390171, "loss": 2.8317, "theoretical_loss": 3.6486854177757273, "tokens_seen": 1001461760 }, { "epoch": 2.1, "learning_rate": 0.00035180541624874626, "loss": 2.9523, "theoretical_loss": 3.6486627017432083, "tokens_seen": 1001527296 }, { "epoch": 2.1, "learning_rate": 0.00035179538615847544, "loss": 2.864, "theoretical_loss": 3.648639987613263, "tokens_seen": 1001592832 }, { "epoch": 2.1, "learning_rate": 0.0003517853560682046, "loss": 2.8495, "theoretical_loss": 3.648617275385607, "tokens_seen": 1001658368 }, { "epoch": 2.1, "learning_rate": 0.0003517753259779338, "loss": 2.8776, "theoretical_loss": 3.6485945650599567, "tokens_seen": 1001723904 }, { "epoch": 2.1, "learning_rate": 0.000351765295887663, "loss": 2.6938, "theoretical_loss": 3.6485718566360283, "tokens_seen": 1001789440 }, { "epoch": 2.1, "learning_rate": 0.0003517552657973922, "loss": 2.7863, "theoretical_loss": 3.6485491501135385, "tokens_seen": 1001854976 }, { "epoch": 2.1, "learning_rate": 0.00035174523570712134, "loss": 2.9154, "theoretical_loss": 3.6485264454922035, "tokens_seen": 1001920512 }, { "epoch": 2.1, "learning_rate": 0.0003517352056168506, "loss": 2.8912, "theoretical_loss": 3.64850374277174, "tokens_seen": 1001986048 }, { "epoch": 2.1, "learning_rate": 0.00035172517552657976, "loss": 3.081, "theoretical_loss": 3.6484810419518645, "tokens_seen": 1002051584 }, { "epoch": 2.1, "learning_rate": 0.00035171514543630894, "loss": 2.962, "theoretical_loss": 3.648458343032293, "tokens_seen": 1002117120 }, { "epoch": 2.1, "learning_rate": 0.0003517051153460381, "loss": 2.7461, "theoretical_loss": 3.648435646012743, "tokens_seen": 1002182656 }, { "epoch": 2.1, "learning_rate": 0.0003516950852557673, "loss": 2.7799, "theoretical_loss": 3.64841295089293, "tokens_seen": 1002248192 }, { "epoch": 2.1, "learning_rate": 0.0003516850551654965, "loss": 2.7796, "theoretical_loss": 3.6483902576725726, "tokens_seen": 1002313728 }, { "epoch": 2.1, "learning_rate": 0.0003516750250752257, "loss": 2.7475, "theoretical_loss": 3.648367566351386, "tokens_seen": 1002379264 }, { "epoch": 2.1, "learning_rate": 0.00035166499498495485, "loss": 2.8086, "theoretical_loss": 3.6483448769290883, "tokens_seen": 1002444800 }, { "epoch": 2.1, "learning_rate": 0.0003516549648946841, "loss": 2.8711, "theoretical_loss": 3.6483221894053957, "tokens_seen": 1002510336 }, { "epoch": 2.1, "learning_rate": 0.0003516449348044132, "loss": 2.9396, "theoretical_loss": 3.6482995037800254, "tokens_seen": 1002575872 }, { "epoch": 2.1, "learning_rate": 0.00035163490471414244, "loss": 2.8567, "theoretical_loss": 3.648276820052695, "tokens_seen": 1002641408 }, { "epoch": 2.1, "objective/train/docs_used": 1611253, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.967592477798462, "objective/train/theoretical_loss": 3.648265478900706, "objective/train/tokens_used": 1023134176, "theoretical_loss": 3.648265478900706, "tokens_seen": 1002674176 }, { "epoch": 2.1, "learning_rate": 0.0003516248746238716, "loss": 2.6796, "theoretical_loss": 3.6482541382231206, "tokens_seen": 1002706944 }, { "epoch": 2.1, "learning_rate": 0.0003516148445336008, "loss": 2.7918, "theoretical_loss": 3.6482314582910202, "tokens_seen": 1002772480 }, { "epoch": 2.1, "learning_rate": 0.00035160481444333, "loss": 2.8562, "theoretical_loss": 3.6482087802561116, "tokens_seen": 1002838016 }, { "epoch": 2.1, "learning_rate": 0.00035159478435305917, "loss": 3.034, "theoretical_loss": 3.6481861041181105, "tokens_seen": 1002903552 }, { "epoch": 2.1, "learning_rate": 0.00035158475426278835, "loss": 2.9352, "theoretical_loss": 3.648163429876736, "tokens_seen": 1002969088 }, { "epoch": 2.1, "learning_rate": 0.0003515747241725176, "loss": 2.8097, "theoretical_loss": 3.648140757531704, "tokens_seen": 1003034624 }, { "epoch": 2.1, "learning_rate": 0.0003515646940822467, "loss": 2.9309, "theoretical_loss": 3.648118087082733, "tokens_seen": 1003100160 }, { "epoch": 2.1, "learning_rate": 0.00035155466399197595, "loss": 2.8605, "theoretical_loss": 3.648095418529541, "tokens_seen": 1003165696 }, { "epoch": 2.1, "learning_rate": 0.00035154463390170513, "loss": 2.7627, "theoretical_loss": 3.6480727518718443, "tokens_seen": 1003231232 }, { "epoch": 2.1, "learning_rate": 0.0003515346038114343, "loss": 2.8052, "theoretical_loss": 3.648050087109362, "tokens_seen": 1003296768 }, { "epoch": 2.1, "learning_rate": 0.0003515245737211635, "loss": 2.8241, "theoretical_loss": 3.6480274242418105, "tokens_seen": 1003362304 }, { "epoch": 2.1, "learning_rate": 0.00035151454363089267, "loss": 2.6978, "theoretical_loss": 3.6480047632689083, "tokens_seen": 1003427840 }, { "epoch": 2.1, "learning_rate": 0.00035150451354062185, "loss": 2.7952, "theoretical_loss": 3.6479821041903735, "tokens_seen": 1003493376 }, { "epoch": 2.1, "learning_rate": 0.0003514944834503511, "loss": 2.78, "theoretical_loss": 3.647959447005924, "tokens_seen": 1003558912 }, { "epoch": 2.1, "learning_rate": 0.0003514844533600802, "loss": 2.824, "theoretical_loss": 3.647936791715277, "tokens_seen": 1003624448 }, { "epoch": 2.1, "learning_rate": 0.00035147442326980945, "loss": 2.8685, "theoretical_loss": 3.6479141383181517, "tokens_seen": 1003689984 }, { "epoch": 2.1, "learning_rate": 0.0003514643931795386, "loss": 3.0035, "theoretical_loss": 3.647891486814265, "tokens_seen": 1003755520 }, { "epoch": 2.1, "learning_rate": 0.0003514543630892678, "loss": 2.8855, "theoretical_loss": 3.647868837203336, "tokens_seen": 1003821056 }, { "epoch": 2.1, "learning_rate": 0.000351444332998997, "loss": 2.9173, "theoretical_loss": 3.647846189485083, "tokens_seen": 1003886592 }, { "epoch": 2.1, "learning_rate": 0.0003514343029087262, "loss": 2.9173, "theoretical_loss": 3.6478235436592237, "tokens_seen": 1003952128 }, { "epoch": 2.1, "learning_rate": 0.00035142427281845536, "loss": 2.8706, "theoretical_loss": 3.6478008997254765, "tokens_seen": 1004017664 }, { "epoch": 2.1, "learning_rate": 0.00035141424272818454, "loss": 2.805, "theoretical_loss": 3.6477782576835605, "tokens_seen": 1004083200 }, { "epoch": 2.1, "learning_rate": 0.0003514042126379137, "loss": 2.8145, "theoretical_loss": 3.6477556175331936, "tokens_seen": 1004148736 }, { "epoch": 2.1, "learning_rate": 0.00035139418254764295, "loss": 3.0146, "theoretical_loss": 3.6477329792740942, "tokens_seen": 1004214272 }, { "epoch": 2.1, "learning_rate": 0.0003513841524573721, "loss": 2.9611, "theoretical_loss": 3.647710342905982, "tokens_seen": 1004279808 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.986518144607544, "objective/train/theoretical_loss": 3.647699025430957, "objective/train/tokens_used": 1024772576, "theoretical_loss": 3.647699025430957, "tokens_seen": 1004312576 }, { "epoch": 2.1, "learning_rate": 0.0003513741223671013, "loss": 2.782, "theoretical_loss": 3.647687708428574, "tokens_seen": 1004345344 }, { "epoch": 2.1, "learning_rate": 0.0003513640922768305, "loss": 2.8993, "theoretical_loss": 3.64766507584159, "tokens_seen": 1004410880 }, { "epoch": 2.1, "learning_rate": 0.0003513540621865597, "loss": 2.8469, "theoretical_loss": 3.647642445144748, "tokens_seen": 1004476416 }, { "epoch": 2.1, "learning_rate": 0.0003513440320962889, "loss": 2.7976, "theoretical_loss": 3.6476198163377678, "tokens_seen": 1004541952 }, { "epoch": 2.1, "learning_rate": 0.00035133400200601804, "loss": 2.8351, "theoretical_loss": 3.647597189420368, "tokens_seen": 1004607488 }, { "epoch": 2.1, "learning_rate": 0.0003513239719157473, "loss": 2.8332, "theoretical_loss": 3.6475745643922677, "tokens_seen": 1004673024 }, { "epoch": 2.1, "learning_rate": 0.00035131394182547646, "loss": 3.026, "theoretical_loss": 3.6475519412531856, "tokens_seen": 1004738560 }, { "epoch": 2.1, "learning_rate": 0.00035130391173520564, "loss": 2.9145, "theoretical_loss": 3.6475293200028402, "tokens_seen": 1004804096 }, { "epoch": 2.1, "learning_rate": 0.0003512938816449348, "loss": 3.0629, "theoretical_loss": 3.647506700640952, "tokens_seen": 1004869632 }, { "epoch": 2.1, "learning_rate": 0.000351283851554664, "loss": 3.0899, "theoretical_loss": 3.647484083167239, "tokens_seen": 1004935168 }, { "epoch": 2.1, "learning_rate": 0.0003512738214643932, "loss": 2.915, "theoretical_loss": 3.647461467581421, "tokens_seen": 1005000704 }, { "epoch": 2.1, "learning_rate": 0.0003512637913741224, "loss": 2.8405, "theoretical_loss": 3.647438853883217, "tokens_seen": 1005066240 }, { "epoch": 2.1, "learning_rate": 0.00035125376128385154, "loss": 2.7966, "theoretical_loss": 3.6474162420723473, "tokens_seen": 1005131776 }, { "epoch": 2.1, "learning_rate": 0.0003512437311935808, "loss": 2.7318, "theoretical_loss": 3.64739363214853, "tokens_seen": 1005197312 }, { "epoch": 2.1, "learning_rate": 0.00035123370110330996, "loss": 2.8516, "theoretical_loss": 3.6473710241114863, "tokens_seen": 1005262848 }, { "epoch": 2.1, "learning_rate": 0.00035122367101303914, "loss": 2.7903, "theoretical_loss": 3.6473484179609343, "tokens_seen": 1005328384 }, { "epoch": 2.1, "learning_rate": 0.0003512136409227683, "loss": 3.1374, "theoretical_loss": 3.6473258136965936, "tokens_seen": 1005393920 }, { "epoch": 2.1, "learning_rate": 0.0003512036108324975, "loss": 2.8929, "theoretical_loss": 3.647303211318185, "tokens_seen": 1005459456 }, { "epoch": 2.1, "learning_rate": 0.0003511935807422267, "loss": 2.8259, "theoretical_loss": 3.647280610825427, "tokens_seen": 1005524992 }, { "epoch": 2.1, "learning_rate": 0.0003511835506519559, "loss": 2.9536, "theoretical_loss": 3.6472580122180407, "tokens_seen": 1005590528 }, { "epoch": 2.1, "learning_rate": 0.00035117352056168505, "loss": 2.9138, "theoretical_loss": 3.6472354154957447, "tokens_seen": 1005656064 }, { "epoch": 2.1, "learning_rate": 0.0003511634904714143, "loss": 2.947, "theoretical_loss": 3.6472128206582597, "tokens_seen": 1005721600 }, { "epoch": 2.1, "learning_rate": 0.0003511534603811434, "loss": 2.8584, "theoretical_loss": 3.6471902277053054, "tokens_seen": 1005787136 }, { "epoch": 2.1, "learning_rate": 0.00035114343029087264, "loss": 3.0689, "theoretical_loss": 3.6471676366366017, "tokens_seen": 1005852672 }, { "epoch": 2.1, "learning_rate": 0.0003511334002006018, "loss": 2.7006, "theoretical_loss": 3.647145047451869, "tokens_seen": 1005918208 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8857154846191406, "objective/train/theoretical_loss": 3.647133753565905, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.647133753565905, "tokens_seen": 1005950976 }, { "epoch": 2.1, "learning_rate": 0.000351123370110331, "loss": 2.8027, "theoretical_loss": 3.647122460150828, "tokens_seen": 1005983744 }, { "epoch": 2.1, "learning_rate": 0.0003511133400200602, "loss": 2.9856, "theoretical_loss": 3.6470998747331977, "tokens_seen": 1006049280 }, { "epoch": 2.1, "learning_rate": 0.00035110330992978937, "loss": 2.8457, "theoretical_loss": 3.6470772911986993, "tokens_seen": 1006114816 }, { "epoch": 2.1, "learning_rate": 0.00035109327983951855, "loss": 2.8521, "theoretical_loss": 3.647054709547053, "tokens_seen": 1006180352 }, { "epoch": 2.1, "learning_rate": 0.0003510832497492478, "loss": 2.8583, "theoretical_loss": 3.647032129777978, "tokens_seen": 1006245888 }, { "epoch": 2.1, "learning_rate": 0.0003510732196589769, "loss": 2.8074, "theoretical_loss": 3.647009551891197, "tokens_seen": 1006311424 }, { "epoch": 2.1, "learning_rate": 0.00035106318956870615, "loss": 3.0219, "theoretical_loss": 3.6469869758864286, "tokens_seen": 1006376960 }, { "epoch": 2.1, "learning_rate": 0.00035105315947843533, "loss": 2.939, "theoretical_loss": 3.646964401763394, "tokens_seen": 1006442496 }, { "epoch": 2.1, "learning_rate": 0.0003510431293881645, "loss": 2.9949, "theoretical_loss": 3.646941829521814, "tokens_seen": 1006508032 }, { "epoch": 2.1, "learning_rate": 0.0003510330992978937, "loss": 2.846, "theoretical_loss": 3.6469192591614092, "tokens_seen": 1006573568 }, { "epoch": 2.1, "learning_rate": 0.00035102306920762287, "loss": 2.8539, "theoretical_loss": 3.646896690681901, "tokens_seen": 1006639104 }, { "epoch": 2.1, "learning_rate": 0.00035101303911735205, "loss": 2.8647, "theoretical_loss": 3.646874124083009, "tokens_seen": 1006704640 }, { "epoch": 2.1, "learning_rate": 0.0003510030090270813, "loss": 2.7841, "theoretical_loss": 3.6468515593644546, "tokens_seen": 1006770176 }, { "epoch": 2.1, "learning_rate": 0.0003509929789368104, "loss": 2.8908, "theoretical_loss": 3.646828996525959, "tokens_seen": 1006835712 }, { "epoch": 2.1, "learning_rate": 0.00035098294884653965, "loss": 2.9476, "theoretical_loss": 3.6468064355672434, "tokens_seen": 1006901248 }, { "epoch": 2.1, "learning_rate": 0.0003509729187562688, "loss": 2.8749, "theoretical_loss": 3.6467838764880276, "tokens_seen": 1006966784 }, { "epoch": 2.1, "learning_rate": 0.000350962888665998, "loss": 2.9604, "theoretical_loss": 3.646761319288034, "tokens_seen": 1007032320 }, { "epoch": 2.1, "learning_rate": 0.0003509528585757272, "loss": 2.8672, "theoretical_loss": 3.6467387639669835, "tokens_seen": 1007097856 }, { "epoch": 2.1, "learning_rate": 0.0003509428284854564, "loss": 2.755, "theoretical_loss": 3.646716210524597, "tokens_seen": 1007163392 }, { "epoch": 2.1, "learning_rate": 0.00035093279839518556, "loss": 3.0416, "theoretical_loss": 3.6466936589605963, "tokens_seen": 1007228928 }, { "epoch": 2.1, "learning_rate": 0.00035092276830491474, "loss": 2.8724, "theoretical_loss": 3.6466711092747017, "tokens_seen": 1007294464 }, { "epoch": 2.1, "learning_rate": 0.0003509127382146439, "loss": 2.8651, "theoretical_loss": 3.6466485614666357, "tokens_seen": 1007360000 }, { "epoch": 2.1, "learning_rate": 0.00035090270812437315, "loss": 2.8246, "theoretical_loss": 3.64662601553612, "tokens_seen": 1007425536 }, { "epoch": 2.1, "learning_rate": 0.0003508926780341023, "loss": 2.8882, "theoretical_loss": 3.646603471482875, "tokens_seen": 1007491072 }, { "epoch": 2.1, "learning_rate": 0.0003508826479438315, "loss": 2.9684, "theoretical_loss": 3.646580929306623, "tokens_seen": 1007556608 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1698482036590576, "objective/train/theoretical_loss": 3.646569658922282, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.646569658922282, "tokens_seen": 1007589376 }, { "epoch": 2.1, "learning_rate": 0.0003508726178535607, "loss": 3.0477, "theoretical_loss": 3.646558389007085, "tokens_seen": 1007622144 }, { "epoch": 2.1, "learning_rate": 0.0003508625877632899, "loss": 2.9333, "theoretical_loss": 3.6465358505839838, "tokens_seen": 1007687680 }, { "epoch": 2.1, "learning_rate": 0.00035085255767301906, "loss": 2.996, "theoretical_loss": 3.6465133140370405, "tokens_seen": 1007753216 }, { "epoch": 2.1, "learning_rate": 0.00035084252758274824, "loss": 2.8243, "theoretical_loss": 3.6464907793659767, "tokens_seen": 1007818752 }, { "epoch": 2.1, "learning_rate": 0.0003508324974924774, "loss": 3.0361, "theoretical_loss": 3.6464682465705147, "tokens_seen": 1007884288 }, { "epoch": 2.1, "learning_rate": 0.00035082246740220666, "loss": 2.9744, "theoretical_loss": 3.6464457156503762, "tokens_seen": 1007949824 }, { "epoch": 2.1, "learning_rate": 0.0003508124373119358, "loss": 2.839, "theoretical_loss": 3.6464231866052836, "tokens_seen": 1008015360 }, { "epoch": 2.1, "learning_rate": 0.000350802407221665, "loss": 2.804, "theoretical_loss": 3.646400659434959, "tokens_seen": 1008080896 }, { "epoch": 2.1, "learning_rate": 0.00035079237713139415, "loss": 2.8651, "theoretical_loss": 3.6463781341391233, "tokens_seen": 1008146432 }, { "epoch": 2.1, "learning_rate": 0.0003507823470411234, "loss": 2.7325, "theoretical_loss": 3.6463556107175004, "tokens_seen": 1008211968 }, { "epoch": 2.1, "learning_rate": 0.00035077231695085256, "loss": 2.8955, "theoretical_loss": 3.6463330891698114, "tokens_seen": 1008277504 }, { "epoch": 2.1, "learning_rate": 0.00035076228686058174, "loss": 2.7969, "theoretical_loss": 3.646310569495779, "tokens_seen": 1008343040 }, { "epoch": 2.1, "learning_rate": 0.0003507522567703109, "loss": 2.9079, "theoretical_loss": 3.6462880516951257, "tokens_seen": 1008408576 }, { "epoch": 2.1, "learning_rate": 0.00035074222668004016, "loss": 2.8605, "theoretical_loss": 3.646265535767574, "tokens_seen": 1008474112 }, { "epoch": 2.1, "learning_rate": 0.0003507321965897693, "loss": 2.7797, "theoretical_loss": 3.646243021712846, "tokens_seen": 1008539648 }, { "epoch": 2.1, "learning_rate": 0.0003507221664994985, "loss": 2.9418, "theoretical_loss": 3.646220509530664, "tokens_seen": 1008605184 }, { "epoch": 2.1, "learning_rate": 0.00035071213640922765, "loss": 2.9098, "theoretical_loss": 3.646197999220751, "tokens_seen": 1008670720 }, { "epoch": 2.1, "learning_rate": 0.0003507021063189569, "loss": 2.8748, "theoretical_loss": 3.64617549078283, "tokens_seen": 1008736256 }, { "epoch": 2.1, "learning_rate": 0.00035069207622868607, "loss": 2.9727, "theoretical_loss": 3.6461529842166227, "tokens_seen": 1008801792 }, { "epoch": 2.1, "learning_rate": 0.00035068204613841525, "loss": 2.7714, "theoretical_loss": 3.6461304795218528, "tokens_seen": 1008867328 }, { "epoch": 2.1, "learning_rate": 0.00035067201604814443, "loss": 2.9246, "theoretical_loss": 3.6461079766982434, "tokens_seen": 1008932864 }, { "epoch": 2.1, "learning_rate": 0.0003506619859578736, "loss": 2.8942, "theoretical_loss": 3.6460854757455166, "tokens_seen": 1008998400 }, { "epoch": 2.1, "learning_rate": 0.0003506519558676028, "loss": 2.7363, "theoretical_loss": 3.646062976663395, "tokens_seen": 1009063936 }, { "epoch": 2.1, "learning_rate": 0.000350641925777332, "loss": 2.8995, "theoretical_loss": 3.646040479451603, "tokens_seen": 1009129472 }, { "epoch": 2.1, "learning_rate": 0.00035063189568706115, "loss": 2.9075, "theoretical_loss": 3.6460179841098626, "tokens_seen": 1009195008 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.902592420578003, "objective/train/theoretical_loss": 3.6460067371401754, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6460067371401754, "tokens_seen": 1009227776 }, { "epoch": 2.1, "learning_rate": 0.0003506218655967904, "loss": 2.7698, "theoretical_loss": 3.645995490637897, "tokens_seen": 1009260544 }, { "epoch": 2.1, "learning_rate": 0.0003506118355065195, "loss": 2.871, "theoretical_loss": 3.64597299903543, "tokens_seen": 1009326080 }, { "epoch": 2.1, "learning_rate": 0.00035060180541624875, "loss": 2.9925, "theoretical_loss": 3.645950509302184, "tokens_seen": 1009391616 }, { "epoch": 2.1, "learning_rate": 0.000350591775325978, "loss": 2.9741, "theoretical_loss": 3.645928021437883, "tokens_seen": 1009457152 }, { "epoch": 2.1, "learning_rate": 0.0003505817452357071, "loss": 2.8469, "theoretical_loss": 3.6459055354422505, "tokens_seen": 1009522688 }, { "epoch": 2.1, "learning_rate": 0.00035057171514543635, "loss": 2.8699, "theoretical_loss": 3.6458830513150087, "tokens_seen": 1009588224 }, { "epoch": 2.1, "learning_rate": 0.00035056168505516553, "loss": 2.856, "theoretical_loss": 3.6458605690558823, "tokens_seen": 1009653760 }, { "epoch": 2.1, "learning_rate": 0.0003505516549648947, "loss": 2.938, "theoretical_loss": 3.645838088664595, "tokens_seen": 1009719296 }, { "epoch": 2.1, "learning_rate": 0.0003505416248746239, "loss": 2.9032, "theoretical_loss": 3.645815610140869, "tokens_seen": 1009784832 }, { "epoch": 2.1, "learning_rate": 0.00035053159478435307, "loss": 2.9444, "theoretical_loss": 3.64579313348443, "tokens_seen": 1009850368 }, { "epoch": 2.1, "learning_rate": 0.00035052156469408225, "loss": 2.764, "theoretical_loss": 3.6457706586949996, "tokens_seen": 1009915904 }, { "epoch": 2.1, "learning_rate": 0.0003505115346038115, "loss": 3.031, "theoretical_loss": 3.6457481857723026, "tokens_seen": 1009981440 }, { "epoch": 2.1, "learning_rate": 0.0003505015045135406, "loss": 2.9056, "theoretical_loss": 3.6457257147160633, "tokens_seen": 1010046976 }, { "epoch": 2.1, "learning_rate": 0.00035049147442326985, "loss": 2.9879, "theoretical_loss": 3.645703245526005, "tokens_seen": 1010112512 }, { "epoch": 2.1, "learning_rate": 0.000350481444332999, "loss": 2.7503, "theoretical_loss": 3.645680778201851, "tokens_seen": 1010178048 }, { "epoch": 2.1, "learning_rate": 0.0003504714142427282, "loss": 2.8603, "theoretical_loss": 3.6456583127433264, "tokens_seen": 1010243584 }, { "epoch": 2.1, "learning_rate": 0.0003504613841524574, "loss": 2.8339, "theoretical_loss": 3.6456358491501555, "tokens_seen": 1010309120 }, { "epoch": 2.1, "learning_rate": 0.0003504513540621866, "loss": 2.7274, "theoretical_loss": 3.6456133874220615, "tokens_seen": 1010374656 }, { "epoch": 2.1, "learning_rate": 0.00035044132397191576, "loss": 2.8099, "theoretical_loss": 3.645590927558769, "tokens_seen": 1010440192 }, { "epoch": 2.1, "learning_rate": 0.00035043129388164494, "loss": 2.8693, "theoretical_loss": 3.6455684695600015, "tokens_seen": 1010505728 }, { "epoch": 2.1, "learning_rate": 0.0003504212637913741, "loss": 2.9553, "theoretical_loss": 3.6455460134254847, "tokens_seen": 1010571264 }, { "epoch": 2.1, "learning_rate": 0.00035041123370110335, "loss": 2.7169, "theoretical_loss": 3.645523559154942, "tokens_seen": 1010636800 }, { "epoch": 2.1, "learning_rate": 0.0003504012036108325, "loss": 2.8256, "theoretical_loss": 3.6455011067480982, "tokens_seen": 1010702336 }, { "epoch": 2.1, "learning_rate": 0.0003503911735205617, "loss": 2.9035, "theoretical_loss": 3.645478656204678, "tokens_seen": 1010767872 }, { "epoch": 2.1, "learning_rate": 0.0003503811434302909, "loss": 2.856, "theoretical_loss": 3.645456207524405, "tokens_seen": 1010833408 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.953544855117798, "objective/train/theoretical_loss": 3.6454449838828626, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6454449838828626, "tokens_seen": 1010866176 }, { "epoch": 2.1, "learning_rate": 0.0003503711133400201, "loss": 2.9337, "theoretical_loss": 3.6454337607070046, "tokens_seen": 1010898944 }, { "epoch": 2.1, "learning_rate": 0.00035036108324974926, "loss": 2.9757, "theoretical_loss": 3.645411315752201, "tokens_seen": 1010964480 }, { "epoch": 2.1, "learning_rate": 0.00035035105315947844, "loss": 2.6662, "theoretical_loss": 3.6453888726597192, "tokens_seen": 1011030016 }, { "epoch": 2.1, "learning_rate": 0.0003503410230692076, "loss": 2.9801, "theoretical_loss": 3.6453664314292844, "tokens_seen": 1011095552 }, { "epoch": 2.1, "learning_rate": 0.00035033099297893686, "loss": 2.9039, "theoretical_loss": 3.6453439920606208, "tokens_seen": 1011161088 }, { "epoch": 2.1, "learning_rate": 0.000350320962888666, "loss": 2.7767, "theoretical_loss": 3.6453215545534534, "tokens_seen": 1011226624 }, { "epoch": 2.1, "learning_rate": 0.0003503109327983952, "loss": 2.9325, "theoretical_loss": 3.645299118907507, "tokens_seen": 1011292160 }, { "epoch": 2.1, "learning_rate": 0.00035030090270812435, "loss": 2.7727, "theoretical_loss": 3.645276685122507, "tokens_seen": 1011357696 }, { "epoch": 2.1, "learning_rate": 0.0003502908726178536, "loss": 3.027, "theoretical_loss": 3.6452542531981784, "tokens_seen": 1011423232 }, { "epoch": 2.1, "learning_rate": 0.00035028084252758276, "loss": 2.642, "theoretical_loss": 3.6452318231342464, "tokens_seen": 1011488768 }, { "epoch": 2.1, "learning_rate": 0.00035027081243731194, "loss": 2.8634, "theoretical_loss": 3.645209394930436, "tokens_seen": 1011554304 }, { "epoch": 2.1, "learning_rate": 0.0003502607823470411, "loss": 2.8664, "theoretical_loss": 3.645186968586472, "tokens_seen": 1011619840 }, { "epoch": 2.1, "learning_rate": 0.00035025075225677036, "loss": 2.6826, "theoretical_loss": 3.6451645441020806, "tokens_seen": 1011685376 }, { "epoch": 2.1, "learning_rate": 0.0003502407221664995, "loss": 3.0257, "theoretical_loss": 3.6451421214769866, "tokens_seen": 1011750912 }, { "epoch": 2.1, "learning_rate": 0.0003502306920762287, "loss": 2.9406, "theoretical_loss": 3.6451197007109153, "tokens_seen": 1011816448 }, { "epoch": 2.1, "learning_rate": 0.00035022066198595785, "loss": 2.8587, "theoretical_loss": 3.6450972818035927, "tokens_seen": 1011881984 }, { "epoch": 2.1, "learning_rate": 0.0003502106318956871, "loss": 2.8772, "theoretical_loss": 3.6450748647547444, "tokens_seen": 1011947520 }, { "epoch": 2.1, "learning_rate": 0.00035020060180541627, "loss": 2.7961, "theoretical_loss": 3.645052449564095, "tokens_seen": 1012013056 }, { "epoch": 2.1, "learning_rate": 0.00035019057171514545, "loss": 2.9907, "theoretical_loss": 3.645030036231371, "tokens_seen": 1012078592 }, { "epoch": 2.1, "learning_rate": 0.00035018054162487463, "loss": 2.8599, "theoretical_loss": 3.645007624756298, "tokens_seen": 1012144128 }, { "epoch": 2.1, "learning_rate": 0.0003501705115346038, "loss": 3.0866, "theoretical_loss": 3.6449852151386013, "tokens_seen": 1012209664 }, { "epoch": 2.1, "learning_rate": 0.000350160481444333, "loss": 2.7079, "theoretical_loss": 3.6449628073780076, "tokens_seen": 1012275200 }, { "epoch": 2.1, "learning_rate": 0.0003501504513540622, "loss": 2.8485, "theoretical_loss": 3.644940401474242, "tokens_seen": 1012340736 }, { "epoch": 2.1, "learning_rate": 0.00035014042126379135, "loss": 2.852, "theoretical_loss": 3.6449179974270307, "tokens_seen": 1012406272 }, { "epoch": 2.1, "learning_rate": 0.0003501303911735206, "loss": 3.015, "theoretical_loss": 3.6448955952360995, "tokens_seen": 1012471808 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9652717113494873, "objective/train/theoretical_loss": 3.6448843948366534, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6448843948366534, "tokens_seen": 1012504576 }, { "epoch": 2.1, "learning_rate": 0.0003501203610832497, "loss": 2.8788, "theoretical_loss": 3.644873194901175, "tokens_seen": 1012537344 }, { "epoch": 2.1, "learning_rate": 0.00035011033099297895, "loss": 2.8892, "theoretical_loss": 3.644850796421983, "tokens_seen": 1012602880 }, { "epoch": 2.1, "learning_rate": 0.00035010030090270813, "loss": 2.8793, "theoretical_loss": 3.6448283997982496, "tokens_seen": 1012668416 }, { "epoch": 2.1, "learning_rate": 0.0003500902708124373, "loss": 2.7196, "theoretical_loss": 3.644806005029701, "tokens_seen": 1012733952 }, { "epoch": 2.1, "learning_rate": 0.0003500802407221665, "loss": 2.8231, "theoretical_loss": 3.644783612116064, "tokens_seen": 1012799488 }, { "epoch": 2.1, "learning_rate": 0.00035007021063189573, "loss": 2.9485, "theoretical_loss": 3.644761221057064, "tokens_seen": 1012865024 }, { "epoch": 2.1, "learning_rate": 0.00035006018054162486, "loss": 2.7701, "theoretical_loss": 3.6447388318524285, "tokens_seen": 1012930560 }, { "epoch": 2.1, "learning_rate": 0.0003500501504513541, "loss": 2.9242, "theoretical_loss": 3.644716444501883, "tokens_seen": 1012996096 }, { "epoch": 2.1, "learning_rate": 0.0003500401203610832, "loss": 2.9507, "theoretical_loss": 3.644694059005155, "tokens_seen": 1013061632 }, { "epoch": 2.1, "learning_rate": 0.00035003009027081245, "loss": 2.9227, "theoretical_loss": 3.6446716753619697, "tokens_seen": 1013127168 }, { "epoch": 2.1, "learning_rate": 0.00035002006018054163, "loss": 2.973, "theoretical_loss": 3.644649293572055, "tokens_seen": 1013192704 }, { "epoch": 2.1, "learning_rate": 0.0003500100300902708, "loss": 2.9834, "theoretical_loss": 3.644626913635137, "tokens_seen": 1013258240 }, { "epoch": 2.1, "learning_rate": 0.00035, "loss": 2.8371, "theoretical_loss": 3.6446045355509424, "tokens_seen": 1013323776 }, { "epoch": 2.1, "learning_rate": 0.0003499899699097292, "loss": 2.7412, "theoretical_loss": 3.644582159319199, "tokens_seen": 1013389312 }, { "epoch": 2.1, "learning_rate": 0.00034997993981945836, "loss": 2.9098, "theoretical_loss": 3.6445597849396325, "tokens_seen": 1013454848 }, { "epoch": 2.1, "learning_rate": 0.0003499699097291876, "loss": 2.8932, "theoretical_loss": 3.64453741241197, "tokens_seen": 1013520384 }, { "epoch": 2.1, "learning_rate": 0.0003499598796389167, "loss": 2.9255, "theoretical_loss": 3.6445150417359393, "tokens_seen": 1013585920 }, { "epoch": 2.1, "learning_rate": 0.00034994984954864596, "loss": 2.9076, "theoretical_loss": 3.6444926729112663, "tokens_seen": 1013651456 }, { "epoch": 2.1, "learning_rate": 0.0003499398194583751, "loss": 2.9607, "theoretical_loss": 3.644470305937679, "tokens_seen": 1013716992 }, { "epoch": 2.1, "learning_rate": 0.0003499297893681043, "loss": 2.812, "theoretical_loss": 3.6444479408149038, "tokens_seen": 1013782528 }, { "epoch": 2.1, "learning_rate": 0.0003499197592778335, "loss": 2.7668, "theoretical_loss": 3.644425577542669, "tokens_seen": 1013848064 }, { "epoch": 2.1, "learning_rate": 0.0003499097291875627, "loss": 3.05, "theoretical_loss": 3.6444032161207005, "tokens_seen": 1013913600 }, { "epoch": 2.1, "learning_rate": 0.00034989969909729186, "loss": 2.9749, "theoretical_loss": 3.644380856548727, "tokens_seen": 1013979136 }, { "epoch": 2.1, "learning_rate": 0.0003498896690070211, "loss": 2.7788, "theoretical_loss": 3.644358498826475, "tokens_seen": 1014044672 }, { "epoch": 2.1, "learning_rate": 0.0003498796389167502, "loss": 2.9406, "theoretical_loss": 3.644336142953672, "tokens_seen": 1014110208 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.107957124710083, "objective/train/theoretical_loss": 3.644324965710729, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.644324965710729, "tokens_seen": 1014142976 }, { "epoch": 2.1, "learning_rate": 0.00034986960882647946, "loss": 2.9312, "theoretical_loss": 3.644313788930046, "tokens_seen": 1014175744 }, { "epoch": 2.1, "learning_rate": 0.0003498595787362086, "loss": 2.9225, "theoretical_loss": 3.644291436755324, "tokens_seen": 1014241280 }, { "epoch": 2.1, "learning_rate": 0.0003498495486459378, "loss": 2.8237, "theoretical_loss": 3.644269086429234, "tokens_seen": 1014306816 }, { "epoch": 2.1, "learning_rate": 0.00034983951855566706, "loss": 2.7865, "theoretical_loss": 3.6442467379515033, "tokens_seen": 1014372352 }, { "epoch": 2.1, "learning_rate": 0.0003498294884653962, "loss": 2.9782, "theoretical_loss": 3.6442243913218606, "tokens_seen": 1014437888 }, { "epoch": 2.1, "learning_rate": 0.0003498194583751254, "loss": 2.9223, "theoretical_loss": 3.644202046540032, "tokens_seen": 1014503424 }, { "epoch": 2.1, "learning_rate": 0.00034980942828485455, "loss": 2.7481, "theoretical_loss": 3.6441797036057477, "tokens_seen": 1014568960 }, { "epoch": 2.1, "learning_rate": 0.0003497993981945838, "loss": 2.8496, "theoretical_loss": 3.644157362518733, "tokens_seen": 1014634496 }, { "epoch": 2.1, "learning_rate": 0.00034978936810431296, "loss": 2.6976, "theoretical_loss": 3.644135023278718, "tokens_seen": 1014700032 }, { "epoch": 2.1, "learning_rate": 0.00034977933801404214, "loss": 2.98, "theoretical_loss": 3.64411268588543, "tokens_seen": 1014765568 }, { "epoch": 2.1, "learning_rate": 0.0003497693079237713, "loss": 2.9227, "theoretical_loss": 3.6440903503385966, "tokens_seen": 1014831104 }, { "epoch": 2.1, "learning_rate": 0.00034975927783350056, "loss": 2.9785, "theoretical_loss": 3.644068016637946, "tokens_seen": 1014896640 }, { "epoch": 2.1, "learning_rate": 0.0003497492477432297, "loss": 2.7177, "theoretical_loss": 3.644045684783207, "tokens_seen": 1014962176 }, { "epoch": 2.1, "learning_rate": 0.0003497392176529589, "loss": 2.6269, "theoretical_loss": 3.6440233547741077, "tokens_seen": 1015027712 }, { "epoch": 2.1, "learning_rate": 0.00034972918756268805, "loss": 2.9472, "theoretical_loss": 3.6440010266103755, "tokens_seen": 1015093248 }, { "epoch": 2.1, "learning_rate": 0.0003497191574724173, "loss": 2.8227, "theoretical_loss": 3.6439787002917408, "tokens_seen": 1015158784 }, { "epoch": 2.1, "learning_rate": 0.00034970912738214647, "loss": 2.8932, "theoretical_loss": 3.64395637581793, "tokens_seen": 1015224320 }, { "epoch": 2.1, "learning_rate": 0.00034969909729187565, "loss": 2.8527, "theoretical_loss": 3.6439340531886724, "tokens_seen": 1015289856 }, { "epoch": 2.1, "learning_rate": 0.00034968906720160483, "loss": 2.9954, "theoretical_loss": 3.6439117324036965, "tokens_seen": 1015355392 }, { "epoch": 2.1, "learning_rate": 0.000349679037111334, "loss": 2.9344, "theoretical_loss": 3.6438894134627313, "tokens_seen": 1015420928 }, { "epoch": 2.1, "learning_rate": 0.0003496690070210632, "loss": 2.8103, "theoretical_loss": 3.643867096365504, "tokens_seen": 1015486464 }, { "epoch": 2.1, "learning_rate": 0.0003496589769307924, "loss": 2.6204, "theoretical_loss": 3.6438447811117456, "tokens_seen": 1015552000 }, { "epoch": 2.1, "learning_rate": 0.00034964894684052155, "loss": 2.7909, "theoretical_loss": 3.6438224677011832, "tokens_seen": 1015617536 }, { "epoch": 2.1, "learning_rate": 0.0003496389167502508, "loss": 2.8205, "theoretical_loss": 3.6438001561335462, "tokens_seen": 1015683072 }, { "epoch": 2.1, "learning_rate": 0.0003496288866599799, "loss": 2.686, "theoretical_loss": 3.6437778464085637, "tokens_seen": 1015748608 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.790055751800537, "objective/train/theoretical_loss": 3.6437666922369827, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6437666922369827, "tokens_seen": 1015781376 }, { "epoch": 2.1, "learning_rate": 0.00034961885656970915, "loss": 2.782, "theoretical_loss": 3.643755538525964, "tokens_seen": 1015814144 }, { "epoch": 2.1, "learning_rate": 0.00034960882647943833, "loss": 3.0288, "theoretical_loss": 3.643733232485476, "tokens_seen": 1015879680 }, { "epoch": 2.1, "learning_rate": 0.0003495987963891675, "loss": 2.9236, "theoretical_loss": 3.6437109282868296, "tokens_seen": 1015945216 }, { "epoch": 2.1, "learning_rate": 0.0003495887662988967, "loss": 2.8203, "theoretical_loss": 3.6436886259297534, "tokens_seen": 1016010752 }, { "epoch": 2.1, "learning_rate": 0.00034957873620862593, "loss": 2.7806, "theoretical_loss": 3.643666325413977, "tokens_seen": 1016076288 }, { "epoch": 2.1, "learning_rate": 0.00034956870611835506, "loss": 2.8984, "theoretical_loss": 3.6436440267392287, "tokens_seen": 1016141824 }, { "epoch": 2.1, "learning_rate": 0.0003495586760280843, "loss": 2.8272, "theoretical_loss": 3.643621729905239, "tokens_seen": 1016207360 }, { "epoch": 2.1, "learning_rate": 0.0003495486459378134, "loss": 2.9649, "theoretical_loss": 3.6435994349117364, "tokens_seen": 1016272896 }, { "epoch": 2.1, "learning_rate": 0.00034953861584754265, "loss": 2.8224, "theoretical_loss": 3.643577141758451, "tokens_seen": 1016338432 }, { "epoch": 2.1, "learning_rate": 0.00034952858575727183, "loss": 2.9162, "theoretical_loss": 3.643554850445111, "tokens_seen": 1016403968 }, { "epoch": 2.1, "learning_rate": 0.000349518555667001, "loss": 2.8339, "theoretical_loss": 3.6435325609714475, "tokens_seen": 1016469504 }, { "epoch": 2.1, "learning_rate": 0.0003495085255767302, "loss": 2.701, "theoretical_loss": 3.6435102733371894, "tokens_seen": 1016535040 }, { "epoch": 2.1, "learning_rate": 0.0003494984954864594, "loss": 2.8769, "theoretical_loss": 3.6434879875420654, "tokens_seen": 1016600576 }, { "epoch": 2.1, "learning_rate": 0.00034948846539618856, "loss": 2.8621, "theoretical_loss": 3.6434657035858065, "tokens_seen": 1016666112 }, { "epoch": 2.1, "learning_rate": 0.0003494784353059178, "loss": 2.8035, "theoretical_loss": 3.643443421468142, "tokens_seen": 1016731648 }, { "epoch": 2.1, "learning_rate": 0.0003494684052156469, "loss": 2.9685, "theoretical_loss": 3.643421141188802, "tokens_seen": 1016797184 }, { "epoch": 2.1, "learning_rate": 0.00034945837512537616, "loss": 2.8239, "theoretical_loss": 3.643398862747516, "tokens_seen": 1016862720 }, { "epoch": 2.1, "learning_rate": 0.0003494483450351053, "loss": 2.7889, "theoretical_loss": 3.6433765861440137, "tokens_seen": 1016928256 }, { "epoch": 2.1, "learning_rate": 0.0003494383149448345, "loss": 2.6573, "theoretical_loss": 3.6433543113780256, "tokens_seen": 1016993792 }, { "epoch": 2.1, "learning_rate": 0.0003494282848545637, "loss": 2.9305, "theoretical_loss": 3.6433320384492816, "tokens_seen": 1017059328 }, { "epoch": 2.1, "learning_rate": 0.0003494182547642929, "loss": 2.988, "theoretical_loss": 3.643309767357511, "tokens_seen": 1017124864 }, { "epoch": 2.1, "learning_rate": 0.00034940822467402206, "loss": 2.8389, "theoretical_loss": 3.6432874981024455, "tokens_seen": 1017190400 }, { "epoch": 2.1, "learning_rate": 0.0003493981945837513, "loss": 2.8016, "theoretical_loss": 3.643265230683814, "tokens_seen": 1017255936 }, { "epoch": 2.1, "learning_rate": 0.0003493881644934804, "loss": 2.977, "theoretical_loss": 3.643242965101347, "tokens_seen": 1017321472 }, { "epoch": 2.1, "learning_rate": 0.00034937813440320966, "loss": 3.0549, "theoretical_loss": 3.6432207013547755, "tokens_seen": 1017387008 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9813570976257324, "objective/train/theoretical_loss": 3.643209570169866, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.643209570169866, "tokens_seen": 1017419776 }, { "epoch": 2.1, "learning_rate": 0.0003493681043129388, "loss": 2.8325, "theoretical_loss": 3.6431984394438293, "tokens_seen": 1017452544 }, { "epoch": 2.1, "learning_rate": 0.000349358074222668, "loss": 2.863, "theoretical_loss": 3.643176179368239, "tokens_seen": 1017518080 }, { "epoch": 2.1, "learning_rate": 0.0003493480441323972, "loss": 2.8618, "theoretical_loss": 3.643153921127735, "tokens_seen": 1017583616 }, { "epoch": 2.1, "learning_rate": 0.0003493380140421264, "loss": 2.851, "theoretical_loss": 3.6431316647220475, "tokens_seen": 1017649152 }, { "epoch": 2.1, "learning_rate": 0.00034932798395185557, "loss": 2.8652, "theoretical_loss": 3.643109410150908, "tokens_seen": 1017714688 }, { "epoch": 2.1, "learning_rate": 0.00034931795386158475, "loss": 2.8082, "theoretical_loss": 3.643087157414046, "tokens_seen": 1017780224 }, { "epoch": 2.1, "learning_rate": 0.00034930792377131393, "loss": 2.8857, "theoretical_loss": 3.643064906511193, "tokens_seen": 1017845760 }, { "epoch": 2.1, "learning_rate": 0.00034929789368104316, "loss": 2.7717, "theoretical_loss": 3.6430426574420802, "tokens_seen": 1017911296 }, { "epoch": 2.1, "learning_rate": 0.0003492878635907723, "loss": 2.9082, "theoretical_loss": 3.6430204102064376, "tokens_seen": 1017976832 }, { "epoch": 2.1, "learning_rate": 0.0003492778335005015, "loss": 2.8397, "theoretical_loss": 3.642998164803996, "tokens_seen": 1018042368 }, { "epoch": 2.1, "learning_rate": 0.00034926780341023065, "loss": 2.9865, "theoretical_loss": 3.6429759212344868, "tokens_seen": 1018107904 }, { "epoch": 2.1, "learning_rate": 0.0003492577733199599, "loss": 2.9383, "theoretical_loss": 3.6429536794976416, "tokens_seen": 1018173440 }, { "epoch": 2.1, "learning_rate": 0.00034924774322968907, "loss": 2.8409, "theoretical_loss": 3.64293143959319, "tokens_seen": 1018238976 }, { "epoch": 2.1, "learning_rate": 0.00034923771313941825, "loss": 2.4902, "theoretical_loss": 3.642909201520864, "tokens_seen": 1018304512 }, { "epoch": 2.1, "learning_rate": 0.00034922768304914743, "loss": 2.9239, "theoretical_loss": 3.642886965280395, "tokens_seen": 1018370048 }, { "epoch": 2.1, "learning_rate": 0.00034921765295887667, "loss": 2.9902, "theoretical_loss": 3.6428647308715134, "tokens_seen": 1018435584 }, { "epoch": 2.1, "learning_rate": 0.0003492076228686058, "loss": 2.8959, "theoretical_loss": 3.642842498293951, "tokens_seen": 1018501120 }, { "epoch": 2.1, "learning_rate": 0.00034919759277833503, "loss": 2.8851, "theoretical_loss": 3.6428202675474393, "tokens_seen": 1018566656 }, { "epoch": 2.1, "learning_rate": 0.00034918756268806416, "loss": 2.8028, "theoretical_loss": 3.6427980386317094, "tokens_seen": 1018632192 }, { "epoch": 2.1, "learning_rate": 0.0003491775325977934, "loss": 2.8943, "theoretical_loss": 3.6427758115464925, "tokens_seen": 1018697728 }, { "epoch": 2.1, "learning_rate": 0.00034916750250752257, "loss": 3.0225, "theoretical_loss": 3.642753586291521, "tokens_seen": 1018763264 }, { "epoch": 2.1, "learning_rate": 0.00034915747241725175, "loss": 2.9775, "theoretical_loss": 3.642731362866526, "tokens_seen": 1018828800 }, { "epoch": 2.1, "learning_rate": 0.00034914744232698093, "loss": 2.7837, "theoretical_loss": 3.6427091412712387, "tokens_seen": 1018894336 }, { "epoch": 2.1, "learning_rate": 0.0003491374122367101, "loss": 2.9549, "theoretical_loss": 3.6426869215053914, "tokens_seen": 1018959872 }, { "epoch": 2.1, "learning_rate": 0.0003491273821464393, "loss": 2.9416, "theoretical_loss": 3.642664703568715, "tokens_seen": 1019025408 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.150725841522217, "objective/train/theoretical_loss": 3.642653595286233, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.642653595286233, "tokens_seen": 1019058176 }, { "epoch": 2.1, "learning_rate": 0.00034911735205616853, "loss": 3.012, "theoretical_loss": 3.6426424874609427, "tokens_seen": 1019090944 }, { "epoch": 2.1, "learning_rate": 0.00034910732196589766, "loss": 2.808, "theoretical_loss": 3.642620273181805, "tokens_seen": 1019156480 }, { "epoch": 2.1, "learning_rate": 0.0003490972918756269, "loss": 2.8841, "theoretical_loss": 3.6425980607310344, "tokens_seen": 1019222016 }, { "epoch": 2.1, "learning_rate": 0.00034908726178535613, "loss": 2.8861, "theoretical_loss": 3.642575850108363, "tokens_seen": 1019287552 }, { "epoch": 2.1, "learning_rate": 0.00034907723169508526, "loss": 2.9126, "theoretical_loss": 3.6425536413135227, "tokens_seen": 1019353088 }, { "epoch": 2.1, "learning_rate": 0.0003490672016048145, "loss": 2.9172, "theoretical_loss": 3.642531434346245, "tokens_seen": 1019418624 }, { "epoch": 2.1, "learning_rate": 0.0003490571715145436, "loss": 2.783, "theoretical_loss": 3.642509229206263, "tokens_seen": 1019484160 }, { "epoch": 2.1, "learning_rate": 0.00034904714142427285, "loss": 2.924, "theoretical_loss": 3.642487025893308, "tokens_seen": 1019549696 }, { "epoch": 2.1, "learning_rate": 0.00034903711133400203, "loss": 2.9396, "theoretical_loss": 3.642464824407113, "tokens_seen": 1019615232 }, { "epoch": 2.1, "learning_rate": 0.0003490270812437312, "loss": 2.9003, "theoretical_loss": 3.64244262474741, "tokens_seen": 1019680768 }, { "epoch": 2.1, "learning_rate": 0.0003490170511534604, "loss": 2.6676, "theoretical_loss": 3.6424204269139313, "tokens_seen": 1019746304 }, { "epoch": 2.1, "learning_rate": 0.0003490070210631896, "loss": 2.9835, "theoretical_loss": 3.642398230906409, "tokens_seen": 1019811840 }, { "epoch": 2.1, "learning_rate": 0.00034899699097291876, "loss": 2.9491, "theoretical_loss": 3.6423760367245768, "tokens_seen": 1019877376 }, { "epoch": 2.1, "learning_rate": 0.000348986960882648, "loss": 2.7845, "theoretical_loss": 3.6423538443681656, "tokens_seen": 1019942912 }, { "epoch": 2.1, "learning_rate": 0.0003489769307923771, "loss": 3.0648, "theoretical_loss": 3.6423316538369086, "tokens_seen": 1020008448 }, { "epoch": 2.1, "learning_rate": 0.00034896690070210636, "loss": 2.9102, "theoretical_loss": 3.642309465130539, "tokens_seen": 1020073984 }, { "epoch": 2.1, "learning_rate": 0.0003489568706118355, "loss": 2.9085, "theoretical_loss": 3.642287278248789, "tokens_seen": 1020139520 }, { "epoch": 2.1, "learning_rate": 0.0003489468405215647, "loss": 3.0307, "theoretical_loss": 3.6422650931913916, "tokens_seen": 1020205056 }, { "epoch": 2.1, "learning_rate": 0.0003489368104312939, "loss": 2.808, "theoretical_loss": 3.642242909958079, "tokens_seen": 1020270592 }, { "epoch": 2.1, "learning_rate": 0.0003489267803410231, "loss": 2.7066, "theoretical_loss": 3.6422207285485846, "tokens_seen": 1020336128 }, { "epoch": 2.1, "learning_rate": 0.00034891675025075226, "loss": 2.6273, "theoretical_loss": 3.6421985489626416, "tokens_seen": 1020401664 }, { "epoch": 2.1, "learning_rate": 0.0003489067201604815, "loss": 2.9686, "theoretical_loss": 3.6421763711999824, "tokens_seen": 1020467200 }, { "epoch": 2.1, "learning_rate": 0.0003488966900702106, "loss": 3.0016, "theoretical_loss": 3.64215419526034, "tokens_seen": 1020532736 }, { "epoch": 2.1, "learning_rate": 0.00034888665997993986, "loss": 2.8709, "theoretical_loss": 3.642132021143448, "tokens_seen": 1020598272 }, { "epoch": 2.1, "learning_rate": 0.000348876629889669, "loss": 2.9764, "theoretical_loss": 3.6421098488490395, "tokens_seen": 1020663808 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.855858564376831, "objective/train/theoretical_loss": 3.6420987633851833, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6420987633851833, "tokens_seen": 1020696576 }, { "epoch": 2.1, "learning_rate": 0.0003488665997993982, "loss": 2.9325, "theoretical_loss": 3.642087678376847, "tokens_seen": 1020729344 }, { "epoch": 2.1, "learning_rate": 0.0003488565697091274, "loss": 2.9797, "theoretical_loss": 3.6420655097266055, "tokens_seen": 1020794880 }, { "epoch": 2.1, "learning_rate": 0.0003488465396188566, "loss": 2.8708, "theoretical_loss": 3.642043342898046, "tokens_seen": 1020860416 }, { "epoch": 2.1, "learning_rate": 0.00034883650952858577, "loss": 2.7632, "theoretical_loss": 3.642021177890903, "tokens_seen": 1020925952 }, { "epoch": 2.1, "learning_rate": 0.00034882647943831495, "loss": 2.8369, "theoretical_loss": 3.64199901470491, "tokens_seen": 1020991488 }, { "epoch": 2.1, "learning_rate": 0.00034881644934804413, "loss": 2.8053, "theoretical_loss": 3.6419768533398003, "tokens_seen": 1021057024 }, { "epoch": 2.1, "learning_rate": 0.00034880641925777336, "loss": 2.8323, "theoretical_loss": 3.6419546937953076, "tokens_seen": 1021122560 }, { "epoch": 2.1, "learning_rate": 0.0003487963891675025, "loss": 2.9046, "theoretical_loss": 3.6419325360711654, "tokens_seen": 1021188096 }, { "epoch": 2.1, "learning_rate": 0.0003487863590772317, "loss": 2.9899, "theoretical_loss": 3.6419103801671073, "tokens_seen": 1021253632 }, { "epoch": 2.1, "learning_rate": 0.00034877632898696085, "loss": 2.9091, "theoretical_loss": 3.641888226082867, "tokens_seen": 1021319168 }, { "epoch": 2.1, "learning_rate": 0.0003487662988966901, "loss": 2.9875, "theoretical_loss": 3.6418660738181785, "tokens_seen": 1021384704 }, { "epoch": 2.1, "learning_rate": 0.00034875626880641927, "loss": 2.9655, "theoretical_loss": 3.6418439233727757, "tokens_seen": 1021450240 }, { "epoch": 2.1, "learning_rate": 0.00034874623871614845, "loss": 2.9474, "theoretical_loss": 3.6418217747463917, "tokens_seen": 1021515776 }, { "epoch": 2.1, "learning_rate": 0.00034873620862587763, "loss": 2.7881, "theoretical_loss": 3.6417996279387617, "tokens_seen": 1021581312 }, { "epoch": 2.1, "learning_rate": 0.00034872617853560687, "loss": 2.9873, "theoretical_loss": 3.6417774829496183, "tokens_seen": 1021646848 }, { "epoch": 2.1, "learning_rate": 0.000348716148445336, "loss": 2.9421, "theoretical_loss": 3.6417553397786966, "tokens_seen": 1021712384 }, { "epoch": 2.1, "learning_rate": 0.00034870611835506523, "loss": 2.9118, "theoretical_loss": 3.64173319842573, "tokens_seen": 1021777920 }, { "epoch": 2.1, "learning_rate": 0.00034869608826479436, "loss": 2.9748, "theoretical_loss": 3.6417110588904533, "tokens_seen": 1021843456 }, { "epoch": 2.1, "learning_rate": 0.0003486860581745236, "loss": 2.8075, "theoretical_loss": 3.6416889211726, "tokens_seen": 1021908992 }, { "epoch": 2.1, "learning_rate": 0.00034867602808425277, "loss": 2.8065, "theoretical_loss": 3.641666785271905, "tokens_seen": 1021974528 }, { "epoch": 2.1, "learning_rate": 0.00034866599799398195, "loss": 2.9005, "theoretical_loss": 3.641644651188102, "tokens_seen": 1022040064 }, { "epoch": 2.1, "learning_rate": 0.00034865596790371113, "loss": 2.8473, "theoretical_loss": 3.6416225189209266, "tokens_seen": 1022105600 }, { "epoch": 2.1, "learning_rate": 0.0003486459378134403, "loss": 2.8313, "theoretical_loss": 3.6416003884701116, "tokens_seen": 1022171136 }, { "epoch": 2.1, "learning_rate": 0.0003486359077231695, "loss": 2.8135, "theoretical_loss": 3.6415782598353927, "tokens_seen": 1022236672 }, { "epoch": 2.1, "learning_rate": 0.00034862587763289873, "loss": 2.9625, "theoretical_loss": 3.641556133016504, "tokens_seen": 1022302208 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.715212345123291, "objective/train/theoretical_loss": 3.641545070287913, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.641545070287913, "tokens_seen": 1022334976 }, { "epoch": 2.1, "learning_rate": 0.00034861584754262786, "loss": 2.7832, "theoretical_loss": 3.64153400801318, "tokens_seen": 1022367744 }, { "epoch": 2.1, "learning_rate": 0.0003486058174523571, "loss": 2.7766, "theoretical_loss": 3.6415118848251558, "tokens_seen": 1022433280 }, { "epoch": 2.1, "learning_rate": 0.0003485957873620862, "loss": 2.8786, "theoretical_loss": 3.6414897634521655, "tokens_seen": 1022498816 }, { "epoch": 2.1, "learning_rate": 0.00034858575727181546, "loss": 2.812, "theoretical_loss": 3.641467643893944, "tokens_seen": 1022564352 }, { "epoch": 2.1, "learning_rate": 0.00034857572718154464, "loss": 2.8761, "theoretical_loss": 3.6414455261502265, "tokens_seen": 1022629888 }, { "epoch": 2.1, "learning_rate": 0.0003485656970912738, "loss": 2.853, "theoretical_loss": 3.6414234102207477, "tokens_seen": 1022695424 }, { "epoch": 2.1, "learning_rate": 0.000348555667001003, "loss": 2.9516, "theoretical_loss": 3.641401296105243, "tokens_seen": 1022760960 }, { "epoch": 2.1, "learning_rate": 0.00034854563691073223, "loss": 2.9492, "theoretical_loss": 3.641379183803447, "tokens_seen": 1022826496 }, { "epoch": 2.1, "learning_rate": 0.00034853560682046136, "loss": 2.6492, "theoretical_loss": 3.6413570733150937, "tokens_seen": 1022892032 }, { "epoch": 2.1, "learning_rate": 0.0003485255767301906, "loss": 2.9718, "theoretical_loss": 3.64133496463992, "tokens_seen": 1022957568 }, { "epoch": 2.1, "learning_rate": 0.0003485155466399197, "loss": 2.637, "theoretical_loss": 3.64131285777766, "tokens_seen": 1023023104 }, { "epoch": 2.1, "learning_rate": 0.00034850551654964896, "loss": 2.8032, "theoretical_loss": 3.6412907527280494, "tokens_seen": 1023088640 }, { "epoch": 2.1, "learning_rate": 0.00034849548645937814, "loss": 3.0177, "theoretical_loss": 3.641268649490823, "tokens_seen": 1023154176 }, { "epoch": 2.1, "learning_rate": 0.0003484854563691073, "loss": 2.7266, "theoretical_loss": 3.641246548065716, "tokens_seen": 1023219712 }, { "epoch": 2.1, "learning_rate": 0.0003484754262788365, "loss": 2.9382, "theoretical_loss": 3.641224448452465, "tokens_seen": 1023285248 }, { "epoch": 2.1, "learning_rate": 0.0003484653961885657, "loss": 2.8568, "theoretical_loss": 3.641202350650804, "tokens_seen": 1023350784 }, { "epoch": 2.1, "learning_rate": 0.00034845536609829486, "loss": 2.918, "theoretical_loss": 3.64118025466047, "tokens_seen": 1023416320 }, { "epoch": 2.1, "learning_rate": 0.0003484453360080241, "loss": 2.8834, "theoretical_loss": 3.6411581604811967, "tokens_seen": 1023481856 }, { "epoch": 2.1, "learning_rate": 0.0003484353059177532, "loss": 2.9364, "theoretical_loss": 3.641136068112721, "tokens_seen": 1023547392 }, { "epoch": 2.1, "learning_rate": 0.00034842527582748246, "loss": 2.8391, "theoretical_loss": 3.6411139775547783, "tokens_seen": 1023612928 }, { "epoch": 2.1, "learning_rate": 0.00034841524573721164, "loss": 2.875, "theoretical_loss": 3.6410918888071038, "tokens_seen": 1023678464 }, { "epoch": 2.1, "learning_rate": 0.0003484052156469408, "loss": 2.8334, "theoretical_loss": 3.641069801869434, "tokens_seen": 1023744000 }, { "epoch": 2.1, "learning_rate": 0.00034839518555667, "loss": 2.8588, "theoretical_loss": 3.6410477167415047, "tokens_seen": 1023809536 }, { "epoch": 2.1, "learning_rate": 0.0003483851554663992, "loss": 2.7187, "theoretical_loss": 3.6410256334230513, "tokens_seen": 1023875072 }, { "epoch": 2.1, "learning_rate": 0.00034837512537612837, "loss": 2.995, "theoretical_loss": 3.64100355191381, "tokens_seen": 1023940608 }, { "epoch": 2.1, "objective/train/docs_used": 1613583, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7899601459503174, "objective/train/theoretical_loss": 3.6409925118375615, "objective/train/tokens_used": 1025103328, "theoretical_loss": 3.6409925118375615, "tokens_seen": 1023973376 }, { "epoch": 2.1, "learning_rate": 0.0003483650952858576, "loss": 2.9348, "theoretical_loss": 3.6409814722135163, "tokens_seen": 1024006144 }, { "epoch": 2.1, "learning_rate": 0.00034835506519558673, "loss": 2.9573, "theoretical_loss": 3.640959394321907, "tokens_seen": 1024071680 }, { "epoch": 2.1, "learning_rate": 0.00034834503510531597, "loss": 2.8298, "theoretical_loss": 3.6409373182387186, "tokens_seen": 1024137216 }, { "epoch": 2.1, "learning_rate": 0.00034833500501504515, "loss": 3.064, "theoretical_loss": 3.6409152439636863, "tokens_seen": 1024202752 }, { "epoch": 2.1, "learning_rate": 0.00034832497492477433, "loss": 2.741, "theoretical_loss": 3.640893171496546, "tokens_seen": 1024268288 }, { "epoch": 2.1, "learning_rate": 0.00034831494483450356, "loss": 2.8227, "theoretical_loss": 3.640871100837035, "tokens_seen": 1024333824 }, { "epoch": 2.1, "learning_rate": 0.0003483049147442327, "loss": 2.9415, "theoretical_loss": 3.6408490319848887, "tokens_seen": 1024399360 }, { "epoch": 2.1, "learning_rate": 0.0003482948846539619, "loss": 2.8866, "theoretical_loss": 3.640826964939845, "tokens_seen": 1024464896 }, { "epoch": 2.1, "learning_rate": 0.00034828485456369105, "loss": 2.8352, "theoretical_loss": 3.6408048997016387, "tokens_seen": 1024530432 }, { "epoch": 2.1, "learning_rate": 0.0003482748244734203, "loss": 2.8466, "theoretical_loss": 3.6407828362700068, "tokens_seen": 1024595968 }, { "epoch": 2.1, "learning_rate": 0.00034826479438314947, "loss": 2.8167, "theoretical_loss": 3.640760774644686, "tokens_seen": 1024661504 }, { "epoch": 2.1, "learning_rate": 0.00034825476429287865, "loss": 2.9761, "theoretical_loss": 3.640738714825413, "tokens_seen": 1024727040 }, { "epoch": 2.1, "learning_rate": 0.00034824473420260783, "loss": 2.7646, "theoretical_loss": 3.6407166568119242, "tokens_seen": 1024792576 }, { "epoch": 2.1, "learning_rate": 0.00034823470411233707, "loss": 2.8841, "theoretical_loss": 3.6406946006039567, "tokens_seen": 1024858112 }, { "epoch": 2.1, "learning_rate": 0.0003482246740220662, "loss": 2.8255, "theoretical_loss": 3.640672546201247, "tokens_seen": 1024923648 }, { "epoch": 2.1, "learning_rate": 0.00034821464393179543, "loss": 2.8135, "theoretical_loss": 3.6406504936035313, "tokens_seen": 1024989184 }, { "epoch": 2.1, "learning_rate": 0.00034820461384152456, "loss": 2.7685, "theoretical_loss": 3.6406284428105478, "tokens_seen": 1025054720 }, { "epoch": 2.1, "learning_rate": 0.0003481945837512538, "loss": 3.031, "theoretical_loss": 3.6406063938220323, "tokens_seen": 1025120256 }, { "epoch": 3.0, "learning_rate": 0.00034818455366098297, "loss": 3.561, "theoretical_loss": 3.6405833132202217, "tokens_seen": 1025188864 }, { "epoch": 3.0, "learning_rate": 0.00034817452357071215, "loss": 2.8543, "theoretical_loss": 3.640561267924408, "tokens_seen": 1025254400 }, { "epoch": 3.0, "learning_rate": 0.00034816449348044133, "loss": 2.9481, "theoretical_loss": 3.640539224432261, "tokens_seen": 1025319936 }, { "epoch": 3.0, "learning_rate": 0.0003481544633901705, "loss": 2.8764, "theoretical_loss": 3.640517182743519, "tokens_seen": 1025385472 }, { "epoch": 3.0, "learning_rate": 0.0003481444332998997, "loss": 2.88, "theoretical_loss": 3.640495142857918, "tokens_seen": 1025451008 }, { "epoch": 3.0, "learning_rate": 0.00034813440320962893, "loss": 2.9916, "theoretical_loss": 3.640473104775196, "tokens_seen": 1025516544 }, { "epoch": 3.0, "learning_rate": 0.00034812437311935806, "loss": 2.8732, "theoretical_loss": 3.640451068495091, "tokens_seen": 1025582080 }, { "epoch": 3.0, "objective/train/docs_used": 1646905, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5984838008880615, "objective/train/theoretical_loss": 3.6404455597066923, "objective/train/tokens_used": 1046058464, "theoretical_loss": 3.6404455597066923, "tokens_seen": 1025598464 }, { "epoch": 3.0, "learning_rate": 0.0003481143430290873, "loss": 2.9326, "theoretical_loss": 3.640429034017339, "tokens_seen": 1025647616 }, { "epoch": 3.0, "learning_rate": 0.0003481043129388164, "loss": 2.9361, "theoretical_loss": 3.6404070013416776, "tokens_seen": 1025713152 }, { "epoch": 3.0, "learning_rate": 0.00034809428284854566, "loss": 2.6869, "theoretical_loss": 3.6403849704678457, "tokens_seen": 1025778688 }, { "epoch": 3.0, "learning_rate": 0.00034808425275827484, "loss": 2.8682, "theoretical_loss": 3.640362941395579, "tokens_seen": 1025844224 }, { "epoch": 3.0, "learning_rate": 0.000348074222668004, "loss": 2.9606, "theoretical_loss": 3.6403409141246166, "tokens_seen": 1025909760 }, { "epoch": 3.0, "learning_rate": 0.0003480641925777332, "loss": 2.8557, "theoretical_loss": 3.6403188886546953, "tokens_seen": 1025975296 }, { "epoch": 3.0, "learning_rate": 0.00034805416248746243, "loss": 2.8665, "theoretical_loss": 3.6402968649855527, "tokens_seen": 1026040832 }, { "epoch": 3.0, "learning_rate": 0.00034804413239719156, "loss": 3.01, "theoretical_loss": 3.640274843116927, "tokens_seen": 1026106368 }, { "epoch": 3.0, "learning_rate": 0.0003480341023069208, "loss": 2.9587, "theoretical_loss": 3.6402528230485567, "tokens_seen": 1026171904 }, { "epoch": 3.0, "learning_rate": 0.0003480240722166499, "loss": 2.9533, "theoretical_loss": 3.6402308047801784, "tokens_seen": 1026237440 }, { "epoch": 3.0, "learning_rate": 0.00034801404212637916, "loss": 2.989, "theoretical_loss": 3.64020878831153, "tokens_seen": 1026302976 }, { "epoch": 3.0, "learning_rate": 0.00034800401203610834, "loss": 2.9962, "theoretical_loss": 3.640186773642351, "tokens_seen": 1026368512 }, { "epoch": 3.0, "learning_rate": 0.0003479939819458375, "loss": 2.9956, "theoretical_loss": 3.640164760772378, "tokens_seen": 1026434048 }, { "epoch": 3.0, "learning_rate": 0.0003479839518555667, "loss": 3.0375, "theoretical_loss": 3.640142749701349, "tokens_seen": 1026499584 }, { "epoch": 3.0, "learning_rate": 0.0003479739217652959, "loss": 2.9654, "theoretical_loss": 3.640120740429003, "tokens_seen": 1026565120 }, { "epoch": 3.0, "learning_rate": 0.00034796389167502506, "loss": 2.8347, "theoretical_loss": 3.640098732955078, "tokens_seen": 1026630656 }, { "epoch": 3.0, "learning_rate": 0.0003479538615847543, "loss": 2.9716, "theoretical_loss": 3.6400767272793124, "tokens_seen": 1026696192 }, { "epoch": 3.0, "learning_rate": 0.00034794383149448343, "loss": 2.9644, "theoretical_loss": 3.6400547234014438, "tokens_seen": 1026761728 }, { "epoch": 3.0, "learning_rate": 0.00034793380140421266, "loss": 3.0056, "theoretical_loss": 3.6400327213212114, "tokens_seen": 1026827264 }, { "epoch": 3.0, "learning_rate": 0.00034792377131394184, "loss": 2.8699, "theoretical_loss": 3.6400107210383528, "tokens_seen": 1026892800 }, { "epoch": 3.0, "learning_rate": 0.000347913741223671, "loss": 3.0131, "theoretical_loss": 3.6399887225526073, "tokens_seen": 1026958336 }, { "epoch": 3.0, "learning_rate": 0.0003479037111334002, "loss": 2.9254, "theoretical_loss": 3.639966725863713, "tokens_seen": 1027023872 }, { "epoch": 3.0, "learning_rate": 0.0003478936810431294, "loss": 2.9332, "theoretical_loss": 3.6399447309714086, "tokens_seen": 1027089408 }, { "epoch": 3.0, "learning_rate": 0.00034788365095285857, "loss": 2.9419, "theoretical_loss": 3.639922737875432, "tokens_seen": 1027154944 }, { "epoch": 3.0, "learning_rate": 0.0003478736208625878, "loss": 2.9324, "theoretical_loss": 3.639900746575523, "tokens_seen": 1027220480 }, { "epoch": 3.0, "objective/train/docs_used": 1649823, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.982206344604492, "objective/train/theoretical_loss": 3.6398952490311514, "objective/train/tokens_used": 1047696864, "theoretical_loss": 3.6398952490311514, "tokens_seen": 1027236864 }, { "epoch": 3.0, "learning_rate": 0.00034786359077231693, "loss": 2.8476, "theoretical_loss": 3.63987875707142, "tokens_seen": 1027286016 }, { "epoch": 3.0, "learning_rate": 0.00034785356068204617, "loss": 2.7875, "theoretical_loss": 3.6398567693628623, "tokens_seen": 1027351552 }, { "epoch": 3.0, "learning_rate": 0.0003478435305917753, "loss": 2.9255, "theoretical_loss": 3.6398347834495874, "tokens_seen": 1027417088 }, { "epoch": 3.0, "learning_rate": 0.00034783350050150453, "loss": 2.9642, "theoretical_loss": 3.6398127993313354, "tokens_seen": 1027482624 }, { "epoch": 3.0, "learning_rate": 0.0003478234704112337, "loss": 3.0063, "theoretical_loss": 3.639790817007845, "tokens_seen": 1027548160 }, { "epoch": 3.0, "learning_rate": 0.0003478134403209629, "loss": 2.725, "theoretical_loss": 3.6397688364788543, "tokens_seen": 1027613696 }, { "epoch": 3.0, "learning_rate": 0.00034780341023069207, "loss": 3.1011, "theoretical_loss": 3.639746857744104, "tokens_seen": 1027679232 }, { "epoch": 3.0, "learning_rate": 0.00034779338014042125, "loss": 3.0302, "theoretical_loss": 3.639724880803332, "tokens_seen": 1027744768 }, { "epoch": 3.0, "learning_rate": 0.00034778335005015043, "loss": 2.8274, "theoretical_loss": 3.6397029056562777, "tokens_seen": 1027810304 }, { "epoch": 3.0, "learning_rate": 0.00034777331995987967, "loss": 2.8631, "theoretical_loss": 3.6396809323026815, "tokens_seen": 1027875840 }, { "epoch": 3.0, "learning_rate": 0.0003477632898696088, "loss": 2.9608, "theoretical_loss": 3.639658960742281, "tokens_seen": 1027941376 }, { "epoch": 3.0, "learning_rate": 0.00034775325977933803, "loss": 2.9646, "theoretical_loss": 3.6396369909748163, "tokens_seen": 1028006912 }, { "epoch": 3.0, "learning_rate": 0.0003477432296890672, "loss": 2.8987, "theoretical_loss": 3.639615023000027, "tokens_seen": 1028072448 }, { "epoch": 3.0, "learning_rate": 0.0003477331995987964, "loss": 2.8474, "theoretical_loss": 3.6395930568176516, "tokens_seen": 1028137984 }, { "epoch": 3.0, "learning_rate": 0.0003477231695085256, "loss": 3.1104, "theoretical_loss": 3.639571092427431, "tokens_seen": 1028203520 }, { "epoch": 3.0, "learning_rate": 0.00034771313941825476, "loss": 2.9, "theoretical_loss": 3.639549129829104, "tokens_seen": 1028269056 }, { "epoch": 3.0, "learning_rate": 0.00034770310932798394, "loss": 2.9543, "theoretical_loss": 3.6395271690224105, "tokens_seen": 1028334592 }, { "epoch": 3.0, "learning_rate": 0.00034769307923771317, "loss": 3.0787, "theoretical_loss": 3.6395052100070897, "tokens_seen": 1028400128 }, { "epoch": 3.0, "learning_rate": 0.0003476830491474423, "loss": 2.9021, "theoretical_loss": 3.639483252782882, "tokens_seen": 1028465664 }, { "epoch": 3.0, "learning_rate": 0.00034767301905717153, "loss": 2.9356, "theoretical_loss": 3.639461297349527, "tokens_seen": 1028531200 }, { "epoch": 3.0, "learning_rate": 0.00034766298896690066, "loss": 2.9074, "theoretical_loss": 3.639439343706764, "tokens_seen": 1028596736 }, { "epoch": 3.0, "learning_rate": 0.0003476529588766299, "loss": 2.8876, "theoretical_loss": 3.6394173918543333, "tokens_seen": 1028662272 }, { "epoch": 3.0, "learning_rate": 0.0003476429287863591, "loss": 2.8757, "theoretical_loss": 3.639395441791975, "tokens_seen": 1028727808 }, { "epoch": 3.0, "learning_rate": 0.00034763289869608826, "loss": 2.8686, "theoretical_loss": 3.639373493519429, "tokens_seen": 1028793344 }, { "epoch": 3.0, "learning_rate": 0.00034762286860581744, "loss": 2.9721, "theoretical_loss": 3.6393515470364353, "tokens_seen": 1028858880 }, { "epoch": 3.0, "objective/train/docs_used": 1652746, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8428022861480713, "objective/train/theoretical_loss": 3.6393460606952734, "objective/train/tokens_used": 1049335264, "theoretical_loss": 3.6393460606952734, "tokens_seen": 1028875264 }, { "epoch": 3.0, "learning_rate": 0.0003476128385155466, "loss": 2.8798, "theoretical_loss": 3.639329602342734, "tokens_seen": 1028924416 }, { "epoch": 3.0, "learning_rate": 0.0003476028084252758, "loss": 2.7313, "theoretical_loss": 3.6393076594380647, "tokens_seen": 1028989952 }, { "epoch": 3.0, "learning_rate": 0.00034759277833500504, "loss": 2.8288, "theoretical_loss": 3.639285718322169, "tokens_seen": 1029055488 }, { "epoch": 3.0, "learning_rate": 0.0003475827482447342, "loss": 2.8953, "theoretical_loss": 3.6392637789947857, "tokens_seen": 1029121024 }, { "epoch": 3.0, "learning_rate": 0.0003475727181544634, "loss": 2.7743, "theoretical_loss": 3.6392418414556564, "tokens_seen": 1029186560 }, { "epoch": 3.0, "learning_rate": 0.00034756268806419263, "loss": 2.9943, "theoretical_loss": 3.6392199057045205, "tokens_seen": 1029252096 }, { "epoch": 3.0, "learning_rate": 0.00034755265797392176, "loss": 3.0102, "theoretical_loss": 3.639197971741119, "tokens_seen": 1029317632 }, { "epoch": 3.0, "learning_rate": 0.000347542627883651, "loss": 2.8234, "theoretical_loss": 3.639176039565192, "tokens_seen": 1029383168 }, { "epoch": 3.0, "learning_rate": 0.0003475325977933801, "loss": 2.8921, "theoretical_loss": 3.6391541091764803, "tokens_seen": 1029448704 }, { "epoch": 3.0, "learning_rate": 0.00034752256770310936, "loss": 2.9768, "theoretical_loss": 3.6391321805747245, "tokens_seen": 1029514240 }, { "epoch": 3.0, "learning_rate": 0.00034751253761283854, "loss": 2.8502, "theoretical_loss": 3.6391102537596653, "tokens_seen": 1029579776 }, { "epoch": 3.0, "learning_rate": 0.0003475025075225677, "loss": 2.755, "theoretical_loss": 3.6390883287310434, "tokens_seen": 1029645312 }, { "epoch": 3.0, "learning_rate": 0.0003474924774322969, "loss": 2.7999, "theoretical_loss": 3.639066405488599, "tokens_seen": 1029710848 }, { "epoch": 3.0, "learning_rate": 0.0003474824473420261, "loss": 2.9341, "theoretical_loss": 3.639044484032074, "tokens_seen": 1029776384 }, { "epoch": 3.0, "learning_rate": 0.00034747241725175527, "loss": 2.6777, "theoretical_loss": 3.639022564361208, "tokens_seen": 1029841920 }, { "epoch": 3.0, "learning_rate": 0.0003474623871614845, "loss": 2.8607, "theoretical_loss": 3.6390006464757434, "tokens_seen": 1029907456 }, { "epoch": 3.0, "learning_rate": 0.00034745235707121363, "loss": 2.8993, "theoretical_loss": 3.63897873037542, "tokens_seen": 1029972992 }, { "epoch": 3.0, "learning_rate": 0.00034744232698094286, "loss": 2.9745, "theoretical_loss": 3.638956816059979, "tokens_seen": 1030038528 }, { "epoch": 3.0, "learning_rate": 0.00034743229689067204, "loss": 2.9773, "theoretical_loss": 3.638934903529162, "tokens_seen": 1030104064 }, { "epoch": 3.0, "learning_rate": 0.0003474222668004012, "loss": 2.7886, "theoretical_loss": 3.6389129927827097, "tokens_seen": 1030169600 }, { "epoch": 3.0, "learning_rate": 0.0003474122367101304, "loss": 2.8205, "theoretical_loss": 3.6388910838203636, "tokens_seen": 1030235136 }, { "epoch": 3.0, "learning_rate": 0.0003474022066198596, "loss": 2.9197, "theoretical_loss": 3.638869176641865, "tokens_seen": 1030300672 }, { "epoch": 3.0, "learning_rate": 0.00034739217652958877, "loss": 2.8838, "theoretical_loss": 3.6388472712469544, "tokens_seen": 1030366208 }, { "epoch": 3.0, "learning_rate": 0.000347382146439318, "loss": 2.8571, "theoretical_loss": 3.638825367635374, "tokens_seen": 1030431744 }, { "epoch": 3.0, "learning_rate": 0.00034737211634904713, "loss": 2.9062, "theoretical_loss": 3.638803465806865, "tokens_seen": 1030497280 }, { "epoch": 3.0, "objective/train/docs_used": 1654929, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.109440565109253, "objective/train/theoretical_loss": 3.638797990628312, "objective/train/tokens_used": 1050973664, "theoretical_loss": 3.638797990628312, "tokens_seen": 1030513664 }, { "epoch": 3.0, "learning_rate": 0.00034736208625877637, "loss": 2.9481, "theoretical_loss": 3.6387815657611684, "tokens_seen": 1030562816 }, { "epoch": 3.0, "learning_rate": 0.0003473520561685055, "loss": 2.9082, "theoretical_loss": 3.638759667498027, "tokens_seen": 1030628352 }, { "epoch": 3.0, "learning_rate": 0.00034734202607823473, "loss": 2.9071, "theoretical_loss": 3.6387377710171807, "tokens_seen": 1030693888 }, { "epoch": 3.0, "learning_rate": 0.0003473319959879639, "loss": 2.7056, "theoretical_loss": 3.6387158763183725, "tokens_seen": 1030759424 }, { "epoch": 3.0, "learning_rate": 0.0003473219658976931, "loss": 2.904, "theoretical_loss": 3.6386939834013434, "tokens_seen": 1030824960 }, { "epoch": 3.0, "learning_rate": 0.00034731193580742227, "loss": 2.8677, "theoretical_loss": 3.6386720922658347, "tokens_seen": 1030890496 }, { "epoch": 3.0, "learning_rate": 0.00034730190571715145, "loss": 2.7242, "theoretical_loss": 3.6386502029115895, "tokens_seen": 1030956032 }, { "epoch": 3.0, "learning_rate": 0.00034729187562688063, "loss": 2.9009, "theoretical_loss": 3.6386283153383485, "tokens_seen": 1031021568 }, { "epoch": 3.0, "learning_rate": 0.00034728184553660987, "loss": 2.8244, "theoretical_loss": 3.6386064295458542, "tokens_seen": 1031087104 }, { "epoch": 3.0, "learning_rate": 0.000347271815446339, "loss": 3.0472, "theoretical_loss": 3.638584545533848, "tokens_seen": 1031152640 }, { "epoch": 3.0, "learning_rate": 0.00034726178535606823, "loss": 2.9095, "theoretical_loss": 3.6385626633020722, "tokens_seen": 1031218176 }, { "epoch": 3.0, "learning_rate": 0.0003472517552657974, "loss": 3.0527, "theoretical_loss": 3.63854078285027, "tokens_seen": 1031283712 }, { "epoch": 3.0, "learning_rate": 0.0003472417251755266, "loss": 2.9148, "theoretical_loss": 3.638518904178181, "tokens_seen": 1031349248 }, { "epoch": 3.0, "learning_rate": 0.0003472316950852558, "loss": 2.8947, "theoretical_loss": 3.638497027285549, "tokens_seen": 1031414784 }, { "epoch": 3.0, "learning_rate": 0.00034722166499498496, "loss": 2.8469, "theoretical_loss": 3.6384751521721164, "tokens_seen": 1031480320 }, { "epoch": 3.0, "learning_rate": 0.00034721163490471414, "loss": 2.7379, "theoretical_loss": 3.638453278837625, "tokens_seen": 1031545856 }, { "epoch": 3.0, "learning_rate": 0.00034720160481444337, "loss": 2.9275, "theoretical_loss": 3.6384314072818174, "tokens_seen": 1031611392 }, { "epoch": 3.0, "learning_rate": 0.0003471915747241725, "loss": 2.8824, "theoretical_loss": 3.6384095375044354, "tokens_seen": 1031676928 }, { "epoch": 3.0, "learning_rate": 0.00034718154463390173, "loss": 2.9838, "theoretical_loss": 3.6383876695052217, "tokens_seen": 1031742464 }, { "epoch": 3.0, "learning_rate": 0.00034717151454363086, "loss": 2.9201, "theoretical_loss": 3.6383658032839192, "tokens_seen": 1031808000 }, { "epoch": 3.0, "learning_rate": 0.0003471614844533601, "loss": 2.8092, "theoretical_loss": 3.6383439388402703, "tokens_seen": 1031873536 }, { "epoch": 3.0, "learning_rate": 0.0003471514543630893, "loss": 2.8065, "theoretical_loss": 3.638322076174017, "tokens_seen": 1031939072 }, { "epoch": 3.0, "learning_rate": 0.00034714142427281846, "loss": 2.9463, "theoretical_loss": 3.6383002152849024, "tokens_seen": 1032004608 }, { "epoch": 3.0, "learning_rate": 0.00034713139418254764, "loss": 2.8289, "theoretical_loss": 3.6382783561726697, "tokens_seen": 1032070144 }, { "epoch": 3.0, "learning_rate": 0.0003471213640922768, "loss": 2.927, "theoretical_loss": 3.6382564988370607, "tokens_seen": 1032135680 }, { "epoch": 3.0, "objective/train/docs_used": 1657823, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9219143390655518, "objective/train/theoretical_loss": 3.6382510347807258, "objective/train/tokens_used": 1052612064, "theoretical_loss": 3.6382510347807258, "tokens_seen": 1032152064 }, { "epoch": 3.0, "learning_rate": 0.000347111334002006, "loss": 3.0677, "theoretical_loss": 3.6382346432778183, "tokens_seen": 1032201216 }, { "epoch": 3.0, "learning_rate": 0.00034710130391173524, "loss": 2.9354, "theoretical_loss": 3.638212789494686, "tokens_seen": 1032266752 }, { "epoch": 3.0, "learning_rate": 0.00034709127382146436, "loss": 2.6857, "theoretical_loss": 3.6381909374874066, "tokens_seen": 1032332288 }, { "epoch": 3.0, "learning_rate": 0.0003470812437311936, "loss": 2.8369, "theoretical_loss": 3.638169087255723, "tokens_seen": 1032397824 }, { "epoch": 3.0, "learning_rate": 0.0003470712136409228, "loss": 2.8872, "theoretical_loss": 3.6381472387993776, "tokens_seen": 1032463360 }, { "epoch": 3.0, "learning_rate": 0.00034706118355065196, "loss": 2.9056, "theoretical_loss": 3.6381253921181145, "tokens_seen": 1032528896 }, { "epoch": 3.0, "learning_rate": 0.00034705115346038114, "loss": 2.8484, "theoretical_loss": 3.638103547211676, "tokens_seen": 1032594432 }, { "epoch": 3.0, "learning_rate": 0.0003470411233701103, "loss": 2.8687, "theoretical_loss": 3.6380817040798057, "tokens_seen": 1032659968 }, { "epoch": 3.0, "learning_rate": 0.0003470310932798395, "loss": 2.9026, "theoretical_loss": 3.6380598627222467, "tokens_seen": 1032725504 }, { "epoch": 3.0, "learning_rate": 0.00034702106318956874, "loss": 2.9976, "theoretical_loss": 3.6380380231387424, "tokens_seen": 1032791040 }, { "epoch": 3.0, "learning_rate": 0.00034701103309929787, "loss": 2.8573, "theoretical_loss": 3.638016185329036, "tokens_seen": 1032856576 }, { "epoch": 3.0, "learning_rate": 0.0003470010030090271, "loss": 2.7595, "theoretical_loss": 3.637994349292871, "tokens_seen": 1032922112 }, { "epoch": 3.0, "learning_rate": 0.00034699097291875623, "loss": 2.9989, "theoretical_loss": 3.6379725150299906, "tokens_seen": 1032987648 }, { "epoch": 3.0, "learning_rate": 0.00034698094282848547, "loss": 2.8769, "theoretical_loss": 3.637950682540139, "tokens_seen": 1033053184 }, { "epoch": 3.0, "learning_rate": 0.00034697091273821465, "loss": 2.8689, "theoretical_loss": 3.637928851823059, "tokens_seen": 1033118720 }, { "epoch": 3.0, "learning_rate": 0.00034696088264794383, "loss": 2.7975, "theoretical_loss": 3.6379070228784944, "tokens_seen": 1033184256 }, { "epoch": 3.0, "learning_rate": 0.000346950852557673, "loss": 2.7647, "theoretical_loss": 3.6378851957061893, "tokens_seen": 1033249792 }, { "epoch": 3.0, "learning_rate": 0.00034694082246740224, "loss": 2.8799, "theoretical_loss": 3.6378633703058867, "tokens_seen": 1033315328 }, { "epoch": 3.0, "learning_rate": 0.00034693079237713137, "loss": 2.8908, "theoretical_loss": 3.637841546677331, "tokens_seen": 1033380864 }, { "epoch": 3.0, "learning_rate": 0.0003469207622868606, "loss": 2.8488, "theoretical_loss": 3.6378197248202655, "tokens_seen": 1033446400 }, { "epoch": 3.0, "learning_rate": 0.00034691073219658973, "loss": 2.8569, "theoretical_loss": 3.637797904734435, "tokens_seen": 1033511936 }, { "epoch": 3.0, "learning_rate": 0.00034690070210631897, "loss": 2.8941, "theoretical_loss": 3.6377760864195814, "tokens_seen": 1033577472 }, { "epoch": 3.0, "learning_rate": 0.00034689067201604815, "loss": 2.9497, "theoretical_loss": 3.6377542698754515, "tokens_seen": 1033643008 }, { "epoch": 3.0, "learning_rate": 0.00034688064192577733, "loss": 2.8145, "theoretical_loss": 3.637732455101787, "tokens_seen": 1033708544 }, { "epoch": 3.0, "learning_rate": 0.0003468706118355065, "loss": 2.9218, "theoretical_loss": 3.6377106420983334, "tokens_seen": 1033774080 }, { "epoch": 3.0, "objective/train/docs_used": 1660748, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9962761402130127, "objective/train/theoretical_loss": 3.637705189124035, "objective/train/tokens_used": 1054250464, "theoretical_loss": 3.637705189124035, "tokens_seen": 1033790464 }, { "epoch": 3.0, "learning_rate": 0.0003468605817452357, "loss": 3.0571, "theoretical_loss": 3.637688830864834, "tokens_seen": 1033839616 }, { "epoch": 3.0, "learning_rate": 0.0003468505516549649, "loss": 2.9475, "theoretical_loss": 3.6376670214010334, "tokens_seen": 1033905152 }, { "epoch": 3.0, "learning_rate": 0.0003468405215646941, "loss": 2.9812, "theoretical_loss": 3.637645213706676, "tokens_seen": 1033970688 }, { "epoch": 3.0, "learning_rate": 0.0003468304914744233, "loss": 2.8615, "theoretical_loss": 3.637623407781506, "tokens_seen": 1034036224 }, { "epoch": 3.0, "learning_rate": 0.00034682046138415247, "loss": 2.8425, "theoretical_loss": 3.637601603625267, "tokens_seen": 1034101760 }, { "epoch": 3.0, "learning_rate": 0.00034681043129388165, "loss": 3.0426, "theoretical_loss": 3.6375798012377043, "tokens_seen": 1034167296 }, { "epoch": 3.0, "learning_rate": 0.00034680040120361083, "loss": 2.8835, "theoretical_loss": 3.6375580006185624, "tokens_seen": 1034232832 }, { "epoch": 3.0, "learning_rate": 0.00034679037111334007, "loss": 2.8217, "theoretical_loss": 3.6375362017675856, "tokens_seen": 1034298368 }, { "epoch": 3.0, "learning_rate": 0.0003467803410230692, "loss": 2.885, "theoretical_loss": 3.6375144046845183, "tokens_seen": 1034363904 }, { "epoch": 3.0, "learning_rate": 0.00034677031093279843, "loss": 2.8799, "theoretical_loss": 3.6374926093691045, "tokens_seen": 1034429440 }, { "epoch": 3.0, "learning_rate": 0.0003467602808425276, "loss": 3.0092, "theoretical_loss": 3.63747081582109, "tokens_seen": 1034494976 }, { "epoch": 3.0, "learning_rate": 0.0003467502507522568, "loss": 2.9397, "theoretical_loss": 3.6374490240402197, "tokens_seen": 1034560512 }, { "epoch": 3.0, "learning_rate": 0.000346740220661986, "loss": 2.8254, "theoretical_loss": 3.6374272340262372, "tokens_seen": 1034626048 }, { "epoch": 3.0, "learning_rate": 0.00034673019057171516, "loss": 2.9687, "theoretical_loss": 3.6374054457788882, "tokens_seen": 1034691584 }, { "epoch": 3.0, "learning_rate": 0.00034672016048144434, "loss": 2.8396, "theoretical_loss": 3.6373836592979174, "tokens_seen": 1034757120 }, { "epoch": 3.0, "learning_rate": 0.00034671013039117357, "loss": 2.8504, "theoretical_loss": 3.6373618745830694, "tokens_seen": 1034822656 }, { "epoch": 3.0, "learning_rate": 0.0003467001003009027, "loss": 2.8029, "theoretical_loss": 3.6373400916340892, "tokens_seen": 1034888192 }, { "epoch": 3.0, "learning_rate": 0.00034669007021063193, "loss": 2.9816, "theoretical_loss": 3.6373183104507225, "tokens_seen": 1034953728 }, { "epoch": 3.0, "learning_rate": 0.00034668004012036106, "loss": 2.8003, "theoretical_loss": 3.637296531032714, "tokens_seen": 1035019264 }, { "epoch": 3.0, "learning_rate": 0.0003466700100300903, "loss": 3.0354, "theoretical_loss": 3.637274753379809, "tokens_seen": 1035084800 }, { "epoch": 3.0, "learning_rate": 0.0003466599799398195, "loss": 3.1338, "theoretical_loss": 3.637252977491752, "tokens_seen": 1035150336 }, { "epoch": 3.0, "learning_rate": 0.00034664994984954866, "loss": 2.8415, "theoretical_loss": 3.637231203368289, "tokens_seen": 1035215872 }, { "epoch": 3.0, "learning_rate": 0.00034663991975927784, "loss": 3.0994, "theoretical_loss": 3.6372094310091656, "tokens_seen": 1035281408 }, { "epoch": 3.0, "learning_rate": 0.000346629889669007, "loss": 2.9826, "theoretical_loss": 3.637187660414126, "tokens_seen": 1035346944 }, { "epoch": 3.0, "learning_rate": 0.0003466198595787362, "loss": 2.9012, "theoretical_loss": 3.637165891582917, "tokens_seen": 1035412480 }, { "epoch": 3.0, "objective/train/docs_used": 1663752, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0893702507019043, "objective/train/theoretical_loss": 3.6371604496506826, "objective/train/tokens_used": 1055888864, "theoretical_loss": 3.6371604496506826, "tokens_seen": 1035428864 }, { "epoch": 3.0, "learning_rate": 0.00034660982948846544, "loss": 3.0792, "theoretical_loss": 3.6371441245152827, "tokens_seen": 1035478016 }, { "epoch": 3.0, "learning_rate": 0.00034659979939819456, "loss": 2.8868, "theoretical_loss": 3.6371223592109696, "tokens_seen": 1035543552 }, { "epoch": 3.0, "learning_rate": 0.0003465897693079238, "loss": 2.8766, "theoretical_loss": 3.637100595669723, "tokens_seen": 1035609088 }, { "epoch": 3.0, "learning_rate": 0.000346579739217653, "loss": 2.9633, "theoretical_loss": 3.6370788338912883, "tokens_seen": 1035674624 }, { "epoch": 3.0, "learning_rate": 0.00034656970912738216, "loss": 2.9256, "theoretical_loss": 3.6370570738754116, "tokens_seen": 1035740160 }, { "epoch": 3.0, "learning_rate": 0.00034655967903711134, "loss": 2.835, "theoretical_loss": 3.6370353156218385, "tokens_seen": 1035805696 }, { "epoch": 3.0, "learning_rate": 0.0003465496489468405, "loss": 2.9273, "theoretical_loss": 3.6370135591303145, "tokens_seen": 1035871232 }, { "epoch": 3.0, "learning_rate": 0.0003465396188565697, "loss": 2.8343, "theoretical_loss": 3.636991804400586, "tokens_seen": 1035936768 }, { "epoch": 3.0, "learning_rate": 0.00034652958876629894, "loss": 2.9371, "theoretical_loss": 3.6369700514323977, "tokens_seen": 1036002304 }, { "epoch": 3.0, "learning_rate": 0.00034651955867602807, "loss": 2.9761, "theoretical_loss": 3.636948300225497, "tokens_seen": 1036067840 }, { "epoch": 3.0, "learning_rate": 0.0003465095285857573, "loss": 2.8991, "theoretical_loss": 3.636926550779629, "tokens_seen": 1036133376 }, { "epoch": 3.0, "learning_rate": 0.00034649949849548643, "loss": 3.0718, "theoretical_loss": 3.6369048030945406, "tokens_seen": 1036198912 }, { "epoch": 3.0, "learning_rate": 0.00034648946840521567, "loss": 2.9347, "theoretical_loss": 3.6368830571699764, "tokens_seen": 1036264448 }, { "epoch": 3.0, "learning_rate": 0.00034647943831494485, "loss": 2.9211, "theoretical_loss": 3.636861313005684, "tokens_seen": 1036329984 }, { "epoch": 3.0, "learning_rate": 0.00034646940822467403, "loss": 2.8315, "theoretical_loss": 3.636839570601409, "tokens_seen": 1036395520 }, { "epoch": 3.0, "learning_rate": 0.0003464593781344032, "loss": 3.0315, "theoretical_loss": 3.6368178299568976, "tokens_seen": 1036461056 }, { "epoch": 3.0, "learning_rate": 0.00034644934804413244, "loss": 2.8214, "theoretical_loss": 3.6367960910718966, "tokens_seen": 1036526592 }, { "epoch": 3.0, "learning_rate": 0.00034643931795386157, "loss": 2.9335, "theoretical_loss": 3.6367743539461515, "tokens_seen": 1036592128 }, { "epoch": 3.0, "learning_rate": 0.0003464292878635908, "loss": 2.8671, "theoretical_loss": 3.63675261857941, "tokens_seen": 1036657664 }, { "epoch": 3.0, "learning_rate": 0.00034641925777331993, "loss": 2.8922, "theoretical_loss": 3.636730884971417, "tokens_seen": 1036723200 }, { "epoch": 3.0, "learning_rate": 0.00034640922768304917, "loss": 2.7258, "theoretical_loss": 3.6367091531219202, "tokens_seen": 1036788736 }, { "epoch": 3.0, "learning_rate": 0.00034639919759277835, "loss": 2.9771, "theoretical_loss": 3.636687423030666, "tokens_seen": 1036854272 }, { "epoch": 3.0, "learning_rate": 0.00034638916750250753, "loss": 2.953, "theoretical_loss": 3.6366656946974008, "tokens_seen": 1036919808 }, { "epoch": 3.0, "learning_rate": 0.0003463791374122367, "loss": 2.9597, "theoretical_loss": 3.636643968121871, "tokens_seen": 1036985344 }, { "epoch": 3.0, "learning_rate": 0.0003463691073219659, "loss": 2.8145, "theoretical_loss": 3.6366222433038238, "tokens_seen": 1037050880 }, { "epoch": 3.0, "objective/train/docs_used": 1666738, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.700777769088745, "objective/train/theoretical_loss": 3.6366168123738887, "objective/train/tokens_used": 1057527264, "theoretical_loss": 3.6366168123738887, "tokens_seen": 1037067264 }, { "epoch": 3.0, "learning_rate": 0.0003463590772316951, "loss": 2.8694, "theoretical_loss": 3.6366005202430056, "tokens_seen": 1037116416 }, { "epoch": 3.0, "learning_rate": 0.0003463490471414243, "loss": 3.0571, "theoretical_loss": 3.636578798939164, "tokens_seen": 1037181952 }, { "epoch": 3.0, "learning_rate": 0.00034633901705115344, "loss": 3.0202, "theoretical_loss": 3.6365570793920448, "tokens_seen": 1037247488 }, { "epoch": 3.0, "learning_rate": 0.00034632898696088267, "loss": 2.8583, "theoretical_loss": 3.636535361601396, "tokens_seen": 1037313024 }, { "epoch": 3.0, "learning_rate": 0.0003463189568706118, "loss": 2.837, "theoretical_loss": 3.6365136455669638, "tokens_seen": 1037378560 }, { "epoch": 3.0, "learning_rate": 0.00034630892678034103, "loss": 2.98, "theoretical_loss": 3.6364919312884956, "tokens_seen": 1037444096 }, { "epoch": 3.0, "learning_rate": 0.0003462988966900702, "loss": 2.9443, "theoretical_loss": 3.636470218765738, "tokens_seen": 1037509632 }, { "epoch": 3.0, "learning_rate": 0.0003462888665997994, "loss": 2.9713, "theoretical_loss": 3.6364485079984394, "tokens_seen": 1037575168 }, { "epoch": 3.0, "learning_rate": 0.0003462788365095286, "loss": 2.952, "theoretical_loss": 3.6364267989863457, "tokens_seen": 1037640704 }, { "epoch": 3.0, "learning_rate": 0.0003462688064192578, "loss": 2.8755, "theoretical_loss": 3.6364050917292046, "tokens_seen": 1037706240 }, { "epoch": 3.0, "learning_rate": 0.00034625877632898694, "loss": 2.8775, "theoretical_loss": 3.636383386226764, "tokens_seen": 1037771776 }, { "epoch": 3.0, "learning_rate": 0.0003462487462387162, "loss": 2.779, "theoretical_loss": 3.6363616824787703, "tokens_seen": 1037837312 }, { "epoch": 3.0, "learning_rate": 0.0003462387161484453, "loss": 2.9784, "theoretical_loss": 3.636339980484972, "tokens_seen": 1037902848 }, { "epoch": 3.0, "learning_rate": 0.00034622868605817454, "loss": 2.9224, "theoretical_loss": 3.6363182802451153, "tokens_seen": 1037968384 }, { "epoch": 3.0, "learning_rate": 0.0003462186559679037, "loss": 2.8976, "theoretical_loss": 3.6362965817589483, "tokens_seen": 1038033920 }, { "epoch": 3.0, "learning_rate": 0.0003462086258776329, "loss": 2.9091, "theoretical_loss": 3.6362748850262188, "tokens_seen": 1038099456 }, { "epoch": 3.0, "learning_rate": 0.0003461985957873621, "loss": 2.9973, "theoretical_loss": 3.6362531900466744, "tokens_seen": 1038164992 }, { "epoch": 3.0, "learning_rate": 0.00034618856569709126, "loss": 2.9218, "theoretical_loss": 3.636231496820062, "tokens_seen": 1038230528 }, { "epoch": 3.0, "learning_rate": 0.00034617853560682044, "loss": 2.9319, "theoretical_loss": 3.6362098053461303, "tokens_seen": 1038296064 }, { "epoch": 3.0, "learning_rate": 0.0003461685055165497, "loss": 2.7815, "theoretical_loss": 3.636188115624627, "tokens_seen": 1038361600 }, { "epoch": 3.0, "learning_rate": 0.0003461584754262788, "loss": 2.7018, "theoretical_loss": 3.636166427655299, "tokens_seen": 1038427136 }, { "epoch": 3.0, "learning_rate": 0.00034614844533600804, "loss": 2.8084, "theoretical_loss": 3.6361447414378953, "tokens_seen": 1038492672 }, { "epoch": 3.0, "learning_rate": 0.00034613841524573717, "loss": 2.9296, "theoretical_loss": 3.6361230569721634, "tokens_seen": 1038558208 }, { "epoch": 3.0, "learning_rate": 0.0003461283851554664, "loss": 3.0258, "theoretical_loss": 3.6361013742578514, "tokens_seen": 1038623744 }, { "epoch": 3.0, "learning_rate": 0.0003461183550651956, "loss": 2.7601, "theoretical_loss": 3.6360796932947066, "tokens_seen": 1038689280 }, { "epoch": 3.0, "objective/train/docs_used": 1668593, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9072201251983643, "objective/train/theoretical_loss": 3.636074273327511, "objective/train/tokens_used": 1059165664, "theoretical_loss": 3.636074273327511, "tokens_seen": 1038705664 }, { "epoch": 3.0, "learning_rate": 0.00034610832497492476, "loss": 2.9416, "theoretical_loss": 3.636058014082478, "tokens_seen": 1038754816 }, { "epoch": 3.0, "learning_rate": 0.00034609829488465395, "loss": 2.795, "theoretical_loss": 3.6360363366209136, "tokens_seen": 1038820352 }, { "epoch": 3.0, "learning_rate": 0.0003460882647943832, "loss": 2.9072, "theoretical_loss": 3.6360146609097614, "tokens_seen": 1038885888 }, { "epoch": 3.0, "learning_rate": 0.00034607823470411236, "loss": 2.8222, "theoretical_loss": 3.6359929869487697, "tokens_seen": 1038951424 }, { "epoch": 3.0, "learning_rate": 0.00034606820461384154, "loss": 2.8615, "theoretical_loss": 3.635971314737686, "tokens_seen": 1039016960 }, { "epoch": 3.0, "learning_rate": 0.0003460581745235707, "loss": 2.6762, "theoretical_loss": 3.635949644276261, "tokens_seen": 1039082496 }, { "epoch": 3.0, "learning_rate": 0.0003460481444332999, "loss": 2.9214, "theoretical_loss": 3.6359279755642406, "tokens_seen": 1039148032 }, { "epoch": 3.0, "learning_rate": 0.00034603811434302914, "loss": 2.8633, "theoretical_loss": 3.635906308601374, "tokens_seen": 1039213568 }, { "epoch": 3.0, "learning_rate": 0.00034602808425275827, "loss": 2.9631, "theoretical_loss": 3.6358846433874104, "tokens_seen": 1039279104 }, { "epoch": 3.0, "learning_rate": 0.0003460180541624875, "loss": 3.0007, "theoretical_loss": 3.6358629799220976, "tokens_seen": 1039344640 }, { "epoch": 3.0, "learning_rate": 0.00034600802407221663, "loss": 3.0118, "theoretical_loss": 3.635841318205185, "tokens_seen": 1039410176 }, { "epoch": 3.0, "learning_rate": 0.00034599799398194587, "loss": 2.9106, "theoretical_loss": 3.63581965823642, "tokens_seen": 1039475712 }, { "epoch": 3.0, "learning_rate": 0.00034598796389167505, "loss": 2.8632, "theoretical_loss": 3.6357980000155523, "tokens_seen": 1039541248 }, { "epoch": 3.0, "learning_rate": 0.00034597793380140423, "loss": 2.9124, "theoretical_loss": 3.635776343542331, "tokens_seen": 1039606784 }, { "epoch": 3.0, "learning_rate": 0.0003459679037111334, "loss": 2.8953, "theoretical_loss": 3.6357546888165038, "tokens_seen": 1039672320 }, { "epoch": 3.0, "learning_rate": 0.00034595787362086264, "loss": 2.7951, "theoretical_loss": 3.635733035837821, "tokens_seen": 1039737856 }, { "epoch": 3.0, "learning_rate": 0.00034594784353059177, "loss": 2.8724, "theoretical_loss": 3.6357113846060294, "tokens_seen": 1039803392 }, { "epoch": 3.0, "learning_rate": 0.000345937813440321, "loss": 2.9391, "theoretical_loss": 3.63568973512088, "tokens_seen": 1039868928 }, { "epoch": 3.0, "learning_rate": 0.00034592778335005013, "loss": 2.7742, "theoretical_loss": 3.635668087382121, "tokens_seen": 1039934464 }, { "epoch": 3.0, "learning_rate": 0.00034591775325977937, "loss": 3.0617, "theoretical_loss": 3.6356464413895013, "tokens_seen": 1040000000 }, { "epoch": 3.0, "learning_rate": 0.00034590772316950855, "loss": 3.0469, "theoretical_loss": 3.6356247971427704, "tokens_seen": 1040065536 }, { "epoch": 3.0, "learning_rate": 0.00034589769307923773, "loss": 2.9839, "theoretical_loss": 3.635603154641678, "tokens_seen": 1040131072 }, { "epoch": 3.0, "learning_rate": 0.0003458876629889669, "loss": 2.9524, "theoretical_loss": 3.6355815138859717, "tokens_seen": 1040196608 }, { "epoch": 3.0, "learning_rate": 0.0003458776328986961, "loss": 2.874, "theoretical_loss": 3.635559874875402, "tokens_seen": 1040262144 }, { "epoch": 3.0, "learning_rate": 0.0003458676028084253, "loss": 2.7473, "theoretical_loss": 3.635538237609719, "tokens_seen": 1040327680 }, { "epoch": 3.0, "objective/train/docs_used": 1671371, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.81103515625, "objective/train/theoretical_loss": 3.635532828565907, "objective/train/tokens_used": 1060804064, "theoretical_loss": 3.635532828565907, "tokens_seen": 1040344064 }, { "epoch": 3.0, "learning_rate": 0.0003458575727181545, "loss": 2.8198, "theoretical_loss": 3.63551660208867, "tokens_seen": 1040393216 }, { "epoch": 3.0, "learning_rate": 0.00034584754262788364, "loss": 2.8767, "theoretical_loss": 3.635494968312006, "tokens_seen": 1040458752 }, { "epoch": 3.0, "learning_rate": 0.00034583751253761287, "loss": 2.9172, "theoretical_loss": 3.6354733362794764, "tokens_seen": 1040524288 }, { "epoch": 3.0, "learning_rate": 0.000345827482447342, "loss": 2.8413, "theoretical_loss": 3.63545170599083, "tokens_seen": 1040589824 }, { "epoch": 3.0, "learning_rate": 0.00034581745235707123, "loss": 2.8374, "theoretical_loss": 3.6354300774458173, "tokens_seen": 1040655360 }, { "epoch": 3.0, "learning_rate": 0.0003458074222668004, "loss": 2.9877, "theoretical_loss": 3.635408450644187, "tokens_seen": 1040720896 }, { "epoch": 3.0, "learning_rate": 0.0003457973921765296, "loss": 2.8465, "theoretical_loss": 3.635386825585689, "tokens_seen": 1040786432 }, { "epoch": 3.0, "learning_rate": 0.0003457873620862588, "loss": 2.9361, "theoretical_loss": 3.6353652022700738, "tokens_seen": 1040851968 }, { "epoch": 3.0, "learning_rate": 0.000345777331995988, "loss": 2.877, "theoretical_loss": 3.635343580697091, "tokens_seen": 1040917504 }, { "epoch": 3.0, "learning_rate": 0.00034576730190571714, "loss": 2.7897, "theoretical_loss": 3.6353219608664897, "tokens_seen": 1040983040 }, { "epoch": 3.0, "learning_rate": 0.0003457572718154464, "loss": 2.8722, "theoretical_loss": 3.635300342778021, "tokens_seen": 1041048576 }, { "epoch": 3.0, "learning_rate": 0.0003457472417251755, "loss": 2.9342, "theoretical_loss": 3.6352787264314332, "tokens_seen": 1041114112 }, { "epoch": 3.0, "learning_rate": 0.00034573721163490474, "loss": 2.9948, "theoretical_loss": 3.635257111826478, "tokens_seen": 1041179648 }, { "epoch": 3.0, "learning_rate": 0.0003457271815446339, "loss": 2.9009, "theoretical_loss": 3.6352354989629045, "tokens_seen": 1041245184 }, { "epoch": 3.0, "learning_rate": 0.0003457171514543631, "loss": 2.8364, "theoretical_loss": 3.635213887840463, "tokens_seen": 1041310720 }, { "epoch": 3.0, "learning_rate": 0.0003457071213640923, "loss": 2.9119, "theoretical_loss": 3.635192278458904, "tokens_seen": 1041376256 }, { "epoch": 3.0, "learning_rate": 0.00034569709127382146, "loss": 2.6338, "theoretical_loss": 3.6351706708179767, "tokens_seen": 1041441792 }, { "epoch": 3.0, "learning_rate": 0.00034568706118355064, "loss": 3.0745, "theoretical_loss": 3.6351490649174334, "tokens_seen": 1041507328 }, { "epoch": 3.0, "learning_rate": 0.0003456770310932799, "loss": 2.9162, "theoretical_loss": 3.635127460757022, "tokens_seen": 1041572864 }, { "epoch": 3.01, "learning_rate": 0.000345667001003009, "loss": 2.8702, "theoretical_loss": 3.6351058583364946, "tokens_seen": 1041638400 }, { "epoch": 3.01, "learning_rate": 0.00034565697091273824, "loss": 2.8169, "theoretical_loss": 3.635084257655601, "tokens_seen": 1041703936 }, { "epoch": 3.01, "learning_rate": 0.00034564694082246737, "loss": 2.8702, "theoretical_loss": 3.6350626587140913, "tokens_seen": 1041769472 }, { "epoch": 3.01, "learning_rate": 0.0003456369107321966, "loss": 2.8572, "theoretical_loss": 3.635041061511717, "tokens_seen": 1041835008 }, { "epoch": 3.01, "learning_rate": 0.0003456268806419258, "loss": 2.738, "theoretical_loss": 3.635019466048228, "tokens_seen": 1041900544 }, { "epoch": 3.01, "learning_rate": 0.00034561685055165496, "loss": 2.8507, "theoretical_loss": 3.6349978723233747, "tokens_seen": 1041966080 }, { "epoch": 3.01, "objective/train/docs_used": 1674131, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.764479398727417, "objective/train/theoretical_loss": 3.634992474163794, "objective/train/tokens_used": 1062442464, "theoretical_loss": 3.634992474163794, "tokens_seen": 1041982464 }, { "epoch": 3.01, "learning_rate": 0.00034560682046138415, "loss": 2.9065, "theoretical_loss": 3.6349762803369083, "tokens_seen": 1042031616 }, { "epoch": 3.01, "learning_rate": 0.0003455967903711134, "loss": 2.7972, "theoretical_loss": 3.6349546900885796, "tokens_seen": 1042097152 }, { "epoch": 3.01, "learning_rate": 0.0003455867602808425, "loss": 3.013, "theoretical_loss": 3.634933101578139, "tokens_seen": 1042162688 }, { "epoch": 3.01, "learning_rate": 0.00034557673019057174, "loss": 2.9529, "theoretical_loss": 3.6349115148053377, "tokens_seen": 1042228224 }, { "epoch": 3.01, "learning_rate": 0.00034556670010030087, "loss": 2.7511, "theoretical_loss": 3.634889929769926, "tokens_seen": 1042293760 }, { "epoch": 3.01, "learning_rate": 0.0003455566700100301, "loss": 3.0306, "theoretical_loss": 3.6348683464716554, "tokens_seen": 1042359296 }, { "epoch": 3.01, "learning_rate": 0.0003455466399197593, "loss": 2.8714, "theoretical_loss": 3.634846764910277, "tokens_seen": 1042424832 }, { "epoch": 3.01, "learning_rate": 0.00034553660982948847, "loss": 2.994, "theoretical_loss": 3.634825185085541, "tokens_seen": 1042490368 }, { "epoch": 3.01, "learning_rate": 0.00034552657973921765, "loss": 2.9075, "theoretical_loss": 3.634803606997199, "tokens_seen": 1042555904 }, { "epoch": 3.01, "learning_rate": 0.00034551654964894683, "loss": 2.8229, "theoretical_loss": 3.634782030645003, "tokens_seen": 1042621440 }, { "epoch": 3.01, "learning_rate": 0.000345506519558676, "loss": 2.7623, "theoretical_loss": 3.634760456028703, "tokens_seen": 1042686976 }, { "epoch": 3.01, "learning_rate": 0.00034549648946840525, "loss": 2.942, "theoretical_loss": 3.6347388831480503, "tokens_seen": 1042752512 }, { "epoch": 3.01, "learning_rate": 0.0003454864593781344, "loss": 2.9849, "theoretical_loss": 3.634717312002797, "tokens_seen": 1042818048 }, { "epoch": 3.01, "learning_rate": 0.0003454764292878636, "loss": 2.9206, "theoretical_loss": 3.634695742592694, "tokens_seen": 1042883584 }, { "epoch": 3.01, "learning_rate": 0.00034546639919759274, "loss": 2.7725, "theoretical_loss": 3.6346741749174925, "tokens_seen": 1042949120 }, { "epoch": 3.01, "learning_rate": 0.00034545636910732197, "loss": 2.8333, "theoretical_loss": 3.6346526089769435, "tokens_seen": 1043014656 }, { "epoch": 3.01, "learning_rate": 0.00034544633901705115, "loss": 2.8447, "theoretical_loss": 3.6346310447708, "tokens_seen": 1043080192 }, { "epoch": 3.01, "learning_rate": 0.00034543630892678033, "loss": 2.91, "theoretical_loss": 3.6346094822988126, "tokens_seen": 1043145728 }, { "epoch": 3.01, "learning_rate": 0.0003454262788365095, "loss": 2.9753, "theoretical_loss": 3.6345879215607333, "tokens_seen": 1043211264 }, { "epoch": 3.01, "learning_rate": 0.00034541624874623875, "loss": 2.9046, "theoretical_loss": 3.6345663625563125, "tokens_seen": 1043276800 }, { "epoch": 3.01, "learning_rate": 0.0003454062186559679, "loss": 2.9418, "theoretical_loss": 3.6345448052853038, "tokens_seen": 1043342336 }, { "epoch": 3.01, "learning_rate": 0.0003453961885656971, "loss": 2.9367, "theoretical_loss": 3.634523249747458, "tokens_seen": 1043407872 }, { "epoch": 3.01, "learning_rate": 0.00034538615847542624, "loss": 2.8968, "theoretical_loss": 3.6345016959425265, "tokens_seen": 1043473408 }, { "epoch": 3.01, "learning_rate": 0.0003453761283851555, "loss": 2.8771, "theoretical_loss": 3.6344801438702614, "tokens_seen": 1043538944 }, { "epoch": 3.01, "learning_rate": 0.00034536609829488466, "loss": 2.923, "theoretical_loss": 3.6344585935304154, "tokens_seen": 1043604480 }, { "epoch": 3.01, "objective/train/docs_used": 1677074, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.641371011734009, "objective/train/theoretical_loss": 3.634453206216115, "objective/train/tokens_used": 1064080864, "theoretical_loss": 3.634453206216115, "tokens_seen": 1043620864 }, { "epoch": 3.01, "learning_rate": 0.00034535606820461384, "loss": 2.914, "theoretical_loss": 3.63443704492274, "tokens_seen": 1043670016 }, { "epoch": 3.01, "learning_rate": 0.000345346038114343, "loss": 3.0032, "theoretical_loss": 3.6344154980469865, "tokens_seen": 1043735552 }, { "epoch": 3.01, "learning_rate": 0.0003453360080240722, "loss": 3.0021, "theoretical_loss": 3.634393952902908, "tokens_seen": 1043801088 }, { "epoch": 3.01, "learning_rate": 0.00034532597793380143, "loss": 2.8049, "theoretical_loss": 3.634372409490256, "tokens_seen": 1043866624 }, { "epoch": 3.01, "learning_rate": 0.0003453159478435306, "loss": 2.7724, "theoretical_loss": 3.6343508678087826, "tokens_seen": 1043932160 }, { "epoch": 3.01, "learning_rate": 0.0003453059177532598, "loss": 2.9796, "theoretical_loss": 3.6343293278582407, "tokens_seen": 1043997696 }, { "epoch": 3.01, "learning_rate": 0.000345295887662989, "loss": 2.8795, "theoretical_loss": 3.634307789638382, "tokens_seen": 1044063232 }, { "epoch": 3.01, "learning_rate": 0.0003452858575727182, "loss": 2.8815, "theoretical_loss": 3.634286253148959, "tokens_seen": 1044128768 }, { "epoch": 3.01, "learning_rate": 0.00034527582748244734, "loss": 2.9091, "theoretical_loss": 3.634264718389724, "tokens_seen": 1044194304 }, { "epoch": 3.01, "learning_rate": 0.0003452657973921766, "loss": 2.808, "theoretical_loss": 3.63424318536043, "tokens_seen": 1044259840 }, { "epoch": 3.01, "learning_rate": 0.0003452557673019057, "loss": 2.952, "theoretical_loss": 3.634221654060828, "tokens_seen": 1044325376 }, { "epoch": 3.01, "learning_rate": 0.00034524573721163494, "loss": 2.8647, "theoretical_loss": 3.634200124490672, "tokens_seen": 1044390912 }, { "epoch": 3.01, "learning_rate": 0.0003452357071213641, "loss": 2.7897, "theoretical_loss": 3.634178596649714, "tokens_seen": 1044456448 }, { "epoch": 3.01, "learning_rate": 0.0003452256770310933, "loss": 2.8712, "theoretical_loss": 3.6341570705377064, "tokens_seen": 1044521984 }, { "epoch": 3.01, "learning_rate": 0.0003452156469408225, "loss": 2.7805, "theoretical_loss": 3.6341355461544023, "tokens_seen": 1044587520 }, { "epoch": 3.01, "learning_rate": 0.00034520561685055166, "loss": 2.91, "theoretical_loss": 3.6341140234995546, "tokens_seen": 1044653056 }, { "epoch": 3.01, "learning_rate": 0.00034519558676028084, "loss": 2.9478, "theoretical_loss": 3.6340925025729147, "tokens_seen": 1044718592 }, { "epoch": 3.01, "learning_rate": 0.0003451855566700101, "loss": 2.9036, "theoretical_loss": 3.6340709833742375, "tokens_seen": 1044784128 }, { "epoch": 3.01, "learning_rate": 0.0003451755265797392, "loss": 2.9942, "theoretical_loss": 3.6340494659032747, "tokens_seen": 1044849664 }, { "epoch": 3.01, "learning_rate": 0.00034516549648946844, "loss": 2.8296, "theoretical_loss": 3.6340279501597794, "tokens_seen": 1044915200 }, { "epoch": 3.01, "learning_rate": 0.00034515546639919757, "loss": 2.6642, "theoretical_loss": 3.6340064361435047, "tokens_seen": 1044980736 }, { "epoch": 3.01, "learning_rate": 0.0003451454363089268, "loss": 2.8803, "theoretical_loss": 3.6339849238542032, "tokens_seen": 1045046272 }, { "epoch": 3.01, "learning_rate": 0.000345135406218656, "loss": 2.8171, "theoretical_loss": 3.633963413291628, "tokens_seen": 1045111808 }, { "epoch": 3.01, "learning_rate": 0.00034512537612838517, "loss": 2.9032, "theoretical_loss": 3.6339419044555332, "tokens_seen": 1045177344 }, { "epoch": 3.01, "learning_rate": 0.00034511534603811435, "loss": 2.846, "theoretical_loss": 3.633920397345671, "tokens_seen": 1045242880 }, { "epoch": 3.01, "objective/train/docs_used": 1678483, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2894299030303955, "objective/train/theoretical_loss": 3.6339150208379003, "objective/train/tokens_used": 1065719264, "theoretical_loss": 3.6339150208379003, "tokens_seen": 1045259264 }, { "epoch": 3.01, "learning_rate": 0.0003451053159478436, "loss": 3.0833, "theoretical_loss": 3.6338988919617954, "tokens_seen": 1045308416 }, { "epoch": 3.01, "learning_rate": 0.0003450952858575727, "loss": 2.901, "theoretical_loss": 3.6338773883036586, "tokens_seen": 1045373952 }, { "epoch": 3.01, "learning_rate": 0.00034508525576730194, "loss": 2.7929, "theoretical_loss": 3.6338558863710153, "tokens_seen": 1045439488 }, { "epoch": 3.01, "learning_rate": 0.00034507522567703107, "loss": 2.9, "theoretical_loss": 3.6338343861636173, "tokens_seen": 1045505024 }, { "epoch": 3.01, "learning_rate": 0.0003450651955867603, "loss": 2.8058, "theoretical_loss": 3.6338128876812195, "tokens_seen": 1045570560 }, { "epoch": 3.01, "learning_rate": 0.0003450551654964895, "loss": 2.8842, "theoretical_loss": 3.633791390923575, "tokens_seen": 1045636096 }, { "epoch": 3.01, "learning_rate": 0.00034504513540621867, "loss": 2.8472, "theoretical_loss": 3.6337698958904365, "tokens_seen": 1045701632 }, { "epoch": 3.01, "learning_rate": 0.00034503510531594785, "loss": 3.0337, "theoretical_loss": 3.6337484025815585, "tokens_seen": 1045767168 }, { "epoch": 3.01, "learning_rate": 0.00034502507522567703, "loss": 2.9947, "theoretical_loss": 3.6337269109966943, "tokens_seen": 1045832704 }, { "epoch": 3.01, "learning_rate": 0.0003450150451354062, "loss": 2.7872, "theoretical_loss": 3.6337054211355984, "tokens_seen": 1045898240 }, { "epoch": 3.01, "learning_rate": 0.00034500501504513545, "loss": 2.7057, "theoretical_loss": 3.633683932998023, "tokens_seen": 1045963776 }, { "epoch": 3.01, "learning_rate": 0.0003449949849548646, "loss": 2.8965, "theoretical_loss": 3.633662446583723, "tokens_seen": 1046029312 }, { "epoch": 3.01, "learning_rate": 0.0003449849548645938, "loss": 2.7341, "theoretical_loss": 3.6336409618924517, "tokens_seen": 1046094848 }, { "epoch": 3.01, "learning_rate": 0.00034497492477432294, "loss": 2.9672, "theoretical_loss": 3.6336194789239635, "tokens_seen": 1046160384 }, { "epoch": 3.01, "learning_rate": 0.00034496489468405217, "loss": 2.7285, "theoretical_loss": 3.633597997678012, "tokens_seen": 1046225920 }, { "epoch": 3.01, "learning_rate": 0.00034495486459378135, "loss": 2.8872, "theoretical_loss": 3.6335765181543516, "tokens_seen": 1046291456 }, { "epoch": 3.01, "learning_rate": 0.00034494483450351053, "loss": 2.8881, "theoretical_loss": 3.6335550403527357, "tokens_seen": 1046356992 }, { "epoch": 3.01, "learning_rate": 0.0003449348044132397, "loss": 2.8244, "theoretical_loss": 3.633533564272919, "tokens_seen": 1046422528 }, { "epoch": 3.01, "learning_rate": 0.00034492477432296895, "loss": 2.7265, "theoretical_loss": 3.633512089914656, "tokens_seen": 1046488064 }, { "epoch": 3.01, "learning_rate": 0.0003449147442326981, "loss": 2.8253, "theoretical_loss": 3.6334906172776993, "tokens_seen": 1046553600 }, { "epoch": 3.01, "learning_rate": 0.0003449047141424273, "loss": 2.8315, "theoretical_loss": 3.6334691463618043, "tokens_seen": 1046619136 }, { "epoch": 3.01, "learning_rate": 0.00034489468405215644, "loss": 2.9315, "theoretical_loss": 3.6334476771667252, "tokens_seen": 1046684672 }, { "epoch": 3.01, "learning_rate": 0.0003448846539618857, "loss": 2.7263, "theoretical_loss": 3.6334262096922165, "tokens_seen": 1046750208 }, { "epoch": 3.01, "learning_rate": 0.00034487462387161486, "loss": 2.9903, "theoretical_loss": 3.633404743938032, "tokens_seen": 1046815744 }, { "epoch": 3.01, "learning_rate": 0.00034486459378134404, "loss": 2.7686, "theoretical_loss": 3.633383279903927, "tokens_seen": 1046881280 }, { "epoch": 3.01, "objective/train/docs_used": 1681281, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.833157777786255, "objective/train/theoretical_loss": 3.633377914164134, "objective/train/tokens_used": 1067357664, "theoretical_loss": 3.633377914164134, "tokens_seen": 1046897664 }, { "epoch": 3.01, "learning_rate": 0.0003448545636910732, "loss": 2.8446, "theoretical_loss": 3.6333618175896554, "tokens_seen": 1046946816 }, { "epoch": 3.01, "learning_rate": 0.0003448445336008024, "loss": 2.6947, "theoretical_loss": 3.633340356994972, "tokens_seen": 1047012352 }, { "epoch": 3.01, "learning_rate": 0.0003448345035105316, "loss": 2.8237, "theoretical_loss": 3.633318898119631, "tokens_seen": 1047077888 }, { "epoch": 3.01, "learning_rate": 0.0003448244734202608, "loss": 2.9525, "theoretical_loss": 3.6332974409633874, "tokens_seen": 1047143424 }, { "epoch": 3.01, "learning_rate": 0.00034481444332998994, "loss": 2.6953, "theoretical_loss": 3.633275985525996, "tokens_seen": 1047208960 }, { "epoch": 3.01, "learning_rate": 0.0003448044132397192, "loss": 3.102, "theoretical_loss": 3.6332545318072116, "tokens_seen": 1047274496 }, { "epoch": 3.01, "learning_rate": 0.00034479438314944836, "loss": 2.9505, "theoretical_loss": 3.6332330798067884, "tokens_seen": 1047340032 }, { "epoch": 3.01, "learning_rate": 0.00034478435305917754, "loss": 3.071, "theoretical_loss": 3.633211629524482, "tokens_seen": 1047405568 }, { "epoch": 3.01, "learning_rate": 0.0003447743229689067, "loss": 2.8719, "theoretical_loss": 3.633190180960047, "tokens_seen": 1047471104 }, { "epoch": 3.01, "learning_rate": 0.0003447642928786359, "loss": 2.8635, "theoretical_loss": 3.633168734113238, "tokens_seen": 1047536640 }, { "epoch": 3.01, "learning_rate": 0.0003447542627883651, "loss": 2.7976, "theoretical_loss": 3.633147288983811, "tokens_seen": 1047602176 }, { "epoch": 3.01, "learning_rate": 0.0003447442326980943, "loss": 2.9467, "theoretical_loss": 3.63312584557152, "tokens_seen": 1047667712 }, { "epoch": 3.01, "learning_rate": 0.00034473420260782345, "loss": 2.8408, "theoretical_loss": 3.6331044038761204, "tokens_seen": 1047733248 }, { "epoch": 3.01, "learning_rate": 0.0003447241725175527, "loss": 3.0526, "theoretical_loss": 3.633082963897367, "tokens_seen": 1047798784 }, { "epoch": 3.01, "learning_rate": 0.0003447141424272818, "loss": 2.9531, "theoretical_loss": 3.633061525635016, "tokens_seen": 1047864320 }, { "epoch": 3.01, "learning_rate": 0.00034470411233701104, "loss": 3.0066, "theoretical_loss": 3.6330400890888224, "tokens_seen": 1047929856 }, { "epoch": 3.01, "learning_rate": 0.0003446940822467402, "loss": 2.9085, "theoretical_loss": 3.6330186542585405, "tokens_seen": 1047995392 }, { "epoch": 3.01, "learning_rate": 0.0003446840521564694, "loss": 2.8106, "theoretical_loss": 3.632997221143927, "tokens_seen": 1048060928 }, { "epoch": 3.01, "learning_rate": 0.0003446740220661986, "loss": 2.7903, "theoretical_loss": 3.6329757897447363, "tokens_seen": 1048126464 }, { "epoch": 3.01, "learning_rate": 0.00034466399197592777, "loss": 2.8154, "theoretical_loss": 3.632954360060725, "tokens_seen": 1048192000 }, { "epoch": 3.01, "learning_rate": 0.00034465396188565695, "loss": 2.8769, "theoretical_loss": 3.632932932091647, "tokens_seen": 1048257536 }, { "epoch": 3.01, "learning_rate": 0.0003446439317953862, "loss": 2.937, "theoretical_loss": 3.632911505837259, "tokens_seen": 1048323072 }, { "epoch": 3.01, "learning_rate": 0.0003446339017051153, "loss": 2.9429, "theoretical_loss": 3.632890081297316, "tokens_seen": 1048388608 }, { "epoch": 3.01, "learning_rate": 0.00034462387161484455, "loss": 2.9223, "theoretical_loss": 3.6328686584715744, "tokens_seen": 1048454144 }, { "epoch": 3.01, "learning_rate": 0.00034461384152457373, "loss": 2.8567, "theoretical_loss": 3.6328472373597895, "tokens_seen": 1048519680 }, { "epoch": 3.01, "objective/train/docs_used": 1683900, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.064023733139038, "objective/train/theoretical_loss": 3.63284188234962, "objective/train/tokens_used": 1068996064, "theoretical_loss": 3.63284188234962, "tokens_seen": 1048536064 }, { "epoch": 3.01, "learning_rate": 0.0003446038114343029, "loss": 2.8623, "theoretical_loss": 3.6328258179617166, "tokens_seen": 1048585216 }, { "epoch": 3.01, "learning_rate": 0.00034459378134403214, "loss": 3.0258, "theoretical_loss": 3.6328044002771125, "tokens_seen": 1048650752 }, { "epoch": 3.01, "learning_rate": 0.00034458375125376127, "loss": 2.7639, "theoretical_loss": 3.632782984305732, "tokens_seen": 1048716288 }, { "epoch": 3.01, "learning_rate": 0.0003445737211634905, "loss": 2.9837, "theoretical_loss": 3.6327615700473324, "tokens_seen": 1048781824 }, { "epoch": 3.01, "learning_rate": 0.0003445636910732197, "loss": 2.9284, "theoretical_loss": 3.632740157501668, "tokens_seen": 1048847360 }, { "epoch": 3.01, "learning_rate": 0.00034455366098294887, "loss": 2.8806, "theoretical_loss": 3.6327187466684956, "tokens_seen": 1048912896 }, { "epoch": 3.01, "learning_rate": 0.00034454363089267805, "loss": 3.0039, "theoretical_loss": 3.6326973375475715, "tokens_seen": 1048978432 }, { "epoch": 3.01, "learning_rate": 0.00034453360080240723, "loss": 2.8487, "theoretical_loss": 3.6326759301386513, "tokens_seen": 1049043968 }, { "epoch": 3.01, "learning_rate": 0.0003445235707121364, "loss": 2.8708, "theoretical_loss": 3.6326545244414916, "tokens_seen": 1049109504 }, { "epoch": 3.01, "learning_rate": 0.00034451354062186565, "loss": 2.7529, "theoretical_loss": 3.632633120455848, "tokens_seen": 1049175040 }, { "epoch": 3.01, "learning_rate": 0.0003445035105315948, "loss": 2.8541, "theoretical_loss": 3.6326117181814777, "tokens_seen": 1049240576 }, { "epoch": 3.01, "learning_rate": 0.000344493480441324, "loss": 2.7841, "theoretical_loss": 3.6325903176181358, "tokens_seen": 1049306112 }, { "epoch": 3.01, "learning_rate": 0.00034448345035105314, "loss": 2.9198, "theoretical_loss": 3.63256891876558, "tokens_seen": 1049371648 }, { "epoch": 3.01, "learning_rate": 0.00034447342026078237, "loss": 2.8017, "theoretical_loss": 3.6325475216235663, "tokens_seen": 1049437184 }, { "epoch": 3.01, "learning_rate": 0.00034446339017051155, "loss": 2.7703, "theoretical_loss": 3.6325261261918502, "tokens_seen": 1049502720 }, { "epoch": 3.01, "learning_rate": 0.00034445336008024073, "loss": 2.8963, "theoretical_loss": 3.632504732470189, "tokens_seen": 1049568256 }, { "epoch": 3.01, "learning_rate": 0.0003444433299899699, "loss": 2.8264, "theoretical_loss": 3.6324833404583394, "tokens_seen": 1049633792 }, { "epoch": 3.01, "learning_rate": 0.00034443329989969915, "loss": 3.1191, "theoretical_loss": 3.6324619501560576, "tokens_seen": 1049699328 }, { "epoch": 3.01, "learning_rate": 0.0003444232698094283, "loss": 2.8857, "theoretical_loss": 3.632440561563101, "tokens_seen": 1049764864 }, { "epoch": 3.01, "learning_rate": 0.0003444132397191575, "loss": 2.8518, "theoretical_loss": 3.632419174679225, "tokens_seen": 1049830400 }, { "epoch": 3.01, "learning_rate": 0.00034440320962888664, "loss": 2.7682, "theoretical_loss": 3.632397789504187, "tokens_seen": 1049895936 }, { "epoch": 3.01, "learning_rate": 0.0003443931795386159, "loss": 2.8945, "theoretical_loss": 3.6323764060377446, "tokens_seen": 1049961472 }, { "epoch": 3.01, "learning_rate": 0.00034438314944834506, "loss": 2.7498, "theoretical_loss": 3.6323550242796534, "tokens_seen": 1050027008 }, { "epoch": 3.01, "learning_rate": 0.00034437311935807424, "loss": 2.9783, "theoretical_loss": 3.632333644229671, "tokens_seen": 1050092544 }, { "epoch": 3.01, "learning_rate": 0.0003443630892678034, "loss": 2.9291, "theoretical_loss": 3.6323122658875544, "tokens_seen": 1050158080 }, { "epoch": 3.01, "objective/train/docs_used": 1686597, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.909562826156616, "objective/train/theoretical_loss": 3.632306921568851, "objective/train/tokens_used": 1070634464, "theoretical_loss": 3.632306921568851, "tokens_seen": 1050174464 }, { "epoch": 3.01, "learning_rate": 0.0003443530591775326, "loss": 2.883, "theoretical_loss": 3.63229088925306, "tokens_seen": 1050223616 }, { "epoch": 3.01, "learning_rate": 0.0003443430290872618, "loss": 2.812, "theoretical_loss": 3.632269514325946, "tokens_seen": 1050289152 }, { "epoch": 3.01, "learning_rate": 0.000344332998996991, "loss": 2.9431, "theoretical_loss": 3.6322481411059684, "tokens_seen": 1050354688 }, { "epoch": 3.01, "learning_rate": 0.00034432296890672014, "loss": 3.081, "theoretical_loss": 3.6322267695928847, "tokens_seen": 1050420224 }, { "epoch": 3.01, "learning_rate": 0.0003443129388164494, "loss": 2.8905, "theoretical_loss": 3.632205399786452, "tokens_seen": 1050485760 }, { "epoch": 3.01, "learning_rate": 0.00034430290872617856, "loss": 2.7414, "theoretical_loss": 3.632184031686428, "tokens_seen": 1050551296 }, { "epoch": 3.01, "learning_rate": 0.00034429287863590774, "loss": 2.9984, "theoretical_loss": 3.6321626652925696, "tokens_seen": 1050616832 }, { "epoch": 3.01, "learning_rate": 0.0003442828485456369, "loss": 2.8768, "theoretical_loss": 3.6321413006046344, "tokens_seen": 1050682368 }, { "epoch": 3.01, "learning_rate": 0.0003442728184553661, "loss": 3.136, "theoretical_loss": 3.632119937622379, "tokens_seen": 1050747904 }, { "epoch": 3.01, "learning_rate": 0.0003442627883650953, "loss": 2.9085, "theoretical_loss": 3.632098576345562, "tokens_seen": 1050813440 }, { "epoch": 3.01, "learning_rate": 0.0003442527582748245, "loss": 2.8498, "theoretical_loss": 3.6320772167739404, "tokens_seen": 1050878976 }, { "epoch": 3.01, "learning_rate": 0.00034424272818455365, "loss": 2.815, "theoretical_loss": 3.632055858907272, "tokens_seen": 1050944512 }, { "epoch": 3.01, "learning_rate": 0.0003442326980942829, "loss": 2.8458, "theoretical_loss": 3.632034502745314, "tokens_seen": 1051010048 }, { "epoch": 3.01, "learning_rate": 0.000344222668004012, "loss": 2.8348, "theoretical_loss": 3.6320131482878235, "tokens_seen": 1051075584 }, { "epoch": 3.01, "learning_rate": 0.00034421263791374124, "loss": 2.8609, "theoretical_loss": 3.6319917955345598, "tokens_seen": 1051141120 }, { "epoch": 3.01, "learning_rate": 0.0003442026078234704, "loss": 2.8524, "theoretical_loss": 3.6319704444852796, "tokens_seen": 1051206656 }, { "epoch": 3.01, "learning_rate": 0.0003441925777331996, "loss": 2.8894, "theoretical_loss": 3.6319490951397406, "tokens_seen": 1051272192 }, { "epoch": 3.01, "learning_rate": 0.0003441825476429288, "loss": 3.0386, "theoretical_loss": 3.631927747497701, "tokens_seen": 1051337728 }, { "epoch": 3.01, "learning_rate": 0.00034417251755265797, "loss": 2.9578, "theoretical_loss": 3.631906401558919, "tokens_seen": 1051403264 }, { "epoch": 3.01, "learning_rate": 0.00034416248746238715, "loss": 2.9702, "theoretical_loss": 3.6318850573231516, "tokens_seen": 1051468800 }, { "epoch": 3.01, "learning_rate": 0.0003441524573721164, "loss": 2.9231, "theoretical_loss": 3.631863714790158, "tokens_seen": 1051534336 }, { "epoch": 3.01, "learning_rate": 0.0003441424272818455, "loss": 2.8692, "theoretical_loss": 3.631842373959695, "tokens_seen": 1051599872 }, { "epoch": 3.01, "learning_rate": 0.00034413239719157475, "loss": 2.9304, "theoretical_loss": 3.631821034831521, "tokens_seen": 1051665408 }, { "epoch": 3.01, "learning_rate": 0.00034412236710130393, "loss": 2.7196, "theoretical_loss": 3.6317996974053957, "tokens_seen": 1051730944 }, { "epoch": 3.01, "learning_rate": 0.0003441123370110331, "loss": 2.9331, "theoretical_loss": 3.631778361681075, "tokens_seen": 1051796480 }, { "epoch": 3.01, "objective/train/docs_used": 1689245, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.908477544784546, "objective/train/theoretical_loss": 3.631773028015874, "objective/train/tokens_used": 1072272864, "theoretical_loss": 3.631773028015874, "tokens_seen": 1051812864 }, { "epoch": 3.01, "learning_rate": 0.0003441023069207623, "loss": 2.7357, "theoretical_loss": 3.6317570276583186, "tokens_seen": 1051862016 }, { "epoch": 3.01, "learning_rate": 0.00034409227683049147, "loss": 2.8003, "theoretical_loss": 3.631735695336884, "tokens_seen": 1051927552 }, { "epoch": 3.01, "learning_rate": 0.00034408224674022065, "loss": 2.7883, "theoretical_loss": 3.6317143647165304, "tokens_seen": 1051993088 }, { "epoch": 3.01, "learning_rate": 0.0003440722166499499, "loss": 2.7593, "theoretical_loss": 3.6316930357970154, "tokens_seen": 1052058624 }, { "epoch": 3.01, "learning_rate": 0.000344062186559679, "loss": 2.9158, "theoretical_loss": 3.6316717085780983, "tokens_seen": 1052124160 }, { "epoch": 3.01, "learning_rate": 0.00034405215646940825, "loss": 2.9633, "theoretical_loss": 3.6316503830595366, "tokens_seen": 1052189696 }, { "epoch": 3.01, "learning_rate": 0.0003440421263791374, "loss": 3.0054, "theoretical_loss": 3.631629059241089, "tokens_seen": 1052255232 }, { "epoch": 3.01, "learning_rate": 0.0003440320962888666, "loss": 2.8877, "theoretical_loss": 3.631607737122515, "tokens_seen": 1052320768 }, { "epoch": 3.01, "learning_rate": 0.0003440220661985958, "loss": 2.9651, "theoretical_loss": 3.6315864167035725, "tokens_seen": 1052386304 }, { "epoch": 3.01, "learning_rate": 0.000344012036108325, "loss": 2.8719, "theoretical_loss": 3.63156509798402, "tokens_seen": 1052451840 }, { "epoch": 3.01, "learning_rate": 0.00034400200601805416, "loss": 2.8936, "theoretical_loss": 3.6315437809636166, "tokens_seen": 1052517376 }, { "epoch": 3.01, "learning_rate": 0.00034399197592778334, "loss": 2.9159, "theoretical_loss": 3.6315224656421212, "tokens_seen": 1052582912 }, { "epoch": 3.01, "learning_rate": 0.0003439819458375125, "loss": 2.9347, "theoretical_loss": 3.6315011520192924, "tokens_seen": 1052648448 }, { "epoch": 3.01, "learning_rate": 0.00034397191574724175, "loss": 2.9548, "theoretical_loss": 3.6314798400948893, "tokens_seen": 1052713984 }, { "epoch": 3.01, "learning_rate": 0.0003439618856569709, "loss": 2.9623, "theoretical_loss": 3.631458529868671, "tokens_seen": 1052779520 }, { "epoch": 3.01, "learning_rate": 0.0003439518555667001, "loss": 2.9511, "theoretical_loss": 3.631437221340396, "tokens_seen": 1052845056 }, { "epoch": 3.01, "learning_rate": 0.0003439418254764293, "loss": 3.0976, "theoretical_loss": 3.6314159145098226, "tokens_seen": 1052910592 }, { "epoch": 3.01, "learning_rate": 0.0003439317953861585, "loss": 2.7834, "theoretical_loss": 3.631394609376712, "tokens_seen": 1052976128 }, { "epoch": 3.01, "learning_rate": 0.00034392176529588766, "loss": 2.723, "theoretical_loss": 3.631373305940821, "tokens_seen": 1053041664 }, { "epoch": 3.01, "learning_rate": 0.00034391173520561684, "loss": 2.8456, "theoretical_loss": 3.6313520042019105, "tokens_seen": 1053107200 }, { "epoch": 3.01, "learning_rate": 0.000343901705115346, "loss": 2.828, "theoretical_loss": 3.6313307041597396, "tokens_seen": 1053172736 }, { "epoch": 3.01, "learning_rate": 0.00034389167502507526, "loss": 2.9784, "theoretical_loss": 3.6313094058140667, "tokens_seen": 1053238272 }, { "epoch": 3.01, "learning_rate": 0.0003438816449348044, "loss": 2.8342, "theoretical_loss": 3.631288109164651, "tokens_seen": 1053303808 }, { "epoch": 3.01, "learning_rate": 0.0003438716148445336, "loss": 3.0299, "theoretical_loss": 3.6312668142112536, "tokens_seen": 1053369344 }, { "epoch": 3.01, "learning_rate": 0.00034386158475426275, "loss": 2.822, "theoretical_loss": 3.631245520953632, "tokens_seen": 1053434880 }, { "epoch": 3.01, "objective/train/docs_used": 1691935, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7306087017059326, "objective/train/theoretical_loss": 3.6312401979041637, "objective/train/tokens_used": 1073911264, "theoretical_loss": 3.6312401979041637, "tokens_seen": 1053451264 }, { "epoch": 3.01, "learning_rate": 0.000343851554663992, "loss": 2.9733, "theoretical_loss": 3.6312242293915467, "tokens_seen": 1053500416 }, { "epoch": 3.01, "learning_rate": 0.0003438415245737212, "loss": 2.7571, "theoretical_loss": 3.6312029395247567, "tokens_seen": 1053565952 }, { "epoch": 3.01, "learning_rate": 0.00034383149448345034, "loss": 2.9077, "theoretical_loss": 3.631181651353022, "tokens_seen": 1053631488 }, { "epoch": 3.01, "learning_rate": 0.0003438214643931796, "loss": 2.7394, "theoretical_loss": 3.631160364876102, "tokens_seen": 1053697024 }, { "epoch": 3.01, "learning_rate": 0.00034381143430290876, "loss": 2.7987, "theoretical_loss": 3.6311390800937566, "tokens_seen": 1053762560 }, { "epoch": 3.01, "learning_rate": 0.00034380140421263794, "loss": 3.0091, "theoretical_loss": 3.6311177970057456, "tokens_seen": 1053828096 }, { "epoch": 3.01, "learning_rate": 0.0003437913741223671, "loss": 2.931, "theoretical_loss": 3.631096515611828, "tokens_seen": 1053893632 }, { "epoch": 3.01, "learning_rate": 0.0003437813440320963, "loss": 2.8479, "theoretical_loss": 3.631075235911764, "tokens_seen": 1053959168 }, { "epoch": 3.01, "learning_rate": 0.0003437713139418255, "loss": 2.9835, "theoretical_loss": 3.631053957905314, "tokens_seen": 1054024704 }, { "epoch": 3.01, "learning_rate": 0.0003437612838515547, "loss": 2.8553, "theoretical_loss": 3.6310326815922376, "tokens_seen": 1054090240 }, { "epoch": 3.01, "learning_rate": 0.00034375125376128385, "loss": 2.9662, "theoretical_loss": 3.631011406972294, "tokens_seen": 1054155776 }, { "epoch": 3.01, "learning_rate": 0.0003437412236710131, "loss": 2.8209, "theoretical_loss": 3.630990134045245, "tokens_seen": 1054221312 }, { "epoch": 3.01, "learning_rate": 0.0003437311935807422, "loss": 2.9005, "theoretical_loss": 3.6309688628108487, "tokens_seen": 1054286848 }, { "epoch": 3.01, "learning_rate": 0.00034372116349047144, "loss": 2.8982, "theoretical_loss": 3.630947593268866, "tokens_seen": 1054352384 }, { "epoch": 3.01, "learning_rate": 0.0003437111334002006, "loss": 2.884, "theoretical_loss": 3.6309263254190576, "tokens_seen": 1054417920 }, { "epoch": 3.01, "learning_rate": 0.0003437011033099298, "loss": 2.8824, "theoretical_loss": 3.6309050592611833, "tokens_seen": 1054483456 }, { "epoch": 3.01, "learning_rate": 0.000343691073219659, "loss": 2.8228, "theoretical_loss": 3.630883794795003, "tokens_seen": 1054548992 }, { "epoch": 3.01, "learning_rate": 0.00034368104312938817, "loss": 2.8473, "theoretical_loss": 3.6308625320202776, "tokens_seen": 1054614528 }, { "epoch": 3.01, "learning_rate": 0.00034367101303911735, "loss": 2.7342, "theoretical_loss": 3.6308412709367675, "tokens_seen": 1054680064 }, { "epoch": 3.01, "learning_rate": 0.0003436609829488466, "loss": 2.7891, "theoretical_loss": 3.630820011544232, "tokens_seen": 1054745600 }, { "epoch": 3.01, "learning_rate": 0.0003436509528585757, "loss": 2.8075, "theoretical_loss": 3.630798753842433, "tokens_seen": 1054811136 }, { "epoch": 3.01, "learning_rate": 0.00034364092276830495, "loss": 2.9641, "theoretical_loss": 3.63077749783113, "tokens_seen": 1054876672 }, { "epoch": 3.01, "learning_rate": 0.00034363089267803413, "loss": 2.9639, "theoretical_loss": 3.630756243510084, "tokens_seen": 1054942208 }, { "epoch": 3.01, "learning_rate": 0.0003436208625877633, "loss": 2.9766, "theoretical_loss": 3.630734990879055, "tokens_seen": 1055007744 }, { "epoch": 3.01, "learning_rate": 0.0003436108324974925, "loss": 2.8093, "theoretical_loss": 3.6307137399378044, "tokens_seen": 1055073280 }, { "epoch": 3.01, "objective/train/docs_used": 1693381, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.87005352973938, "objective/train/theoretical_loss": 3.630708427466492, "objective/train/tokens_used": 1075549664, "theoretical_loss": 3.630708427466492, "tokens_seen": 1055089664 }, { "epoch": 3.01, "learning_rate": 0.00034360080240722167, "loss": 2.7446, "theoretical_loss": 3.6306924906860933, "tokens_seen": 1055138816 }, { "epoch": 3.01, "learning_rate": 0.00034359077231695085, "loss": 2.8811, "theoretical_loss": 3.630671243123681, "tokens_seen": 1055204352 }, { "epoch": 3.01, "learning_rate": 0.0003435807422266801, "loss": 2.863, "theoretical_loss": 3.6306499972503294, "tokens_seen": 1055269888 }, { "epoch": 3.01, "learning_rate": 0.0003435707121364092, "loss": 2.7369, "theoretical_loss": 3.6306287530657992, "tokens_seen": 1055335424 }, { "epoch": 3.01, "learning_rate": 0.00034356068204613845, "loss": 2.9299, "theoretical_loss": 3.630607510569851, "tokens_seen": 1055400960 }, { "epoch": 3.01, "learning_rate": 0.0003435506519558676, "loss": 2.9114, "theoretical_loss": 3.630586269762246, "tokens_seen": 1055466496 }, { "epoch": 3.01, "learning_rate": 0.0003435406218655968, "loss": 3.0415, "theoretical_loss": 3.630565030642745, "tokens_seen": 1055532032 }, { "epoch": 3.01, "learning_rate": 0.000343530591775326, "loss": 2.5328, "theoretical_loss": 3.6305437932111086, "tokens_seen": 1055597568 }, { "epoch": 3.01, "learning_rate": 0.0003435205616850552, "loss": 2.8762, "theoretical_loss": 3.630522557467099, "tokens_seen": 1055663104 }, { "epoch": 3.01, "learning_rate": 0.00034351053159478436, "loss": 2.9375, "theoretical_loss": 3.630501323410477, "tokens_seen": 1055728640 }, { "epoch": 3.01, "learning_rate": 0.00034350050150451354, "loss": 2.8509, "theoretical_loss": 3.6304800910410027, "tokens_seen": 1055794176 }, { "epoch": 3.01, "learning_rate": 0.0003434904714142427, "loss": 2.9448, "theoretical_loss": 3.6304588603584387, "tokens_seen": 1055859712 }, { "epoch": 3.01, "learning_rate": 0.00034348044132397195, "loss": 2.8278, "theoretical_loss": 3.6304376313625455, "tokens_seen": 1055925248 }, { "epoch": 3.01, "learning_rate": 0.0003434704112337011, "loss": 2.9481, "theoretical_loss": 3.6304164040530846, "tokens_seen": 1055990784 }, { "epoch": 3.01, "learning_rate": 0.0003434603811434303, "loss": 3.0221, "theoretical_loss": 3.6303951784298176, "tokens_seen": 1056056320 }, { "epoch": 3.01, "learning_rate": 0.0003434503510531595, "loss": 2.9676, "theoretical_loss": 3.630373954492506, "tokens_seen": 1056121856 }, { "epoch": 3.01, "learning_rate": 0.0003434403209628887, "loss": 2.9808, "theoretical_loss": 3.630352732240911, "tokens_seen": 1056187392 }, { "epoch": 3.01, "learning_rate": 0.00034343029087261786, "loss": 2.9874, "theoretical_loss": 3.6303315116747936, "tokens_seen": 1056252928 }, { "epoch": 3.01, "learning_rate": 0.00034342026078234704, "loss": 2.7595, "theoretical_loss": 3.6303102927939164, "tokens_seen": 1056318464 }, { "epoch": 3.01, "learning_rate": 0.0003434102306920762, "loss": 2.9526, "theoretical_loss": 3.63028907559804, "tokens_seen": 1056384000 }, { "epoch": 3.01, "learning_rate": 0.00034340020060180546, "loss": 2.9941, "theoretical_loss": 3.630267860086928, "tokens_seen": 1056449536 }, { "epoch": 3.01, "learning_rate": 0.0003433901705115346, "loss": 2.8407, "theoretical_loss": 3.6302466462603395, "tokens_seen": 1056515072 }, { "epoch": 3.01, "learning_rate": 0.0003433801404212638, "loss": 2.907, "theoretical_loss": 3.6302254341180378, "tokens_seen": 1056580608 }, { "epoch": 3.01, "learning_rate": 0.00034337011033099295, "loss": 3.0356, "theoretical_loss": 3.630204223659785, "tokens_seen": 1056646144 }, { "epoch": 3.01, "learning_rate": 0.0003433600802407222, "loss": 2.7543, "theoretical_loss": 3.6301830148853416, "tokens_seen": 1056711680 }, { "epoch": 3.01, "objective/train/docs_used": 1696185, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.863558053970337, "objective/train/theoretical_loss": 3.6301777129547985, "objective/train/tokens_used": 1077188064, "theoretical_loss": 3.6301777129547985, "tokens_seen": 1056728064 }, { "epoch": 3.01, "learning_rate": 0.00034335005015045136, "loss": 2.7673, "theoretical_loss": 3.630161807794471, "tokens_seen": 1056777216 }, { "epoch": 3.01, "learning_rate": 0.00034334002006018054, "loss": 2.8786, "theoretical_loss": 3.6301406023869345, "tokens_seen": 1056842752 }, { "epoch": 3.01, "learning_rate": 0.0003433299899699097, "loss": 2.5939, "theoretical_loss": 3.630119398662494, "tokens_seen": 1056908288 }, { "epoch": 3.01, "learning_rate": 0.0003433199598796389, "loss": 2.874, "theoretical_loss": 3.630098196620911, "tokens_seen": 1056973824 }, { "epoch": 3.01, "learning_rate": 0.0003433099297893681, "loss": 2.9094, "theoretical_loss": 3.6300769962619492, "tokens_seen": 1057039360 }, { "epoch": 3.01, "learning_rate": 0.0003432998996990973, "loss": 2.6764, "theoretical_loss": 3.630055797585369, "tokens_seen": 1057104896 }, { "epoch": 3.01, "learning_rate": 0.00034328986960882645, "loss": 2.8365, "theoretical_loss": 3.630034600590934, "tokens_seen": 1057170432 }, { "epoch": 3.01, "learning_rate": 0.0003432798395185557, "loss": 3.0592, "theoretical_loss": 3.630013405278406, "tokens_seen": 1057235968 }, { "epoch": 3.01, "learning_rate": 0.00034326980942828487, "loss": 2.8102, "theoretical_loss": 3.6299922116475467, "tokens_seen": 1057301504 }, { "epoch": 3.01, "learning_rate": 0.00034325977933801405, "loss": 2.6453, "theoretical_loss": 3.6299710196981194, "tokens_seen": 1057367040 }, { "epoch": 3.01, "learning_rate": 0.00034324974924774323, "loss": 2.9146, "theoretical_loss": 3.6299498294298855, "tokens_seen": 1057432576 }, { "epoch": 3.01, "learning_rate": 0.0003432397191574724, "loss": 2.778, "theoretical_loss": 3.6299286408426084, "tokens_seen": 1057498112 }, { "epoch": 3.01, "learning_rate": 0.0003432296890672016, "loss": 2.9065, "theoretical_loss": 3.6299074539360503, "tokens_seen": 1057563648 }, { "epoch": 3.01, "learning_rate": 0.0003432196589769308, "loss": 2.8688, "theoretical_loss": 3.6298862687099733, "tokens_seen": 1057629184 }, { "epoch": 3.01, "learning_rate": 0.00034320962888665995, "loss": 2.7931, "theoretical_loss": 3.6298650851641403, "tokens_seen": 1057694720 }, { "epoch": 3.01, "learning_rate": 0.0003431995987963892, "loss": 2.829, "theoretical_loss": 3.6298439032983136, "tokens_seen": 1057760256 }, { "epoch": 3.01, "learning_rate": 0.0003431895687061183, "loss": 2.919, "theoretical_loss": 3.629822723112257, "tokens_seen": 1057825792 }, { "epoch": 3.01, "learning_rate": 0.00034317953861584755, "loss": 2.8195, "theoretical_loss": 3.629801544605732, "tokens_seen": 1057891328 }, { "epoch": 3.01, "learning_rate": 0.00034316950852557673, "loss": 2.8269, "theoretical_loss": 3.629780367778502, "tokens_seen": 1057956864 }, { "epoch": 3.01, "learning_rate": 0.0003431594784353059, "loss": 2.9344, "theoretical_loss": 3.6297591926303294, "tokens_seen": 1058022400 }, { "epoch": 3.01, "learning_rate": 0.0003431494483450351, "loss": 2.9832, "theoretical_loss": 3.629738019160978, "tokens_seen": 1058087936 }, { "epoch": 3.01, "learning_rate": 0.00034313941825476433, "loss": 2.8131, "theoretical_loss": 3.6297168473702097, "tokens_seen": 1058153472 }, { "epoch": 3.01, "learning_rate": 0.00034312938816449345, "loss": 3.0152, "theoretical_loss": 3.6296956772577875, "tokens_seen": 1058219008 }, { "epoch": 3.01, "learning_rate": 0.0003431193580742227, "loss": 2.8927, "theoretical_loss": 3.6296745088234754, "tokens_seen": 1058284544 }, { "epoch": 3.01, "learning_rate": 0.0003431093279839518, "loss": 2.7701, "theoretical_loss": 3.629653342067036, "tokens_seen": 1058350080 }, { "epoch": 3.01, "objective/train/docs_used": 1699207, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8425145149230957, "objective/train/theoretical_loss": 3.629648050640066, "objective/train/tokens_used": 1078826464, "theoretical_loss": 3.629648050640066, "tokens_seen": 1058366464 }, { "epoch": 3.01, "learning_rate": 0.00034309929789368105, "loss": 2.8718, "theoretical_loss": 3.6296321769882316, "tokens_seen": 1058415616 }, { "epoch": 3.01, "learning_rate": 0.0003430892678034103, "loss": 2.915, "theoretical_loss": 3.6296110135868265, "tokens_seen": 1058481152 }, { "epoch": 3.01, "learning_rate": 0.0003430792377131394, "loss": 2.6707, "theoretical_loss": 3.6295898518625833, "tokens_seen": 1058546688 }, { "epoch": 3.01, "learning_rate": 0.0003430792377131394, "loss": 3.0282, "theoretical_loss": 3.6295686918152654, "tokens_seen": 1058612224 }, { "epoch": 3.01, "learning_rate": 0.00034306920762286865, "loss": 2.9156, "theoretical_loss": 3.629547533444636, "tokens_seen": 1058677760 }, { "epoch": 3.01, "learning_rate": 0.0003430591775325978, "loss": 2.9902, "theoretical_loss": 3.6295263767504586, "tokens_seen": 1058743296 }, { "epoch": 3.01, "learning_rate": 0.000343049147442327, "loss": 2.9266, "theoretical_loss": 3.6295052217324972, "tokens_seen": 1058808832 }, { "epoch": 3.01, "learning_rate": 0.0003430391173520562, "loss": 2.7769, "theoretical_loss": 3.6294840683905143, "tokens_seen": 1058874368 }, { "epoch": 3.01, "learning_rate": 0.0003430290872617854, "loss": 2.9305, "theoretical_loss": 3.6294629167242736, "tokens_seen": 1058939904 }, { "epoch": 3.01, "learning_rate": 0.00034301905717151456, "loss": 2.6576, "theoretical_loss": 3.629441766733539, "tokens_seen": 1059005440 }, { "epoch": 3.01, "learning_rate": 0.00034300902708124374, "loss": 3.1151, "theoretical_loss": 3.6294206184180737, "tokens_seen": 1059070976 }, { "epoch": 3.01, "learning_rate": 0.0003429989969909729, "loss": 2.8814, "theoretical_loss": 3.629399471777642, "tokens_seen": 1059136512 }, { "epoch": 3.01, "learning_rate": 0.00034298896690070215, "loss": 2.7515, "theoretical_loss": 3.6293783268120072, "tokens_seen": 1059202048 }, { "epoch": 3.01, "learning_rate": 0.0003429789368104313, "loss": 2.7956, "theoretical_loss": 3.629357183520933, "tokens_seen": 1059267584 }, { "epoch": 3.01, "learning_rate": 0.0003429689067201605, "loss": 2.9158, "theoretical_loss": 3.6293360419041827, "tokens_seen": 1059333120 }, { "epoch": 3.01, "learning_rate": 0.0003429588766298897, "loss": 2.943, "theoretical_loss": 3.6293149019615214, "tokens_seen": 1059398656 }, { "epoch": 3.01, "learning_rate": 0.0003429488465396189, "loss": 2.9475, "theoretical_loss": 3.6292937636927114, "tokens_seen": 1059464192 }, { "epoch": 3.01, "learning_rate": 0.00034293881644934806, "loss": 2.9237, "theoretical_loss": 3.629272627097518, "tokens_seen": 1059529728 }, { "epoch": 3.01, "learning_rate": 0.00034292878635907724, "loss": 2.8556, "theoretical_loss": 3.6292514921757046, "tokens_seen": 1059595264 }, { "epoch": 3.01, "learning_rate": 0.0003429187562688064, "loss": 2.93, "theoretical_loss": 3.6292303589270354, "tokens_seen": 1059660800 }, { "epoch": 3.01, "learning_rate": 0.00034290872617853566, "loss": 2.8847, "theoretical_loss": 3.629209227351274, "tokens_seen": 1059726336 }, { "epoch": 3.01, "learning_rate": 0.0003428986960882648, "loss": 2.8397, "theoretical_loss": 3.6291880974481856, "tokens_seen": 1059791872 }, { "epoch": 3.01, "learning_rate": 0.000342888665997994, "loss": 2.9289, "theoretical_loss": 3.6291669692175335, "tokens_seen": 1059857408 }, { "epoch": 3.01, "learning_rate": 0.00034287863590772315, "loss": 2.8629, "theoretical_loss": 3.6291458426590815, "tokens_seen": 1059922944 }, { "epoch": 3.01, "learning_rate": 0.0003428686058174524, "loss": 2.7589, "theoretical_loss": 3.629124717772595, "tokens_seen": 1059988480 }, { "epoch": 3.01, "objective/train/docs_used": 1701903, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.646085500717163, "objective/train/theoretical_loss": 3.62911943681219, "objective/train/tokens_used": 1080464864, "theoretical_loss": 3.62911943681219, "tokens_seen": 1060004864 }, { "epoch": 3.01, "learning_rate": 0.00034285857572718156, "loss": 2.9062, "theoretical_loss": 3.629103594557838, "tokens_seen": 1060054016 }, { "epoch": 3.01, "learning_rate": 0.00034284854563691074, "loss": 2.9513, "theoretical_loss": 3.629082473014574, "tokens_seen": 1060119552 }, { "epoch": 3.01, "learning_rate": 0.0003428385155466399, "loss": 2.7804, "theoretical_loss": 3.6290613531425686, "tokens_seen": 1060185088 }, { "epoch": 3.01, "learning_rate": 0.0003428284854563691, "loss": 2.7715, "theoretical_loss": 3.6290402349415856, "tokens_seen": 1060250624 }, { "epoch": 3.01, "learning_rate": 0.0003428184553660983, "loss": 2.8627, "theoretical_loss": 3.6290191184113896, "tokens_seen": 1060316160 }, { "epoch": 3.01, "learning_rate": 0.0003428084252758275, "loss": 2.8856, "theoretical_loss": 3.6289980035517453, "tokens_seen": 1060381696 }, { "epoch": 3.01, "learning_rate": 0.00034279839518555665, "loss": 2.8094, "theoretical_loss": 3.6289768903624173, "tokens_seen": 1060447232 }, { "epoch": 3.01, "learning_rate": 0.0003427883650952859, "loss": 2.8034, "theoretical_loss": 3.62895577884317, "tokens_seen": 1060512768 }, { "epoch": 3.01, "learning_rate": 0.00034277833500501507, "loss": 2.778, "theoretical_loss": 3.628934668993768, "tokens_seen": 1060578304 }, { "epoch": 3.01, "learning_rate": 0.00034276830491474425, "loss": 2.811, "theoretical_loss": 3.628913560813977, "tokens_seen": 1060643840 }, { "epoch": 3.01, "learning_rate": 0.00034275827482447343, "loss": 3.024, "theoretical_loss": 3.6288924543035606, "tokens_seen": 1060709376 }, { "epoch": 3.01, "learning_rate": 0.0003427482447342026, "loss": 2.8064, "theoretical_loss": 3.628871349462284, "tokens_seen": 1060774912 }, { "epoch": 3.01, "learning_rate": 0.0003427382146439318, "loss": 3.0664, "theoretical_loss": 3.628850246289913, "tokens_seen": 1060840448 }, { "epoch": 3.01, "learning_rate": 0.000342728184553661, "loss": 3.0079, "theoretical_loss": 3.628829144786211, "tokens_seen": 1060905984 }, { "epoch": 3.01, "learning_rate": 0.00034271815446339015, "loss": 2.9721, "theoretical_loss": 3.6288080449509446, "tokens_seen": 1060971520 }, { "epoch": 3.01, "learning_rate": 0.0003427081243731194, "loss": 2.9546, "theoretical_loss": 3.628786946783878, "tokens_seen": 1061037056 }, { "epoch": 3.01, "learning_rate": 0.0003426980942828485, "loss": 2.8882, "theoretical_loss": 3.6287658502847755, "tokens_seen": 1061102592 }, { "epoch": 3.01, "learning_rate": 0.00034268806419257775, "loss": 2.8905, "theoretical_loss": 3.6287447554534036, "tokens_seen": 1061168128 }, { "epoch": 3.01, "learning_rate": 0.00034267803410230693, "loss": 2.8921, "theoretical_loss": 3.628723662289527, "tokens_seen": 1061233664 }, { "epoch": 3.01, "learning_rate": 0.0003426680040120361, "loss": 2.7114, "theoretical_loss": 3.6287025707929104, "tokens_seen": 1061299200 }, { "epoch": 3.01, "learning_rate": 0.0003426579739217653, "loss": 2.6744, "theoretical_loss": 3.6286814809633197, "tokens_seen": 1061364736 }, { "epoch": 3.01, "learning_rate": 0.00034264794383149453, "loss": 2.9529, "theoretical_loss": 3.62866039280052, "tokens_seen": 1061430272 }, { "epoch": 3.01, "learning_rate": 0.00034263791374122366, "loss": 2.871, "theoretical_loss": 3.628639306304277, "tokens_seen": 1061495808 }, { "epoch": 3.01, "learning_rate": 0.0003426278836509529, "loss": 2.8861, "theoretical_loss": 3.628618221474355, "tokens_seen": 1061561344 }, { "epoch": 3.01, "learning_rate": 0.000342617853560682, "loss": 2.912, "theoretical_loss": 3.6285971383105213, "tokens_seen": 1061626880 }, { "epoch": 3.01, "objective/train/docs_used": 1704784, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7478954792022705, "objective/train/theoretical_loss": 3.6285918677798614, "objective/train/tokens_used": 1082103264, "theoretical_loss": 3.6285918677798614, "tokens_seen": 1061643264 }, { "epoch": 3.01, "learning_rate": 0.00034260782347041125, "loss": 2.7355, "theoretical_loss": 3.62857605681254, "tokens_seen": 1061692416 }, { "epoch": 3.01, "learning_rate": 0.00034259779338014043, "loss": 2.8366, "theoretical_loss": 3.628554976980177, "tokens_seen": 1061757952 }, { "epoch": 3.01, "learning_rate": 0.0003425877632898696, "loss": 2.9332, "theoretical_loss": 3.6285338988131977, "tokens_seen": 1061823488 }, { "epoch": 3.01, "learning_rate": 0.0003425777331995988, "loss": 2.9873, "theoretical_loss": 3.628512822311368, "tokens_seen": 1061889024 }, { "epoch": 3.01, "learning_rate": 0.000342567703109328, "loss": 2.8965, "theoretical_loss": 3.6284917474744542, "tokens_seen": 1061954560 }, { "epoch": 3.01, "learning_rate": 0.00034255767301905716, "loss": 2.8499, "theoretical_loss": 3.6284706743022213, "tokens_seen": 1062020096 }, { "epoch": 3.01, "learning_rate": 0.0003425476429287864, "loss": 2.8895, "theoretical_loss": 3.6284496027944346, "tokens_seen": 1062085632 }, { "epoch": 3.01, "learning_rate": 0.0003425376128385155, "loss": 2.7667, "theoretical_loss": 3.628428532950861, "tokens_seen": 1062151168 }, { "epoch": 3.01, "learning_rate": 0.00034252758274824476, "loss": 2.8087, "theoretical_loss": 3.628407464771266, "tokens_seen": 1062216704 }, { "epoch": 3.01, "learning_rate": 0.0003425175526579739, "loss": 2.9951, "theoretical_loss": 3.6283863982554156, "tokens_seen": 1062282240 }, { "epoch": 3.01, "learning_rate": 0.0003425075225677031, "loss": 2.8316, "theoretical_loss": 3.6283653334030754, "tokens_seen": 1062347776 }, { "epoch": 3.01, "learning_rate": 0.0003424974924774323, "loss": 2.7556, "theoretical_loss": 3.6283442702140123, "tokens_seen": 1062413312 }, { "epoch": 3.01, "learning_rate": 0.0003424874623871615, "loss": 2.8606, "theoretical_loss": 3.6283232086879913, "tokens_seen": 1062478848 }, { "epoch": 3.01, "learning_rate": 0.00034247743229689066, "loss": 2.8931, "theoretical_loss": 3.6283021488247793, "tokens_seen": 1062544384 }, { "epoch": 3.01, "learning_rate": 0.0003424674022066199, "loss": 3.0024, "theoretical_loss": 3.6282810906241423, "tokens_seen": 1062609920 }, { "epoch": 3.01, "learning_rate": 0.000342457372116349, "loss": 2.7034, "theoretical_loss": 3.628260034085846, "tokens_seen": 1062675456 }, { "epoch": 3.01, "learning_rate": 0.00034244734202607826, "loss": 2.9312, "theoretical_loss": 3.628238979209658, "tokens_seen": 1062740992 }, { "epoch": 3.01, "learning_rate": 0.0003424373119358074, "loss": 2.932, "theoretical_loss": 3.628217925995343, "tokens_seen": 1062806528 }, { "epoch": 3.01, "learning_rate": 0.0003424272818455366, "loss": 2.9348, "theoretical_loss": 3.6281968744426676, "tokens_seen": 1062872064 }, { "epoch": 3.01, "learning_rate": 0.0003424172517552658, "loss": 2.9651, "theoretical_loss": 3.6281758245513993, "tokens_seen": 1062937600 }, { "epoch": 3.01, "learning_rate": 0.000342407221664995, "loss": 2.9944, "theoretical_loss": 3.628154776321304, "tokens_seen": 1063003136 }, { "epoch": 3.01, "learning_rate": 0.00034239719157472416, "loss": 2.833, "theoretical_loss": 3.628133729752148, "tokens_seen": 1063068672 }, { "epoch": 3.01, "learning_rate": 0.00034238716148445335, "loss": 2.8011, "theoretical_loss": 3.628112684843698, "tokens_seen": 1063134208 }, { "epoch": 3.01, "learning_rate": 0.0003423771313941825, "loss": 2.8832, "theoretical_loss": 3.628091641595721, "tokens_seen": 1063199744 }, { "epoch": 3.01, "learning_rate": 0.00034236710130391176, "loss": 2.9786, "theoretical_loss": 3.628070600007983, "tokens_seen": 1063265280 }, { "epoch": 3.01, "objective/train/docs_used": 1706210, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.999783754348755, "objective/train/theoretical_loss": 3.6280653398704334, "objective/train/tokens_used": 1083741664, "theoretical_loss": 3.6280653398704334, "tokens_seen": 1063281664 }, { "epoch": 3.01, "learning_rate": 0.0003423570712136409, "loss": 2.8747, "theoretical_loss": 3.6280495600802505, "tokens_seen": 1063330816 }, { "epoch": 3.01, "learning_rate": 0.0003423470411233701, "loss": 2.9512, "theoretical_loss": 3.628028521812291, "tokens_seen": 1063396352 }, { "epoch": 3.01, "learning_rate": 0.0003423370110330993, "loss": 2.936, "theoretical_loss": 3.628007485203871, "tokens_seen": 1063461888 }, { "epoch": 3.01, "learning_rate": 0.0003423269809428285, "loss": 2.8181, "theoretical_loss": 3.627986450254758, "tokens_seen": 1063527424 }, { "epoch": 3.01, "learning_rate": 0.0003423169508525577, "loss": 2.9239, "theoretical_loss": 3.6279654169647175, "tokens_seen": 1063592960 }, { "epoch": 3.01, "learning_rate": 0.00034230692076228685, "loss": 2.9975, "theoretical_loss": 3.6279443853335165, "tokens_seen": 1063658496 }, { "epoch": 3.01, "learning_rate": 0.0003422968906720161, "loss": 2.9342, "theoretical_loss": 3.627923355360924, "tokens_seen": 1063724032 }, { "epoch": 3.01, "learning_rate": 0.00034228686058174527, "loss": 2.9025, "theoretical_loss": 3.6279023270467046, "tokens_seen": 1063789568 }, { "epoch": 3.01, "learning_rate": 0.00034227683049147445, "loss": 2.7495, "theoretical_loss": 3.6278813003906265, "tokens_seen": 1063855104 }, { "epoch": 3.01, "learning_rate": 0.00034226680040120363, "loss": 2.8977, "theoretical_loss": 3.627860275392457, "tokens_seen": 1063920640 }, { "epoch": 3.01, "learning_rate": 0.0003422567703109328, "loss": 2.972, "theoretical_loss": 3.627839252051963, "tokens_seen": 1063986176 }, { "epoch": 3.01, "learning_rate": 0.000342246740220662, "loss": 2.977, "theoretical_loss": 3.6278182303689115, "tokens_seen": 1064051712 }, { "epoch": 3.01, "learning_rate": 0.0003422367101303912, "loss": 2.9231, "theoretical_loss": 3.62779721034307, "tokens_seen": 1064117248 }, { "epoch": 3.01, "learning_rate": 0.00034222668004012035, "loss": 2.8885, "theoretical_loss": 3.6277761919742053, "tokens_seen": 1064182784 }, { "epoch": 3.01, "learning_rate": 0.0003422166499498496, "loss": 2.804, "theoretical_loss": 3.6277551752620862, "tokens_seen": 1064248320 }, { "epoch": 3.01, "learning_rate": 0.0003422066198595787, "loss": 2.7895, "theoretical_loss": 3.6277341602064785, "tokens_seen": 1064313856 }, { "epoch": 3.01, "learning_rate": 0.00034219658976930795, "loss": 2.8748, "theoretical_loss": 3.6277131468071504, "tokens_seen": 1064379392 }, { "epoch": 3.01, "learning_rate": 0.00034218655967903713, "loss": 2.9201, "theoretical_loss": 3.627692135063869, "tokens_seen": 1064444928 }, { "epoch": 3.01, "learning_rate": 0.0003421765295887663, "loss": 2.8843, "theoretical_loss": 3.627671124976402, "tokens_seen": 1064510464 }, { "epoch": 3.01, "learning_rate": 0.0003421664994984955, "loss": 2.8363, "theoretical_loss": 3.627650116544517, "tokens_seen": 1064576000 }, { "epoch": 3.01, "learning_rate": 0.00034215646940822473, "loss": 2.9957, "theoretical_loss": 3.6276291097679825, "tokens_seen": 1064641536 }, { "epoch": 3.01, "learning_rate": 0.00034214643931795386, "loss": 2.8837, "theoretical_loss": 3.6276081046465647, "tokens_seen": 1064707072 }, { "epoch": 3.01, "learning_rate": 0.0003421364092276831, "loss": 2.6636, "theoretical_loss": 3.6275871011800325, "tokens_seen": 1064772608 }, { "epoch": 3.01, "learning_rate": 0.0003421263791374122, "loss": 2.9736, "theoretical_loss": 3.6275660993681527, "tokens_seen": 1064838144 }, { "epoch": 3.01, "learning_rate": 0.00034211634904714145, "loss": 2.8166, "theoretical_loss": 3.6275450992106935, "tokens_seen": 1064903680 }, { "epoch": 3.01, "objective/train/docs_used": 1708934, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6907949447631836, "objective/train/theoretical_loss": 3.6275398494298052, "objective/train/tokens_used": 1085380064, "theoretical_loss": 3.6275398494298052, "tokens_seen": 1064920064 }, { "epoch": 3.01, "learning_rate": 0.00034210631895687063, "loss": 2.9334, "theoretical_loss": 3.6275241007074235, "tokens_seen": 1064969216 }, { "epoch": 3.01, "learning_rate": 0.0003420962888665998, "loss": 2.8554, "theoretical_loss": 3.6275031038581096, "tokens_seen": 1065034752 }, { "epoch": 3.01, "learning_rate": 0.000342086258776329, "loss": 2.8389, "theoretical_loss": 3.6274821086625204, "tokens_seen": 1065100288 }, { "epoch": 3.01, "learning_rate": 0.0003420762286860582, "loss": 2.9738, "theoretical_loss": 3.6274611151204237, "tokens_seen": 1065165824 }, { "epoch": 3.01, "learning_rate": 0.00034206619859578736, "loss": 2.8611, "theoretical_loss": 3.6274401232315867, "tokens_seen": 1065231360 }, { "epoch": 3.01, "learning_rate": 0.0003420561685055166, "loss": 2.7243, "theoretical_loss": 3.6274191329957794, "tokens_seen": 1065296896 }, { "epoch": 3.01, "learning_rate": 0.0003420461384152457, "loss": 2.8339, "theoretical_loss": 3.6273981444127683, "tokens_seen": 1065362432 }, { "epoch": 3.01, "learning_rate": 0.00034203610832497496, "loss": 2.8661, "theoretical_loss": 3.6273771574823224, "tokens_seen": 1065427968 }, { "epoch": 3.01, "learning_rate": 0.0003420260782347041, "loss": 2.8413, "theoretical_loss": 3.62735617220421, "tokens_seen": 1065493504 }, { "epoch": 3.01, "learning_rate": 0.0003420160481444333, "loss": 3.1328, "theoretical_loss": 3.6273351885781984, "tokens_seen": 1065559040 }, { "epoch": 3.01, "learning_rate": 0.0003420060180541625, "loss": 3.0226, "theoretical_loss": 3.6273142066040567, "tokens_seen": 1065624576 }, { "epoch": 3.01, "learning_rate": 0.0003419959879638917, "loss": 2.8769, "theoretical_loss": 3.627293226281554, "tokens_seen": 1065690112 }, { "epoch": 3.01, "learning_rate": 0.00034198595787362086, "loss": 2.9488, "theoretical_loss": 3.627272247610457, "tokens_seen": 1065755648 }, { "epoch": 3.01, "learning_rate": 0.0003419759277833501, "loss": 2.9175, "theoretical_loss": 3.627251270590536, "tokens_seen": 1065821184 }, { "epoch": 3.01, "learning_rate": 0.0003419658976930792, "loss": 2.7893, "theoretical_loss": 3.6272302952215583, "tokens_seen": 1065886720 }, { "epoch": 3.01, "learning_rate": 0.00034195586760280846, "loss": 2.8937, "theoretical_loss": 3.627209321503293, "tokens_seen": 1065952256 }, { "epoch": 3.01, "learning_rate": 0.0003419458375125376, "loss": 2.9753, "theoretical_loss": 3.627188349435508, "tokens_seen": 1066017792 }, { "epoch": 3.01, "learning_rate": 0.0003419358074222668, "loss": 2.6738, "theoretical_loss": 3.6271673790179726, "tokens_seen": 1066083328 }, { "epoch": 3.01, "learning_rate": 0.000341925777331996, "loss": 2.7624, "theoretical_loss": 3.627146410250456, "tokens_seen": 1066148864 }, { "epoch": 3.01, "learning_rate": 0.0003419157472417252, "loss": 3.0234, "theoretical_loss": 3.627125443132726, "tokens_seen": 1066214400 }, { "epoch": 3.01, "learning_rate": 0.00034190571715145436, "loss": 2.9143, "theoretical_loss": 3.627104477664552, "tokens_seen": 1066279936 }, { "epoch": 3.01, "learning_rate": 0.00034189568706118355, "loss": 2.9804, "theoretical_loss": 3.6270835138457023, "tokens_seen": 1066345472 }, { "epoch": 3.01, "learning_rate": 0.0003418856569709127, "loss": 2.7468, "theoretical_loss": 3.6270625516759463, "tokens_seen": 1066411008 }, { "epoch": 3.01, "learning_rate": 0.00034187562688064196, "loss": 2.913, "theoretical_loss": 3.627041591155053, "tokens_seen": 1066476544 }, { "epoch": 3.01, "learning_rate": 0.0003418655967903711, "loss": 2.9475, "theoretical_loss": 3.6270206322827905, "tokens_seen": 1066542080 }, { "epoch": 3.01, "objective/train/docs_used": 1711457, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8633267879486084, "objective/train/theoretical_loss": 3.6270153928222966, "objective/train/tokens_used": 1087018464, "theoretical_loss": 3.6270153928222966, "tokens_seen": 1066558464 }, { "epoch": 3.01, "learning_rate": 0.0003418555667001003, "loss": 2.9428, "theoretical_loss": 3.6269996750589293, "tokens_seen": 1066607616 }, { "epoch": 3.01, "learning_rate": 0.00034184553660982945, "loss": 2.9165, "theoretical_loss": 3.6269787194832377, "tokens_seen": 1066673152 }, { "epoch": 3.01, "learning_rate": 0.0003418355065195587, "loss": 2.8435, "theoretical_loss": 3.626957765555484, "tokens_seen": 1066738688 }, { "epoch": 3.01, "learning_rate": 0.00034182547642928787, "loss": 2.9708, "theoretical_loss": 3.6269368132754387, "tokens_seen": 1066804224 }, { "epoch": 3.01, "learning_rate": 0.00034181544633901705, "loss": 2.9931, "theoretical_loss": 3.6269158626428704, "tokens_seen": 1066869760 }, { "epoch": 3.01, "learning_rate": 0.00034180541624874623, "loss": 3.0361, "theoretical_loss": 3.626894913657549, "tokens_seen": 1066935296 }, { "epoch": 3.01, "learning_rate": 0.00034179538615847547, "loss": 2.9073, "theoretical_loss": 3.626873966319242, "tokens_seen": 1067000832 }, { "epoch": 3.01, "learning_rate": 0.0003417853560682046, "loss": 2.9071, "theoretical_loss": 3.6268530206277214, "tokens_seen": 1067066368 }, { "epoch": 3.01, "learning_rate": 0.00034177532597793383, "loss": 2.872, "theoretical_loss": 3.6268320765827546, "tokens_seen": 1067131904 }, { "epoch": 3.01, "learning_rate": 0.00034176529588766295, "loss": 2.8807, "theoretical_loss": 3.626811134184112, "tokens_seen": 1067197440 }, { "epoch": 3.01, "learning_rate": 0.0003417552657973922, "loss": 2.9632, "theoretical_loss": 3.6267901934315625, "tokens_seen": 1067262976 }, { "epoch": 3.01, "learning_rate": 0.00034174523570712137, "loss": 2.9583, "theoretical_loss": 3.626769254324876, "tokens_seen": 1067328512 }, { "epoch": 3.01, "learning_rate": 0.00034173520561685055, "loss": 2.7867, "theoretical_loss": 3.6267483168638224, "tokens_seen": 1067394048 }, { "epoch": 3.01, "learning_rate": 0.00034172517552657973, "loss": 2.9051, "theoretical_loss": 3.6267273810481706, "tokens_seen": 1067459584 }, { "epoch": 3.01, "learning_rate": 0.0003417151454363089, "loss": 2.8117, "theoretical_loss": 3.6267064468776913, "tokens_seen": 1067525120 }, { "epoch": 3.01, "learning_rate": 0.0003417051153460381, "loss": 2.8434, "theoretical_loss": 3.626685514352153, "tokens_seen": 1067590656 }, { "epoch": 3.01, "learning_rate": 0.00034169508525576733, "loss": 2.9373, "theoretical_loss": 3.6266645834713263, "tokens_seen": 1067656192 }, { "epoch": 3.01, "learning_rate": 0.00034168505516549646, "loss": 2.9069, "theoretical_loss": 3.626643654234981, "tokens_seen": 1067721728 }, { "epoch": 3.01, "learning_rate": 0.0003416750250752257, "loss": 2.9708, "theoretical_loss": 3.6266227266428865, "tokens_seen": 1067787264 }, { "epoch": 3.01, "learning_rate": 0.0003416649949849548, "loss": 2.9475, "theoretical_loss": 3.6266018006948135, "tokens_seen": 1067852800 }, { "epoch": 3.01, "learning_rate": 0.00034165496489468406, "loss": 2.9337, "theoretical_loss": 3.6265808763905314, "tokens_seen": 1067918336 }, { "epoch": 3.01, "learning_rate": 0.00034164493480441324, "loss": 2.9329, "theoretical_loss": 3.62655995372981, "tokens_seen": 1067983872 }, { "epoch": 3.01, "learning_rate": 0.0003416349047141424, "loss": 2.7461, "theoretical_loss": 3.6265390327124196, "tokens_seen": 1068049408 }, { "epoch": 3.01, "learning_rate": 0.0003416248746238716, "loss": 2.7631, "theoretical_loss": 3.626518113338131, "tokens_seen": 1068114944 }, { "epoch": 3.01, "learning_rate": 0.00034161484453360083, "loss": 2.8745, "theoretical_loss": 3.6264971956067127, "tokens_seen": 1068180480 }, { "epoch": 3.01, "objective/train/docs_used": 1714336, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.748981237411499, "objective/train/theoretical_loss": 3.62649196643053, "objective/train/tokens_used": 1088656864, "theoretical_loss": 3.62649196643053, "tokens_seen": 1068196864 }, { "epoch": 3.01, "learning_rate": 0.00034160481444332996, "loss": 2.9735, "theoretical_loss": 3.626476279517936, "tokens_seen": 1068246016 }, { "epoch": 3.01, "learning_rate": 0.0003415947843530592, "loss": 2.9196, "theoretical_loss": 3.626455365071572, "tokens_seen": 1068311552 }, { "epoch": 3.01, "learning_rate": 0.0003415847542627884, "loss": 2.8999, "theoretical_loss": 3.6264344522673895, "tokens_seen": 1068377088 }, { "epoch": 3.01, "learning_rate": 0.00034157472417251756, "loss": 2.8964, "theoretical_loss": 3.6264135411051592, "tokens_seen": 1068442624 }, { "epoch": 3.01, "learning_rate": 0.0003415646940822468, "loss": 2.8614, "theoretical_loss": 3.626392631584652, "tokens_seen": 1068508160 }, { "epoch": 3.01, "learning_rate": 0.0003415546639919759, "loss": 2.9645, "theoretical_loss": 3.626371723705638, "tokens_seen": 1068573696 }, { "epoch": 3.01, "learning_rate": 0.00034154463390170516, "loss": 2.8863, "theoretical_loss": 3.6263508174678876, "tokens_seen": 1068639232 }, { "epoch": 3.01, "learning_rate": 0.0003415346038114343, "loss": 2.8747, "theoretical_loss": 3.626329912871171, "tokens_seen": 1068704768 }, { "epoch": 3.01, "learning_rate": 0.0003415245737211635, "loss": 2.8387, "theoretical_loss": 3.6263090099152597, "tokens_seen": 1068770304 }, { "epoch": 3.01, "learning_rate": 0.0003415145436308927, "loss": 2.9661, "theoretical_loss": 3.6262881085999235, "tokens_seen": 1068835840 }, { "epoch": 3.01, "learning_rate": 0.0003415045135406219, "loss": 2.8357, "theoretical_loss": 3.626267208924933, "tokens_seen": 1068901376 }, { "epoch": 3.01, "learning_rate": 0.00034149448345035106, "loss": 2.9704, "theoretical_loss": 3.6262463108900596, "tokens_seen": 1068966912 }, { "epoch": 3.01, "learning_rate": 0.0003414844533600803, "loss": 2.8811, "theoretical_loss": 3.6262254144950736, "tokens_seen": 1069032448 }, { "epoch": 3.01, "learning_rate": 0.0003414744232698094, "loss": 2.7118, "theoretical_loss": 3.626204519739746, "tokens_seen": 1069097984 }, { "epoch": 3.01, "learning_rate": 0.00034146439317953866, "loss": 2.8876, "theoretical_loss": 3.626183626623847, "tokens_seen": 1069163520 }, { "epoch": 3.01, "learning_rate": 0.0003414543630892678, "loss": 2.9356, "theoretical_loss": 3.6261627351471484, "tokens_seen": 1069229056 }, { "epoch": 3.01, "learning_rate": 0.000341444332998997, "loss": 2.9104, "theoretical_loss": 3.6261418453094207, "tokens_seen": 1069294592 }, { "epoch": 3.01, "learning_rate": 0.0003414343029087262, "loss": 2.7672, "theoretical_loss": 3.6261209571104347, "tokens_seen": 1069360128 }, { "epoch": 3.01, "learning_rate": 0.0003414242728184554, "loss": 2.8576, "theoretical_loss": 3.6261000705499624, "tokens_seen": 1069425664 }, { "epoch": 3.01, "learning_rate": 0.00034141424272818456, "loss": 2.9199, "theoretical_loss": 3.6260791856277734, "tokens_seen": 1069491200 }, { "epoch": 3.01, "learning_rate": 0.00034140421263791375, "loss": 2.7821, "theoretical_loss": 3.6260583023436395, "tokens_seen": 1069556736 }, { "epoch": 3.01, "learning_rate": 0.00034139418254764293, "loss": 2.9228, "theoretical_loss": 3.6260374206973323, "tokens_seen": 1069622272 }, { "epoch": 3.01, "learning_rate": 0.00034138415245737216, "loss": 2.7637, "theoretical_loss": 3.6260165406886222, "tokens_seen": 1069687808 }, { "epoch": 3.01, "learning_rate": 0.0003413741223671013, "loss": 3.0472, "theoretical_loss": 3.6259956623172815, "tokens_seen": 1069753344 }, { "epoch": 3.01, "learning_rate": 0.0003413640922768305, "loss": 2.7318, "theoretical_loss": 3.6259747855830797, "tokens_seen": 1069818880 }, { "epoch": 3.01, "objective/train/docs_used": 1717358, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7464444637298584, "objective/train/theoretical_loss": 3.6259695666553062, "objective/train/tokens_used": 1090295264, "theoretical_loss": 3.6259695666553062, "tokens_seen": 1069835264 }, { "epoch": 3.01, "learning_rate": 0.00034135406218655965, "loss": 2.7262, "theoretical_loss": 3.6259539104857907, "tokens_seen": 1069884416 }, { "epoch": 3.01, "learning_rate": 0.0003413440320962889, "loss": 2.6666, "theoretical_loss": 3.6259330370251837, "tokens_seen": 1069949952 }, { "epoch": 3.01, "learning_rate": 0.00034133400200601807, "loss": 2.9932, "theoretical_loss": 3.625912165201031, "tokens_seen": 1070015488 }, { "epoch": 3.01, "learning_rate": 0.00034132397191574725, "loss": 2.8644, "theoretical_loss": 3.625891295013104, "tokens_seen": 1070081024 }, { "epoch": 3.01, "learning_rate": 0.00034131394182547643, "loss": 2.9218, "theoretical_loss": 3.6258704264611747, "tokens_seen": 1070146560 }, { "epoch": 3.01, "learning_rate": 0.00034130391173520567, "loss": 2.9776, "theoretical_loss": 3.6258495595450135, "tokens_seen": 1070212096 }, { "epoch": 3.01, "learning_rate": 0.0003412938816449348, "loss": 2.9045, "theoretical_loss": 3.625828694264394, "tokens_seen": 1070277632 }, { "epoch": 3.01, "learning_rate": 0.00034128385155466403, "loss": 2.9996, "theoretical_loss": 3.6258078306190855, "tokens_seen": 1070343168 }, { "epoch": 3.01, "learning_rate": 0.00034127382146439315, "loss": 2.7483, "theoretical_loss": 3.6257869686088613, "tokens_seen": 1070408704 }, { "epoch": 3.01, "learning_rate": 0.0003412637913741224, "loss": 2.8165, "theoretical_loss": 3.6257661082334924, "tokens_seen": 1070474240 }, { "epoch": 3.01, "learning_rate": 0.00034125376128385157, "loss": 2.8122, "theoretical_loss": 3.625745249492751, "tokens_seen": 1070539776 }, { "epoch": 3.01, "learning_rate": 0.00034124373119358075, "loss": 2.7962, "theoretical_loss": 3.6257243923864095, "tokens_seen": 1070605312 }, { "epoch": 3.01, "learning_rate": 0.00034123370110330993, "loss": 2.7685, "theoretical_loss": 3.625703536914239, "tokens_seen": 1070670848 }, { "epoch": 3.01, "learning_rate": 0.0003412236710130391, "loss": 2.8879, "theoretical_loss": 3.625682683076011, "tokens_seen": 1070736384 }, { "epoch": 3.01, "learning_rate": 0.0003412136409227683, "loss": 2.8601, "theoretical_loss": 3.6256618308714987, "tokens_seen": 1070801920 }, { "epoch": 3.01, "learning_rate": 0.00034120361083249753, "loss": 2.7112, "theoretical_loss": 3.6256409803004734, "tokens_seen": 1070867456 }, { "epoch": 3.01, "learning_rate": 0.00034119358074222666, "loss": 2.8117, "theoretical_loss": 3.625620131362707, "tokens_seen": 1070932992 }, { "epoch": 3.01, "learning_rate": 0.0003411835506519559, "loss": 2.8247, "theoretical_loss": 3.625599284057973, "tokens_seen": 1070998528 }, { "epoch": 3.01, "learning_rate": 0.000341173520561685, "loss": 2.76, "theoretical_loss": 3.625578438386041, "tokens_seen": 1071064064 }, { "epoch": 3.01, "learning_rate": 0.00034116349047141426, "loss": 2.8607, "theoretical_loss": 3.6255575943466862, "tokens_seen": 1071129600 }, { "epoch": 3.01, "learning_rate": 0.00034115346038114344, "loss": 2.9012, "theoretical_loss": 3.625536751939679, "tokens_seen": 1071195136 }, { "epoch": 3.01, "learning_rate": 0.0003411434302908726, "loss": 2.9042, "theoretical_loss": 3.6255159111647917, "tokens_seen": 1071260672 }, { "epoch": 3.01, "learning_rate": 0.0003411334002006018, "loss": 2.784, "theoretical_loss": 3.6254950720217973, "tokens_seen": 1071326208 }, { "epoch": 3.01, "learning_rate": 0.00034112337011033103, "loss": 2.906, "theoretical_loss": 3.625474234510468, "tokens_seen": 1071391744 }, { "epoch": 3.01, "learning_rate": 0.00034111334002006016, "loss": 2.9546, "theoretical_loss": 3.625453398630576, "tokens_seen": 1071457280 }, { "epoch": 3.01, "objective/train/docs_used": 1720270, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.097710132598877, "objective/train/theoretical_loss": 3.6254481899154887, "objective/train/tokens_used": 1091933664, "theoretical_loss": 3.6254481899154887, "tokens_seen": 1071473664 }, { "epoch": 3.01, "learning_rate": 0.0003411033099297894, "loss": 2.8616, "theoretical_loss": 3.6254325643818945, "tokens_seen": 1071522816 }, { "epoch": 3.01, "learning_rate": 0.0003410932798395185, "loss": 3.0557, "theoretical_loss": 3.6254117317641956, "tokens_seen": 1071588352 }, { "epoch": 3.01, "learning_rate": 0.00034108324974924776, "loss": 2.8163, "theoretical_loss": 3.6253909007772513, "tokens_seen": 1071653888 }, { "epoch": 3.01, "learning_rate": 0.00034107321965897694, "loss": 2.8322, "theoretical_loss": 3.6253700714208357, "tokens_seen": 1071719424 }, { "epoch": 3.01, "learning_rate": 0.0003410631895687061, "loss": 2.822, "theoretical_loss": 3.6253492436947194, "tokens_seen": 1071784960 }, { "epoch": 3.01, "learning_rate": 0.0003410531594784353, "loss": 2.9793, "theoretical_loss": 3.625328417598677, "tokens_seen": 1071850496 }, { "epoch": 3.01, "learning_rate": 0.0003410431293881645, "loss": 2.9115, "theoretical_loss": 3.6253075931324807, "tokens_seen": 1071916032 }, { "epoch": 3.01, "learning_rate": 0.00034103309929789366, "loss": 2.9282, "theoretical_loss": 3.6252867702959026, "tokens_seen": 1071981568 }, { "epoch": 3.01, "learning_rate": 0.0003410230692076229, "loss": 2.9309, "theoretical_loss": 3.6252659490887167, "tokens_seen": 1072047104 }, { "epoch": 3.01, "learning_rate": 0.000341013039117352, "loss": 2.7311, "theoretical_loss": 3.625245129510695, "tokens_seen": 1072112640 }, { "epoch": 3.01, "learning_rate": 0.00034100300902708126, "loss": 2.927, "theoretical_loss": 3.6252243115616114, "tokens_seen": 1072178176 }, { "epoch": 3.01, "learning_rate": 0.00034099297893681044, "loss": 2.8256, "theoretical_loss": 3.6252034952412373, "tokens_seen": 1072243712 }, { "epoch": 3.01, "learning_rate": 0.0003409829488465396, "loss": 2.9919, "theoretical_loss": 3.6251826805493472, "tokens_seen": 1072309248 }, { "epoch": 3.01, "learning_rate": 0.0003409729187562688, "loss": 2.8012, "theoretical_loss": 3.625161867485714, "tokens_seen": 1072374784 }, { "epoch": 3.01, "learning_rate": 0.000340962888665998, "loss": 3.0289, "theoretical_loss": 3.6251410560501105, "tokens_seen": 1072440320 }, { "epoch": 3.01, "learning_rate": 0.00034095285857572717, "loss": 2.9404, "theoretical_loss": 3.625120246242309, "tokens_seen": 1072505856 }, { "epoch": 3.01, "learning_rate": 0.0003409428284854564, "loss": 3.0394, "theoretical_loss": 3.625099438062085, "tokens_seen": 1072571392 }, { "epoch": 3.01, "learning_rate": 0.00034093279839518553, "loss": 2.8484, "theoretical_loss": 3.625078631509209, "tokens_seen": 1072636928 }, { "epoch": 3.01, "learning_rate": 0.00034092276830491477, "loss": 2.9424, "theoretical_loss": 3.6250578265834568, "tokens_seen": 1072702464 }, { "epoch": 3.01, "learning_rate": 0.0003409127382146439, "loss": 2.9032, "theoretical_loss": 3.6250370232846, "tokens_seen": 1072768000 }, { "epoch": 3.01, "learning_rate": 0.00034090270812437313, "loss": 2.9782, "theoretical_loss": 3.6250162216124133, "tokens_seen": 1072833536 }, { "epoch": 3.01, "learning_rate": 0.0003408926780341023, "loss": 2.8366, "theoretical_loss": 3.624995421566669, "tokens_seen": 1072899072 }, { "epoch": 3.01, "learning_rate": 0.0003408826479438315, "loss": 2.8745, "theoretical_loss": 3.624974623147141, "tokens_seen": 1072964608 }, { "epoch": 3.01, "learning_rate": 0.00034087261785356067, "loss": 2.9378, "theoretical_loss": 3.624953826353604, "tokens_seen": 1073030144 }, { "epoch": 3.01, "learning_rate": 0.00034086258776328985, "loss": 2.8837, "theoretical_loss": 3.6249330311858294, "tokens_seen": 1073095680 }, { "epoch": 3.01, "objective/train/docs_used": 1723064, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.117689371109009, "objective/train/theoretical_loss": 3.624927832647885, "objective/train/tokens_used": 1093572064, "theoretical_loss": 3.624927832647885, "tokens_seen": 1073112064 }, { "epoch": 3.01, "learning_rate": 0.00034085255767301903, "loss": 3.0541, "theoretical_loss": 3.6249122376435925, "tokens_seen": 1073161216 }, { "epoch": 3.01, "learning_rate": 0.00034084252758274827, "loss": 2.8342, "theoretical_loss": 3.624891445726666, "tokens_seen": 1073226752 }, { "epoch": 3.01, "learning_rate": 0.00034083249749247745, "loss": 3.0326, "theoretical_loss": 3.6248706554348247, "tokens_seen": 1073292288 }, { "epoch": 3.01, "learning_rate": 0.00034082246740220663, "loss": 2.8683, "theoretical_loss": 3.6248498667678417, "tokens_seen": 1073357824 }, { "epoch": 3.01, "learning_rate": 0.00034081243731193587, "loss": 2.8953, "theoretical_loss": 3.6248290797254903, "tokens_seen": 1073423360 }, { "epoch": 3.01, "learning_rate": 0.000340802407221665, "loss": 2.9216, "theoretical_loss": 3.6248082943075457, "tokens_seen": 1073488896 }, { "epoch": 3.01, "learning_rate": 0.00034079237713139423, "loss": 2.8634, "theoretical_loss": 3.6247875105137806, "tokens_seen": 1073554432 }, { "epoch": 3.01, "learning_rate": 0.00034078234704112335, "loss": 2.8803, "theoretical_loss": 3.6247667283439693, "tokens_seen": 1073619968 }, { "epoch": 3.01, "learning_rate": 0.0003407723169508526, "loss": 3.0046, "theoretical_loss": 3.624745947797886, "tokens_seen": 1073685504 }, { "epoch": 3.01, "learning_rate": 0.00034076228686058177, "loss": 2.9129, "theoretical_loss": 3.6247251688753046, "tokens_seen": 1073751040 }, { "epoch": 3.01, "learning_rate": 0.00034075225677031095, "loss": 3.0109, "theoretical_loss": 3.6247043915759987, "tokens_seen": 1073816576 }, { "epoch": 3.01, "learning_rate": 0.00034074222668004013, "loss": 2.9504, "theoretical_loss": 3.6246836158997437, "tokens_seen": 1073882112 }, { "epoch": 3.01, "learning_rate": 0.0003407321965897693, "loss": 2.895, "theoretical_loss": 3.6246628418463125, "tokens_seen": 1073947648 }, { "epoch": 3.01, "learning_rate": 0.0003407221664994985, "loss": 2.8082, "theoretical_loss": 3.62464206941548, "tokens_seen": 1074013184 }, { "epoch": 3.01, "learning_rate": 0.00034071213640922773, "loss": 2.9338, "theoretical_loss": 3.6246212986070203, "tokens_seen": 1074078720 }, { "epoch": 3.01, "learning_rate": 0.00034070210631895686, "loss": 2.8448, "theoretical_loss": 3.6246005294207073, "tokens_seen": 1074144256 }, { "epoch": 3.01, "learning_rate": 0.0003406920762286861, "loss": 3.0079, "theoretical_loss": 3.6245797618563165, "tokens_seen": 1074209792 }, { "epoch": 3.01, "learning_rate": 0.0003406820461384152, "loss": 2.9807, "theoretical_loss": 3.624558995913621, "tokens_seen": 1074275328 }, { "epoch": 3.01, "learning_rate": 0.00034067201604814446, "loss": 2.8172, "theoretical_loss": 3.6245382315923957, "tokens_seen": 1074340864 }, { "epoch": 3.01, "learning_rate": 0.00034066198595787364, "loss": 2.8847, "theoretical_loss": 3.6245174688924156, "tokens_seen": 1074406400 }, { "epoch": 3.01, "learning_rate": 0.0003406519558676028, "loss": 2.9618, "theoretical_loss": 3.6244967078134542, "tokens_seen": 1074471936 }, { "epoch": 3.01, "learning_rate": 0.000340641925777332, "loss": 2.7733, "theoretical_loss": 3.624475948355287, "tokens_seen": 1074537472 }, { "epoch": 3.01, "learning_rate": 0.00034063189568706123, "loss": 2.637, "theoretical_loss": 3.6244551905176885, "tokens_seen": 1074603008 }, { "epoch": 3.02, "learning_rate": 0.00034062186559679036, "loss": 3.083, "theoretical_loss": 3.624434434300433, "tokens_seen": 1074668544 }, { "epoch": 3.02, "learning_rate": 0.0003406118355065196, "loss": 2.8528, "theoretical_loss": 3.624413679703295, "tokens_seen": 1074734080 }, { "epoch": 3.02, "objective/train/docs_used": 1725354, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.997434616088867, "objective/train/theoretical_loss": 3.624408491307128, "objective/train/tokens_used": 1095210464, "theoretical_loss": 3.624408491307128, "tokens_seen": 1074750464 }, { "epoch": 3.02, "learning_rate": 0.0003406018054162487, "loss": 2.9775, "theoretical_loss": 3.6243929267260504, "tokens_seen": 1074799616 }, { "epoch": 3.02, "learning_rate": 0.00034059177532597796, "loss": 2.9534, "theoretical_loss": 3.624372175368473, "tokens_seen": 1074865152 }, { "epoch": 3.02, "learning_rate": 0.00034058174523570714, "loss": 2.8432, "theoretical_loss": 3.624351425630338, "tokens_seen": 1074930688 }, { "epoch": 3.02, "learning_rate": 0.0003405717151454363, "loss": 2.9523, "theoretical_loss": 3.62433067751142, "tokens_seen": 1074996224 }, { "epoch": 3.02, "learning_rate": 0.0003405616850551655, "loss": 2.7374, "theoretical_loss": 3.624309931011495, "tokens_seen": 1075061760 }, { "epoch": 3.02, "learning_rate": 0.0003405516549648947, "loss": 2.843, "theoretical_loss": 3.624289186130336, "tokens_seen": 1075127296 }, { "epoch": 3.02, "learning_rate": 0.00034054162487462386, "loss": 2.8387, "theoretical_loss": 3.624268442867719, "tokens_seen": 1075192832 }, { "epoch": 3.02, "learning_rate": 0.0003405315947843531, "loss": 2.9446, "theoretical_loss": 3.62424770122342, "tokens_seen": 1075258368 }, { "epoch": 3.02, "learning_rate": 0.0003405215646940822, "loss": 2.8819, "theoretical_loss": 3.6242269611972135, "tokens_seen": 1075323904 }, { "epoch": 3.02, "learning_rate": 0.00034051153460381146, "loss": 2.9596, "theoretical_loss": 3.6242062227888745, "tokens_seen": 1075389440 }, { "epoch": 3.02, "learning_rate": 0.00034050150451354064, "loss": 2.8189, "theoretical_loss": 3.624185485998178, "tokens_seen": 1075454976 }, { "epoch": 3.02, "learning_rate": 0.0003404914744232698, "loss": 2.8538, "theoretical_loss": 3.6241647508248995, "tokens_seen": 1075520512 }, { "epoch": 3.02, "learning_rate": 0.000340481444332999, "loss": 2.9254, "theoretical_loss": 3.624144017268814, "tokens_seen": 1075586048 }, { "epoch": 3.02, "learning_rate": 0.0003404714142427282, "loss": 2.9093, "theoretical_loss": 3.6241232853296976, "tokens_seen": 1075651584 }, { "epoch": 3.02, "learning_rate": 0.00034046138415245737, "loss": 2.9074, "theoretical_loss": 3.624102555007325, "tokens_seen": 1075717120 }, { "epoch": 3.02, "learning_rate": 0.0003404513540621866, "loss": 2.924, "theoretical_loss": 3.624081826301472, "tokens_seen": 1075782656 }, { "epoch": 3.02, "learning_rate": 0.00034044132397191573, "loss": 3.0362, "theoretical_loss": 3.6240610992119144, "tokens_seen": 1075848192 }, { "epoch": 3.02, "learning_rate": 0.00034043129388164497, "loss": 2.771, "theoretical_loss": 3.624040373738427, "tokens_seen": 1075913728 }, { "epoch": 3.02, "learning_rate": 0.0003404212637913741, "loss": 2.9016, "theoretical_loss": 3.6240196498807853, "tokens_seen": 1075979264 }, { "epoch": 3.02, "learning_rate": 0.00034041123370110333, "loss": 2.9343, "theoretical_loss": 3.6239989276387656, "tokens_seen": 1076044800 }, { "epoch": 3.02, "learning_rate": 0.0003404012036108325, "loss": 3.0058, "theoretical_loss": 3.623978207012143, "tokens_seen": 1076110336 }, { "epoch": 3.02, "learning_rate": 0.0003403911735205617, "loss": 2.8717, "theoretical_loss": 3.6239574880006935, "tokens_seen": 1076175872 }, { "epoch": 3.02, "learning_rate": 0.00034038114343029087, "loss": 2.8726, "theoretical_loss": 3.623936770604193, "tokens_seen": 1076241408 }, { "epoch": 3.02, "learning_rate": 0.00034037111334002005, "loss": 2.97, "theoretical_loss": 3.623916054822417, "tokens_seen": 1076306944 }, { "epoch": 3.02, "learning_rate": 0.00034036108324974923, "loss": 2.9274, "theoretical_loss": 3.623895340655141, "tokens_seen": 1076372480 }, { "epoch": 3.02, "objective/train/docs_used": 1728064, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.729905843734741, "objective/train/theoretical_loss": 3.623890162365562, "objective/train/tokens_used": 1096848864, "theoretical_loss": 3.623890162365562, "tokens_seen": 1076388864 }, { "epoch": 3.02, "learning_rate": 0.00034035105315947847, "loss": 2.9181, "theoretical_loss": 3.6238746281021417, "tokens_seen": 1076438016 }, { "epoch": 3.02, "learning_rate": 0.0003403410230692076, "loss": 2.8591, "theoretical_loss": 3.6238539171631947, "tokens_seen": 1076503552 }, { "epoch": 3.02, "learning_rate": 0.00034033099297893683, "loss": 2.8272, "theoretical_loss": 3.623833207838076, "tokens_seen": 1076569088 }, { "epoch": 3.02, "learning_rate": 0.000340320962888666, "loss": 2.9227, "theoretical_loss": 3.623812500126561, "tokens_seen": 1076634624 }, { "epoch": 3.02, "learning_rate": 0.0003403109327983952, "loss": 2.8918, "theoretical_loss": 3.6237917940284268, "tokens_seen": 1076700160 }, { "epoch": 3.02, "learning_rate": 0.0003403009027081244, "loss": 2.9213, "theoretical_loss": 3.6237710895434487, "tokens_seen": 1076765696 }, { "epoch": 3.02, "learning_rate": 0.00034029087261785356, "loss": 2.7342, "theoretical_loss": 3.623750386671403, "tokens_seen": 1076831232 }, { "epoch": 3.02, "learning_rate": 0.00034028084252758274, "loss": 2.953, "theoretical_loss": 3.6237296854120666, "tokens_seen": 1076896768 }, { "epoch": 3.02, "learning_rate": 0.00034027081243731197, "loss": 2.9202, "theoretical_loss": 3.623708985765215, "tokens_seen": 1076962304 }, { "epoch": 3.02, "learning_rate": 0.0003402607823470411, "loss": 2.8783, "theoretical_loss": 3.6236882877306242, "tokens_seen": 1077027840 }, { "epoch": 3.02, "learning_rate": 0.00034025075225677033, "loss": 2.8006, "theoretical_loss": 3.623667591308071, "tokens_seen": 1077093376 }, { "epoch": 3.02, "learning_rate": 0.00034024072216649946, "loss": 2.8273, "theoretical_loss": 3.6236468964973323, "tokens_seen": 1077158912 }, { "epoch": 3.02, "learning_rate": 0.0003402306920762287, "loss": 2.9906, "theoretical_loss": 3.623626203298184, "tokens_seen": 1077224448 }, { "epoch": 3.02, "learning_rate": 0.0003402206619859579, "loss": 2.8908, "theoretical_loss": 3.6236055117104025, "tokens_seen": 1077289984 }, { "epoch": 3.02, "learning_rate": 0.00034021063189568706, "loss": 3.001, "theoretical_loss": 3.623584821733764, "tokens_seen": 1077355520 }, { "epoch": 3.02, "learning_rate": 0.00034020060180541624, "loss": 2.8942, "theoretical_loss": 3.6235641333680455, "tokens_seen": 1077421056 }, { "epoch": 3.02, "learning_rate": 0.0003401905717151454, "loss": 2.9384, "theoretical_loss": 3.6235434466130236, "tokens_seen": 1077486592 }, { "epoch": 3.02, "learning_rate": 0.0003401805416248746, "loss": 2.9182, "theoretical_loss": 3.6235227614684753, "tokens_seen": 1077552128 }, { "epoch": 3.02, "learning_rate": 0.00034017051153460384, "loss": 2.8165, "theoretical_loss": 3.6235020779341762, "tokens_seen": 1077617664 }, { "epoch": 3.02, "learning_rate": 0.00034016048144433296, "loss": 2.8447, "theoretical_loss": 3.623481396009904, "tokens_seen": 1077683200 }, { "epoch": 3.02, "learning_rate": 0.0003401504513540622, "loss": 2.8564, "theoretical_loss": 3.623460715695435, "tokens_seen": 1077748736 }, { "epoch": 3.02, "learning_rate": 0.0003401404212637914, "loss": 2.9987, "theoretical_loss": 3.623440036990546, "tokens_seen": 1077814272 }, { "epoch": 3.02, "learning_rate": 0.00034013039117352056, "loss": 2.9102, "theoretical_loss": 3.6234193598950144, "tokens_seen": 1077879808 }, { "epoch": 3.02, "learning_rate": 0.00034012036108324974, "loss": 2.863, "theoretical_loss": 3.623398684408617, "tokens_seen": 1077945344 }, { "epoch": 3.02, "learning_rate": 0.0003401103309929789, "loss": 2.8481, "theoretical_loss": 3.6233780105311304, "tokens_seen": 1078010880 }, { "epoch": 3.02, "objective/train/docs_used": 1730764, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6427290439605713, "objective/train/theoretical_loss": 3.623372842313125, "objective/train/tokens_used": 1098487264, "theoretical_loss": 3.623372842313125, "tokens_seen": 1078027264 }, { "epoch": 3.02, "learning_rate": 0.0003401003009027081, "loss": 2.7604, "theoretical_loss": 3.6233573382623314, "tokens_seen": 1078076416 }, { "epoch": 3.02, "learning_rate": 0.00034009027081243734, "loss": 2.8619, "theoretical_loss": 3.6233366676019974, "tokens_seen": 1078141952 }, { "epoch": 3.02, "learning_rate": 0.0003400802407221665, "loss": 2.9415, "theoretical_loss": 3.6233159985499057, "tokens_seen": 1078207488 }, { "epoch": 3.02, "learning_rate": 0.0003400702106318957, "loss": 2.9081, "theoretical_loss": 3.6232953311058327, "tokens_seen": 1078273024 }, { "epoch": 3.02, "learning_rate": 0.0003400601805416249, "loss": 2.9052, "theoretical_loss": 3.623274665269556, "tokens_seen": 1078338560 }, { "epoch": 3.02, "learning_rate": 0.00034005015045135406, "loss": 2.997, "theoretical_loss": 3.6232540010408534, "tokens_seen": 1078404096 }, { "epoch": 3.02, "learning_rate": 0.0003400401203610833, "loss": 3.0337, "theoretical_loss": 3.6232333384195012, "tokens_seen": 1078469632 }, { "epoch": 3.02, "learning_rate": 0.0003400300902708124, "loss": 2.8085, "theoretical_loss": 3.6232126774052777, "tokens_seen": 1078535168 }, { "epoch": 3.02, "learning_rate": 0.00034002006018054166, "loss": 2.8626, "theoretical_loss": 3.623192017997959, "tokens_seen": 1078600704 }, { "epoch": 3.02, "learning_rate": 0.00034001003009027084, "loss": 2.7752, "theoretical_loss": 3.6231713601973237, "tokens_seen": 1078666240 }, { "epoch": 3.02, "learning_rate": 0.00034, "loss": 2.9533, "theoretical_loss": 3.623150704003149, "tokens_seen": 1078731776 }, { "epoch": 3.02, "learning_rate": 0.0003399899699097292, "loss": 2.8314, "theoretical_loss": 3.6231300494152117, "tokens_seen": 1078797312 }, { "epoch": 3.02, "learning_rate": 0.0003399799398194584, "loss": 2.842, "theoretical_loss": 3.6231093964332897, "tokens_seen": 1078862848 }, { "epoch": 3.02, "learning_rate": 0.00033996990972918757, "loss": 2.7729, "theoretical_loss": 3.623088745057161, "tokens_seen": 1078928384 }, { "epoch": 3.02, "learning_rate": 0.0003399598796389168, "loss": 2.9855, "theoretical_loss": 3.6230680952866026, "tokens_seen": 1078993920 }, { "epoch": 3.02, "learning_rate": 0.00033994984954864593, "loss": 2.9941, "theoretical_loss": 3.623047447121392, "tokens_seen": 1079059456 }, { "epoch": 3.02, "learning_rate": 0.00033993981945837517, "loss": 2.8971, "theoretical_loss": 3.623026800561308, "tokens_seen": 1079124992 }, { "epoch": 3.02, "learning_rate": 0.0003399297893681043, "loss": 2.8421, "theoretical_loss": 3.6230061556061273, "tokens_seen": 1079190528 }, { "epoch": 3.02, "learning_rate": 0.00033991975927783353, "loss": 2.9386, "theoretical_loss": 3.622985512255628, "tokens_seen": 1079256064 }, { "epoch": 3.02, "learning_rate": 0.0003399097291875627, "loss": 2.713, "theoretical_loss": 3.6229648705095885, "tokens_seen": 1079321600 }, { "epoch": 3.02, "learning_rate": 0.0003398996990972919, "loss": 2.8457, "theoretical_loss": 3.622944230367786, "tokens_seen": 1079387136 }, { "epoch": 3.02, "learning_rate": 0.00033988966900702107, "loss": 2.8307, "theoretical_loss": 3.6229235918299985, "tokens_seen": 1079452672 }, { "epoch": 3.02, "learning_rate": 0.00033987963891675025, "loss": 2.8747, "theoretical_loss": 3.622902954896005, "tokens_seen": 1079518208 }, { "epoch": 3.02, "learning_rate": 0.00033986960882647943, "loss": 2.978, "theoretical_loss": 3.6228823195655813, "tokens_seen": 1079583744 }, { "epoch": 3.02, "learning_rate": 0.00033985957873620867, "loss": 2.8431, "theoretical_loss": 3.622861685838507, "tokens_seen": 1079649280 }, { "epoch": 3.02, "objective/train/docs_used": 1732164, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.287489652633667, "objective/train/theoretical_loss": 3.6228565276572358, "objective/train/tokens_used": 1100125664, "theoretical_loss": 3.6228565276572358, "tokens_seen": 1079665664 }, { "epoch": 3.02, "learning_rate": 0.0003398495486459378, "loss": 3.0905, "theoretical_loss": 3.6228410537145606, "tokens_seen": 1079714816 }, { "epoch": 3.02, "learning_rate": 0.00033983951855566703, "loss": 2.7276, "theoretical_loss": 3.6228204231935193, "tokens_seen": 1079780352 }, { "epoch": 3.02, "learning_rate": 0.0003398294884653962, "loss": 2.9139, "theoretical_loss": 3.6227997942751617, "tokens_seen": 1079845888 }, { "epoch": 3.02, "learning_rate": 0.0003398194583751254, "loss": 2.8314, "theoretical_loss": 3.6227791669592664, "tokens_seen": 1079911424 }, { "epoch": 3.02, "learning_rate": 0.0003398094282848546, "loss": 2.992, "theoretical_loss": 3.622758541245611, "tokens_seen": 1079976960 }, { "epoch": 3.02, "learning_rate": 0.00033979939819458376, "loss": 2.9563, "theoretical_loss": 3.622737917133974, "tokens_seen": 1080042496 }, { "epoch": 3.02, "learning_rate": 0.00033978936810431294, "loss": 2.8438, "theoretical_loss": 3.6227172946241337, "tokens_seen": 1080108032 }, { "epoch": 3.02, "learning_rate": 0.00033977933801404217, "loss": 2.8564, "theoretical_loss": 3.6226966737158692, "tokens_seen": 1080173568 }, { "epoch": 3.02, "learning_rate": 0.0003397693079237713, "loss": 2.9477, "theoretical_loss": 3.622676054408958, "tokens_seen": 1080239104 }, { "epoch": 3.02, "learning_rate": 0.00033975927783350053, "loss": 2.9171, "theoretical_loss": 3.6226554367031794, "tokens_seen": 1080304640 }, { "epoch": 3.02, "learning_rate": 0.00033974924774322966, "loss": 2.9618, "theoretical_loss": 3.622634820598311, "tokens_seen": 1080370176 }, { "epoch": 3.02, "learning_rate": 0.0003397392176529589, "loss": 2.9963, "theoretical_loss": 3.6226142060941324, "tokens_seen": 1080435712 }, { "epoch": 3.02, "learning_rate": 0.0003397291875626881, "loss": 2.8676, "theoretical_loss": 3.6225935931904223, "tokens_seen": 1080501248 }, { "epoch": 3.02, "learning_rate": 0.00033971915747241726, "loss": 2.9844, "theoretical_loss": 3.6225729818869583, "tokens_seen": 1080566784 }, { "epoch": 3.02, "learning_rate": 0.00033970912738214644, "loss": 2.8999, "theoretical_loss": 3.62255237218352, "tokens_seen": 1080632320 }, { "epoch": 3.02, "learning_rate": 0.0003396990972918756, "loss": 2.8753, "theoretical_loss": 3.622531764079886, "tokens_seen": 1080697856 }, { "epoch": 3.02, "learning_rate": 0.0003396890672016048, "loss": 2.691, "theoretical_loss": 3.6225111575758344, "tokens_seen": 1080763392 }, { "epoch": 3.02, "learning_rate": 0.00033967903711133404, "loss": 2.8119, "theoretical_loss": 3.6224905526711453, "tokens_seen": 1080828928 }, { "epoch": 3.02, "learning_rate": 0.00033966900702106316, "loss": 2.773, "theoretical_loss": 3.6224699493655965, "tokens_seen": 1080894464 }, { "epoch": 3.02, "learning_rate": 0.0003396589769307924, "loss": 2.8502, "theoretical_loss": 3.622449347658968, "tokens_seen": 1080960000 }, { "epoch": 3.02, "learning_rate": 0.0003396489468405216, "loss": 2.8075, "theoretical_loss": 3.622428747551038, "tokens_seen": 1081025536 }, { "epoch": 3.02, "learning_rate": 0.00033963891675025076, "loss": 2.8775, "theoretical_loss": 3.6224081490415854, "tokens_seen": 1081091072 }, { "epoch": 3.02, "learning_rate": 0.00033962888665997994, "loss": 2.9769, "theoretical_loss": 3.62238755213039, "tokens_seen": 1081156608 }, { "epoch": 3.02, "learning_rate": 0.0003396188565697091, "loss": 2.8744, "theoretical_loss": 3.6223669568172303, "tokens_seen": 1081222144 }, { "epoch": 3.02, "learning_rate": 0.0003396088264794383, "loss": 2.7572, "theoretical_loss": 3.6223463631018857, "tokens_seen": 1081287680 }, { "epoch": 3.02, "objective/train/docs_used": 1734795, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.898815393447876, "objective/train/theoretical_loss": 3.622341214922683, "objective/train/tokens_used": 1101764064, "theoretical_loss": 3.622341214922683, "tokens_seen": 1081304064 }, { "epoch": 3.02, "learning_rate": 0.00033959879638916754, "loss": 2.9515, "theoretical_loss": 3.622325770984136, "tokens_seen": 1081353216 }, { "epoch": 3.02, "learning_rate": 0.00033958876629889667, "loss": 2.9251, "theoretical_loss": 3.6223051804637594, "tokens_seen": 1081418752 }, { "epoch": 3.02, "learning_rate": 0.0003395787362086259, "loss": 2.9709, "theoretical_loss": 3.6222845915405353, "tokens_seen": 1081484288 }, { "epoch": 3.02, "learning_rate": 0.00033956870611835503, "loss": 2.8107, "theoretical_loss": 3.6222640042142435, "tokens_seen": 1081549824 }, { "epoch": 3.02, "learning_rate": 0.00033955867602808426, "loss": 2.898, "theoretical_loss": 3.622243418484664, "tokens_seen": 1081615360 }, { "epoch": 3.02, "learning_rate": 0.00033954864593781345, "loss": 2.9389, "theoretical_loss": 3.622222834351575, "tokens_seen": 1081680896 }, { "epoch": 3.02, "learning_rate": 0.0003395386158475426, "loss": 2.8786, "theoretical_loss": 3.6222022518147563, "tokens_seen": 1081746432 }, { "epoch": 3.02, "learning_rate": 0.0003395285857572718, "loss": 2.9086, "theoretical_loss": 3.622181670873988, "tokens_seen": 1081811968 }, { "epoch": 3.02, "learning_rate": 0.00033951855566700104, "loss": 2.9172, "theoretical_loss": 3.622161091529049, "tokens_seen": 1081877504 }, { "epoch": 3.02, "learning_rate": 0.00033950852557673017, "loss": 3.0382, "theoretical_loss": 3.622140513779719, "tokens_seen": 1081943040 }, { "epoch": 3.02, "learning_rate": 0.0003394984954864594, "loss": 2.8833, "theoretical_loss": 3.6221199376257784, "tokens_seen": 1082008576 }, { "epoch": 3.02, "learning_rate": 0.00033948846539618853, "loss": 2.9259, "theoretical_loss": 3.6220993630670058, "tokens_seen": 1082074112 }, { "epoch": 3.02, "learning_rate": 0.00033947843530591777, "loss": 2.9042, "theoretical_loss": 3.6220787901031812, "tokens_seen": 1082139648 }, { "epoch": 3.02, "learning_rate": 0.00033946840521564695, "loss": 2.8613, "theoretical_loss": 3.622058218734085, "tokens_seen": 1082205184 }, { "epoch": 3.02, "learning_rate": 0.00033945837512537613, "loss": 2.9064, "theoretical_loss": 3.6220376489594965, "tokens_seen": 1082270720 }, { "epoch": 3.02, "learning_rate": 0.0003394483450351053, "loss": 2.9196, "theoretical_loss": 3.6220170807791963, "tokens_seen": 1082336256 }, { "epoch": 3.02, "learning_rate": 0.0003394383149448345, "loss": 2.9833, "theoretical_loss": 3.621996514192963, "tokens_seen": 1082401792 }, { "epoch": 3.02, "learning_rate": 0.0003394282848545637, "loss": 2.9202, "theoretical_loss": 3.6219759492005776, "tokens_seen": 1082467328 }, { "epoch": 3.02, "learning_rate": 0.0003394182547642929, "loss": 2.8345, "theoretical_loss": 3.6219553858018196, "tokens_seen": 1082532864 }, { "epoch": 3.02, "learning_rate": 0.00033940822467402204, "loss": 2.8876, "theoretical_loss": 3.621934823996469, "tokens_seen": 1082598400 }, { "epoch": 3.02, "learning_rate": 0.00033939819458375127, "loss": 2.9411, "theoretical_loss": 3.6219142637843063, "tokens_seen": 1082663936 }, { "epoch": 3.02, "learning_rate": 0.0003393881644934804, "loss": 2.8077, "theoretical_loss": 3.621893705165111, "tokens_seen": 1082729472 }, { "epoch": 3.02, "learning_rate": 0.00033937813440320963, "loss": 2.8592, "theoretical_loss": 3.6218731481386643, "tokens_seen": 1082795008 }, { "epoch": 3.02, "learning_rate": 0.0003393681043129388, "loss": 2.957, "theoretical_loss": 3.6218525927047454, "tokens_seen": 1082860544 }, { "epoch": 3.02, "learning_rate": 0.000339358074222668, "loss": 2.9211, "theoretical_loss": 3.6218320388631353, "tokens_seen": 1082926080 }, { "epoch": 3.02, "objective/train/docs_used": 1737651, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1534736156463623, "objective/train/theoretical_loss": 3.621826900651505, "objective/train/tokens_used": 1103402464, "theoretical_loss": 3.621826900651505, "tokens_seen": 1082942464 }, { "epoch": 3.02, "learning_rate": 0.0003393480441323972, "loss": 2.9252, "theoretical_loss": 3.6218114866136135, "tokens_seen": 1082991616 }, { "epoch": 3.02, "learning_rate": 0.0003393380140421264, "loss": 2.9198, "theoretical_loss": 3.621790935955961, "tokens_seen": 1083057152 }, { "epoch": 3.02, "learning_rate": 0.0003393279839518556, "loss": 2.9199, "theoretical_loss": 3.621770386889958, "tokens_seen": 1083122688 }, { "epoch": 3.02, "learning_rate": 0.0003393179538615848, "loss": 2.8857, "theoretical_loss": 3.6217498394153846, "tokens_seen": 1083188224 }, { "epoch": 3.02, "learning_rate": 0.00033930792377131396, "loss": 2.9033, "theoretical_loss": 3.621729293532022, "tokens_seen": 1083253760 }, { "epoch": 3.02, "learning_rate": 0.00033929789368104314, "loss": 2.9013, "theoretical_loss": 3.62170874923965, "tokens_seen": 1083319296 }, { "epoch": 3.02, "learning_rate": 0.00033928786359077237, "loss": 2.9587, "theoretical_loss": 3.62168820653805, "tokens_seen": 1083384832 }, { "epoch": 3.02, "learning_rate": 0.0003392778335005015, "loss": 2.8745, "theoretical_loss": 3.621667665427002, "tokens_seen": 1083450368 }, { "epoch": 3.02, "learning_rate": 0.00033926780341023073, "loss": 2.7278, "theoretical_loss": 3.621647125906286, "tokens_seen": 1083515904 }, { "epoch": 3.02, "learning_rate": 0.00033925777331995986, "loss": 2.8418, "theoretical_loss": 3.621626587975684, "tokens_seen": 1083581440 }, { "epoch": 3.02, "learning_rate": 0.0003392477432296891, "loss": 2.9574, "theoretical_loss": 3.621606051634976, "tokens_seen": 1083646976 }, { "epoch": 3.02, "learning_rate": 0.0003392377131394183, "loss": 2.6582, "theoretical_loss": 3.6215855168839433, "tokens_seen": 1083712512 }, { "epoch": 3.02, "learning_rate": 0.00033922768304914746, "loss": 2.9886, "theoretical_loss": 3.6215649837223656, "tokens_seen": 1083778048 }, { "epoch": 3.02, "learning_rate": 0.00033921765295887664, "loss": 2.9884, "theoretical_loss": 3.621544452150025, "tokens_seen": 1083843584 }, { "epoch": 3.02, "learning_rate": 0.0003392076228686058, "loss": 2.7708, "theoretical_loss": 3.621523922166702, "tokens_seen": 1083909120 }, { "epoch": 3.02, "learning_rate": 0.000339197592778335, "loss": 2.9621, "theoretical_loss": 3.6215033937721772, "tokens_seen": 1083974656 }, { "epoch": 3.02, "learning_rate": 0.00033918756268806424, "loss": 2.7996, "theoretical_loss": 3.6214828669662325, "tokens_seen": 1084040192 }, { "epoch": 3.02, "learning_rate": 0.00033917753259779336, "loss": 2.8465, "theoretical_loss": 3.6214623417486482, "tokens_seen": 1084105728 }, { "epoch": 3.02, "learning_rate": 0.0003391675025075226, "loss": 2.9828, "theoretical_loss": 3.621441818119205, "tokens_seen": 1084171264 }, { "epoch": 3.02, "learning_rate": 0.0003391574724172518, "loss": 2.9516, "theoretical_loss": 3.621421296077685, "tokens_seen": 1084236800 }, { "epoch": 3.02, "learning_rate": 0.00033914744232698096, "loss": 2.9971, "theoretical_loss": 3.6214007756238686, "tokens_seen": 1084302336 }, { "epoch": 3.02, "learning_rate": 0.00033913741223671014, "loss": 2.9708, "theoretical_loss": 3.6213802567575377, "tokens_seen": 1084367872 }, { "epoch": 3.02, "learning_rate": 0.0003391273821464393, "loss": 2.8311, "theoretical_loss": 3.621359739478473, "tokens_seen": 1084433408 }, { "epoch": 3.02, "learning_rate": 0.0003391173520561685, "loss": 3.0883, "theoretical_loss": 3.621339223786456, "tokens_seen": 1084498944 }, { "epoch": 3.02, "learning_rate": 0.00033910732196589774, "loss": 3.0278, "theoretical_loss": 3.621318709681268, "tokens_seen": 1084564480 }, { "epoch": 3.02, "objective/train/docs_used": 1740671, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9658401012420654, "objective/train/theoretical_loss": 3.6213135814028874, "objective/train/tokens_used": 1105040864, "theoretical_loss": 3.6213135814028874, "tokens_seen": 1084580864 }, { "epoch": 3.02, "learning_rate": 0.00033909729187562687, "loss": 2.7608, "theoretical_loss": 3.6212981971626905, "tokens_seen": 1084630016 }, { "epoch": 3.02, "learning_rate": 0.0003390872617853561, "loss": 2.9921, "theoretical_loss": 3.621277686230505, "tokens_seen": 1084695552 }, { "epoch": 3.02, "learning_rate": 0.00033907723169508523, "loss": 2.8979, "theoretical_loss": 3.621257176884493, "tokens_seen": 1084761088 }, { "epoch": 3.02, "learning_rate": 0.00033906720160481446, "loss": 2.9753, "theoretical_loss": 3.6212366691244355, "tokens_seen": 1084826624 }, { "epoch": 3.02, "learning_rate": 0.00033905717151454365, "loss": 2.874, "theoretical_loss": 3.6212161629501143, "tokens_seen": 1084892160 }, { "epoch": 3.02, "learning_rate": 0.00033904714142427283, "loss": 2.8565, "theoretical_loss": 3.6211956583613114, "tokens_seen": 1084957696 }, { "epoch": 3.02, "learning_rate": 0.000339037111334002, "loss": 2.9191, "theoretical_loss": 3.621175155357808, "tokens_seen": 1085023232 }, { "epoch": 3.02, "learning_rate": 0.00033902708124373124, "loss": 2.7388, "theoretical_loss": 3.621154653939386, "tokens_seen": 1085088768 }, { "epoch": 3.02, "learning_rate": 0.00033901705115346037, "loss": 3.0648, "theoretical_loss": 3.6211341541058273, "tokens_seen": 1085154304 }, { "epoch": 3.02, "learning_rate": 0.0003390070210631896, "loss": 2.9279, "theoretical_loss": 3.6211136558569135, "tokens_seen": 1085219840 }, { "epoch": 3.02, "learning_rate": 0.00033899699097291873, "loss": 2.8439, "theoretical_loss": 3.6210931591924265, "tokens_seen": 1085285376 }, { "epoch": 3.02, "learning_rate": 0.00033898696088264797, "loss": 2.9547, "theoretical_loss": 3.6210726641121473, "tokens_seen": 1085350912 }, { "epoch": 3.02, "learning_rate": 0.00033897693079237715, "loss": 2.9603, "theoretical_loss": 3.621052170615859, "tokens_seen": 1085416448 }, { "epoch": 3.02, "learning_rate": 0.00033896690070210633, "loss": 2.9712, "theoretical_loss": 3.6210316787033436, "tokens_seen": 1085481984 }, { "epoch": 3.02, "learning_rate": 0.0003389568706118355, "loss": 3.0346, "theoretical_loss": 3.621011188374382, "tokens_seen": 1085547520 }, { "epoch": 3.02, "learning_rate": 0.0003389468405215647, "loss": 2.8053, "theoretical_loss": 3.6209906996287575, "tokens_seen": 1085613056 }, { "epoch": 3.02, "learning_rate": 0.0003389368104312939, "loss": 2.7845, "theoretical_loss": 3.620970212466251, "tokens_seen": 1085678592 }, { "epoch": 3.02, "learning_rate": 0.0003389267803410231, "loss": 2.8883, "theoretical_loss": 3.6209497268866455, "tokens_seen": 1085744128 }, { "epoch": 3.02, "learning_rate": 0.00033891675025075224, "loss": 2.8495, "theoretical_loss": 3.6209292428897224, "tokens_seen": 1085809664 }, { "epoch": 3.02, "learning_rate": 0.00033890672016048147, "loss": 2.909, "theoretical_loss": 3.620908760475265, "tokens_seen": 1085875200 }, { "epoch": 3.02, "learning_rate": 0.0003388966900702106, "loss": 2.7803, "theoretical_loss": 3.6208882796430544, "tokens_seen": 1085940736 }, { "epoch": 3.02, "learning_rate": 0.00033888665997993983, "loss": 2.8204, "theoretical_loss": 3.6208678003928734, "tokens_seen": 1086006272 }, { "epoch": 3.02, "learning_rate": 0.000338876629889669, "loss": 2.9273, "theoretical_loss": 3.6208473227245044, "tokens_seen": 1086071808 }, { "epoch": 3.02, "learning_rate": 0.0003388665997993982, "loss": 2.9977, "theoretical_loss": 3.62082684663773, "tokens_seen": 1086137344 }, { "epoch": 3.02, "learning_rate": 0.0003388565697091274, "loss": 3.0976, "theoretical_loss": 3.6208063721323316, "tokens_seen": 1086202880 }, { "epoch": 3.02, "objective/train/docs_used": 1743440, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.843622922897339, "objective/train/theoretical_loss": 3.6208012537530467, "objective/train/tokens_used": 1106679264, "theoretical_loss": 3.6208012537530467, "tokens_seen": 1086219264 }, { "epoch": 3.02, "learning_rate": 0.0003388465396188566, "loss": 2.8953, "theoretical_loss": 3.6207858992080926, "tokens_seen": 1086268416 }, { "epoch": 3.02, "learning_rate": 0.00033883650952858574, "loss": 2.774, "theoretical_loss": 3.6207654278647956, "tokens_seen": 1086333952 }, { "epoch": 3.02, "learning_rate": 0.000338826479438315, "loss": 2.8035, "theoretical_loss": 3.620744958102223, "tokens_seen": 1086399488 }, { "epoch": 3.02, "learning_rate": 0.0003388164493480441, "loss": 2.9713, "theoretical_loss": 3.620724489920157, "tokens_seen": 1086465024 }, { "epoch": 3.02, "learning_rate": 0.00033880641925777334, "loss": 2.8475, "theoretical_loss": 3.620704023318381, "tokens_seen": 1086530560 }, { "epoch": 3.02, "learning_rate": 0.0003387963891675025, "loss": 2.9717, "theoretical_loss": 3.6206835582966765, "tokens_seen": 1086596096 }, { "epoch": 3.02, "learning_rate": 0.0003387863590772317, "loss": 2.9789, "theoretical_loss": 3.6206630948548275, "tokens_seen": 1086661632 }, { "epoch": 3.02, "learning_rate": 0.0003387763289869609, "loss": 2.8995, "theoretical_loss": 3.620642632992616, "tokens_seen": 1086727168 }, { "epoch": 3.02, "learning_rate": 0.00033876629889669006, "loss": 2.8271, "theoretical_loss": 3.6206221727098247, "tokens_seen": 1086792704 }, { "epoch": 3.02, "learning_rate": 0.00033875626880641924, "loss": 2.9134, "theoretical_loss": 3.620601714006237, "tokens_seen": 1086858240 }, { "epoch": 3.02, "learning_rate": 0.0003387462387161485, "loss": 2.9515, "theoretical_loss": 3.6205812568816356, "tokens_seen": 1086923776 }, { "epoch": 3.02, "learning_rate": 0.0003387362086258776, "loss": 2.9263, "theoretical_loss": 3.6205608013358033, "tokens_seen": 1086989312 }, { "epoch": 3.02, "learning_rate": 0.00033872617853560684, "loss": 2.9306, "theoretical_loss": 3.620540347368523, "tokens_seen": 1087054848 }, { "epoch": 3.02, "learning_rate": 0.00033871614844533597, "loss": 2.8635, "theoretical_loss": 3.620519894979578, "tokens_seen": 1087120384 }, { "epoch": 3.02, "learning_rate": 0.0003387061183550652, "loss": 2.7813, "theoretical_loss": 3.6204994441687512, "tokens_seen": 1087185920 }, { "epoch": 3.02, "learning_rate": 0.0003386960882647944, "loss": 2.9351, "theoretical_loss": 3.620478994935826, "tokens_seen": 1087251456 }, { "epoch": 3.02, "learning_rate": 0.00033868605817452356, "loss": 2.8789, "theoretical_loss": 3.6204585472805855, "tokens_seen": 1087316992 }, { "epoch": 3.02, "learning_rate": 0.00033867602808425275, "loss": 2.8042, "theoretical_loss": 3.6204381012028124, "tokens_seen": 1087382528 }, { "epoch": 3.02, "learning_rate": 0.000338665997993982, "loss": 2.8029, "theoretical_loss": 3.6204176567022905, "tokens_seen": 1087448064 }, { "epoch": 3.02, "learning_rate": 0.0003386559679037111, "loss": 2.9176, "theoretical_loss": 3.6203972137788027, "tokens_seen": 1087513600 }, { "epoch": 3.02, "learning_rate": 0.00033864593781344034, "loss": 2.88, "theoretical_loss": 3.6203767724321327, "tokens_seen": 1087579136 }, { "epoch": 3.02, "learning_rate": 0.00033863590772316947, "loss": 2.8801, "theoretical_loss": 3.6203563326620634, "tokens_seen": 1087644672 }, { "epoch": 3.02, "learning_rate": 0.0003386258776328987, "loss": 2.937, "theoretical_loss": 3.6203358944683783, "tokens_seen": 1087710208 }, { "epoch": 3.02, "learning_rate": 0.0003386158475426279, "loss": 3.0223, "theoretical_loss": 3.620315457850862, "tokens_seen": 1087775744 }, { "epoch": 3.02, "learning_rate": 0.00033860581745235707, "loss": 2.8273, "theoretical_loss": 3.6202950228092963, "tokens_seen": 1087841280 }, { "epoch": 3.02, "objective/train/docs_used": 1744938, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2281439304351807, "objective/train/theoretical_loss": 3.620289914295122, "objective/train/tokens_used": 1108317664, "theoretical_loss": 3.620289914295122, "tokens_seen": 1087857664 }, { "epoch": 3.02, "learning_rate": 0.00033859578736208625, "loss": 2.9878, "theoretical_loss": 3.620274589343465, "tokens_seen": 1087906816 }, { "epoch": 3.02, "learning_rate": 0.00033858575727181543, "loss": 2.8946, "theoretical_loss": 3.620254157453153, "tokens_seen": 1087972352 }, { "epoch": 3.02, "learning_rate": 0.00033857572718154467, "loss": 2.7804, "theoretical_loss": 3.620233727138143, "tokens_seen": 1088037888 }, { "epoch": 3.02, "learning_rate": 0.00033856569709127385, "loss": 2.9237, "theoretical_loss": 3.6202132983982183, "tokens_seen": 1088103424 }, { "epoch": 3.02, "learning_rate": 0.00033855566700100303, "loss": 2.8762, "theoretical_loss": 3.6201928712331632, "tokens_seen": 1088168960 }, { "epoch": 3.02, "learning_rate": 0.0003385456369107322, "loss": 3.023, "theoretical_loss": 3.6201724456427615, "tokens_seen": 1088234496 }, { "epoch": 3.02, "learning_rate": 0.00033853560682046144, "loss": 2.9964, "theoretical_loss": 3.620152021626797, "tokens_seen": 1088300032 }, { "epoch": 3.02, "learning_rate": 0.00033852557673019057, "loss": 2.9565, "theoretical_loss": 3.620131599185053, "tokens_seen": 1088365568 }, { "epoch": 3.02, "learning_rate": 0.0003385155466399198, "loss": 2.8698, "theoretical_loss": 3.6201111783173134, "tokens_seen": 1088431104 }, { "epoch": 3.02, "learning_rate": 0.00033850551654964893, "loss": 2.915, "theoretical_loss": 3.6200907590233635, "tokens_seen": 1088496640 }, { "epoch": 3.02, "learning_rate": 0.00033849548645937817, "loss": 2.8242, "theoretical_loss": 3.620070341302985, "tokens_seen": 1088562176 }, { "epoch": 3.02, "learning_rate": 0.00033848545636910735, "loss": 2.8947, "theoretical_loss": 3.620049925155964, "tokens_seen": 1088627712 }, { "epoch": 3.02, "learning_rate": 0.00033847542627883653, "loss": 2.7822, "theoretical_loss": 3.620029510582084, "tokens_seen": 1088693248 }, { "epoch": 3.02, "learning_rate": 0.0003384653961885657, "loss": 2.9863, "theoretical_loss": 3.620009097581128, "tokens_seen": 1088758784 }, { "epoch": 3.02, "learning_rate": 0.0003384553660982949, "loss": 3.1086, "theoretical_loss": 3.6199886861528814, "tokens_seen": 1088824320 }, { "epoch": 3.02, "learning_rate": 0.0003384453360080241, "loss": 2.8508, "theoretical_loss": 3.6199682762971275, "tokens_seen": 1088889856 }, { "epoch": 3.02, "learning_rate": 0.0003384353059177533, "loss": 2.7961, "theoretical_loss": 3.619947868013651, "tokens_seen": 1088955392 }, { "epoch": 3.02, "learning_rate": 0.00033842527582748244, "loss": 2.9183, "theoretical_loss": 3.6199274613022365, "tokens_seen": 1089020928 }, { "epoch": 3.02, "learning_rate": 0.00033841524573721167, "loss": 2.9715, "theoretical_loss": 3.619907056162668, "tokens_seen": 1089086464 }, { "epoch": 3.02, "learning_rate": 0.0003384052156469408, "loss": 2.8445, "theoretical_loss": 3.6198866525947295, "tokens_seen": 1089152000 }, { "epoch": 3.02, "learning_rate": 0.00033839518555667003, "loss": 2.7445, "theoretical_loss": 3.6198662505982053, "tokens_seen": 1089217536 }, { "epoch": 3.02, "learning_rate": 0.0003383851554663992, "loss": 2.8358, "theoretical_loss": 3.6198458501728807, "tokens_seen": 1089283072 }, { "epoch": 3.02, "learning_rate": 0.0003383751253761284, "loss": 2.7122, "theoretical_loss": 3.6198254513185395, "tokens_seen": 1089348608 }, { "epoch": 3.02, "learning_rate": 0.0003383650952858576, "loss": 3.0448, "theoretical_loss": 3.619805054034966, "tokens_seen": 1089414144 }, { "epoch": 3.02, "learning_rate": 0.0003383550651955868, "loss": 2.7102, "theoretical_loss": 3.6197846583219464, "tokens_seen": 1089479680 }, { "epoch": 3.02, "objective/train/docs_used": 1747703, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9921348094940186, "objective/train/theoretical_loss": 3.6197795596390643, "objective/train/tokens_used": 1109956064, "theoretical_loss": 3.6197795596390643, "tokens_seen": 1089496064 }, { "epoch": 3.02, "learning_rate": 0.00033834503510531594, "loss": 2.9519, "theoretical_loss": 3.619764264179263, "tokens_seen": 1089545216 }, { "epoch": 3.02, "learning_rate": 0.0003383350050150452, "loss": 2.9333, "theoretical_loss": 3.6197438716067016, "tokens_seen": 1089610752 }, { "epoch": 3.02, "learning_rate": 0.0003383249749247743, "loss": 2.9049, "theoretical_loss": 3.619723480604047, "tokens_seen": 1089676288 }, { "epoch": 3.02, "learning_rate": 0.00033831494483450354, "loss": 3.0056, "theoretical_loss": 3.6197030911710835, "tokens_seen": 1089741824 }, { "epoch": 3.02, "learning_rate": 0.0003383049147442327, "loss": 2.983, "theoretical_loss": 3.6196827033075967, "tokens_seen": 1089807360 }, { "epoch": 3.02, "learning_rate": 0.0003382948846539619, "loss": 2.9383, "theoretical_loss": 3.6196623170133706, "tokens_seen": 1089872896 }, { "epoch": 3.02, "learning_rate": 0.0003382848545636911, "loss": 2.8761, "theoretical_loss": 3.6196419322881903, "tokens_seen": 1089938432 }, { "epoch": 3.02, "learning_rate": 0.00033827482447342026, "loss": 2.9125, "theoretical_loss": 3.619621549131841, "tokens_seen": 1090003968 }, { "epoch": 3.02, "learning_rate": 0.00033826479438314944, "loss": 2.8419, "theoretical_loss": 3.619601167544107, "tokens_seen": 1090069504 }, { "epoch": 3.02, "learning_rate": 0.0003382547642928787, "loss": 2.8365, "theoretical_loss": 3.6195807875247734, "tokens_seen": 1090135040 }, { "epoch": 3.02, "learning_rate": 0.0003382447342026078, "loss": 2.8841, "theoretical_loss": 3.6195604090736255, "tokens_seen": 1090200576 }, { "epoch": 3.02, "learning_rate": 0.00033823470411233704, "loss": 2.9063, "theoretical_loss": 3.6195400321904487, "tokens_seen": 1090266112 }, { "epoch": 3.02, "learning_rate": 0.00033822467402206617, "loss": 2.9136, "theoretical_loss": 3.6195196568750276, "tokens_seen": 1090331648 }, { "epoch": 3.02, "learning_rate": 0.0003382146439317954, "loss": 2.8437, "theoretical_loss": 3.6194992831271477, "tokens_seen": 1090397184 }, { "epoch": 3.02, "learning_rate": 0.0003382046138415246, "loss": 2.6784, "theoretical_loss": 3.6194789109465937, "tokens_seen": 1090462720 }, { "epoch": 3.02, "learning_rate": 0.00033819458375125376, "loss": 2.9124, "theoretical_loss": 3.6194585403331514, "tokens_seen": 1090528256 }, { "epoch": 3.02, "learning_rate": 0.00033818455366098295, "loss": 2.9884, "theoretical_loss": 3.619438171286606, "tokens_seen": 1090593792 }, { "epoch": 3.02, "learning_rate": 0.0003381745235707122, "loss": 2.8159, "theoretical_loss": 3.6194178038067424, "tokens_seen": 1090659328 }, { "epoch": 3.02, "learning_rate": 0.0003381644934804413, "loss": 2.7519, "theoretical_loss": 3.6193974378933467, "tokens_seen": 1090724864 }, { "epoch": 3.02, "learning_rate": 0.00033815446339017054, "loss": 2.8702, "theoretical_loss": 3.619377073546203, "tokens_seen": 1090790400 }, { "epoch": 3.02, "learning_rate": 0.00033814443329989967, "loss": 2.9507, "theoretical_loss": 3.6193567107650986, "tokens_seen": 1090855936 }, { "epoch": 3.02, "learning_rate": 0.0003381344032096289, "loss": 2.7829, "theoretical_loss": 3.619336349549817, "tokens_seen": 1090921472 }, { "epoch": 3.02, "learning_rate": 0.0003381243731193581, "loss": 3.0038, "theoretical_loss": 3.619315989900146, "tokens_seen": 1090987008 }, { "epoch": 3.02, "learning_rate": 0.00033811434302908727, "loss": 2.848, "theoretical_loss": 3.619295631815869, "tokens_seen": 1091052544 }, { "epoch": 3.02, "learning_rate": 0.00033810431293881645, "loss": 2.8249, "theoretical_loss": 3.619275275296773, "tokens_seen": 1091118080 }, { "epoch": 3.02, "objective/train/docs_used": 1750650, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.90329909324646, "objective/train/theoretical_loss": 3.6192701864115335, "objective/train/tokens_used": 1111594464, "theoretical_loss": 3.6192701864115335, "tokens_seen": 1091134464 }, { "epoch": 3.02, "learning_rate": 0.00033809428284854563, "loss": 2.9408, "theoretical_loss": 3.619254920342643, "tokens_seen": 1091183616 }, { "epoch": 3.02, "learning_rate": 0.0003380842527582748, "loss": 2.8926, "theoretical_loss": 3.6192345669532653, "tokens_seen": 1091249152 }, { "epoch": 3.02, "learning_rate": 0.00033807422266800405, "loss": 2.9885, "theoretical_loss": 3.619214215128425, "tokens_seen": 1091314688 }, { "epoch": 3.02, "learning_rate": 0.0003380641925777332, "loss": 3.0546, "theoretical_loss": 3.619193864867908, "tokens_seen": 1091380224 }, { "epoch": 3.02, "learning_rate": 0.0003380541624874624, "loss": 2.8271, "theoretical_loss": 3.6191735161715006, "tokens_seen": 1091445760 }, { "epoch": 3.02, "learning_rate": 0.00033804413239719154, "loss": 2.9196, "theoretical_loss": 3.6191531690389884, "tokens_seen": 1091511296 }, { "epoch": 3.02, "learning_rate": 0.00033803410230692077, "loss": 2.9979, "theoretical_loss": 3.6191328234701574, "tokens_seen": 1091576832 }, { "epoch": 3.02, "learning_rate": 0.00033802407221664995, "loss": 2.9204, "theoretical_loss": 3.619112479464793, "tokens_seen": 1091642368 }, { "epoch": 3.02, "learning_rate": 0.00033801404212637913, "loss": 2.9333, "theoretical_loss": 3.6190921370226823, "tokens_seen": 1091707904 }, { "epoch": 3.02, "learning_rate": 0.0003380040120361083, "loss": 2.8831, "theoretical_loss": 3.6190717961436105, "tokens_seen": 1091773440 }, { "epoch": 3.02, "learning_rate": 0.00033799398194583755, "loss": 2.6923, "theoretical_loss": 3.6190514568273633, "tokens_seen": 1091838976 }, { "epoch": 3.02, "learning_rate": 0.0003379839518555667, "loss": 2.9557, "theoretical_loss": 3.6190311190737283, "tokens_seen": 1091904512 }, { "epoch": 3.02, "learning_rate": 0.0003379739217652959, "loss": 2.885, "theoretical_loss": 3.6190107828824907, "tokens_seen": 1091970048 }, { "epoch": 3.02, "learning_rate": 0.00033796389167502504, "loss": 2.8164, "theoretical_loss": 3.6189904482534363, "tokens_seen": 1092035584 }, { "epoch": 3.02, "learning_rate": 0.0003379538615847543, "loss": 2.8321, "theoretical_loss": 3.6189701151863525, "tokens_seen": 1092101120 }, { "epoch": 3.02, "learning_rate": 0.00033794383149448346, "loss": 2.9781, "theoretical_loss": 3.6189497836810247, "tokens_seen": 1092166656 }, { "epoch": 3.02, "learning_rate": 0.00033793380140421264, "loss": 2.8662, "theoretical_loss": 3.6189294537372394, "tokens_seen": 1092232192 }, { "epoch": 3.02, "learning_rate": 0.0003379237713139418, "loss": 2.8744, "theoretical_loss": 3.6189091253547834, "tokens_seen": 1092297728 }, { "epoch": 3.02, "learning_rate": 0.000337913741223671, "loss": 2.9536, "theoretical_loss": 3.6188887985334426, "tokens_seen": 1092363264 }, { "epoch": 3.02, "learning_rate": 0.0003379037111334002, "loss": 3.015, "theoretical_loss": 3.618868473273004, "tokens_seen": 1092428800 }, { "epoch": 3.02, "learning_rate": 0.0003378936810431294, "loss": 2.8725, "theoretical_loss": 3.618848149573253, "tokens_seen": 1092494336 }, { "epoch": 3.02, "learning_rate": 0.00033788365095285854, "loss": 2.9339, "theoretical_loss": 3.618827827433978, "tokens_seen": 1092559872 }, { "epoch": 3.02, "learning_rate": 0.0003378736208625878, "loss": 3.0098, "theoretical_loss": 3.618807506854964, "tokens_seen": 1092625408 }, { "epoch": 3.02, "learning_rate": 0.0003378635907723169, "loss": 2.9904, "theoretical_loss": 3.6187871878359985, "tokens_seen": 1092690944 }, { "epoch": 3.02, "learning_rate": 0.00033785356068204614, "loss": 2.7727, "theoretical_loss": 3.6187668703768674, "tokens_seen": 1092756480 }, { "epoch": 3.02, "objective/train/docs_used": 1753040, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.452921152114868, "objective/train/theoretical_loss": 3.6187617912557837, "objective/train/tokens_used": 1113232864, "theoretical_loss": 3.6187617912557837, "tokens_seen": 1092772864 }, { "epoch": 3.02, "learning_rate": 0.0003378435305917753, "loss": 2.9242, "theoretical_loss": 3.618746554477358, "tokens_seen": 1092822016 }, { "epoch": 3.02, "learning_rate": 0.0003378335005015045, "loss": 2.9617, "theoretical_loss": 3.6187262401372573, "tokens_seen": 1092887552 }, { "epoch": 3.02, "learning_rate": 0.00033782347041123374, "loss": 2.9446, "theoretical_loss": 3.618705927356351, "tokens_seen": 1092953088 }, { "epoch": 3.02, "learning_rate": 0.0003378134403209629, "loss": 2.976, "theoretical_loss": 3.6186856161344276, "tokens_seen": 1093018624 }, { "epoch": 3.02, "learning_rate": 0.0003378034102306921, "loss": 2.8174, "theoretical_loss": 3.618665306471273, "tokens_seen": 1093084160 }, { "epoch": 3.02, "learning_rate": 0.0003377933801404213, "loss": 3.0496, "theoretical_loss": 3.6186449983666735, "tokens_seen": 1093149696 }, { "epoch": 3.02, "learning_rate": 0.00033778335005015046, "loss": 3.1029, "theoretical_loss": 3.6186246918204175, "tokens_seen": 1093215232 }, { "epoch": 3.02, "learning_rate": 0.00033777331995987964, "loss": 2.8877, "theoretical_loss": 3.618604386832291, "tokens_seen": 1093280768 }, { "epoch": 3.02, "learning_rate": 0.0003377632898696089, "loss": 3.0719, "theoretical_loss": 3.618584083402081, "tokens_seen": 1093346304 }, { "epoch": 3.02, "learning_rate": 0.000337753259779338, "loss": 2.9689, "theoretical_loss": 3.6185637815295753, "tokens_seen": 1093411840 }, { "epoch": 3.02, "learning_rate": 0.00033774322968906724, "loss": 2.8834, "theoretical_loss": 3.6185434812145605, "tokens_seen": 1093477376 }, { "epoch": 3.02, "learning_rate": 0.00033773319959879637, "loss": 2.845, "theoretical_loss": 3.6185231824568236, "tokens_seen": 1093542912 }, { "epoch": 3.02, "learning_rate": 0.0003377231695085256, "loss": 2.895, "theoretical_loss": 3.6185028852561527, "tokens_seen": 1093608448 }, { "epoch": 3.02, "learning_rate": 0.0003377131394182548, "loss": 2.732, "theoretical_loss": 3.6184825896123343, "tokens_seen": 1093673984 }, { "epoch": 3.02, "learning_rate": 0.00033770310932798396, "loss": 2.8804, "theoretical_loss": 3.6184622955251564, "tokens_seen": 1093739520 }, { "epoch": 3.02, "learning_rate": 0.00033769307923771315, "loss": 2.8567, "theoretical_loss": 3.6184420029944055, "tokens_seen": 1093805056 }, { "epoch": 3.02, "learning_rate": 0.0003376830491474424, "loss": 2.9998, "theoretical_loss": 3.6184217120198694, "tokens_seen": 1093870592 }, { "epoch": 3.02, "learning_rate": 0.0003376730190571715, "loss": 3.0224, "theoretical_loss": 3.6184014226013357, "tokens_seen": 1093936128 }, { "epoch": 3.02, "learning_rate": 0.00033766298896690074, "loss": 2.9082, "theoretical_loss": 3.6183811347385912, "tokens_seen": 1094001664 }, { "epoch": 3.02, "learning_rate": 0.00033765295887662987, "loss": 2.9162, "theoretical_loss": 3.6183608484314242, "tokens_seen": 1094067200 }, { "epoch": 3.02, "learning_rate": 0.0003376429287863591, "loss": 3.0513, "theoretical_loss": 3.618340563679622, "tokens_seen": 1094132736 }, { "epoch": 3.02, "learning_rate": 0.0003376328986960883, "loss": 2.8334, "theoretical_loss": 3.6183202804829717, "tokens_seen": 1094198272 }, { "epoch": 3.02, "learning_rate": 0.00033762286860581747, "loss": 2.989, "theoretical_loss": 3.6182999988412616, "tokens_seen": 1094263808 }, { "epoch": 3.02, "learning_rate": 0.00033761283851554665, "loss": 2.9694, "theoretical_loss": 3.6182797187542795, "tokens_seen": 1094329344 }, { "epoch": 3.02, "learning_rate": 0.00033760280842527583, "loss": 2.8337, "theoretical_loss": 3.618259440221813, "tokens_seen": 1094394880 }, { "epoch": 3.02, "objective/train/docs_used": 1755697, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.007908821105957, "objective/train/theoretical_loss": 3.618254370831564, "objective/train/tokens_used": 1114871264, "theoretical_loss": 3.618254370831564, "tokens_seen": 1094411264 }, { "epoch": 3.02, "learning_rate": 0.000337592778335005, "loss": 2.8868, "theoretical_loss": 3.618239163243649, "tokens_seen": 1094460416 }, { "epoch": 3.02, "learning_rate": 0.00033758274824473425, "loss": 3.1044, "theoretical_loss": 3.618218887819576, "tokens_seen": 1094525952 }, { "epoch": 3.02, "learning_rate": 0.0003375727181544634, "loss": 2.7178, "theoretical_loss": 3.618198613949382, "tokens_seen": 1094591488 }, { "epoch": 3.02, "learning_rate": 0.0003375626880641926, "loss": 2.8844, "theoretical_loss": 3.618178341632855, "tokens_seen": 1094657024 }, { "epoch": 3.02, "learning_rate": 0.00033755265797392174, "loss": 2.8694, "theoretical_loss": 3.6181580708697823, "tokens_seen": 1094722560 }, { "epoch": 3.02, "learning_rate": 0.00033754262788365097, "loss": 2.881, "theoretical_loss": 3.6181378016599517, "tokens_seen": 1094788096 }, { "epoch": 3.02, "learning_rate": 0.00033753259779338015, "loss": 2.8126, "theoretical_loss": 3.6181175340031526, "tokens_seen": 1094853632 }, { "epoch": 3.02, "learning_rate": 0.00033752256770310933, "loss": 2.9808, "theoretical_loss": 3.6180972678991714, "tokens_seen": 1094919168 }, { "epoch": 3.02, "learning_rate": 0.0003375125376128385, "loss": 2.9533, "theoretical_loss": 3.618077003347797, "tokens_seen": 1094984704 }, { "epoch": 3.02, "learning_rate": 0.00033750250752256775, "loss": 2.9608, "theoretical_loss": 3.618056740348818, "tokens_seen": 1095050240 }, { "epoch": 3.02, "learning_rate": 0.0003374924774322969, "loss": 2.8835, "theoretical_loss": 3.618036478902022, "tokens_seen": 1095115776 }, { "epoch": 3.02, "learning_rate": 0.0003374824473420261, "loss": 2.8914, "theoretical_loss": 3.618016219007197, "tokens_seen": 1095181312 }, { "epoch": 3.02, "learning_rate": 0.00033747241725175524, "loss": 2.6364, "theoretical_loss": 3.617995960664132, "tokens_seen": 1095246848 }, { "epoch": 3.02, "learning_rate": 0.0003374623871614845, "loss": 3.0137, "theoretical_loss": 3.617975703872615, "tokens_seen": 1095312384 }, { "epoch": 3.02, "learning_rate": 0.00033745235707121366, "loss": 2.8492, "theoretical_loss": 3.6179554486324332, "tokens_seen": 1095377920 }, { "epoch": 3.02, "learning_rate": 0.00033744232698094284, "loss": 2.9732, "theoretical_loss": 3.6179351949433767, "tokens_seen": 1095443456 }, { "epoch": 3.02, "learning_rate": 0.000337432296890672, "loss": 2.9482, "theoretical_loss": 3.617914942805233, "tokens_seen": 1095508992 }, { "epoch": 3.02, "learning_rate": 0.0003374222668004012, "loss": 2.6993, "theoretical_loss": 3.6178946922177913, "tokens_seen": 1095574528 }, { "epoch": 3.02, "learning_rate": 0.0003374122367101304, "loss": 2.9122, "theoretical_loss": 3.617874443180839, "tokens_seen": 1095640064 }, { "epoch": 3.02, "learning_rate": 0.0003374022066198596, "loss": 2.825, "theoretical_loss": 3.6178541956941657, "tokens_seen": 1095705600 }, { "epoch": 3.02, "learning_rate": 0.00033739217652958874, "loss": 2.9879, "theoretical_loss": 3.6178339497575593, "tokens_seen": 1095771136 }, { "epoch": 3.02, "learning_rate": 0.000337382146439318, "loss": 2.9648, "theoretical_loss": 3.617813705370809, "tokens_seen": 1095836672 }, { "epoch": 3.02, "learning_rate": 0.0003373721163490471, "loss": 3.0933, "theoretical_loss": 3.6177934625337027, "tokens_seen": 1095902208 }, { "epoch": 3.02, "learning_rate": 0.00033736208625877634, "loss": 2.834, "theoretical_loss": 3.61777322124603, "tokens_seen": 1095967744 }, { "epoch": 3.02, "learning_rate": 0.0003373520561685055, "loss": 2.9508, "theoretical_loss": 3.617752981507579, "tokens_seen": 1096033280 }, { "epoch": 3.02, "objective/train/docs_used": 1758536, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8290581703186035, "objective/train/theoretical_loss": 3.617747921815008, "objective/train/tokens_used": 1116509664, "theoretical_loss": 3.617747921815008, "tokens_seen": 1096049664 }, { "epoch": 3.02, "learning_rate": 0.0003373420260782347, "loss": 3.0507, "theoretical_loss": 3.617732743318139, "tokens_seen": 1096098816 }, { "epoch": 3.02, "learning_rate": 0.0003373319959879639, "loss": 2.8687, "theoretical_loss": 3.6177125066774987, "tokens_seen": 1096164352 }, { "epoch": 3.02, "learning_rate": 0.0003373219658976931, "loss": 2.8944, "theoretical_loss": 3.6176922715854465, "tokens_seen": 1096229888 }, { "epoch": 3.02, "learning_rate": 0.00033731193580742225, "loss": 2.8568, "theoretical_loss": 3.617672038041772, "tokens_seen": 1096295424 }, { "epoch": 3.02, "learning_rate": 0.0003373019057171515, "loss": 2.8839, "theoretical_loss": 3.617651806046264, "tokens_seen": 1096360960 }, { "epoch": 3.02, "learning_rate": 0.0003372918756268806, "loss": 2.9019, "theoretical_loss": 3.617631575598711, "tokens_seen": 1096426496 }, { "epoch": 3.02, "learning_rate": 0.00033728184553660984, "loss": 2.9026, "theoretical_loss": 3.6176113466989026, "tokens_seen": 1096492032 }, { "epoch": 3.02, "learning_rate": 0.000337271815446339, "loss": 2.8993, "theoretical_loss": 3.6175911193466277, "tokens_seen": 1096557568 }, { "epoch": 3.02, "learning_rate": 0.0003372617853560682, "loss": 2.9147, "theoretical_loss": 3.617570893541676, "tokens_seen": 1096623104 }, { "epoch": 3.02, "learning_rate": 0.0003372517552657974, "loss": 2.8242, "theoretical_loss": 3.617550669283836, "tokens_seen": 1096688640 }, { "epoch": 3.02, "learning_rate": 0.00033724172517552657, "loss": 2.8032, "theoretical_loss": 3.6175304465728964, "tokens_seen": 1096754176 }, { "epoch": 3.02, "learning_rate": 0.00033723169508525575, "loss": 2.8955, "theoretical_loss": 3.6175102254086475, "tokens_seen": 1096819712 }, { "epoch": 3.02, "learning_rate": 0.000337221664994985, "loss": 2.9631, "theoretical_loss": 3.6174900057908785, "tokens_seen": 1096885248 }, { "epoch": 3.02, "learning_rate": 0.0003372116349047141, "loss": 2.79, "theoretical_loss": 3.617469787719378, "tokens_seen": 1096950784 }, { "epoch": 3.02, "learning_rate": 0.00033720160481444335, "loss": 2.845, "theoretical_loss": 3.6174495711939363, "tokens_seen": 1097016320 }, { "epoch": 3.02, "learning_rate": 0.0003371915747241725, "loss": 2.9589, "theoretical_loss": 3.617429356214342, "tokens_seen": 1097081856 }, { "epoch": 3.02, "learning_rate": 0.0003371815446339017, "loss": 2.8288, "theoretical_loss": 3.617409142780385, "tokens_seen": 1097147392 }, { "epoch": 3.02, "learning_rate": 0.0003371715145436309, "loss": 2.9506, "theoretical_loss": 3.617388930891855, "tokens_seen": 1097212928 }, { "epoch": 3.02, "learning_rate": 0.00033716148445336007, "loss": 2.9556, "theoretical_loss": 3.6173687205485408, "tokens_seen": 1097278464 }, { "epoch": 3.02, "learning_rate": 0.00033715145436308925, "loss": 2.9116, "theoretical_loss": 3.617348511750233, "tokens_seen": 1097344000 }, { "epoch": 3.02, "learning_rate": 0.0003371414242728185, "loss": 2.9627, "theoretical_loss": 3.617328304496721, "tokens_seen": 1097409536 }, { "epoch": 3.02, "learning_rate": 0.0003371313941825476, "loss": 2.7759, "theoretical_loss": 3.6173080987877935, "tokens_seen": 1097475072 }, { "epoch": 3.02, "learning_rate": 0.00033712136409227685, "loss": 2.8629, "theoretical_loss": 3.617287894623241, "tokens_seen": 1097540608 }, { "epoch": 3.02, "learning_rate": 0.000337111334002006, "loss": 2.8043, "theoretical_loss": 3.6172676920028533, "tokens_seen": 1097606144 }, { "epoch": 3.02, "learning_rate": 0.0003371013039117352, "loss": 2.7798, "theoretical_loss": 3.61724749092642, "tokens_seen": 1097671680 }, { "epoch": 3.02, "objective/train/docs_used": 1761251, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1671676635742188, "objective/train/theoretical_loss": 3.6172424408985293, "objective/train/tokens_used": 1118148064, "theoretical_loss": 3.6172424408985293, "tokens_seen": 1097688064 }, { "epoch": 3.02, "learning_rate": 0.0003370912738214644, "loss": 3.0293, "theoretical_loss": 3.6172272913937307, "tokens_seen": 1097737216 }, { "epoch": 3.02, "learning_rate": 0.0003370812437311936, "loss": 3.0796, "theoretical_loss": 3.6172070934045757, "tokens_seen": 1097802752 }, { "epoch": 3.02, "learning_rate": 0.0003370712136409228, "loss": 2.8988, "theoretical_loss": 3.617186896958745, "tokens_seen": 1097868288 }, { "epoch": 3.02, "learning_rate": 0.00033706118355065194, "loss": 2.8801, "theoretical_loss": 3.617166702056028, "tokens_seen": 1097933824 }, { "epoch": 3.02, "learning_rate": 0.00033705115346038117, "loss": 2.9023, "theoretical_loss": 3.617146508696215, "tokens_seen": 1097999360 }, { "epoch": 3.02, "learning_rate": 0.00033704112337011035, "loss": 2.756, "theoretical_loss": 3.617126316879096, "tokens_seen": 1098064896 }, { "epoch": 3.02, "learning_rate": 0.00033703109327983953, "loss": 2.8292, "theoretical_loss": 3.6171061266044617, "tokens_seen": 1098130432 }, { "epoch": 3.02, "learning_rate": 0.0003370210631895687, "loss": 3.0456, "theoretical_loss": 3.617085937872101, "tokens_seen": 1098195968 }, { "epoch": 3.02, "learning_rate": 0.00033701103309929795, "loss": 2.8476, "theoretical_loss": 3.617065750681806, "tokens_seen": 1098261504 }, { "epoch": 3.02, "learning_rate": 0.0003370010030090271, "loss": 2.9883, "theoretical_loss": 3.617045565033364, "tokens_seen": 1098327040 }, { "epoch": 3.02, "learning_rate": 0.0003369909729187563, "loss": 2.7889, "theoretical_loss": 3.6170253809265676, "tokens_seen": 1098392576 }, { "epoch": 3.02, "learning_rate": 0.00033698094282848544, "loss": 2.7944, "theoretical_loss": 3.6170051983612064, "tokens_seen": 1098458112 }, { "epoch": 3.02, "learning_rate": 0.0003369709127382147, "loss": 2.8049, "theoretical_loss": 3.616985017337071, "tokens_seen": 1098523648 }, { "epoch": 3.02, "learning_rate": 0.00033696088264794386, "loss": 2.7992, "theoretical_loss": 3.616964837853951, "tokens_seen": 1098589184 }, { "epoch": 3.02, "learning_rate": 0.00033695085255767304, "loss": 2.6566, "theoretical_loss": 3.6169446599116375, "tokens_seen": 1098654720 }, { "epoch": 3.02, "learning_rate": 0.0003369408224674022, "loss": 2.7988, "theoretical_loss": 3.61692448350992, "tokens_seen": 1098720256 }, { "epoch": 3.02, "learning_rate": 0.0003369307923771314, "loss": 2.9401, "theoretical_loss": 3.6169043086485906, "tokens_seen": 1098785792 }, { "epoch": 3.02, "learning_rate": 0.0003369207622868606, "loss": 2.83, "theoretical_loss": 3.616884135327439, "tokens_seen": 1098851328 }, { "epoch": 3.02, "learning_rate": 0.0003369107321965898, "loss": 2.828, "theoretical_loss": 3.616863963546255, "tokens_seen": 1098916864 }, { "epoch": 3.02, "learning_rate": 0.00033690070210631894, "loss": 2.9666, "theoretical_loss": 3.6168437933048305, "tokens_seen": 1098982400 }, { "epoch": 3.02, "learning_rate": 0.0003368906720160482, "loss": 2.8667, "theoretical_loss": 3.6168236246029557, "tokens_seen": 1099047936 }, { "epoch": 3.02, "learning_rate": 0.0003368806419257773, "loss": 2.923, "theoretical_loss": 3.6168034574404206, "tokens_seen": 1099113472 }, { "epoch": 3.02, "learning_rate": 0.00033687061183550654, "loss": 2.9758, "theoretical_loss": 3.616783291817017, "tokens_seen": 1099179008 }, { "epoch": 3.02, "learning_rate": 0.0003368605817452357, "loss": 2.8608, "theoretical_loss": 3.6167631277325354, "tokens_seen": 1099244544 }, { "epoch": 3.02, "learning_rate": 0.0003368505516549649, "loss": 2.895, "theoretical_loss": 3.616742965186766, "tokens_seen": 1099310080 }, { "epoch": 3.02, "objective/train/docs_used": 1764151, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.768031358718872, "objective/train/theoretical_loss": 3.6167379247907228, "objective/train/tokens_used": 1119786464, "theoretical_loss": 3.6167379247907228, "tokens_seen": 1099326464 }, { "epoch": 3.02, "learning_rate": 0.0003368405215646941, "loss": 2.8387, "theoretical_loss": 3.6167228041795, "tokens_seen": 1099375616 }, { "epoch": 3.02, "learning_rate": 0.0003368304914744233, "loss": 2.9822, "theoretical_loss": 3.6167026447105286, "tokens_seen": 1099441152 }, { "epoch": 3.02, "learning_rate": 0.00033682046138415245, "loss": 2.982, "theoretical_loss": 3.6166824867796428, "tokens_seen": 1099506688 }, { "epoch": 3.02, "learning_rate": 0.0003368104312938817, "loss": 2.9556, "theoretical_loss": 3.6166623303866334, "tokens_seen": 1099572224 }, { "epoch": 3.02, "learning_rate": 0.0003368004012036108, "loss": 2.8633, "theoretical_loss": 3.6166421755312905, "tokens_seen": 1099637760 }, { "epoch": 3.02, "learning_rate": 0.00033679037111334004, "loss": 2.8623, "theoretical_loss": 3.6166220222134067, "tokens_seen": 1099703296 }, { "epoch": 3.02, "learning_rate": 0.0003367803410230692, "loss": 2.8231, "theoretical_loss": 3.6166018704327723, "tokens_seen": 1099768832 }, { "epoch": 3.02, "learning_rate": 0.0003367703109327984, "loss": 2.8483, "theoretical_loss": 3.6165817201891786, "tokens_seen": 1099834368 }, { "epoch": 3.02, "learning_rate": 0.0003367602808425276, "loss": 2.9499, "theoretical_loss": 3.616561571482417, "tokens_seen": 1099899904 }, { "epoch": 3.02, "learning_rate": 0.00033675025075225677, "loss": 2.9643, "theoretical_loss": 3.6165414243122775, "tokens_seen": 1099965440 }, { "epoch": 3.02, "learning_rate": 0.00033674022066198595, "loss": 3.0157, "theoretical_loss": 3.6165212786785537, "tokens_seen": 1100030976 }, { "epoch": 3.02, "learning_rate": 0.0003367301905717152, "loss": 2.9162, "theoretical_loss": 3.6165011345810347, "tokens_seen": 1100096512 }, { "epoch": 3.02, "learning_rate": 0.0003367201604814443, "loss": 2.8326, "theoretical_loss": 3.616480992019513, "tokens_seen": 1100162048 }, { "epoch": 3.02, "learning_rate": 0.00033671013039117355, "loss": 2.8124, "theoretical_loss": 3.6164608509937795, "tokens_seen": 1100227584 }, { "epoch": 3.02, "learning_rate": 0.00033670010030090273, "loss": 2.7789, "theoretical_loss": 3.6164407115036257, "tokens_seen": 1100293120 }, { "epoch": 3.02, "learning_rate": 0.0003366900702106319, "loss": 2.8697, "theoretical_loss": 3.6164205735488433, "tokens_seen": 1100358656 }, { "epoch": 3.02, "learning_rate": 0.0003366800401203611, "loss": 2.9813, "theoretical_loss": 3.616400437129224, "tokens_seen": 1100424192 }, { "epoch": 3.02, "learning_rate": 0.00033667001003009027, "loss": 2.9242, "theoretical_loss": 3.6163803022445586, "tokens_seen": 1100489728 }, { "epoch": 3.02, "learning_rate": 0.00033665997993981945, "loss": 2.952, "theoretical_loss": 3.61636016889464, "tokens_seen": 1100555264 }, { "epoch": 3.02, "learning_rate": 0.0003366499498495487, "loss": 2.9225, "theoretical_loss": 3.616340037079258, "tokens_seen": 1100620800 }, { "epoch": 3.02, "learning_rate": 0.0003366399197592778, "loss": 2.8771, "theoretical_loss": 3.616319906798206, "tokens_seen": 1100686336 }, { "epoch": 3.02, "learning_rate": 0.00033662988966900705, "loss": 2.8174, "theoretical_loss": 3.6162997780512747, "tokens_seen": 1100751872 }, { "epoch": 3.02, "learning_rate": 0.0003366198595787362, "loss": 2.99, "theoretical_loss": 3.616279650838256, "tokens_seen": 1100817408 }, { "epoch": 3.02, "learning_rate": 0.0003366098294884654, "loss": 2.8935, "theoretical_loss": 3.6162595251589424, "tokens_seen": 1100882944 }, { "epoch": 3.02, "learning_rate": 0.0003365997993981946, "loss": 2.8117, "theoretical_loss": 3.6162394010131247, "tokens_seen": 1100948480 }, { "epoch": 3.02, "objective/train/docs_used": 1766715, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9439899921417236, "objective/train/theoretical_loss": 3.6162343702162545, "objective/train/tokens_used": 1121424864, "theoretical_loss": 3.6162343702162545, "tokens_seen": 1100964864 }, { "epoch": 3.02, "learning_rate": 0.0003365897693079238, "loss": 2.8018, "theoretical_loss": 3.6162192784005955, "tokens_seen": 1101014016 }, { "epoch": 3.02, "learning_rate": 0.00033657973921765295, "loss": 2.9646, "theoretical_loss": 3.6161991573211463, "tokens_seen": 1101079552 }, { "epoch": 3.02, "learning_rate": 0.00033656970912738214, "loss": 2.6961, "theoretical_loss": 3.6161790377745695, "tokens_seen": 1101145088 }, { "epoch": 3.02, "learning_rate": 0.0003365596790371113, "loss": 2.8303, "theoretical_loss": 3.6161589197606565, "tokens_seen": 1101210624 }, { "epoch": 3.02, "learning_rate": 0.00033654964894684055, "loss": 2.7937, "theoretical_loss": 3.6161388032792, "tokens_seen": 1101276160 }, { "epoch": 3.02, "learning_rate": 0.0003365396188565697, "loss": 2.6791, "theoretical_loss": 3.6161186883299923, "tokens_seen": 1101341696 }, { "epoch": 3.02, "learning_rate": 0.0003365295887662989, "loss": 2.8361, "theoretical_loss": 3.616098574912824, "tokens_seen": 1101407232 }, { "epoch": 3.02, "learning_rate": 0.0003365195586760281, "loss": 2.8214, "theoretical_loss": 3.6160784630274887, "tokens_seen": 1101472768 }, { "epoch": 3.02, "learning_rate": 0.0003365095285857573, "loss": 2.9791, "theoretical_loss": 3.616058352673778, "tokens_seen": 1101538304 }, { "epoch": 3.02, "learning_rate": 0.00033649949849548646, "loss": 2.7965, "theoretical_loss": 3.616038243851485, "tokens_seen": 1101603840 }, { "epoch": 3.02, "learning_rate": 0.00033648946840521564, "loss": 2.9459, "theoretical_loss": 3.6160181365604007, "tokens_seen": 1101669376 }, { "epoch": 3.02, "learning_rate": 0.0003364794383149448, "loss": 2.9613, "theoretical_loss": 3.615998030800318, "tokens_seen": 1101734912 }, { "epoch": 3.02, "learning_rate": 0.00033646940822467406, "loss": 2.9649, "theoretical_loss": 3.6159779265710297, "tokens_seen": 1101800448 }, { "epoch": 3.02, "learning_rate": 0.0003364593781344032, "loss": 2.9089, "theoretical_loss": 3.6159578238723276, "tokens_seen": 1101865984 }, { "epoch": 3.02, "learning_rate": 0.0003364493480441324, "loss": 2.9031, "theoretical_loss": 3.615937722704004, "tokens_seen": 1101931520 }, { "epoch": 3.02, "learning_rate": 0.00033643931795386154, "loss": 2.9416, "theoretical_loss": 3.6159176230658523, "tokens_seen": 1101997056 }, { "epoch": 3.02, "learning_rate": 0.0003364292878635908, "loss": 2.8776, "theoretical_loss": 3.6158975249576644, "tokens_seen": 1102062592 }, { "epoch": 3.02, "learning_rate": 0.00033641925777331996, "loss": 2.9658, "theoretical_loss": 3.6158774283792328, "tokens_seen": 1102128128 }, { "epoch": 3.02, "learning_rate": 0.00033640922768304914, "loss": 2.7994, "theoretical_loss": 3.61585733333035, "tokens_seen": 1102193664 }, { "epoch": 3.02, "learning_rate": 0.0003363991975927783, "loss": 3.0173, "theoretical_loss": 3.6158372398108094, "tokens_seen": 1102259200 }, { "epoch": 3.02, "learning_rate": 0.0003363891675025075, "loss": 2.918, "theoretical_loss": 3.615817147820403, "tokens_seen": 1102324736 }, { "epoch": 3.02, "learning_rate": 0.0003363791374122367, "loss": 2.8305, "theoretical_loss": 3.6157970573589235, "tokens_seen": 1102390272 }, { "epoch": 3.02, "learning_rate": 0.0003363691073219659, "loss": 2.8634, "theoretical_loss": 3.6157769684261636, "tokens_seen": 1102455808 }, { "epoch": 3.02, "learning_rate": 0.00033635907723169505, "loss": 2.9197, "theoretical_loss": 3.615756881021917, "tokens_seen": 1102521344 }, { "epoch": 3.02, "learning_rate": 0.0003363490471414243, "loss": 2.7406, "theoretical_loss": 3.615736795145976, "tokens_seen": 1102586880 }, { "epoch": 3.02, "objective/train/docs_used": 1768042, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6401076316833496, "objective/train/theoretical_loss": 3.615731773915764, "objective/train/tokens_used": 1123063264, "theoretical_loss": 3.615731773915764, "tokens_seen": 1102603264 }, { "epoch": 3.02, "learning_rate": 0.00033633901705115346, "loss": 2.8881, "theoretical_loss": 3.615716710798133, "tokens_seen": 1102652416 }, { "epoch": 3.02, "learning_rate": 0.00033632898696088265, "loss": 2.8702, "theoretical_loss": 3.615696627978182, "tokens_seen": 1102717952 }, { "epoch": 3.02, "learning_rate": 0.0003363189568706119, "loss": 2.8101, "theoretical_loss": 3.615676546685915, "tokens_seen": 1102783488 }, { "epoch": 3.02, "learning_rate": 0.000336308926780341, "loss": 2.8898, "theoretical_loss": 3.6156564669211253, "tokens_seen": 1102849024 }, { "epoch": 3.02, "learning_rate": 0.00033629889669007024, "loss": 2.7983, "theoretical_loss": 3.615636388683606, "tokens_seen": 1102914560 }, { "epoch": 3.02, "learning_rate": 0.0003362888665997994, "loss": 2.9586, "theoretical_loss": 3.6156163119731506, "tokens_seen": 1102980096 }, { "epoch": 3.02, "learning_rate": 0.0003362788365095286, "loss": 2.8227, "theoretical_loss": 3.6155962367895516, "tokens_seen": 1103045632 }, { "epoch": 3.02, "learning_rate": 0.0003362688064192578, "loss": 2.9365, "theoretical_loss": 3.6155761631326024, "tokens_seen": 1103111168 }, { "epoch": 3.02, "learning_rate": 0.00033625877632898697, "loss": 2.8844, "theoretical_loss": 3.6155560910020963, "tokens_seen": 1103176704 }, { "epoch": 3.02, "learning_rate": 0.00033624874623871615, "loss": 2.9255, "theoretical_loss": 3.615536020397826, "tokens_seen": 1103242240 }, { "epoch": 3.02, "learning_rate": 0.0003362387161484454, "loss": 2.8707, "theoretical_loss": 3.615515951319586, "tokens_seen": 1103307776 }, { "epoch": 3.02, "learning_rate": 0.0003362286860581745, "loss": 2.8122, "theoretical_loss": 3.6154958837671685, "tokens_seen": 1103373312 }, { "epoch": 3.02, "learning_rate": 0.00033621865596790375, "loss": 2.8213, "theoretical_loss": 3.615475817740368, "tokens_seen": 1103438848 }, { "epoch": 3.02, "learning_rate": 0.00033620862587763293, "loss": 2.9513, "theoretical_loss": 3.6154557532389764, "tokens_seen": 1103504384 }, { "epoch": 3.02, "learning_rate": 0.0003361985957873621, "loss": 2.7467, "theoretical_loss": 3.615435690262788, "tokens_seen": 1103569920 }, { "epoch": 3.02, "learning_rate": 0.0003361885656970913, "loss": 2.8483, "theoretical_loss": 3.615415628811597, "tokens_seen": 1103635456 }, { "epoch": 3.02, "learning_rate": 0.00033617853560682047, "loss": 2.8884, "theoretical_loss": 3.6153955688851953, "tokens_seen": 1103700992 }, { "epoch": 3.02, "learning_rate": 0.00033616850551654965, "loss": 2.9331, "theoretical_loss": 3.615375510483378, "tokens_seen": 1103766528 }, { "epoch": 3.02, "learning_rate": 0.0003361584754262789, "loss": 2.8067, "theoretical_loss": 3.615355453605938, "tokens_seen": 1103832064 }, { "epoch": 3.02, "learning_rate": 0.000336148445336008, "loss": 2.8569, "theoretical_loss": 3.615335398252669, "tokens_seen": 1103897600 }, { "epoch": 3.02, "learning_rate": 0.00033613841524573725, "loss": 2.9502, "theoretical_loss": 3.615315344423365, "tokens_seen": 1103963136 }, { "epoch": 3.02, "learning_rate": 0.0003361283851554664, "loss": 3.005, "theoretical_loss": 3.6152952921178194, "tokens_seen": 1104028672 }, { "epoch": 3.02, "learning_rate": 0.0003361183550651956, "loss": 2.8402, "theoretical_loss": 3.615275241335826, "tokens_seen": 1104094208 }, { "epoch": 3.02, "learning_rate": 0.0003361083249749248, "loss": 2.9041, "theoretical_loss": 3.6152551920771785, "tokens_seen": 1104159744 }, { "epoch": 3.02, "learning_rate": 0.000336098294884654, "loss": 2.7104, "theoretical_loss": 3.6152351443416713, "tokens_seen": 1104225280 }, { "epoch": 3.02, "objective/train/docs_used": 1770814, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8287065029144287, "objective/train/theoretical_loss": 3.615230132645761, "objective/train/tokens_used": 1124701664, "theoretical_loss": 3.615230132645761, "tokens_seen": 1104241664 }, { "epoch": 3.02, "learning_rate": 0.00033608826479438316, "loss": 2.7483, "theoretical_loss": 3.615215098129098, "tokens_seen": 1104290816 }, { "epoch": 3.02, "learning_rate": 0.00033607823470411234, "loss": 2.7349, "theoretical_loss": 3.615195053439252, "tokens_seen": 1104356352 }, { "epoch": 3.02, "learning_rate": 0.0003360682046138415, "loss": 2.955, "theoretical_loss": 3.6151750102719284, "tokens_seen": 1104421888 }, { "epoch": 3.02, "learning_rate": 0.00033605817452357075, "loss": 2.8348, "theoretical_loss": 3.6151549686269204, "tokens_seen": 1104487424 }, { "epoch": 3.02, "learning_rate": 0.0003360481444332999, "loss": 2.9265, "theoretical_loss": 3.615134928504022, "tokens_seen": 1104552960 }, { "epoch": 3.02, "learning_rate": 0.0003360381143430291, "loss": 2.8321, "theoretical_loss": 3.615114889903028, "tokens_seen": 1104618496 }, { "epoch": 3.02, "learning_rate": 0.0003360280842527583, "loss": 2.8233, "theoretical_loss": 3.6150948528237317, "tokens_seen": 1104684032 }, { "epoch": 3.02, "learning_rate": 0.0003360180541624875, "loss": 2.9224, "theoretical_loss": 3.6150748172659277, "tokens_seen": 1104749568 }, { "epoch": 3.02, "learning_rate": 0.00033600802407221666, "loss": 2.8754, "theoretical_loss": 3.6150547832294104, "tokens_seen": 1104815104 }, { "epoch": 3.02, "learning_rate": 0.00033599799398194584, "loss": 3.1028, "theoretical_loss": 3.615034750713974, "tokens_seen": 1104880640 }, { "epoch": 3.02, "learning_rate": 0.000335987963891675, "loss": 2.9706, "theoretical_loss": 3.6150147197194125, "tokens_seen": 1104946176 }, { "epoch": 3.02, "learning_rate": 0.00033597793380140426, "loss": 2.873, "theoretical_loss": 3.61499469024552, "tokens_seen": 1105011712 }, { "epoch": 3.02, "learning_rate": 0.0003359679037111334, "loss": 2.8403, "theoretical_loss": 3.6149746622920915, "tokens_seen": 1105077248 }, { "epoch": 3.02, "learning_rate": 0.0003359578736208626, "loss": 2.92, "theoretical_loss": 3.614954635858921, "tokens_seen": 1105142784 }, { "epoch": 3.02, "learning_rate": 0.00033594784353059174, "loss": 2.7978, "theoretical_loss": 3.614934610945804, "tokens_seen": 1105208320 }, { "epoch": 3.02, "learning_rate": 0.000335937813440321, "loss": 2.9216, "theoretical_loss": 3.6149145875525335, "tokens_seen": 1105273856 }, { "epoch": 3.02, "learning_rate": 0.00033592778335005016, "loss": 2.8682, "theoretical_loss": 3.6148945656789047, "tokens_seen": 1105339392 }, { "epoch": 3.02, "learning_rate": 0.00033591775325977934, "loss": 2.8879, "theoretical_loss": 3.614874545324712, "tokens_seen": 1105404928 }, { "epoch": 3.02, "learning_rate": 0.0003359077231695085, "loss": 2.8338, "theoretical_loss": 3.6148545264897507, "tokens_seen": 1105470464 }, { "epoch": 3.02, "learning_rate": 0.0003358976930792377, "loss": 2.8494, "theoretical_loss": 3.6148345091738143, "tokens_seen": 1105536000 }, { "epoch": 3.02, "learning_rate": 0.0003358876629889669, "loss": 3.0682, "theoretical_loss": 3.6148144933766986, "tokens_seen": 1105601536 }, { "epoch": 3.02, "learning_rate": 0.0003358776328986961, "loss": 2.8834, "theoretical_loss": 3.6147944790981983, "tokens_seen": 1105667072 }, { "epoch": 3.02, "learning_rate": 0.00033586760280842525, "loss": 2.7748, "theoretical_loss": 3.6147744663381074, "tokens_seen": 1105732608 }, { "epoch": 3.02, "learning_rate": 0.0003358575727181545, "loss": 2.9117, "theoretical_loss": 3.6147544550962207, "tokens_seen": 1105798144 }, { "epoch": 3.02, "learning_rate": 0.00033584754262788366, "loss": 2.8933, "theoretical_loss": 3.614734445372334, "tokens_seen": 1105863680 }, { "epoch": 3.02, "objective/train/docs_used": 1773480, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.135927677154541, "objective/train/theoretical_loss": 3.6147294431785255, "objective/train/tokens_used": 1126340064, "theoretical_loss": 3.6147294431785255, "tokens_seen": 1105880064 }, { "epoch": 3.02, "learning_rate": 0.00033583751253761285, "loss": 3.0158, "theoretical_loss": 3.614714437166241, "tokens_seen": 1105929216 }, { "epoch": 3.02, "learning_rate": 0.000335827482447342, "loss": 2.8001, "theoretical_loss": 3.614694430477738, "tokens_seen": 1105994752 }, { "epoch": 3.02, "learning_rate": 0.0003358174523570712, "loss": 2.8308, "theoretical_loss": 3.6146744253066183, "tokens_seen": 1106060288 }, { "epoch": 3.02, "learning_rate": 0.0003358074222668004, "loss": 2.8561, "theoretical_loss": 3.614654421652679, "tokens_seen": 1106125824 }, { "epoch": 3.02, "learning_rate": 0.0003357973921765296, "loss": 3.0504, "theoretical_loss": 3.6146344195157134, "tokens_seen": 1106191360 }, { "epoch": 3.02, "learning_rate": 0.00033578736208625875, "loss": 2.9413, "theoretical_loss": 3.6146144188955174, "tokens_seen": 1106256896 }, { "epoch": 3.02, "learning_rate": 0.000335777331995988, "loss": 2.7936, "theoretical_loss": 3.614594419791886, "tokens_seen": 1106322432 }, { "epoch": 3.02, "learning_rate": 0.0003357673019057171, "loss": 2.8708, "theoretical_loss": 3.614574422204614, "tokens_seen": 1106387968 }, { "epoch": 3.02, "learning_rate": 0.00033575727181544635, "loss": 2.76, "theoretical_loss": 3.6145544261334974, "tokens_seen": 1106453504 }, { "epoch": 3.02, "learning_rate": 0.00033574724172517553, "loss": 2.8387, "theoretical_loss": 3.614534431578331, "tokens_seen": 1106519040 }, { "epoch": 3.02, "learning_rate": 0.0003357372116349047, "loss": 2.8398, "theoretical_loss": 3.61451443853891, "tokens_seen": 1106584576 }, { "epoch": 3.02, "learning_rate": 0.0003357271815446339, "loss": 2.7765, "theoretical_loss": 3.61449444701503, "tokens_seen": 1106650112 }, { "epoch": 3.02, "learning_rate": 0.00033571715145436313, "loss": 2.8492, "theoretical_loss": 3.6144744570064855, "tokens_seen": 1106715648 }, { "epoch": 3.02, "learning_rate": 0.00033570712136409225, "loss": 2.6675, "theoretical_loss": 3.614454468513074, "tokens_seen": 1106781184 }, { "epoch": 3.02, "learning_rate": 0.0003356970912738215, "loss": 2.8999, "theoretical_loss": 3.6144344815345884, "tokens_seen": 1106846720 }, { "epoch": 3.02, "learning_rate": 0.0003356870611835506, "loss": 2.7562, "theoretical_loss": 3.614414496070826, "tokens_seen": 1106912256 }, { "epoch": 3.02, "learning_rate": 0.00033567703109327985, "loss": 2.8386, "theoretical_loss": 3.6143945121215815, "tokens_seen": 1106977792 }, { "epoch": 3.02, "learning_rate": 0.00033566700100300903, "loss": 3.0239, "theoretical_loss": 3.6143745296866507, "tokens_seen": 1107043328 }, { "epoch": 3.02, "learning_rate": 0.0003356569709127382, "loss": 2.8594, "theoretical_loss": 3.6143545487658297, "tokens_seen": 1107108864 }, { "epoch": 3.02, "learning_rate": 0.0003356469408224674, "loss": 2.8126, "theoretical_loss": 3.6143345693589133, "tokens_seen": 1107174400 }, { "epoch": 3.02, "learning_rate": 0.0003356369107321966, "loss": 2.9636, "theoretical_loss": 3.614314591465697, "tokens_seen": 1107239936 }, { "epoch": 3.02, "learning_rate": 0.00033562688064192576, "loss": 2.937, "theoretical_loss": 3.614294615085978, "tokens_seen": 1107305472 }, { "epoch": 3.02, "learning_rate": 0.000335616850551655, "loss": 2.9297, "theoretical_loss": 3.614274640219551, "tokens_seen": 1107371008 }, { "epoch": 3.02, "learning_rate": 0.0003356068204613841, "loss": 2.9701, "theoretical_loss": 3.614254666866212, "tokens_seen": 1107436544 }, { "epoch": 3.02, "learning_rate": 0.00033559679037111336, "loss": 2.8063, "theoretical_loss": 3.6142346950257567, "tokens_seen": 1107502080 }, { "epoch": 3.02, "objective/train/docs_used": 1776229, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2258996963500977, "objective/train/theoretical_loss": 3.6142297023020067, "objective/train/tokens_used": 1127978464, "theoretical_loss": 3.6142297023020067, "tokens_seen": 1107518464 }, { "epoch": 3.02, "learning_rate": 0.0003355867602808425, "loss": 2.8103, "theoretical_loss": 3.614214724697981, "tokens_seen": 1107567616 }, { "epoch": 3.03, "learning_rate": 0.0003355767301905717, "loss": 3.077, "theoretical_loss": 3.6141947558826812, "tokens_seen": 1107633152 }, { "epoch": 3.03, "learning_rate": 0.00033556670010030095, "loss": 2.8892, "theoretical_loss": 3.614174788579653, "tokens_seen": 1107698688 }, { "epoch": 3.03, "learning_rate": 0.0003355566700100301, "loss": 2.8852, "theoretical_loss": 3.6141548227886924, "tokens_seen": 1107764224 }, { "epoch": 3.03, "learning_rate": 0.0003355466399197593, "loss": 2.7473, "theoretical_loss": 3.614134858509596, "tokens_seen": 1107829760 }, { "epoch": 3.03, "learning_rate": 0.0003355366098294885, "loss": 2.7895, "theoretical_loss": 3.6141148957421585, "tokens_seen": 1107895296 }, { "epoch": 3.03, "learning_rate": 0.0003355265797392177, "loss": 3.0512, "theoretical_loss": 3.6140949344861775, "tokens_seen": 1107960832 }, { "epoch": 3.03, "learning_rate": 0.00033551654964894686, "loss": 2.9066, "theoretical_loss": 3.6140749747414485, "tokens_seen": 1108026368 }, { "epoch": 3.03, "learning_rate": 0.00033550651955867604, "loss": 2.8881, "theoretical_loss": 3.614055016507768, "tokens_seen": 1108091904 }, { "epoch": 3.03, "learning_rate": 0.0003354964894684052, "loss": 2.8894, "theoretical_loss": 3.6140350597849316, "tokens_seen": 1108157440 }, { "epoch": 3.03, "learning_rate": 0.00033548645937813446, "loss": 2.8693, "theoretical_loss": 3.6140151045727364, "tokens_seen": 1108222976 }, { "epoch": 3.03, "learning_rate": 0.0003354764292878636, "loss": 2.8191, "theoretical_loss": 3.613995150870978, "tokens_seen": 1108288512 }, { "epoch": 3.03, "learning_rate": 0.0003354663991975928, "loss": 2.5494, "theoretical_loss": 3.6139751986794533, "tokens_seen": 1108354048 }, { "epoch": 3.03, "learning_rate": 0.00033545636910732195, "loss": 2.7351, "theoretical_loss": 3.613955247997959, "tokens_seen": 1108419584 }, { "epoch": 3.03, "learning_rate": 0.0003354463390170512, "loss": 2.9293, "theoretical_loss": 3.6139352988262905, "tokens_seen": 1108485120 }, { "epoch": 3.03, "learning_rate": 0.00033543630892678036, "loss": 2.9781, "theoretical_loss": 3.6139153511642452, "tokens_seen": 1108550656 }, { "epoch": 3.03, "learning_rate": 0.00033542627883650954, "loss": 2.9417, "theoretical_loss": 3.613895405011619, "tokens_seen": 1108616192 }, { "epoch": 3.03, "learning_rate": 0.0003354162487462387, "loss": 2.9009, "theoretical_loss": 3.6138754603682095, "tokens_seen": 1108681728 }, { "epoch": 3.03, "learning_rate": 0.0003354062186559679, "loss": 2.6306, "theoretical_loss": 3.6138555172338114, "tokens_seen": 1108747264 }, { "epoch": 3.03, "learning_rate": 0.0003353961885656971, "loss": 2.8815, "theoretical_loss": 3.613835575608223, "tokens_seen": 1108812800 }, { "epoch": 3.03, "learning_rate": 0.0003353861584754263, "loss": 2.9565, "theoretical_loss": 3.613815635491241, "tokens_seen": 1108878336 }, { "epoch": 3.03, "learning_rate": 0.00033537612838515545, "loss": 2.9256, "theoretical_loss": 3.6137956968826614, "tokens_seen": 1108943872 }, { "epoch": 3.03, "learning_rate": 0.0003353660982948847, "loss": 2.8205, "theoretical_loss": 3.613775759782281, "tokens_seen": 1109009408 }, { "epoch": 3.03, "learning_rate": 0.00033535606820461386, "loss": 3.0214, "theoretical_loss": 3.6137558241898966, "tokens_seen": 1109074944 }, { "epoch": 3.03, "learning_rate": 0.00033534603811434305, "loss": 2.7949, "theoretical_loss": 3.613735890105305, "tokens_seen": 1109140480 }, { "epoch": 3.03, "objective/train/docs_used": 1779121, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0580368041992188, "objective/train/theoretical_loss": 3.6137309068197263, "objective/train/tokens_used": 1129616864, "theoretical_loss": 3.6137309068197263, "tokens_seen": 1109156864 }, { "epoch": 3.03, "learning_rate": 0.0003353360080240722, "loss": 2.9046, "theoretical_loss": 3.613715957528304, "tokens_seen": 1109206016 }, { "epoch": 3.03, "learning_rate": 0.0003353259779338014, "loss": 2.9147, "theoretical_loss": 3.61369602645869, "tokens_seen": 1109271552 }, { "epoch": 3.03, "learning_rate": 0.0003353159478435306, "loss": 3.0322, "theoretical_loss": 3.613676096896259, "tokens_seen": 1109337088 }, { "epoch": 3.03, "learning_rate": 0.0003353059177532598, "loss": 2.6878, "theoretical_loss": 3.6136561688408095, "tokens_seen": 1109402624 }, { "epoch": 3.03, "learning_rate": 0.00033529588766298895, "loss": 2.8346, "theoretical_loss": 3.613636242292137, "tokens_seen": 1109468160 }, { "epoch": 3.03, "learning_rate": 0.0003352858575727182, "loss": 2.8273, "theoretical_loss": 3.61361631725004, "tokens_seen": 1109533696 }, { "epoch": 3.03, "learning_rate": 0.0003352758274824473, "loss": 2.9751, "theoretical_loss": 3.6135963937143147, "tokens_seen": 1109599232 }, { "epoch": 3.03, "learning_rate": 0.00033526579739217655, "loss": 2.9639, "theoretical_loss": 3.6135764716847585, "tokens_seen": 1109664768 }, { "epoch": 3.03, "learning_rate": 0.00033525576730190573, "loss": 2.969, "theoretical_loss": 3.613556551161169, "tokens_seen": 1109730304 }, { "epoch": 3.03, "learning_rate": 0.0003352457372116349, "loss": 2.7009, "theoretical_loss": 3.613536632143343, "tokens_seen": 1109795840 }, { "epoch": 3.03, "learning_rate": 0.0003352357071213641, "loss": 2.8817, "theoretical_loss": 3.6135167146310776, "tokens_seen": 1109861376 }, { "epoch": 3.03, "learning_rate": 0.00033522567703109333, "loss": 2.8737, "theoretical_loss": 3.61349679862417, "tokens_seen": 1109926912 }, { "epoch": 3.03, "learning_rate": 0.00033521564694082245, "loss": 2.7757, "theoretical_loss": 3.6134768841224187, "tokens_seen": 1109992448 }, { "epoch": 3.03, "learning_rate": 0.0003352056168505517, "loss": 2.8938, "theoretical_loss": 3.61345697112562, "tokens_seen": 1110057984 }, { "epoch": 3.03, "learning_rate": 0.0003351955867602808, "loss": 2.8209, "theoretical_loss": 3.613437059633572, "tokens_seen": 1110123520 }, { "epoch": 3.03, "learning_rate": 0.00033518555667001005, "loss": 2.9107, "theoretical_loss": 3.613417149646071, "tokens_seen": 1110189056 }, { "epoch": 3.03, "learning_rate": 0.00033517552657973923, "loss": 2.8939, "theoretical_loss": 3.6133972411629163, "tokens_seen": 1110254592 }, { "epoch": 3.03, "learning_rate": 0.0003351654964894684, "loss": 2.8592, "theoretical_loss": 3.6133773341839035, "tokens_seen": 1110320128 }, { "epoch": 3.03, "learning_rate": 0.0003351554663991976, "loss": 2.6176, "theoretical_loss": 3.6133574287088317, "tokens_seen": 1110385664 }, { "epoch": 3.03, "learning_rate": 0.0003351454363089268, "loss": 2.7948, "theoretical_loss": 3.613337524737498, "tokens_seen": 1110451200 }, { "epoch": 3.03, "learning_rate": 0.00033513540621865596, "loss": 2.843, "theoretical_loss": 3.6133176222697, "tokens_seen": 1110516736 }, { "epoch": 3.03, "learning_rate": 0.0003351253761283852, "loss": 2.9431, "theoretical_loss": 3.6132977213052353, "tokens_seen": 1110582272 }, { "epoch": 3.03, "learning_rate": 0.0003351153460381143, "loss": 2.9451, "theoretical_loss": 3.613277821843902, "tokens_seen": 1110647808 }, { "epoch": 3.03, "learning_rate": 0.00033510531594784356, "loss": 2.7636, "theoretical_loss": 3.6132579238854974, "tokens_seen": 1110713344 }, { "epoch": 3.03, "learning_rate": 0.0003350952858575727, "loss": 2.7368, "theoretical_loss": 3.6132380274298197, "tokens_seen": 1110778880 }, { "epoch": 3.03, "objective/train/docs_used": 1780704, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9335076808929443, "objective/train/theoretical_loss": 3.613233053550678, "objective/train/tokens_used": 1131255264, "theoretical_loss": 3.613233053550678, "tokens_seen": 1110795264 }, { "epoch": 3.03, "learning_rate": 0.0003350852557673019, "loss": 2.9585, "theoretical_loss": 3.613218132476667, "tokens_seen": 1110844416 }, { "epoch": 3.03, "learning_rate": 0.0003350752256770311, "loss": 2.8459, "theoretical_loss": 3.613198239025836, "tokens_seen": 1110909952 }, { "epoch": 3.03, "learning_rate": 0.0003350651955867603, "loss": 2.9026, "theoretical_loss": 3.6131783470771266, "tokens_seen": 1110975488 }, { "epoch": 3.03, "learning_rate": 0.00033505516549648946, "loss": 2.9281, "theoretical_loss": 3.6131584566303347, "tokens_seen": 1111041024 }, { "epoch": 3.03, "learning_rate": 0.0003350451354062187, "loss": 2.8582, "theoretical_loss": 3.61313856768526, "tokens_seen": 1111106560 }, { "epoch": 3.03, "learning_rate": 0.0003350351053159478, "loss": 2.7809, "theoretical_loss": 3.6131186802416995, "tokens_seen": 1111172096 }, { "epoch": 3.03, "learning_rate": 0.00033502507522567706, "loss": 2.846, "theoretical_loss": 3.613098794299452, "tokens_seen": 1111237632 }, { "epoch": 3.03, "learning_rate": 0.0003350150451354062, "loss": 2.8218, "theoretical_loss": 3.6130789098583147, "tokens_seen": 1111303168 }, { "epoch": 3.03, "learning_rate": 0.0003350050150451354, "loss": 2.9124, "theoretical_loss": 3.6130590269180862, "tokens_seen": 1111368704 }, { "epoch": 3.03, "learning_rate": 0.0003349949849548646, "loss": 2.8815, "theoretical_loss": 3.6130391454785658, "tokens_seen": 1111434240 }, { "epoch": 3.03, "learning_rate": 0.0003349849548645938, "loss": 2.7564, "theoretical_loss": 3.61301926553955, "tokens_seen": 1111499776 }, { "epoch": 3.03, "learning_rate": 0.00033497492477432296, "loss": 2.8046, "theoretical_loss": 3.612999387100838, "tokens_seen": 1111565312 }, { "epoch": 3.03, "learning_rate": 0.00033496489468405215, "loss": 2.8272, "theoretical_loss": 3.612979510162228, "tokens_seen": 1111630848 }, { "epoch": 3.03, "learning_rate": 0.0003349548645937813, "loss": 2.7598, "theoretical_loss": 3.612959634723519, "tokens_seen": 1111696384 }, { "epoch": 3.03, "learning_rate": 0.00033494483450351056, "loss": 2.7622, "theoretical_loss": 3.612939760784508, "tokens_seen": 1111761920 }, { "epoch": 3.03, "learning_rate": 0.0003349348044132397, "loss": 3.0008, "theoretical_loss": 3.6129198883449947, "tokens_seen": 1111827456 }, { "epoch": 3.03, "learning_rate": 0.0003349247743229689, "loss": 2.9286, "theoretical_loss": 3.6129000174047774, "tokens_seen": 1111892992 }, { "epoch": 3.03, "learning_rate": 0.00033491474423269805, "loss": 2.8793, "theoretical_loss": 3.6128801479636543, "tokens_seen": 1111958528 }, { "epoch": 3.03, "learning_rate": 0.0003349047141424273, "loss": 2.8699, "theoretical_loss": 3.6128602800214233, "tokens_seen": 1112024064 }, { "epoch": 3.03, "learning_rate": 0.00033489468405215647, "loss": 2.7857, "theoretical_loss": 3.612840413577884, "tokens_seen": 1112089600 }, { "epoch": 3.03, "learning_rate": 0.00033488465396188565, "loss": 2.8717, "theoretical_loss": 3.6128205486328353, "tokens_seen": 1112155136 }, { "epoch": 3.03, "learning_rate": 0.00033487462387161483, "loss": 2.8998, "theoretical_loss": 3.612800685186075, "tokens_seen": 1112220672 }, { "epoch": 3.03, "learning_rate": 0.00033486459378134406, "loss": 2.8182, "theoretical_loss": 3.6127808232374026, "tokens_seen": 1112286208 }, { "epoch": 3.03, "learning_rate": 0.0003348545636910732, "loss": 2.9893, "theoretical_loss": 3.612760962786616, "tokens_seen": 1112351744 }, { "epoch": 3.03, "learning_rate": 0.0003348445336008024, "loss": 2.8611, "theoretical_loss": 3.612741103833514, "tokens_seen": 1112417280 }, { "epoch": 3.03, "objective/train/docs_used": 1783596, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9412996768951416, "objective/train/theoretical_loss": 3.6127361393292285, "objective/train/tokens_used": 1132893664, "theoretical_loss": 3.6127361393292285, "tokens_seen": 1112433664 }, { "epoch": 3.03, "learning_rate": 0.0003348345035105316, "loss": 2.822, "theoretical_loss": 3.6127212463778964, "tokens_seen": 1112482816 }, { "epoch": 3.03, "learning_rate": 0.0003348244734202608, "loss": 2.8542, "theoretical_loss": 3.612701390419562, "tokens_seen": 1112548352 }, { "epoch": 3.03, "learning_rate": 0.00033481444332999, "loss": 2.8462, "theoretical_loss": 3.612681535958308, "tokens_seen": 1112613888 }, { "epoch": 3.03, "learning_rate": 0.00033480441323971915, "loss": 2.9405, "theoretical_loss": 3.6126616829939353, "tokens_seen": 1112679424 }, { "epoch": 3.03, "learning_rate": 0.0003347943831494484, "loss": 2.8249, "theoretical_loss": 3.6126418315262425, "tokens_seen": 1112744960 }, { "epoch": 3.03, "learning_rate": 0.0003347843530591775, "loss": 2.9179, "theoretical_loss": 3.6126219815550282, "tokens_seen": 1112810496 }, { "epoch": 3.03, "learning_rate": 0.00033477432296890675, "loss": 2.8631, "theoretical_loss": 3.6126021330800913, "tokens_seen": 1112876032 }, { "epoch": 3.03, "learning_rate": 0.00033476429287863593, "loss": 2.9187, "theoretical_loss": 3.6125822861012313, "tokens_seen": 1112941568 }, { "epoch": 3.03, "learning_rate": 0.0003347542627883651, "loss": 2.8019, "theoretical_loss": 3.6125624406182473, "tokens_seen": 1113007104 }, { "epoch": 3.03, "learning_rate": 0.0003347442326980943, "loss": 2.8808, "theoretical_loss": 3.612542596630938, "tokens_seen": 1113072640 }, { "epoch": 3.03, "learning_rate": 0.00033473420260782353, "loss": 2.8695, "theoretical_loss": 3.6125227541391034, "tokens_seen": 1113138176 }, { "epoch": 3.03, "learning_rate": 0.00033472417251755265, "loss": 2.8146, "theoretical_loss": 3.6125029131425426, "tokens_seen": 1113203712 }, { "epoch": 3.03, "learning_rate": 0.0003347141424272819, "loss": 2.9956, "theoretical_loss": 3.6124830736410543, "tokens_seen": 1113269248 }, { "epoch": 3.03, "learning_rate": 0.000334704112337011, "loss": 2.9151, "theoretical_loss": 3.6124632356344386, "tokens_seen": 1113334784 }, { "epoch": 3.03, "learning_rate": 0.00033469408224674025, "loss": 2.9632, "theoretical_loss": 3.6124433991224945, "tokens_seen": 1113400320 }, { "epoch": 3.03, "learning_rate": 0.00033468405215646943, "loss": 2.8918, "theoretical_loss": 3.612423564105021, "tokens_seen": 1113465856 }, { "epoch": 3.03, "learning_rate": 0.0003346740220661986, "loss": 2.8383, "theoretical_loss": 3.6124037305818186, "tokens_seen": 1113531392 }, { "epoch": 3.03, "learning_rate": 0.0003346639919759278, "loss": 2.8096, "theoretical_loss": 3.612383898552686, "tokens_seen": 1113596928 }, { "epoch": 3.03, "learning_rate": 0.000334653961885657, "loss": 2.9389, "theoretical_loss": 3.612364068017423, "tokens_seen": 1113662464 }, { "epoch": 3.03, "learning_rate": 0.00033464393179538616, "loss": 2.7641, "theoretical_loss": 3.612344238975829, "tokens_seen": 1113728000 }, { "epoch": 3.03, "learning_rate": 0.0003346339017051154, "loss": 2.9479, "theoretical_loss": 3.6123244114277036, "tokens_seen": 1113793536 }, { "epoch": 3.03, "learning_rate": 0.0003346238716148445, "loss": 2.9779, "theoretical_loss": 3.612304585372847, "tokens_seen": 1113859072 }, { "epoch": 3.03, "learning_rate": 0.00033461384152457376, "loss": 2.9273, "theoretical_loss": 3.612284760811058, "tokens_seen": 1113924608 }, { "epoch": 3.03, "learning_rate": 0.0003346038114343029, "loss": 3.032, "theoretical_loss": 3.612264937742137, "tokens_seen": 1113990144 }, { "epoch": 3.03, "learning_rate": 0.0003345937813440321, "loss": 2.7863, "theoretical_loss": 3.6122451161658837, "tokens_seen": 1114055680 }, { "epoch": 3.03, "objective/train/docs_used": 1786457, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.881303310394287, "objective/train/theoretical_loss": 3.612240161005026, "objective/train/tokens_used": 1134532064, "theoretical_loss": 3.612240161005026, "tokens_seen": 1114072064 }, { "epoch": 3.03, "learning_rate": 0.0003345837512537613, "loss": 2.9861, "theoretical_loss": 3.612225296082097, "tokens_seen": 1114121216 }, { "epoch": 3.03, "learning_rate": 0.0003345737211634905, "loss": 2.9367, "theoretical_loss": 3.6122054774905785, "tokens_seen": 1114186752 }, { "epoch": 3.03, "learning_rate": 0.00033456369107321966, "loss": 2.8102, "theoretical_loss": 3.6121856603911264, "tokens_seen": 1114252288 }, { "epoch": 3.03, "learning_rate": 0.0003345536609829489, "loss": 2.9678, "theoretical_loss": 3.612165844783542, "tokens_seen": 1114317824 }, { "epoch": 3.03, "learning_rate": 0.000334543630892678, "loss": 2.852, "theoretical_loss": 3.6121460306676236, "tokens_seen": 1114383360 }, { "epoch": 3.03, "learning_rate": 0.00033453360080240726, "loss": 2.9925, "theoretical_loss": 3.612126218043173, "tokens_seen": 1114448896 }, { "epoch": 3.03, "learning_rate": 0.0003345235707121364, "loss": 2.8604, "theoretical_loss": 3.612106406909989, "tokens_seen": 1114514432 }, { "epoch": 3.03, "learning_rate": 0.0003345135406218656, "loss": 2.8995, "theoretical_loss": 3.612086597267872, "tokens_seen": 1114579968 }, { "epoch": 3.03, "learning_rate": 0.0003345035105315948, "loss": 2.9028, "theoretical_loss": 3.6120667891166223, "tokens_seen": 1114645504 }, { "epoch": 3.03, "learning_rate": 0.000334493480441324, "loss": 2.9692, "theoretical_loss": 3.6120469824560395, "tokens_seen": 1114711040 }, { "epoch": 3.03, "learning_rate": 0.00033448345035105316, "loss": 2.9301, "theoretical_loss": 3.612027177285925, "tokens_seen": 1114776576 }, { "epoch": 3.03, "learning_rate": 0.00033447342026078235, "loss": 2.8143, "theoretical_loss": 3.612007373606078, "tokens_seen": 1114842112 }, { "epoch": 3.03, "learning_rate": 0.0003344633901705115, "loss": 2.7035, "theoretical_loss": 3.6119875714162983, "tokens_seen": 1114907648 }, { "epoch": 3.03, "learning_rate": 0.00033445336008024076, "loss": 2.8046, "theoretical_loss": 3.6119677707163875, "tokens_seen": 1114973184 }, { "epoch": 3.03, "learning_rate": 0.0003344433299899699, "loss": 2.8347, "theoretical_loss": 3.6119479715061455, "tokens_seen": 1115038720 }, { "epoch": 3.03, "learning_rate": 0.0003344332998996991, "loss": 2.9695, "theoretical_loss": 3.6119281737853726, "tokens_seen": 1115104256 }, { "epoch": 3.03, "learning_rate": 0.00033442326980942825, "loss": 3.0068, "theoretical_loss": 3.611908377553869, "tokens_seen": 1115169792 }, { "epoch": 3.03, "learning_rate": 0.0003344132397191575, "loss": 2.8478, "theoretical_loss": 3.611888582811435, "tokens_seen": 1115235328 }, { "epoch": 3.03, "learning_rate": 0.00033440320962888667, "loss": 2.7963, "theoretical_loss": 3.6118687895578723, "tokens_seen": 1115300864 }, { "epoch": 3.03, "learning_rate": 0.00033439317953861585, "loss": 2.8111, "theoretical_loss": 3.61184899779298, "tokens_seen": 1115366400 }, { "epoch": 3.03, "learning_rate": 0.00033438314944834503, "loss": 2.7411, "theoretical_loss": 3.6118292075165592, "tokens_seen": 1115431936 }, { "epoch": 3.03, "learning_rate": 0.00033437311935807427, "loss": 2.846, "theoretical_loss": 3.6118094187284107, "tokens_seen": 1115497472 }, { "epoch": 3.03, "learning_rate": 0.0003343630892678034, "loss": 2.9916, "theoretical_loss": 3.6117896314283344, "tokens_seen": 1115563008 }, { "epoch": 3.03, "learning_rate": 0.00033435305917753263, "loss": 2.774, "theoretical_loss": 3.6117698456161325, "tokens_seen": 1115628544 }, { "epoch": 3.03, "learning_rate": 0.00033434302908726175, "loss": 2.9826, "theoretical_loss": 3.6117500612916045, "tokens_seen": 1115694080 }, { "epoch": 3.03, "objective/train/docs_used": 1789480, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.660053253173828, "objective/train/theoretical_loss": 3.6117451154428983, "objective/train/tokens_used": 1136170464, "theoretical_loss": 3.6117451154428983, "tokens_seen": 1115710464 }, { "epoch": 3.03, "learning_rate": 0.000334332998996991, "loss": 2.8797, "theoretical_loss": 3.6117302784545515, "tokens_seen": 1115759616 }, { "epoch": 3.03, "learning_rate": 0.00033432296890672017, "loss": 2.9507, "theoretical_loss": 3.611710497104774, "tokens_seen": 1115825152 }, { "epoch": 3.03, "learning_rate": 0.00033431293881644935, "loss": 2.8158, "theoretical_loss": 3.6116907172420736, "tokens_seen": 1115890688 }, { "epoch": 3.03, "learning_rate": 0.00033430290872617853, "loss": 2.9369, "theoretical_loss": 3.6116709388662507, "tokens_seen": 1115956224 }, { "epoch": 3.03, "learning_rate": 0.0003342928786359077, "loss": 2.7401, "theoretical_loss": 3.6116511619771057, "tokens_seen": 1116021760 }, { "epoch": 3.03, "learning_rate": 0.0003342828485456369, "loss": 2.9204, "theoretical_loss": 3.6116313865744405, "tokens_seen": 1116087296 }, { "epoch": 3.03, "learning_rate": 0.00033427281845536613, "loss": 2.9718, "theoretical_loss": 3.6116116126580557, "tokens_seen": 1116152832 }, { "epoch": 3.03, "learning_rate": 0.00033426278836509526, "loss": 2.8568, "theoretical_loss": 3.611591840227752, "tokens_seen": 1116218368 }, { "epoch": 3.03, "learning_rate": 0.0003342527582748245, "loss": 2.8922, "theoretical_loss": 3.6115720692833313, "tokens_seen": 1116283904 }, { "epoch": 3.03, "learning_rate": 0.0003342427281845536, "loss": 2.8029, "theoretical_loss": 3.611552299824594, "tokens_seen": 1116349440 }, { "epoch": 3.03, "learning_rate": 0.00033423269809428285, "loss": 2.8592, "theoretical_loss": 3.611532531851341, "tokens_seen": 1116414976 }, { "epoch": 3.03, "learning_rate": 0.00033422266800401204, "loss": 2.813, "theoretical_loss": 3.6115127653633747, "tokens_seen": 1116480512 }, { "epoch": 3.03, "learning_rate": 0.0003342126379137412, "loss": 2.8702, "theoretical_loss": 3.6114930003604955, "tokens_seen": 1116546048 }, { "epoch": 3.03, "learning_rate": 0.0003342026078234704, "loss": 2.8941, "theoretical_loss": 3.611473236842505, "tokens_seen": 1116611584 }, { "epoch": 3.03, "learning_rate": 0.00033419257773319963, "loss": 2.8542, "theoretical_loss": 3.6114534748092035, "tokens_seen": 1116677120 }, { "epoch": 3.03, "learning_rate": 0.00033418254764292876, "loss": 2.8992, "theoretical_loss": 3.611433714260393, "tokens_seen": 1116742656 }, { "epoch": 3.03, "learning_rate": 0.000334172517552658, "loss": 2.9566, "theoretical_loss": 3.611413955195876, "tokens_seen": 1116808192 }, { "epoch": 3.03, "learning_rate": 0.0003341624874623871, "loss": 2.9342, "theoretical_loss": 3.6113941976154518, "tokens_seen": 1116873728 }, { "epoch": 3.03, "learning_rate": 0.00033415245737211636, "loss": 2.7905, "theoretical_loss": 3.6113744415189237, "tokens_seen": 1116939264 }, { "epoch": 3.03, "learning_rate": 0.00033414242728184554, "loss": 3.055, "theoretical_loss": 3.6113546869060924, "tokens_seen": 1117004800 }, { "epoch": 3.03, "learning_rate": 0.0003341323971915747, "loss": 2.8152, "theoretical_loss": 3.6113349337767593, "tokens_seen": 1117070336 }, { "epoch": 3.03, "learning_rate": 0.0003341223671013039, "loss": 3.0846, "theoretical_loss": 3.611315182130726, "tokens_seen": 1117135872 }, { "epoch": 3.03, "learning_rate": 0.0003341123370110331, "loss": 2.8871, "theoretical_loss": 3.611295431967794, "tokens_seen": 1117201408 }, { "epoch": 3.03, "learning_rate": 0.00033410230692076226, "loss": 2.706, "theoretical_loss": 3.611275683287766, "tokens_seen": 1117266944 }, { "epoch": 3.03, "learning_rate": 0.0003340922768304915, "loss": 2.991, "theoretical_loss": 3.6112559360904424, "tokens_seen": 1117332480 }, { "epoch": 3.03, "objective/train/docs_used": 1792364, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.657228708267212, "objective/train/theoretical_loss": 3.6112509995227606, "objective/train/tokens_used": 1137808864, "theoretical_loss": 3.6112509995227606, "tokens_seen": 1117348864 }, { "epoch": 3.03, "learning_rate": 0.0003340822467402207, "loss": 2.7967, "theoretical_loss": 3.6112361903756254, "tokens_seen": 1117398016 }, { "epoch": 3.03, "learning_rate": 0.00033407221664994986, "loss": 2.9931, "theoretical_loss": 3.611216446143117, "tokens_seen": 1117463552 }, { "epoch": 3.03, "learning_rate": 0.0003340621865596791, "loss": 2.91, "theoretical_loss": 3.611196703392719, "tokens_seen": 1117529088 }, { "epoch": 3.03, "learning_rate": 0.0003340521564694082, "loss": 2.9613, "theoretical_loss": 3.6111769621242322, "tokens_seen": 1117594624 }, { "epoch": 3.03, "learning_rate": 0.00033404212637913746, "loss": 2.8507, "theoretical_loss": 3.61115722233746, "tokens_seen": 1117660160 }, { "epoch": 3.03, "learning_rate": 0.0003340320962888666, "loss": 2.9666, "theoretical_loss": 3.611137484032203, "tokens_seen": 1117725696 }, { "epoch": 3.03, "learning_rate": 0.0003340220661985958, "loss": 2.7809, "theoretical_loss": 3.6111177472082643, "tokens_seen": 1117791232 }, { "epoch": 3.03, "learning_rate": 0.000334012036108325, "loss": 2.9514, "theoretical_loss": 3.611098011865445, "tokens_seen": 1117856768 }, { "epoch": 3.03, "learning_rate": 0.0003340020060180542, "loss": 2.9609, "theoretical_loss": 3.611078278003548, "tokens_seen": 1117922304 }, { "epoch": 3.03, "learning_rate": 0.00033399197592778336, "loss": 2.9525, "theoretical_loss": 3.6110585456223747, "tokens_seen": 1117987840 }, { "epoch": 3.03, "learning_rate": 0.00033398194583751255, "loss": 3.0142, "theoretical_loss": 3.6110388147217267, "tokens_seen": 1118053376 }, { "epoch": 3.03, "learning_rate": 0.0003339719157472417, "loss": 2.896, "theoretical_loss": 3.6110190853014075, "tokens_seen": 1118118912 }, { "epoch": 3.03, "learning_rate": 0.00033396188565697096, "loss": 2.8084, "theoretical_loss": 3.610999357361218, "tokens_seen": 1118184448 }, { "epoch": 3.03, "learning_rate": 0.0003339518555667001, "loss": 2.9355, "theoretical_loss": 3.6109796309009616, "tokens_seen": 1118249984 }, { "epoch": 3.03, "learning_rate": 0.0003339418254764293, "loss": 2.6865, "theoretical_loss": 3.6109599059204394, "tokens_seen": 1118315520 }, { "epoch": 3.03, "learning_rate": 0.00033393179538615845, "loss": 2.8882, "theoretical_loss": 3.610940182419455, "tokens_seen": 1118381056 }, { "epoch": 3.03, "learning_rate": 0.0003339217652958877, "loss": 2.9161, "theoretical_loss": 3.610920460397809, "tokens_seen": 1118446592 }, { "epoch": 3.03, "learning_rate": 0.00033391173520561687, "loss": 2.9151, "theoretical_loss": 3.610900739855305, "tokens_seen": 1118512128 }, { "epoch": 3.03, "learning_rate": 0.00033390170511534605, "loss": 2.9681, "theoretical_loss": 3.6108810207917457, "tokens_seen": 1118577664 }, { "epoch": 3.03, "learning_rate": 0.00033389167502507523, "loss": 2.8844, "theoretical_loss": 3.6108613032069328, "tokens_seen": 1118643200 }, { "epoch": 3.03, "learning_rate": 0.00033388164493480447, "loss": 2.7164, "theoretical_loss": 3.6108415871006687, "tokens_seen": 1118708736 }, { "epoch": 3.03, "learning_rate": 0.0003338716148445336, "loss": 2.7464, "theoretical_loss": 3.6108218724727568, "tokens_seen": 1118774272 }, { "epoch": 3.03, "learning_rate": 0.00033386158475426283, "loss": 2.9235, "theoretical_loss": 3.6108021593229984, "tokens_seen": 1118839808 }, { "epoch": 3.03, "learning_rate": 0.00033385155466399195, "loss": 2.9311, "theoretical_loss": 3.610782447651197, "tokens_seen": 1118905344 }, { "epoch": 3.03, "learning_rate": 0.0003338415245737212, "loss": 2.8086, "theoretical_loss": 3.610762737457155, "tokens_seen": 1118970880 }, { "epoch": 3.03, "objective/train/docs_used": 1794356, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.833271026611328, "objective/train/theoretical_loss": 3.6107578101395212, "objective/train/tokens_used": 1139447264, "theoretical_loss": 3.6107578101395212, "tokens_seen": 1118987264 }, { "epoch": 3.03, "learning_rate": 0.00033383149448345037, "loss": 2.7927, "theoretical_loss": 3.610743028740675, "tokens_seen": 1119036416 }, { "epoch": 3.03, "learning_rate": 0.00033382146439317955, "loss": 2.8367, "theoretical_loss": 3.6107233215015597, "tokens_seen": 1119101952 }, { "epoch": 3.03, "learning_rate": 0.00033381143430290873, "loss": 2.9904, "theoretical_loss": 3.610703615739612, "tokens_seen": 1119167488 }, { "epoch": 3.03, "learning_rate": 0.0003338014042126379, "loss": 3.0214, "theoretical_loss": 3.610683911454635, "tokens_seen": 1119233024 }, { "epoch": 3.03, "learning_rate": 0.0003337913741223671, "loss": 2.7769, "theoretical_loss": 3.6106642086464316, "tokens_seen": 1119298560 }, { "epoch": 3.03, "learning_rate": 0.00033378134403209633, "loss": 2.8976, "theoretical_loss": 3.6106445073148032, "tokens_seen": 1119364096 }, { "epoch": 3.03, "learning_rate": 0.00033377131394182546, "loss": 2.9967, "theoretical_loss": 3.610624807459554, "tokens_seen": 1119429632 }, { "epoch": 3.03, "learning_rate": 0.0003337612838515547, "loss": 2.798, "theoretical_loss": 3.6106051090804865, "tokens_seen": 1119495168 }, { "epoch": 3.03, "learning_rate": 0.0003337512537612838, "loss": 2.8459, "theoretical_loss": 3.6105854121774046, "tokens_seen": 1119560704 }, { "epoch": 3.03, "learning_rate": 0.00033374122367101306, "loss": 2.8379, "theoretical_loss": 3.61056571675011, "tokens_seen": 1119626240 }, { "epoch": 3.03, "learning_rate": 0.00033373119358074224, "loss": 2.7394, "theoretical_loss": 3.610546022798406, "tokens_seen": 1119691776 }, { "epoch": 3.03, "learning_rate": 0.0003337211634904714, "loss": 2.8851, "theoretical_loss": 3.6105263303220965, "tokens_seen": 1119757312 }, { "epoch": 3.03, "learning_rate": 0.0003337111334002006, "loss": 2.849, "theoretical_loss": 3.610506639320984, "tokens_seen": 1119822848 }, { "epoch": 3.03, "learning_rate": 0.00033370110330992983, "loss": 2.9629, "theoretical_loss": 3.610486949794872, "tokens_seen": 1119888384 }, { "epoch": 3.03, "learning_rate": 0.00033369107321965896, "loss": 2.8778, "theoretical_loss": 3.6104672617435627, "tokens_seen": 1119953920 }, { "epoch": 3.03, "learning_rate": 0.0003336810431293882, "loss": 2.8233, "theoretical_loss": 3.610447575166861, "tokens_seen": 1120019456 }, { "epoch": 3.03, "learning_rate": 0.0003336710130391173, "loss": 2.6962, "theoretical_loss": 3.610427890064569, "tokens_seen": 1120084992 }, { "epoch": 3.03, "learning_rate": 0.00033366098294884656, "loss": 2.8555, "theoretical_loss": 3.6104082064364906, "tokens_seen": 1120150528 }, { "epoch": 3.03, "learning_rate": 0.00033365095285857574, "loss": 3.0343, "theoretical_loss": 3.6103885242824285, "tokens_seen": 1120216064 }, { "epoch": 3.03, "learning_rate": 0.0003336409227683049, "loss": 2.8034, "theoretical_loss": 3.6103688436021866, "tokens_seen": 1120281600 }, { "epoch": 3.03, "learning_rate": 0.0003336308926780341, "loss": 2.9315, "theoretical_loss": 3.6103491643955685, "tokens_seen": 1120347136 }, { "epoch": 3.03, "learning_rate": 0.0003336208625877633, "loss": 2.802, "theoretical_loss": 3.610329486662377, "tokens_seen": 1120412672 }, { "epoch": 3.03, "learning_rate": 0.00033361083249749246, "loss": 2.8968, "theoretical_loss": 3.6103098104024163, "tokens_seen": 1120478208 }, { "epoch": 3.03, "learning_rate": 0.0003336008024072217, "loss": 2.984, "theoretical_loss": 3.6102901356154895, "tokens_seen": 1120543744 }, { "epoch": 3.03, "learning_rate": 0.0003335907723169508, "loss": 2.9171, "theoretical_loss": 3.6102704623014, "tokens_seen": 1120609280 }, { "epoch": 3.03, "objective/train/docs_used": 1797089, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.049738645553589, "objective/train/theoretical_loss": 3.610265544202986, "objective/train/tokens_used": 1141085664, "theoretical_loss": 3.610265544202986, "tokens_seen": 1120625664 }, { "epoch": 3.03, "learning_rate": 0.00033358074222668006, "loss": 3.0277, "theoretical_loss": 3.610250790459952, "tokens_seen": 1120674816 }, { "epoch": 3.03, "learning_rate": 0.00033357071213640924, "loss": 2.811, "theoretical_loss": 3.610231120090949, "tokens_seen": 1120740352 }, { "epoch": 3.03, "learning_rate": 0.0003335606820461384, "loss": 2.8575, "theoretical_loss": 3.6102114511941945, "tokens_seen": 1120805888 }, { "epoch": 3.03, "learning_rate": 0.0003335506519558676, "loss": 2.8006, "theoretical_loss": 3.6101917837694923, "tokens_seen": 1120871424 }, { "epoch": 3.03, "learning_rate": 0.0003335406218655968, "loss": 2.9088, "theoretical_loss": 3.610172117816646, "tokens_seen": 1120936960 }, { "epoch": 3.03, "learning_rate": 0.00033353059177532597, "loss": 2.9176, "theoretical_loss": 3.61015245333546, "tokens_seen": 1121002496 }, { "epoch": 3.03, "learning_rate": 0.0003335205616850552, "loss": 2.7223, "theoretical_loss": 3.6101327903257374, "tokens_seen": 1121068032 }, { "epoch": 3.03, "learning_rate": 0.00033351053159478433, "loss": 2.9546, "theoretical_loss": 3.6101131287872823, "tokens_seen": 1121133568 }, { "epoch": 3.03, "learning_rate": 0.00033350050150451356, "loss": 2.8212, "theoretical_loss": 3.6100934687198993, "tokens_seen": 1121199104 }, { "epoch": 3.03, "learning_rate": 0.0003334904714142427, "loss": 2.9276, "theoretical_loss": 3.6100738101233913, "tokens_seen": 1121264640 }, { "epoch": 3.03, "learning_rate": 0.0003334804413239719, "loss": 2.8369, "theoretical_loss": 3.610054152997563, "tokens_seen": 1121330176 }, { "epoch": 3.03, "learning_rate": 0.0003334704112337011, "loss": 2.9464, "theoretical_loss": 3.6100344973422183, "tokens_seen": 1121395712 }, { "epoch": 3.03, "learning_rate": 0.0003334603811434303, "loss": 2.7382, "theoretical_loss": 3.610014843157161, "tokens_seen": 1121461248 }, { "epoch": 3.03, "learning_rate": 0.00033345035105315947, "loss": 2.828, "theoretical_loss": 3.609995190442196, "tokens_seen": 1121526784 }, { "epoch": 3.03, "learning_rate": 0.00033344032096288865, "loss": 2.7583, "theoretical_loss": 3.609975539197126, "tokens_seen": 1121592320 }, { "epoch": 3.03, "learning_rate": 0.00033343029087261783, "loss": 2.8898, "theoretical_loss": 3.609955889421756, "tokens_seen": 1121657856 }, { "epoch": 3.03, "learning_rate": 0.00033342026078234707, "loss": 2.8404, "theoretical_loss": 3.609936241115891, "tokens_seen": 1121723392 }, { "epoch": 3.03, "learning_rate": 0.0003334102306920762, "loss": 2.9108, "theoretical_loss": 3.6099165942793343, "tokens_seen": 1121788928 }, { "epoch": 3.03, "learning_rate": 0.00033340020060180543, "loss": 2.9864, "theoretical_loss": 3.6098969489118904, "tokens_seen": 1121854464 }, { "epoch": 3.03, "learning_rate": 0.0003333901705115346, "loss": 2.9903, "theoretical_loss": 3.6098773050133635, "tokens_seen": 1121920000 }, { "epoch": 3.03, "learning_rate": 0.0003333801404212638, "loss": 2.7407, "theoretical_loss": 3.6098576625835577, "tokens_seen": 1121985536 }, { "epoch": 3.03, "learning_rate": 0.000333370110330993, "loss": 2.856, "theoretical_loss": 3.6098380216222785, "tokens_seen": 1122051072 }, { "epoch": 3.03, "learning_rate": 0.00033336008024072215, "loss": 2.6675, "theoretical_loss": 3.6098183821293297, "tokens_seen": 1122116608 }, { "epoch": 3.03, "learning_rate": 0.00033335005015045134, "loss": 2.9239, "theoretical_loss": 3.6097987441045154, "tokens_seen": 1122182144 }, { "epoch": 3.03, "learning_rate": 0.00033334002006018057, "loss": 2.8805, "theoretical_loss": 3.6097791075476406, "tokens_seen": 1122247680 }, { "epoch": 3.03, "objective/train/docs_used": 1799759, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0512828826904297, "objective/train/theoretical_loss": 3.6097741986377647, "objective/train/tokens_used": 1142724064, "theoretical_loss": 3.6097741986377647, "tokens_seen": 1122264064 }, { "epoch": 3.03, "learning_rate": 0.00033332998996990975, "loss": 2.8788, "theoretical_loss": 3.60975947245851, "tokens_seen": 1122313216 }, { "epoch": 3.03, "learning_rate": 0.00033331995987963893, "loss": 2.9588, "theoretical_loss": 3.6097398388369273, "tokens_seen": 1122378752 }, { "epoch": 3.03, "learning_rate": 0.0003333099297893681, "loss": 2.9459, "theoretical_loss": 3.609720206682698, "tokens_seen": 1122444288 }, { "epoch": 3.03, "learning_rate": 0.0003332998996990973, "loss": 2.9237, "theoretical_loss": 3.6097005759956264, "tokens_seen": 1122509824 }, { "epoch": 3.03, "learning_rate": 0.00033328986960882653, "loss": 2.8508, "theoretical_loss": 3.6096809467755175, "tokens_seen": 1122575360 }, { "epoch": 3.03, "learning_rate": 0.00033327983951855566, "loss": 2.942, "theoretical_loss": 3.6096613190221762, "tokens_seen": 1122640896 }, { "epoch": 3.03, "learning_rate": 0.0003332698094282849, "loss": 2.8458, "theoretical_loss": 3.6096416927354067, "tokens_seen": 1122706432 }, { "epoch": 3.03, "learning_rate": 0.000333259779338014, "loss": 2.8914, "theoretical_loss": 3.609622067915014, "tokens_seen": 1122771968 }, { "epoch": 3.03, "learning_rate": 0.00033324974924774326, "loss": 2.8368, "theoretical_loss": 3.609602444560803, "tokens_seen": 1122837504 }, { "epoch": 3.03, "learning_rate": 0.00033323971915747244, "loss": 2.9141, "theoretical_loss": 3.609582822672579, "tokens_seen": 1122903040 }, { "epoch": 3.03, "learning_rate": 0.0003332296890672016, "loss": 2.7688, "theoretical_loss": 3.609563202250146, "tokens_seen": 1122968576 }, { "epoch": 3.03, "learning_rate": 0.0003332196589769308, "loss": 2.8566, "theoretical_loss": 3.60954358329331, "tokens_seen": 1123034112 }, { "epoch": 3.03, "learning_rate": 0.00033320962888666003, "loss": 2.7902, "theoretical_loss": 3.6095239658018756, "tokens_seen": 1123099648 }, { "epoch": 3.03, "learning_rate": 0.00033319959879638916, "loss": 2.8912, "theoretical_loss": 3.6095043497756474, "tokens_seen": 1123165184 }, { "epoch": 3.03, "learning_rate": 0.0003331895687061184, "loss": 2.9165, "theoretical_loss": 3.6094847352144313, "tokens_seen": 1123230720 }, { "epoch": 3.03, "learning_rate": 0.0003331795386158475, "loss": 2.76, "theoretical_loss": 3.6094651221180314, "tokens_seen": 1123296256 }, { "epoch": 3.03, "learning_rate": 0.00033316950852557676, "loss": 2.9324, "theoretical_loss": 3.609445510486254, "tokens_seen": 1123361792 }, { "epoch": 3.03, "learning_rate": 0.00033315947843530594, "loss": 2.874, "theoretical_loss": 3.6094259003189038, "tokens_seen": 1123427328 }, { "epoch": 3.03, "learning_rate": 0.0003331494483450351, "loss": 2.9747, "theoretical_loss": 3.6094062916157856, "tokens_seen": 1123492864 }, { "epoch": 3.03, "learning_rate": 0.0003331394182547643, "loss": 2.9624, "theoretical_loss": 3.609386684376705, "tokens_seen": 1123558400 }, { "epoch": 3.03, "learning_rate": 0.0003331293881644935, "loss": 2.8146, "theoretical_loss": 3.6093670786014673, "tokens_seen": 1123623936 }, { "epoch": 3.03, "learning_rate": 0.00033311935807422266, "loss": 2.9443, "theoretical_loss": 3.609347474289878, "tokens_seen": 1123689472 }, { "epoch": 3.03, "learning_rate": 0.0003331093279839519, "loss": 2.7748, "theoretical_loss": 3.6093278714417423, "tokens_seen": 1123755008 }, { "epoch": 3.03, "learning_rate": 0.000333099297893681, "loss": 2.9041, "theoretical_loss": 3.609308270056866, "tokens_seen": 1123820544 }, { "epoch": 3.03, "learning_rate": 0.00033308926780341026, "loss": 2.8048, "theoretical_loss": 3.6092886701350535, "tokens_seen": 1123886080 }, { "epoch": 3.03, "objective/train/docs_used": 1802618, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7901110649108887, "objective/train/theoretical_loss": 3.6092837703831817, "objective/train/tokens_used": 1144362464, "theoretical_loss": 3.6092837703831817, "tokens_seen": 1123902464 }, { "epoch": 3.03, "learning_rate": 0.00033307923771313944, "loss": 2.8228, "theoretical_loss": 3.6092690716761116, "tokens_seen": 1123951616 }, { "epoch": 3.03, "learning_rate": 0.0003330692076228686, "loss": 2.6464, "theoretical_loss": 3.6092494746798454, "tokens_seen": 1124017152 }, { "epoch": 3.03, "learning_rate": 0.0003330591775325978, "loss": 2.9437, "theoretical_loss": 3.60922987914606, "tokens_seen": 1124082688 }, { "epoch": 3.03, "learning_rate": 0.000333049147442327, "loss": 2.8995, "theoretical_loss": 3.6092102850745613, "tokens_seen": 1124148224 }, { "epoch": 3.03, "learning_rate": 0.00033303911735205617, "loss": 2.797, "theoretical_loss": 3.6091906924651553, "tokens_seen": 1124213760 }, { "epoch": 3.03, "learning_rate": 0.0003330290872617854, "loss": 2.9081, "theoretical_loss": 3.6091711013176466, "tokens_seen": 1124279296 }, { "epoch": 3.03, "learning_rate": 0.00033301905717151453, "loss": 2.9262, "theoretical_loss": 3.6091515116318424, "tokens_seen": 1124344832 }, { "epoch": 3.03, "learning_rate": 0.00033300902708124376, "loss": 2.7167, "theoretical_loss": 3.6091319234075474, "tokens_seen": 1124410368 }, { "epoch": 3.03, "learning_rate": 0.0003329989969909729, "loss": 2.7396, "theoretical_loss": 3.609112336644568, "tokens_seen": 1124475904 }, { "epoch": 3.03, "learning_rate": 0.0003329889669007021, "loss": 2.8217, "theoretical_loss": 3.6090927513427093, "tokens_seen": 1124541440 }, { "epoch": 3.03, "learning_rate": 0.0003329789368104313, "loss": 2.8952, "theoretical_loss": 3.6090731675017773, "tokens_seen": 1124606976 }, { "epoch": 3.03, "learning_rate": 0.0003329689067201605, "loss": 2.8474, "theoretical_loss": 3.609053585121579, "tokens_seen": 1124672512 }, { "epoch": 3.03, "learning_rate": 0.00033295887662988967, "loss": 2.8494, "theoretical_loss": 3.609034004201919, "tokens_seen": 1124738048 }, { "epoch": 3.03, "learning_rate": 0.00033294884653961885, "loss": 2.974, "theoretical_loss": 3.609014424742604, "tokens_seen": 1124803584 }, { "epoch": 3.03, "learning_rate": 0.00033293881644934803, "loss": 2.8473, "theoretical_loss": 3.6089948467434394, "tokens_seen": 1124869120 }, { "epoch": 3.03, "learning_rate": 0.00033292878635907727, "loss": 2.8612, "theoretical_loss": 3.608975270204232, "tokens_seen": 1124934656 }, { "epoch": 3.03, "learning_rate": 0.0003329187562688064, "loss": 2.8389, "theoretical_loss": 3.6089556951247874, "tokens_seen": 1125000192 }, { "epoch": 3.03, "learning_rate": 0.00033290872617853563, "loss": 2.8773, "theoretical_loss": 3.608936121504912, "tokens_seen": 1125065728 }, { "epoch": 3.03, "learning_rate": 0.0003328986960882648, "loss": 2.8645, "theoretical_loss": 3.6089165493444115, "tokens_seen": 1125131264 }, { "epoch": 3.03, "learning_rate": 0.000332888665997994, "loss": 2.9184, "theoretical_loss": 3.6088969786430924, "tokens_seen": 1125196800 }, { "epoch": 3.03, "learning_rate": 0.0003328786359077232, "loss": 2.6595, "theoretical_loss": 3.6088774094007614, "tokens_seen": 1125262336 }, { "epoch": 3.03, "learning_rate": 0.00033286860581745235, "loss": 2.6999, "theoretical_loss": 3.6088578416172243, "tokens_seen": 1125327872 }, { "epoch": 3.03, "learning_rate": 0.00033285857572718154, "loss": 2.8965, "theoretical_loss": 3.608838275292287, "tokens_seen": 1125393408 }, { "epoch": 3.03, "learning_rate": 0.00033284854563691077, "loss": 3.0449, "theoretical_loss": 3.6088187104257563, "tokens_seen": 1125458944 }, { "epoch": 3.03, "learning_rate": 0.0003328385155466399, "loss": 2.793, "theoretical_loss": 3.6087991470174385, "tokens_seen": 1125524480 }, { "epoch": 3.03, "objective/train/docs_used": 1803910, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7907650470733643, "objective/train/theoretical_loss": 3.6087942563931823, "objective/train/tokens_used": 1146000864, "theoretical_loss": 3.6087942563931823, "tokens_seen": 1125540864 }, { "epoch": 3.03, "learning_rate": 0.00033282848545636913, "loss": 2.8279, "theoretical_loss": 3.6087795850671402, "tokens_seen": 1125590016 }, { "epoch": 3.03, "learning_rate": 0.00033281845536609826, "loss": 2.7831, "theoretical_loss": 3.608760024574668, "tokens_seen": 1125655552 }, { "epoch": 3.03, "learning_rate": 0.0003328084252758275, "loss": 3.004, "theoretical_loss": 3.6087404655398276, "tokens_seen": 1125721088 }, { "epoch": 3.03, "learning_rate": 0.0003327983951855567, "loss": 2.8711, "theoretical_loss": 3.6087209079624265, "tokens_seen": 1125786624 }, { "epoch": 3.03, "learning_rate": 0.00033278836509528586, "loss": 3.0606, "theoretical_loss": 3.608701351842271, "tokens_seen": 1125852160 }, { "epoch": 3.03, "learning_rate": 0.00033277833500501504, "loss": 2.8037, "theoretical_loss": 3.6086817971791665, "tokens_seen": 1125917696 }, { "epoch": 3.03, "learning_rate": 0.0003327683049147442, "loss": 2.822, "theoretical_loss": 3.6086622439729212, "tokens_seen": 1125983232 }, { "epoch": 3.03, "learning_rate": 0.0003327582748244734, "loss": 2.9632, "theoretical_loss": 3.6086426922233414, "tokens_seen": 1126048768 }, { "epoch": 3.03, "learning_rate": 0.00033274824473420264, "loss": 2.931, "theoretical_loss": 3.6086231419302335, "tokens_seen": 1126114304 }, { "epoch": 3.03, "learning_rate": 0.00033273821464393176, "loss": 2.9608, "theoretical_loss": 3.608603593093404, "tokens_seen": 1126179840 }, { "epoch": 3.03, "learning_rate": 0.000332728184553661, "loss": 2.6884, "theoretical_loss": 3.608584045712661, "tokens_seen": 1126245376 }, { "epoch": 3.03, "learning_rate": 0.0003327181544633902, "loss": 2.8782, "theoretical_loss": 3.6085644997878097, "tokens_seen": 1126310912 }, { "epoch": 3.03, "learning_rate": 0.00033270812437311936, "loss": 2.7894, "theoretical_loss": 3.608544955318658, "tokens_seen": 1126376448 }, { "epoch": 3.03, "learning_rate": 0.00033269809428284854, "loss": 2.7567, "theoretical_loss": 3.608525412305012, "tokens_seen": 1126441984 }, { "epoch": 3.03, "learning_rate": 0.0003326880641925777, "loss": 2.9165, "theoretical_loss": 3.6085058707466793, "tokens_seen": 1126507520 }, { "epoch": 3.03, "learning_rate": 0.0003326780341023069, "loss": 2.976, "theoretical_loss": 3.608486330643467, "tokens_seen": 1126573056 }, { "epoch": 3.03, "learning_rate": 0.00033266800401203614, "loss": 2.7936, "theoretical_loss": 3.6084667919951814, "tokens_seen": 1126638592 }, { "epoch": 3.03, "learning_rate": 0.00033265797392176527, "loss": 2.9763, "theoretical_loss": 3.60844725480163, "tokens_seen": 1126704128 }, { "epoch": 3.03, "learning_rate": 0.0003326479438314945, "loss": 2.9359, "theoretical_loss": 3.60842771906262, "tokens_seen": 1126769664 }, { "epoch": 3.03, "learning_rate": 0.00033263791374122363, "loss": 2.8058, "theoretical_loss": 3.608408184777958, "tokens_seen": 1126835200 }, { "epoch": 3.03, "learning_rate": 0.00033262788365095286, "loss": 2.7676, "theoretical_loss": 3.6083886519474513, "tokens_seen": 1126900736 }, { "epoch": 3.03, "learning_rate": 0.00033261785356068205, "loss": 2.7105, "theoretical_loss": 3.6083691205709076, "tokens_seen": 1126966272 }, { "epoch": 3.03, "learning_rate": 0.0003326078234704112, "loss": 2.8182, "theoretical_loss": 3.608349590648134, "tokens_seen": 1127031808 }, { "epoch": 3.03, "learning_rate": 0.0003325977933801404, "loss": 2.6489, "theoretical_loss": 3.608330062178937, "tokens_seen": 1127097344 }, { "epoch": 3.03, "learning_rate": 0.00033258776328986964, "loss": 2.9689, "theoretical_loss": 3.608310535163125, "tokens_seen": 1127162880 }, { "epoch": 3.03, "objective/train/docs_used": 1806806, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9636917114257812, "objective/train/theoretical_loss": 3.608305653636241, "objective/train/tokens_used": 1147639264, "theoretical_loss": 3.608305653636241, "tokens_seen": 1127179264 }, { "epoch": 3.03, "learning_rate": 0.0003325777331995988, "loss": 2.9001, "theoretical_loss": 3.6082910096005048, "tokens_seen": 1127228416 }, { "epoch": 3.03, "learning_rate": 0.000332567703109328, "loss": 2.9294, "theoretical_loss": 3.6082714854908833, "tokens_seen": 1127293952 }, { "epoch": 3.03, "learning_rate": 0.0003325576730190572, "loss": 2.8744, "theoretical_loss": 3.608251962834069, "tokens_seen": 1127359488 }, { "epoch": 3.03, "learning_rate": 0.00033254764292878637, "loss": 2.7669, "theoretical_loss": 3.6082324416298683, "tokens_seen": 1127425024 }, { "epoch": 3.03, "learning_rate": 0.0003325376128385156, "loss": 2.7313, "theoretical_loss": 3.6082129218780894, "tokens_seen": 1127490560 }, { "epoch": 3.03, "learning_rate": 0.00033252758274824473, "loss": 2.9095, "theoretical_loss": 3.6081934035785395, "tokens_seen": 1127556096 }, { "epoch": 3.03, "learning_rate": 0.00033251755265797396, "loss": 2.8424, "theoretical_loss": 3.608173886731026, "tokens_seen": 1127621632 }, { "epoch": 3.03, "learning_rate": 0.0003325075225677031, "loss": 2.876, "theoretical_loss": 3.608154371335357, "tokens_seen": 1127687168 }, { "epoch": 3.03, "learning_rate": 0.00033249749247743233, "loss": 2.893, "theoretical_loss": 3.6081348573913394, "tokens_seen": 1127752704 }, { "epoch": 3.03, "learning_rate": 0.0003324874623871615, "loss": 2.8318, "theoretical_loss": 3.6081153448987813, "tokens_seen": 1127818240 }, { "epoch": 3.03, "learning_rate": 0.0003324774322968907, "loss": 2.7587, "theoretical_loss": 3.608095833857491, "tokens_seen": 1127883776 }, { "epoch": 3.03, "learning_rate": 0.00033246740220661987, "loss": 2.8482, "theoretical_loss": 3.6080763242672753, "tokens_seen": 1127949312 }, { "epoch": 3.03, "learning_rate": 0.00033245737211634905, "loss": 2.8445, "theoretical_loss": 3.6080568161279425, "tokens_seen": 1128014848 }, { "epoch": 3.03, "learning_rate": 0.00033244734202607823, "loss": 2.771, "theoretical_loss": 3.6080373094393003, "tokens_seen": 1128080384 }, { "epoch": 3.03, "learning_rate": 0.00033243731193580747, "loss": 2.7466, "theoretical_loss": 3.6080178042011566, "tokens_seen": 1128145920 }, { "epoch": 3.03, "learning_rate": 0.0003324272818455366, "loss": 2.8409, "theoretical_loss": 3.6079983004133185, "tokens_seen": 1128211456 }, { "epoch": 3.03, "learning_rate": 0.00033241725175526583, "loss": 2.9642, "theoretical_loss": 3.607978798075595, "tokens_seen": 1128276992 }, { "epoch": 3.03, "learning_rate": 0.000332407221664995, "loss": 2.8849, "theoretical_loss": 3.607959297187794, "tokens_seen": 1128342528 }, { "epoch": 3.03, "learning_rate": 0.0003323971915747242, "loss": 2.7687, "theoretical_loss": 3.607939797749723, "tokens_seen": 1128408064 }, { "epoch": 3.03, "learning_rate": 0.0003323871614844534, "loss": 3.0501, "theoretical_loss": 3.60792029976119, "tokens_seen": 1128473600 }, { "epoch": 3.03, "learning_rate": 0.00033237713139418255, "loss": 2.9404, "theoretical_loss": 3.6079008032220035, "tokens_seen": 1128539136 }, { "epoch": 3.03, "learning_rate": 0.00033236710130391174, "loss": 2.8125, "theoretical_loss": 3.6078813081319714, "tokens_seen": 1128604672 }, { "epoch": 3.03, "learning_rate": 0.00033235707121364097, "loss": 2.8065, "theoretical_loss": 3.607861814490902, "tokens_seen": 1128670208 }, { "epoch": 3.03, "learning_rate": 0.0003323470411233701, "loss": 2.7332, "theoretical_loss": 3.6078423222986036, "tokens_seen": 1128735744 }, { "epoch": 3.03, "learning_rate": 0.00033233701103309933, "loss": 2.927, "theoretical_loss": 3.6078228315548833, "tokens_seen": 1128801280 }, { "epoch": 3.03, "objective/train/docs_used": 1809677, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2526793479919434, "objective/train/theoretical_loss": 3.6078179590952715, "objective/train/tokens_used": 1149277664, "theoretical_loss": 3.6078179590952715, "tokens_seen": 1128817664 }, { "epoch": 3.03, "learning_rate": 0.00033232698094282846, "loss": 3.1223, "theoretical_loss": 3.607803342259551, "tokens_seen": 1128866816 }, { "epoch": 3.03, "learning_rate": 0.0003323169508525577, "loss": 2.7999, "theoretical_loss": 3.607783854412414, "tokens_seen": 1128932352 }, { "epoch": 3.03, "learning_rate": 0.0003323069207622869, "loss": 2.8463, "theoretical_loss": 3.6077643680132807, "tokens_seen": 1128997888 }, { "epoch": 3.03, "learning_rate": 0.00033229689067201606, "loss": 2.8725, "theoretical_loss": 3.60774488306196, "tokens_seen": 1129063424 }, { "epoch": 3.03, "learning_rate": 0.00033228686058174524, "loss": 2.8544, "theoretical_loss": 3.6077253995582588, "tokens_seen": 1129128960 }, { "epoch": 3.03, "learning_rate": 0.0003322768304914744, "loss": 2.9021, "theoretical_loss": 3.6077059175019874, "tokens_seen": 1129194496 }, { "epoch": 3.03, "learning_rate": 0.0003322668004012036, "loss": 2.9146, "theoretical_loss": 3.607686436892954, "tokens_seen": 1129260032 }, { "epoch": 3.03, "learning_rate": 0.00033225677031093284, "loss": 2.8976, "theoretical_loss": 3.6076669577309657, "tokens_seen": 1129325568 }, { "epoch": 3.03, "learning_rate": 0.00033224674022066196, "loss": 2.9707, "theoretical_loss": 3.6076474800158325, "tokens_seen": 1129391104 }, { "epoch": 3.03, "learning_rate": 0.0003322367101303912, "loss": 2.9357, "theoretical_loss": 3.6076280037473625, "tokens_seen": 1129456640 }, { "epoch": 3.03, "learning_rate": 0.0003322266800401204, "loss": 2.8154, "theoretical_loss": 3.607608528925364, "tokens_seen": 1129522176 }, { "epoch": 3.03, "learning_rate": 0.00033221664994984956, "loss": 2.8184, "theoretical_loss": 3.6075890555496457, "tokens_seen": 1129587712 }, { "epoch": 3.03, "learning_rate": 0.00033220661985957874, "loss": 2.9826, "theoretical_loss": 3.607569583620017, "tokens_seen": 1129653248 }, { "epoch": 3.03, "learning_rate": 0.0003321965897693079, "loss": 2.8587, "theoretical_loss": 3.607550113136286, "tokens_seen": 1129718784 }, { "epoch": 3.03, "learning_rate": 0.0003321865596790371, "loss": 2.863, "theoretical_loss": 3.6075306440982615, "tokens_seen": 1129784320 }, { "epoch": 3.03, "learning_rate": 0.00033217652958876634, "loss": 2.88, "theoretical_loss": 3.607511176505753, "tokens_seen": 1129849856 }, { "epoch": 3.03, "learning_rate": 0.00033216649949849547, "loss": 2.9413, "theoretical_loss": 3.607491710358568, "tokens_seen": 1129915392 }, { "epoch": 3.03, "learning_rate": 0.0003321564694082247, "loss": 2.7301, "theoretical_loss": 3.607472245656516, "tokens_seen": 1129980928 }, { "epoch": 3.03, "learning_rate": 0.00033214643931795383, "loss": 2.7672, "theoretical_loss": 3.607452782399407, "tokens_seen": 1130046464 }, { "epoch": 3.03, "learning_rate": 0.00033213640922768306, "loss": 2.86, "theoretical_loss": 3.607433320587048, "tokens_seen": 1130112000 }, { "epoch": 3.03, "learning_rate": 0.00033212637913741225, "loss": 2.7713, "theoretical_loss": 3.6074138602192494, "tokens_seen": 1130177536 }, { "epoch": 3.03, "learning_rate": 0.0003321163490471414, "loss": 2.8072, "theoretical_loss": 3.60739440129582, "tokens_seen": 1130243072 }, { "epoch": 3.03, "learning_rate": 0.0003321063189568706, "loss": 2.8474, "theoretical_loss": 3.607374943816568, "tokens_seen": 1130308608 }, { "epoch": 3.03, "learning_rate": 0.00033209628886659984, "loss": 2.7828, "theoretical_loss": 3.607355487781304, "tokens_seen": 1130374144 }, { "epoch": 3.03, "learning_rate": 0.00033208625877632897, "loss": 2.8888, "theoretical_loss": 3.607336033189836, "tokens_seen": 1130439680 }, { "epoch": 3.03, "objective/train/docs_used": 1812529, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0236549377441406, "objective/train/theoretical_loss": 3.6073311697675394, "objective/train/tokens_used": 1150916064, "theoretical_loss": 3.6073311697675394, "tokens_seen": 1130456064 }, { "epoch": 3.03, "learning_rate": 0.0003320762286860582, "loss": 2.8935, "theoretical_loss": 3.607316580041973, "tokens_seen": 1130505216 }, { "epoch": 3.03, "learning_rate": 0.00033206619859578733, "loss": 2.8172, "theoretical_loss": 3.607297128337525, "tokens_seen": 1130570752 }, { "epoch": 3.03, "learning_rate": 0.00033205616850551657, "loss": 2.8566, "theoretical_loss": 3.6072776780763007, "tokens_seen": 1130636288 }, { "epoch": 3.03, "learning_rate": 0.00033204613841524575, "loss": 3.016, "theoretical_loss": 3.60725822925811, "tokens_seen": 1130701824 }, { "epoch": 3.03, "learning_rate": 0.00033203610832497493, "loss": 2.9739, "theoretical_loss": 3.607238781882761, "tokens_seen": 1130767360 }, { "epoch": 3.03, "learning_rate": 0.0003320260782347041, "loss": 2.6546, "theoretical_loss": 3.6072193359500644, "tokens_seen": 1130832896 }, { "epoch": 3.03, "learning_rate": 0.0003320160481444333, "loss": 2.839, "theoretical_loss": 3.6071998914598287, "tokens_seen": 1130898432 }, { "epoch": 3.03, "learning_rate": 0.0003320060180541625, "loss": 2.8764, "theoretical_loss": 3.6071804484118637, "tokens_seen": 1130963968 }, { "epoch": 3.03, "learning_rate": 0.0003319959879638917, "loss": 3.04, "theoretical_loss": 3.607161006805979, "tokens_seen": 1131029504 }, { "epoch": 3.03, "learning_rate": 0.00033198595787362084, "loss": 2.704, "theoretical_loss": 3.6071415666419835, "tokens_seen": 1131095040 }, { "epoch": 3.03, "learning_rate": 0.00033197592778335007, "loss": 2.8895, "theoretical_loss": 3.6071221279196872, "tokens_seen": 1131160576 }, { "epoch": 3.03, "learning_rate": 0.0003319658976930792, "loss": 2.7449, "theoretical_loss": 3.6071026906388997, "tokens_seen": 1131226112 }, { "epoch": 3.03, "learning_rate": 0.00033195586760280843, "loss": 2.8239, "theoretical_loss": 3.6070832547994303, "tokens_seen": 1131291648 }, { "epoch": 3.03, "learning_rate": 0.0003319458375125376, "loss": 2.9317, "theoretical_loss": 3.6070638204010885, "tokens_seen": 1131357184 }, { "epoch": 3.03, "learning_rate": 0.0003319358074222668, "loss": 2.7689, "theoretical_loss": 3.607044387443685, "tokens_seen": 1131422720 }, { "epoch": 3.03, "learning_rate": 0.000331925777331996, "loss": 2.8811, "theoretical_loss": 3.607024955927028, "tokens_seen": 1131488256 }, { "epoch": 3.03, "learning_rate": 0.0003319157472417252, "loss": 2.8999, "theoretical_loss": 3.6070055258509286, "tokens_seen": 1131553792 }, { "epoch": 3.03, "learning_rate": 0.00033190571715145434, "loss": 2.9045, "theoretical_loss": 3.606986097215196, "tokens_seen": 1131619328 }, { "epoch": 3.03, "learning_rate": 0.0003318956870611836, "loss": 2.7095, "theoretical_loss": 3.6069666700196397, "tokens_seen": 1131684864 }, { "epoch": 3.03, "learning_rate": 0.0003318856569709127, "loss": 2.9583, "theoretical_loss": 3.6069472442640693, "tokens_seen": 1131750400 }, { "epoch": 3.03, "learning_rate": 0.00033187562688064194, "loss": 2.704, "theoretical_loss": 3.606927819948296, "tokens_seen": 1131815936 }, { "epoch": 3.03, "learning_rate": 0.0003318655967903711, "loss": 2.7441, "theoretical_loss": 3.606908397072129, "tokens_seen": 1131881472 }, { "epoch": 3.03, "learning_rate": 0.0003318555667001003, "loss": 2.7713, "theoretical_loss": 3.6068889756353784, "tokens_seen": 1131947008 }, { "epoch": 3.03, "learning_rate": 0.0003318455366098295, "loss": 2.9021, "theoretical_loss": 3.6068695556378536, "tokens_seen": 1132012544 }, { "epoch": 3.03, "learning_rate": 0.00033183550651955866, "loss": 2.7955, "theoretical_loss": 3.6068501370793653, "tokens_seen": 1132078080 }, { "epoch": 3.03, "objective/train/docs_used": 1813896, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.624011993408203, "objective/train/theoretical_loss": 3.60684528266457, "objective/train/tokens_used": 1152554464, "theoretical_loss": 3.60684528266457, "tokens_seen": 1132094464 }, { "epoch": 3.03, "learning_rate": 0.0003318254764292879, "loss": 2.6393, "theoretical_loss": 3.606830719959723, "tokens_seen": 1132143616 }, { "epoch": 3.03, "learning_rate": 0.0003318154463390171, "loss": 2.8808, "theoretical_loss": 3.6068113042787373, "tokens_seen": 1132209152 }, { "epoch": 3.03, "learning_rate": 0.00033180541624874626, "loss": 2.8217, "theoretical_loss": 3.6067918900362184, "tokens_seen": 1132274688 }, { "epoch": 3.03, "learning_rate": 0.00033179538615847544, "loss": 2.9873, "theoretical_loss": 3.606772477231976, "tokens_seen": 1132340224 }, { "epoch": 3.03, "learning_rate": 0.0003317853560682046, "loss": 2.9542, "theoretical_loss": 3.6067530658658207, "tokens_seen": 1132405760 }, { "epoch": 3.03, "learning_rate": 0.0003317753259779338, "loss": 2.7544, "theoretical_loss": 3.6067336559375627, "tokens_seen": 1132471296 }, { "epoch": 3.03, "learning_rate": 0.00033176529588766304, "loss": 2.8458, "theoretical_loss": 3.606714247447012, "tokens_seen": 1132536832 }, { "epoch": 3.03, "learning_rate": 0.00033175526579739216, "loss": 2.7915, "theoretical_loss": 3.6066948403939794, "tokens_seen": 1132602368 }, { "epoch": 3.03, "learning_rate": 0.0003317452357071214, "loss": 2.8406, "theoretical_loss": 3.606675434778275, "tokens_seen": 1132667904 }, { "epoch": 3.03, "learning_rate": 0.0003317352056168506, "loss": 2.8004, "theoretical_loss": 3.6066560305997086, "tokens_seen": 1132733440 }, { "epoch": 3.03, "learning_rate": 0.00033172517552657976, "loss": 2.8034, "theoretical_loss": 3.606636627858092, "tokens_seen": 1132798976 }, { "epoch": 3.03, "learning_rate": 0.00033171514543630894, "loss": 2.808, "theoretical_loss": 3.606617226553235, "tokens_seen": 1132864512 }, { "epoch": 3.03, "learning_rate": 0.0003317051153460381, "loss": 3.0128, "theoretical_loss": 3.606597826684947, "tokens_seen": 1132930048 }, { "epoch": 3.03, "learning_rate": 0.0003316950852557673, "loss": 3.0538, "theoretical_loss": 3.60657842825304, "tokens_seen": 1132995584 }, { "epoch": 3.03, "learning_rate": 0.00033168505516549654, "loss": 2.7376, "theoretical_loss": 3.6065590312573246, "tokens_seen": 1133061120 }, { "epoch": 3.03, "learning_rate": 0.00033167502507522567, "loss": 2.8341, "theoretical_loss": 3.6065396356976103, "tokens_seen": 1133126656 }, { "epoch": 3.03, "learning_rate": 0.0003316649949849549, "loss": 3.0225, "theoretical_loss": 3.6065202415737083, "tokens_seen": 1133192192 }, { "epoch": 3.03, "learning_rate": 0.00033165496489468403, "loss": 2.8213, "theoretical_loss": 3.60650084888543, "tokens_seen": 1133257728 }, { "epoch": 3.03, "learning_rate": 0.00033164493480441326, "loss": 3.0076, "theoretical_loss": 3.6064814576325848, "tokens_seen": 1133323264 }, { "epoch": 3.03, "learning_rate": 0.00033163490471414245, "loss": 2.7983, "theoretical_loss": 3.6064620678149844, "tokens_seen": 1133388800 }, { "epoch": 3.03, "learning_rate": 0.0003316248746238716, "loss": 2.9103, "theoretical_loss": 3.6064426794324396, "tokens_seen": 1133454336 }, { "epoch": 3.03, "learning_rate": 0.0003316148445336008, "loss": 2.7782, "theoretical_loss": 3.6064232924847603, "tokens_seen": 1133519872 }, { "epoch": 3.03, "learning_rate": 0.00033160481444333004, "loss": 2.7168, "theoretical_loss": 3.6064039069717584, "tokens_seen": 1133585408 }, { "epoch": 3.03, "learning_rate": 0.00033159478435305917, "loss": 2.8502, "theoretical_loss": 3.606384522893244, "tokens_seen": 1133650944 }, { "epoch": 3.03, "learning_rate": 0.0003315847542627884, "loss": 2.9145, "theoretical_loss": 3.6063651402490287, "tokens_seen": 1133716480 }, { "epoch": 3.03, "objective/train/docs_used": 1816707, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.071863889694214, "objective/train/theoretical_loss": 3.606360294812062, "objective/train/tokens_used": 1154192864, "theoretical_loss": 3.606360294812062, "tokens_seen": 1133732864 }, { "epoch": 3.03, "learning_rate": 0.00033157472417251753, "loss": 2.8564, "theoretical_loss": 3.6063457590389234, "tokens_seen": 1133782016 }, { "epoch": 3.03, "learning_rate": 0.00033156469408224677, "loss": 2.7085, "theoretical_loss": 3.6063263792627382, "tokens_seen": 1133847552 }, { "epoch": 3.03, "learning_rate": 0.00033155466399197595, "loss": 2.9067, "theoretical_loss": 3.6063070009202853, "tokens_seen": 1133913088 }, { "epoch": 3.03, "learning_rate": 0.00033154463390170513, "loss": 2.9345, "theoretical_loss": 3.606287624011375, "tokens_seen": 1133978624 }, { "epoch": 3.03, "learning_rate": 0.0003315346038114343, "loss": 2.8305, "theoretical_loss": 3.6062682485358195, "tokens_seen": 1134044160 }, { "epoch": 3.03, "learning_rate": 0.0003315245737211635, "loss": 2.9283, "theoretical_loss": 3.6062488744934287, "tokens_seen": 1134109696 }, { "epoch": 3.03, "learning_rate": 0.0003315145436308927, "loss": 2.78, "theoretical_loss": 3.606229501884014, "tokens_seen": 1134175232 }, { "epoch": 3.03, "learning_rate": 0.0003315045135406219, "loss": 2.8358, "theoretical_loss": 3.606210130707387, "tokens_seen": 1134240768 }, { "epoch": 3.03, "learning_rate": 0.00033149448345035104, "loss": 2.7218, "theoretical_loss": 3.6061907609633588, "tokens_seen": 1134306304 }, { "epoch": 3.03, "learning_rate": 0.00033148445336008027, "loss": 2.9122, "theoretical_loss": 3.6061713926517407, "tokens_seen": 1134371840 }, { "epoch": 3.03, "learning_rate": 0.0003314744232698094, "loss": 2.8043, "theoretical_loss": 3.606152025772344, "tokens_seen": 1134437376 }, { "epoch": 3.03, "learning_rate": 0.00033146439317953863, "loss": 2.788, "theoretical_loss": 3.60613266032498, "tokens_seen": 1134502912 }, { "epoch": 3.03, "learning_rate": 0.0003314543630892678, "loss": 2.886, "theoretical_loss": 3.6061132963094606, "tokens_seen": 1134568448 }, { "epoch": 3.03, "learning_rate": 0.000331444332998997, "loss": 2.6128, "theoretical_loss": 3.606093933725597, "tokens_seen": 1134633984 }, { "epoch": 3.03, "learning_rate": 0.0003314343029087262, "loss": 2.7219, "theoretical_loss": 3.6060745725732, "tokens_seen": 1134699520 }, { "epoch": 3.03, "learning_rate": 0.0003314242728184554, "loss": 2.7577, "theoretical_loss": 3.6060552128520813, "tokens_seen": 1134765056 }, { "epoch": 3.03, "learning_rate": 0.00033141424272818454, "loss": 2.8214, "theoretical_loss": 3.6060358545620534, "tokens_seen": 1134830592 }, { "epoch": 3.03, "learning_rate": 0.0003314042126379138, "loss": 2.9761, "theoretical_loss": 3.606016497702927, "tokens_seen": 1134896128 }, { "epoch": 3.03, "learning_rate": 0.0003313941825476429, "loss": 2.8637, "theoretical_loss": 3.605997142274514, "tokens_seen": 1134961664 }, { "epoch": 3.03, "learning_rate": 0.00033138415245737214, "loss": 2.8146, "theoretical_loss": 3.605977788276626, "tokens_seen": 1135027200 }, { "epoch": 3.03, "learning_rate": 0.0003313741223671013, "loss": 2.8115, "theoretical_loss": 3.6059584357090744, "tokens_seen": 1135092736 }, { "epoch": 3.03, "learning_rate": 0.0003313640922768305, "loss": 2.9422, "theoretical_loss": 3.6059390845716717, "tokens_seen": 1135158272 }, { "epoch": 3.03, "learning_rate": 0.0003313540621865597, "loss": 2.6816, "theoretical_loss": 3.6059197348642282, "tokens_seen": 1135223808 }, { "epoch": 3.03, "learning_rate": 0.00033134403209628886, "loss": 2.7707, "theoretical_loss": 3.6059003865865575, "tokens_seen": 1135289344 }, { "epoch": 3.03, "learning_rate": 0.00033133400200601804, "loss": 2.9036, "theoretical_loss": 3.6058810397384704, "tokens_seen": 1135354880 }, { "epoch": 3.03, "objective/train/docs_used": 1819465, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.425114870071411, "objective/train/theoretical_loss": 3.605876203249799, "objective/train/tokens_used": 1155831264, "theoretical_loss": 3.605876203249799, "tokens_seen": 1135371264 }, { "epoch": 3.03, "learning_rate": 0.0003313239719157473, "loss": 2.8147, "theoretical_loss": 3.605861694319779, "tokens_seen": 1135420416 }, { "epoch": 3.03, "learning_rate": 0.0003313139418254764, "loss": 2.9658, "theoretical_loss": 3.6058423503302945, "tokens_seen": 1135485952 }, { "epoch": 3.03, "learning_rate": 0.00033130391173520564, "loss": 2.5761, "theoretical_loss": 3.60582300776983, "tokens_seen": 1135551488 }, { "epoch": 3.03, "learning_rate": 0.00033129388164493477, "loss": 2.7339, "theoretical_loss": 3.605803666638197, "tokens_seen": 1135617024 }, { "epoch": 3.03, "learning_rate": 0.000331283851554664, "loss": 2.8871, "theoretical_loss": 3.6057843269352072, "tokens_seen": 1135682560 }, { "epoch": 3.03, "learning_rate": 0.0003312738214643932, "loss": 2.9028, "theoretical_loss": 3.605764988660673, "tokens_seen": 1135748096 }, { "epoch": 3.03, "learning_rate": 0.00033126379137412236, "loss": 2.8083, "theoretical_loss": 3.6057456518144066, "tokens_seen": 1135813632 }, { "epoch": 3.03, "learning_rate": 0.00033125376128385155, "loss": 2.8547, "theoretical_loss": 3.605726316396219, "tokens_seen": 1135879168 }, { "epoch": 3.03, "learning_rate": 0.0003312437311935808, "loss": 2.9512, "theoretical_loss": 3.605706982405924, "tokens_seen": 1135944704 }, { "epoch": 3.03, "learning_rate": 0.0003312337011033099, "loss": 2.8091, "theoretical_loss": 3.6056876498433326, "tokens_seen": 1136010240 }, { "epoch": 3.03, "learning_rate": 0.00033122367101303914, "loss": 2.8329, "theoretical_loss": 3.6056683187082577, "tokens_seen": 1136075776 }, { "epoch": 3.03, "learning_rate": 0.00033121364092276827, "loss": 2.8203, "theoretical_loss": 3.605648989000511, "tokens_seen": 1136141312 }, { "epoch": 3.03, "learning_rate": 0.0003312036108324975, "loss": 2.896, "theoretical_loss": 3.605629660719905, "tokens_seen": 1136206848 }, { "epoch": 3.03, "learning_rate": 0.0003311935807422267, "loss": 2.8116, "theoretical_loss": 3.605610333866252, "tokens_seen": 1136272384 }, { "epoch": 3.03, "learning_rate": 0.00033118355065195587, "loss": 3.0039, "theoretical_loss": 3.6055910084393643, "tokens_seen": 1136337920 }, { "epoch": 3.03, "learning_rate": 0.00033117352056168505, "loss": 2.6676, "theoretical_loss": 3.605571684439055, "tokens_seen": 1136403456 }, { "epoch": 3.03, "learning_rate": 0.00033116349047141423, "loss": 2.691, "theoretical_loss": 3.6055523618651355, "tokens_seen": 1136468992 }, { "epoch": 3.03, "learning_rate": 0.0003311534603811434, "loss": 2.9183, "theoretical_loss": 3.605533040717419, "tokens_seen": 1136534528 }, { "epoch": 3.03, "learning_rate": 0.00033114343029087265, "loss": 2.8198, "theoretical_loss": 3.6055137209957175, "tokens_seen": 1136600064 }, { "epoch": 3.03, "learning_rate": 0.00033113340020060177, "loss": 2.9445, "theoretical_loss": 3.6054944026998434, "tokens_seen": 1136665600 }, { "epoch": 3.03, "learning_rate": 0.000331123370110331, "loss": 2.8503, "theoretical_loss": 3.6054750858296103, "tokens_seen": 1136731136 }, { "epoch": 3.03, "learning_rate": 0.00033111334002006013, "loss": 2.7637, "theoretical_loss": 3.60545577038483, "tokens_seen": 1136796672 }, { "epoch": 3.03, "learning_rate": 0.00033110330992978937, "loss": 2.8491, "theoretical_loss": 3.6054364563653154, "tokens_seen": 1136862208 }, { "epoch": 3.03, "learning_rate": 0.00033109327983951855, "loss": 2.6452, "theoretical_loss": 3.6054171437708784, "tokens_seen": 1136927744 }, { "epoch": 3.03, "learning_rate": 0.00033108324974924773, "loss": 2.7694, "theoretical_loss": 3.6053978326013327, "tokens_seen": 1136993280 }, { "epoch": 3.03, "objective/train/docs_used": 1821895, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.885347843170166, "objective/train/theoretical_loss": 3.605393005031564, "objective/train/tokens_used": 1157469664, "theoretical_loss": 3.605393005031564, "tokens_seen": 1137009664 }, { "epoch": 3.03, "learning_rate": 0.00033107321965897697, "loss": 2.9272, "theoretical_loss": 3.6053785228564914, "tokens_seen": 1137058816 }, { "epoch": 3.03, "learning_rate": 0.00033106318956870615, "loss": 2.8672, "theoretical_loss": 3.6053592145361657, "tokens_seen": 1137124352 }, { "epoch": 3.03, "learning_rate": 0.00033105315947843533, "loss": 2.9028, "theoretical_loss": 3.6053399076401695, "tokens_seen": 1137189888 }, { "epoch": 3.03, "learning_rate": 0.0003310431293881645, "loss": 2.7739, "theoretical_loss": 3.605320602168316, "tokens_seen": 1137255424 }, { "epoch": 3.03, "learning_rate": 0.0003310330992978937, "loss": 2.8532, "theoretical_loss": 3.605301298120417, "tokens_seen": 1137320960 }, { "epoch": 3.03, "learning_rate": 0.0003310230692076229, "loss": 2.9746, "theoretical_loss": 3.605281995496286, "tokens_seen": 1137386496 }, { "epoch": 3.03, "learning_rate": 0.0003310130391173521, "loss": 3.04, "theoretical_loss": 3.605262694295736, "tokens_seen": 1137452032 }, { "epoch": 3.03, "learning_rate": 0.00033100300902708124, "loss": 2.8746, "theoretical_loss": 3.6052433945185802, "tokens_seen": 1137517568 }, { "epoch": 3.03, "learning_rate": 0.00033099297893681047, "loss": 2.7751, "theoretical_loss": 3.6052240961646316, "tokens_seen": 1137583104 }, { "epoch": 3.03, "learning_rate": 0.0003309829488465396, "loss": 2.5668, "theoretical_loss": 3.605204799233703, "tokens_seen": 1137648640 }, { "epoch": 3.03, "learning_rate": 0.00033097291875626883, "loss": 2.8013, "theoretical_loss": 3.605185503725607, "tokens_seen": 1137714176 }, { "epoch": 3.03, "learning_rate": 0.000330962888665998, "loss": 2.9119, "theoretical_loss": 3.6051662096401573, "tokens_seen": 1137779712 }, { "epoch": 3.03, "learning_rate": 0.0003309528585757272, "loss": 2.9383, "theoretical_loss": 3.605146916977168, "tokens_seen": 1137845248 }, { "epoch": 3.03, "learning_rate": 0.0003309428284854564, "loss": 2.8932, "theoretical_loss": 3.6051276257364506, "tokens_seen": 1137910784 }, { "epoch": 3.03, "learning_rate": 0.0003309327983951856, "loss": 2.8514, "theoretical_loss": 3.6051083359178193, "tokens_seen": 1137976320 }, { "epoch": 3.03, "learning_rate": 0.00033092276830491474, "loss": 2.9716, "theoretical_loss": 3.6050890475210875, "tokens_seen": 1138041856 }, { "epoch": 3.03, "learning_rate": 0.000330912738214644, "loss": 3.0181, "theoretical_loss": 3.605069760546068, "tokens_seen": 1138107392 }, { "epoch": 3.03, "learning_rate": 0.0003309027081243731, "loss": 2.835, "theoretical_loss": 3.6050504749925745, "tokens_seen": 1138172928 }, { "epoch": 3.03, "learning_rate": 0.00033089267803410234, "loss": 3.001, "theoretical_loss": 3.6050311908604202, "tokens_seen": 1138238464 }, { "epoch": 3.03, "learning_rate": 0.0003308826479438315, "loss": 2.9653, "theoretical_loss": 3.605011908149418, "tokens_seen": 1138304000 }, { "epoch": 3.03, "learning_rate": 0.0003308726178535607, "loss": 2.8798, "theoretical_loss": 3.6049926268593824, "tokens_seen": 1138369536 }, { "epoch": 3.03, "learning_rate": 0.0003308625877632899, "loss": 2.6133, "theoretical_loss": 3.6049733469901266, "tokens_seen": 1138435072 }, { "epoch": 3.03, "learning_rate": 0.00033085255767301906, "loss": 2.8299, "theoretical_loss": 3.604954068541464, "tokens_seen": 1138500608 }, { "epoch": 3.03, "learning_rate": 0.00033084252758274824, "loss": 2.9657, "theoretical_loss": 3.604934791513208, "tokens_seen": 1138566144 }, { "epoch": 3.03, "learning_rate": 0.0003308324974924775, "loss": 2.8808, "theoretical_loss": 3.604915515905172, "tokens_seen": 1138631680 }, { "epoch": 3.03, "objective/train/docs_used": 1824687, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.910226345062256, "objective/train/theoretical_loss": 3.6049106972250504, "objective/train/tokens_used": 1159108064, "theoretical_loss": 3.6049106972250504, "tokens_seen": 1138648064 }, { "epoch": 3.03, "learning_rate": 0.0003308224674022066, "loss": 2.8488, "theoretical_loss": 3.6048962417171704, "tokens_seen": 1138697216 }, { "epoch": 3.03, "learning_rate": 0.00033081243731193584, "loss": 2.9246, "theoretical_loss": 3.604876968949016, "tokens_seen": 1138762752 }, { "epoch": 3.03, "learning_rate": 0.00033080240722166497, "loss": 2.8768, "theoretical_loss": 3.604857697600523, "tokens_seen": 1138828288 }, { "epoch": 3.03, "learning_rate": 0.0003307923771313942, "loss": 2.7665, "theoretical_loss": 3.6048384276715053, "tokens_seen": 1138893824 }, { "epoch": 3.03, "learning_rate": 0.0003307823470411234, "loss": 2.8492, "theoretical_loss": 3.604819159161776, "tokens_seen": 1138959360 }, { "epoch": 3.03, "learning_rate": 0.00033077231695085256, "loss": 2.7873, "theoretical_loss": 3.60479989207115, "tokens_seen": 1139024896 }, { "epoch": 3.03, "learning_rate": 0.00033076228686058175, "loss": 2.8823, "theoretical_loss": 3.6047806263994397, "tokens_seen": 1139090432 }, { "epoch": 3.03, "learning_rate": 0.000330752256770311, "loss": 2.9043, "theoretical_loss": 3.6047613621464607, "tokens_seen": 1139155968 }, { "epoch": 3.03, "learning_rate": 0.0003307422266800401, "loss": 2.9021, "theoretical_loss": 3.604742099312025, "tokens_seen": 1139221504 }, { "epoch": 3.03, "learning_rate": 0.00033073219658976934, "loss": 2.7392, "theoretical_loss": 3.6047228378959484, "tokens_seen": 1139287040 }, { "epoch": 3.03, "learning_rate": 0.00033072216649949847, "loss": 2.8142, "theoretical_loss": 3.6047035778980434, "tokens_seen": 1139352576 }, { "epoch": 3.03, "learning_rate": 0.0003307121364092277, "loss": 2.7853, "theoretical_loss": 3.6046843193181246, "tokens_seen": 1139418112 }, { "epoch": 3.03, "learning_rate": 0.0003307021063189569, "loss": 2.7914, "theoretical_loss": 3.6046650621560063, "tokens_seen": 1139483648 }, { "epoch": 3.03, "learning_rate": 0.00033069207622868607, "loss": 2.7871, "theoretical_loss": 3.6046458064115026, "tokens_seen": 1139549184 }, { "epoch": 3.03, "learning_rate": 0.00033068204613841525, "loss": 2.6171, "theoretical_loss": 3.6046265520844267, "tokens_seen": 1139614720 }, { "epoch": 3.03, "learning_rate": 0.00033067201604814443, "loss": 2.9973, "theoretical_loss": 3.604607299174594, "tokens_seen": 1139680256 }, { "epoch": 3.03, "learning_rate": 0.0003306619859578736, "loss": 2.8165, "theoretical_loss": 3.604588047681818, "tokens_seen": 1139745792 }, { "epoch": 3.03, "learning_rate": 0.00033065195586760285, "loss": 2.7276, "theoretical_loss": 3.6045687976059133, "tokens_seen": 1139811328 }, { "epoch": 3.03, "learning_rate": 0.00033064192577733197, "loss": 2.8055, "theoretical_loss": 3.604549548946694, "tokens_seen": 1139876864 }, { "epoch": 3.03, "learning_rate": 0.0003306318956870612, "loss": 2.9943, "theoretical_loss": 3.6045303017039734, "tokens_seen": 1139942400 }, { "epoch": 3.03, "learning_rate": 0.00033062186559679034, "loss": 2.8361, "theoretical_loss": 3.6045110558775675, "tokens_seen": 1140007936 }, { "epoch": 3.03, "learning_rate": 0.00033061183550651957, "loss": 2.6816, "theoretical_loss": 3.60449181146729, "tokens_seen": 1140073472 }, { "epoch": 3.03, "learning_rate": 0.00033060180541624875, "loss": 2.8215, "theoretical_loss": 3.604472568472955, "tokens_seen": 1140139008 }, { "epoch": 3.03, "learning_rate": 0.00033059177532597793, "loss": 2.9155, "theoretical_loss": 3.604453326894377, "tokens_seen": 1140204544 }, { "epoch": 3.03, "learning_rate": 0.0003305817452357071, "loss": 2.8998, "theoretical_loss": 3.604434086731371, "tokens_seen": 1140270080 }, { "epoch": 3.03, "objective/train/docs_used": 1827284, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6508848667144775, "objective/train/theoretical_loss": 3.6044292769117807, "objective/train/tokens_used": 1160746464, "theoretical_loss": 3.6044292769117807, "tokens_seen": 1140286464 }, { "epoch": 3.03, "learning_rate": 0.00033057171514543635, "loss": 2.8466, "theoretical_loss": 3.604414847983751, "tokens_seen": 1140335616 }, { "epoch": 3.03, "learning_rate": 0.0003305616850551655, "loss": 3.0493, "theoretical_loss": 3.6043956106513315, "tokens_seen": 1140401152 }, { "epoch": 3.03, "learning_rate": 0.0003305516549648947, "loss": 2.8102, "theoretical_loss": 3.6043763747339277, "tokens_seen": 1140466688 }, { "epoch": 3.03, "learning_rate": 0.00033054162487462384, "loss": 2.6645, "theoretical_loss": 3.604357140231354, "tokens_seen": 1140532224 }, { "epoch": 3.03, "learning_rate": 0.0003305315947843531, "loss": 2.8584, "theoretical_loss": 3.604337907143424, "tokens_seen": 1140597760 }, { "epoch": 3.04, "learning_rate": 0.00033052156469408225, "loss": 2.8109, "theoretical_loss": 3.604318675469954, "tokens_seen": 1140663296 }, { "epoch": 3.04, "learning_rate": 0.00033051153460381144, "loss": 2.8933, "theoretical_loss": 3.6042994452107573, "tokens_seen": 1140728832 }, { "epoch": 3.04, "learning_rate": 0.0003305015045135406, "loss": 2.9418, "theoretical_loss": 3.6042802163656495, "tokens_seen": 1140794368 }, { "epoch": 3.04, "learning_rate": 0.0003304914744232698, "loss": 2.9256, "theoretical_loss": 3.6042609889344455, "tokens_seen": 1140859904 }, { "epoch": 3.04, "learning_rate": 0.000330481444332999, "loss": 3.0369, "theoretical_loss": 3.60424176291696, "tokens_seen": 1140925440 }, { "epoch": 3.04, "learning_rate": 0.0003304714142427282, "loss": 2.784, "theoretical_loss": 3.6042225383130067, "tokens_seen": 1140990976 }, { "epoch": 3.04, "learning_rate": 0.00033046138415245734, "loss": 2.768, "theoretical_loss": 3.604203315122402, "tokens_seen": 1141056512 }, { "epoch": 3.04, "learning_rate": 0.0003304513540621866, "loss": 2.8452, "theoretical_loss": 3.6041840933449603, "tokens_seen": 1141122048 }, { "epoch": 3.04, "learning_rate": 0.0003304413239719157, "loss": 2.9905, "theoretical_loss": 3.6041648729804967, "tokens_seen": 1141187584 }, { "epoch": 3.04, "learning_rate": 0.00033043129388164494, "loss": 2.8652, "theoretical_loss": 3.6041456540288257, "tokens_seen": 1141253120 }, { "epoch": 3.04, "learning_rate": 0.0003304212637913741, "loss": 2.7252, "theoretical_loss": 3.6041264364897634, "tokens_seen": 1141318656 }, { "epoch": 3.04, "learning_rate": 0.0003304112337011033, "loss": 2.9104, "theoretical_loss": 3.6041072203631233, "tokens_seen": 1141384192 }, { "epoch": 3.04, "learning_rate": 0.0003304012036108325, "loss": 2.8408, "theoretical_loss": 3.6040880056487223, "tokens_seen": 1141449728 }, { "epoch": 3.04, "learning_rate": 0.0003303911735205617, "loss": 2.7949, "theoretical_loss": 3.6040687923463737, "tokens_seen": 1141515264 }, { "epoch": 3.04, "learning_rate": 0.00033038114343029084, "loss": 2.9001, "theoretical_loss": 3.604049580455894, "tokens_seen": 1141580800 }, { "epoch": 3.04, "learning_rate": 0.0003303711133400201, "loss": 2.8991, "theoretical_loss": 3.6040303699770977, "tokens_seen": 1141646336 }, { "epoch": 3.04, "learning_rate": 0.0003303610832497492, "loss": 2.7893, "theoretical_loss": 3.6040111609098004, "tokens_seen": 1141711872 }, { "epoch": 3.04, "learning_rate": 0.00033035105315947844, "loss": 2.817, "theoretical_loss": 3.6039919532538174, "tokens_seen": 1141777408 }, { "epoch": 3.04, "learning_rate": 0.0003303410230692076, "loss": 2.7568, "theoretical_loss": 3.6039727470089638, "tokens_seen": 1141842944 }, { "epoch": 3.04, "learning_rate": 0.0003303309929789368, "loss": 2.8561, "theoretical_loss": 3.6039535421750553, "tokens_seen": 1141908480 }, { "epoch": 3.04, "objective/train/docs_used": 1830220, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9589056968688965, "objective/train/theoretical_loss": 3.6039487411870166, "objective/train/tokens_used": 1162384864, "theoretical_loss": 3.6039487411870166, "tokens_seen": 1141924864 }, { "epoch": 3.04, "learning_rate": 0.00033032096288866604, "loss": 2.796, "theoretical_loss": 3.6039343387519067, "tokens_seen": 1141974016 }, { "epoch": 3.04, "learning_rate": 0.00033031093279839517, "loss": 2.9092, "theoretical_loss": 3.603915136739334, "tokens_seen": 1142039552 }, { "epoch": 3.04, "learning_rate": 0.0003303009027081244, "loss": 2.7043, "theoretical_loss": 3.6038959361371523, "tokens_seen": 1142105088 }, { "epoch": 3.04, "learning_rate": 0.0003302908726178536, "loss": 2.9277, "theoretical_loss": 3.603876736945177, "tokens_seen": 1142170624 }, { "epoch": 3.04, "learning_rate": 0.00033028084252758276, "loss": 2.899, "theoretical_loss": 3.603857539163224, "tokens_seen": 1142236160 }, { "epoch": 3.04, "learning_rate": 0.00033027081243731195, "loss": 2.8479, "theoretical_loss": 3.6038383427911085, "tokens_seen": 1142301696 }, { "epoch": 3.04, "learning_rate": 0.0003302607823470412, "loss": 2.928, "theoretical_loss": 3.6038191478286463, "tokens_seen": 1142367232 }, { "epoch": 3.04, "learning_rate": 0.0003302507522567703, "loss": 2.8633, "theoretical_loss": 3.6037999542756527, "tokens_seen": 1142432768 }, { "epoch": 3.04, "learning_rate": 0.00033024072216649954, "loss": 2.9199, "theoretical_loss": 3.6037807621319438, "tokens_seen": 1142498304 }, { "epoch": 3.04, "learning_rate": 0.00033023069207622867, "loss": 2.7898, "theoretical_loss": 3.603761571397335, "tokens_seen": 1142563840 }, { "epoch": 3.04, "learning_rate": 0.0003302206619859579, "loss": 2.8375, "theoretical_loss": 3.6037423820716428, "tokens_seen": 1142629376 }, { "epoch": 3.04, "learning_rate": 0.0003302106318956871, "loss": 2.7935, "theoretical_loss": 3.603723194154682, "tokens_seen": 1142694912 }, { "epoch": 3.04, "learning_rate": 0.00033020060180541627, "loss": 2.9658, "theoretical_loss": 3.603704007646268, "tokens_seen": 1142760448 }, { "epoch": 3.04, "learning_rate": 0.00033019057171514545, "loss": 2.7127, "theoretical_loss": 3.603684822546218, "tokens_seen": 1142825984 }, { "epoch": 3.04, "learning_rate": 0.00033018054162487463, "loss": 2.8209, "theoretical_loss": 3.6036656388543475, "tokens_seen": 1142891520 }, { "epoch": 3.04, "learning_rate": 0.0003301705115346038, "loss": 2.9426, "theoretical_loss": 3.6036464565704716, "tokens_seen": 1142957056 }, { "epoch": 3.04, "learning_rate": 0.00033016048144433305, "loss": 2.9615, "theoretical_loss": 3.6036272756944063, "tokens_seen": 1143022592 }, { "epoch": 3.04, "learning_rate": 0.0003301504513540622, "loss": 2.9111, "theoretical_loss": 3.603608096225969, "tokens_seen": 1143088128 }, { "epoch": 3.04, "learning_rate": 0.0003301404212637914, "loss": 2.897, "theoretical_loss": 3.603588918164974, "tokens_seen": 1143153664 }, { "epoch": 3.04, "learning_rate": 0.00033013039117352054, "loss": 2.8347, "theoretical_loss": 3.603569741511238, "tokens_seen": 1143219200 }, { "epoch": 3.04, "learning_rate": 0.00033012036108324977, "loss": 2.7385, "theoretical_loss": 3.6035505662645777, "tokens_seen": 1143284736 }, { "epoch": 3.04, "learning_rate": 0.00033011033099297895, "loss": 2.83, "theoretical_loss": 3.6035313924248085, "tokens_seen": 1143350272 }, { "epoch": 3.04, "learning_rate": 0.00033010030090270813, "loss": 3.0239, "theoretical_loss": 3.603512219991746, "tokens_seen": 1143415808 }, { "epoch": 3.04, "learning_rate": 0.0003300902708124373, "loss": 2.8969, "theoretical_loss": 3.6034930489652073, "tokens_seen": 1143481344 }, { "epoch": 3.04, "learning_rate": 0.00033008024072216655, "loss": 3.1242, "theoretical_loss": 3.6034738793450085, "tokens_seen": 1143546880 }, { "epoch": 3.04, "objective/train/docs_used": 1831670, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.690896511077881, "objective/train/theoretical_loss": 3.603469087159678, "objective/train/tokens_used": 1164023264, "theoretical_loss": 3.603469087159678, "tokens_seen": 1143563264 }, { "epoch": 3.04, "learning_rate": 0.0003300702106318957, "loss": 2.7795, "theoretical_loss": 3.603454711130966, "tokens_seen": 1143612416 }, { "epoch": 3.04, "learning_rate": 0.0003300601805416249, "loss": 2.7582, "theoretical_loss": 3.6034355443228954, "tokens_seen": 1143677952 }, { "epoch": 3.04, "learning_rate": 0.00033005015045135404, "loss": 2.8446, "theoretical_loss": 3.603416378920614, "tokens_seen": 1143743488 }, { "epoch": 3.04, "learning_rate": 0.0003300401203610833, "loss": 2.8506, "theoretical_loss": 3.603397214923937, "tokens_seen": 1143809024 }, { "epoch": 3.04, "learning_rate": 0.00033003009027081245, "loss": 2.9191, "theoretical_loss": 3.6033780523326815, "tokens_seen": 1143874560 }, { "epoch": 3.04, "learning_rate": 0.00033002006018054164, "loss": 2.7914, "theoretical_loss": 3.6033588911466636, "tokens_seen": 1143940096 }, { "epoch": 3.04, "learning_rate": 0.0003300100300902708, "loss": 2.8547, "theoretical_loss": 3.6033397313657, "tokens_seen": 1144005632 }, { "epoch": 3.04, "learning_rate": 0.00033, "loss": 2.7371, "theoretical_loss": 3.603320572989607, "tokens_seen": 1144071168 }, { "epoch": 3.04, "learning_rate": 0.0003299899699097292, "loss": 2.8866, "theoretical_loss": 3.6033014160182013, "tokens_seen": 1144136704 }, { "epoch": 3.04, "learning_rate": 0.0003299799398194584, "loss": 2.8764, "theoretical_loss": 3.6032822604512997, "tokens_seen": 1144202240 }, { "epoch": 3.04, "learning_rate": 0.00032996990972918754, "loss": 2.8127, "theoretical_loss": 3.6032631062887184, "tokens_seen": 1144267776 }, { "epoch": 3.04, "learning_rate": 0.0003299598796389168, "loss": 2.8721, "theoretical_loss": 3.603243953530274, "tokens_seen": 1144333312 }, { "epoch": 3.04, "learning_rate": 0.0003299498495486459, "loss": 2.7249, "theoretical_loss": 3.6032248021757827, "tokens_seen": 1144398848 }, { "epoch": 3.04, "learning_rate": 0.00032993981945837514, "loss": 2.8041, "theoretical_loss": 3.6032056522250624, "tokens_seen": 1144464384 }, { "epoch": 3.04, "learning_rate": 0.0003299297893681043, "loss": 2.7572, "theoretical_loss": 3.603186503677929, "tokens_seen": 1144529920 }, { "epoch": 3.04, "learning_rate": 0.0003299197592778335, "loss": 2.7473, "theoretical_loss": 3.6031673565341995, "tokens_seen": 1144595456 }, { "epoch": 3.04, "learning_rate": 0.0003299097291875627, "loss": 2.7508, "theoretical_loss": 3.603148210793691, "tokens_seen": 1144660992 }, { "epoch": 3.04, "learning_rate": 0.0003298996990972919, "loss": 2.7301, "theoretical_loss": 3.603129066456219, "tokens_seen": 1144726528 }, { "epoch": 3.04, "learning_rate": 0.00032988966900702104, "loss": 2.73, "theoretical_loss": 3.603109923521602, "tokens_seen": 1144792064 }, { "epoch": 3.04, "learning_rate": 0.0003298796389167503, "loss": 2.8014, "theoretical_loss": 3.603090781989656, "tokens_seen": 1144857600 }, { "epoch": 3.04, "learning_rate": 0.0003298696088264794, "loss": 3.0115, "theoretical_loss": 3.6030716418601987, "tokens_seen": 1144923136 }, { "epoch": 3.04, "learning_rate": 0.00032985957873620864, "loss": 2.9313, "theoretical_loss": 3.6030525031330463, "tokens_seen": 1144988672 }, { "epoch": 3.04, "learning_rate": 0.0003298495486459378, "loss": 2.901, "theoretical_loss": 3.603033365808016, "tokens_seen": 1145054208 }, { "epoch": 3.04, "learning_rate": 0.000329839518555667, "loss": 2.7973, "theoretical_loss": 3.603014229884925, "tokens_seen": 1145119744 }, { "epoch": 3.04, "learning_rate": 0.0003298294884653962, "loss": 2.9911, "theoretical_loss": 3.60299509536359, "tokens_seen": 1145185280 }, { "epoch": 3.04, "objective/train/docs_used": 1835787, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7594590187072754, "objective/train/theoretical_loss": 3.6029903119522593, "objective/train/tokens_used": 1165661664, "theoretical_loss": 3.6029903119522593, "tokens_seen": 1145201664 }, { "epoch": 3.04, "learning_rate": 0.00032981945837512537, "loss": 2.9174, "theoretical_loss": 3.6029759622438284, "tokens_seen": 1145250816 }, { "epoch": 3.04, "learning_rate": 0.00032980942828485455, "loss": 2.876, "theoretical_loss": 3.6029568305254576, "tokens_seen": 1145316352 }, { "epoch": 3.04, "learning_rate": 0.0003297993981945838, "loss": 2.8821, "theoretical_loss": 3.6029377002082943, "tokens_seen": 1145381888 }, { "epoch": 3.04, "learning_rate": 0.0003297893681043129, "loss": 2.7227, "theoretical_loss": 3.6029185712921556, "tokens_seen": 1145447424 }, { "epoch": 3.04, "learning_rate": 0.00032977933801404215, "loss": 2.7912, "theoretical_loss": 3.6028994437768596, "tokens_seen": 1145512960 }, { "epoch": 3.04, "learning_rate": 0.0003297693079237713, "loss": 2.7805, "theoretical_loss": 3.602880317662223, "tokens_seen": 1145578496 }, { "epoch": 3.04, "learning_rate": 0.0003297592778335005, "loss": 2.7235, "theoretical_loss": 3.6028611929480627, "tokens_seen": 1145644032 }, { "epoch": 3.04, "learning_rate": 0.0003297492477432297, "loss": 2.9912, "theoretical_loss": 3.602842069634197, "tokens_seen": 1145709568 }, { "epoch": 3.04, "learning_rate": 0.00032973921765295887, "loss": 2.9072, "theoretical_loss": 3.6028229477204423, "tokens_seen": 1145775104 }, { "epoch": 3.04, "learning_rate": 0.00032972918756268805, "loss": 2.9436, "theoretical_loss": 3.602803827206617, "tokens_seen": 1145840640 }, { "epoch": 3.04, "learning_rate": 0.0003297191574724173, "loss": 3.0652, "theoretical_loss": 3.6027847080925377, "tokens_seen": 1145906176 }, { "epoch": 3.04, "learning_rate": 0.0003297091273821464, "loss": 2.9467, "theoretical_loss": 3.6027655903780222, "tokens_seen": 1145971712 }, { "epoch": 3.04, "learning_rate": 0.00032969909729187565, "loss": 3.0528, "theoretical_loss": 3.6027464740628883, "tokens_seen": 1146037248 }, { "epoch": 3.04, "learning_rate": 0.0003296890672016048, "loss": 2.8158, "theoretical_loss": 3.602727359146953, "tokens_seen": 1146102784 }, { "epoch": 3.04, "learning_rate": 0.000329679037111334, "loss": 2.8417, "theoretical_loss": 3.6027082456300343, "tokens_seen": 1146168320 }, { "epoch": 3.04, "learning_rate": 0.0003296690070210632, "loss": 2.7865, "theoretical_loss": 3.6026891335119497, "tokens_seen": 1146233856 }, { "epoch": 3.04, "learning_rate": 0.0003296589769307924, "loss": 2.7922, "theoretical_loss": 3.602670022792517, "tokens_seen": 1146299392 }, { "epoch": 3.04, "learning_rate": 0.00032964894684052155, "loss": 2.8433, "theoretical_loss": 3.602650913471554, "tokens_seen": 1146364928 }, { "epoch": 3.04, "learning_rate": 0.00032963891675025074, "loss": 2.8475, "theoretical_loss": 3.602631805548878, "tokens_seen": 1146430464 }, { "epoch": 3.04, "learning_rate": 0.0003296288866599799, "loss": 2.8377, "theoretical_loss": 3.6026126990243066, "tokens_seen": 1146496000 }, { "epoch": 3.04, "learning_rate": 0.00032961885656970915, "loss": 2.7566, "theoretical_loss": 3.602593593897658, "tokens_seen": 1146561536 }, { "epoch": 3.04, "learning_rate": 0.0003296088264794383, "loss": 2.8966, "theoretical_loss": 3.6025744901687498, "tokens_seen": 1146627072 }, { "epoch": 3.04, "learning_rate": 0.0003295987963891675, "loss": 2.7771, "theoretical_loss": 3.6025553878374, "tokens_seen": 1146692608 }, { "epoch": 3.04, "learning_rate": 0.0003295887662988967, "loss": 3.011, "theoretical_loss": 3.602536286903427, "tokens_seen": 1146758144 }, { "epoch": 3.04, "learning_rate": 0.0003295787362086259, "loss": 2.799, "theoretical_loss": 3.602517187366648, "tokens_seen": 1146823680 }, { "debugging/Self-BLEU-5": 0.5689140825155978, "debugging/distinct-1-grams": 0.7733108506809848, "debugging/distinct-2-grams": 0.9599143247081116, "debugging/entropy-1-grams": 6.365451167547745, "debugging/entropy-2-grams": 7.376540094457866, "debugging/length": 517.9166666666666, "debugging/num_segments": 24, "epoch": 3.04, "objective/train/docs_used": 1837145, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5178656578063965, "objective/train/theoretical_loss": 3.6025124127007437, "objective/train/tokens_used": 1167300064, "theoretical_loss": 3.6025124127007437, "tokens_seen": 1146840064 }, { "epoch": 3.04, "learning_rate": 0.0003295687061183551, "loss": 2.7941, "theoretical_loss": 3.6024980892268816, "tokens_seen": 1146889216 }, { "epoch": 3.04, "learning_rate": 0.00032955867602808424, "loss": 2.8656, "theoretical_loss": 3.602478992483945, "tokens_seen": 1146954752 }, { "epoch": 3.04, "learning_rate": 0.0003295486459378135, "loss": 2.7461, "theoretical_loss": 3.6024598971376562, "tokens_seen": 1147020288 }, { "epoch": 3.04, "learning_rate": 0.00032953861584754265, "loss": 2.9075, "theoretical_loss": 3.6024408031878346, "tokens_seen": 1147085824 }, { "epoch": 3.04, "learning_rate": 0.00032952858575727184, "loss": 2.7939, "theoretical_loss": 3.602421710634297, "tokens_seen": 1147151360 }, { "epoch": 3.04, "learning_rate": 0.000329518555667001, "loss": 2.8976, "theoretical_loss": 3.602402619476862, "tokens_seen": 1147216896 }, { "epoch": 3.04, "learning_rate": 0.0003295085255767302, "loss": 2.8969, "theoretical_loss": 3.6023835297153473, "tokens_seen": 1147282432 }, { "epoch": 3.04, "learning_rate": 0.0003294984954864594, "loss": 2.8692, "theoretical_loss": 3.6023644413495717, "tokens_seen": 1147347968 }, { "epoch": 3.04, "learning_rate": 0.0003294884653961886, "loss": 2.8022, "theoretical_loss": 3.602345354379353, "tokens_seen": 1147413504 }, { "epoch": 3.04, "learning_rate": 0.00032947843530591774, "loss": 2.8835, "theoretical_loss": 3.602326268804511, "tokens_seen": 1147479040 }, { "epoch": 3.04, "learning_rate": 0.000329468405215647, "loss": 2.8803, "theoretical_loss": 3.6023071846248613, "tokens_seen": 1147544576 }, { "epoch": 3.04, "learning_rate": 0.0003294583751253761, "loss": 2.8649, "theoretical_loss": 3.602288101840224, "tokens_seen": 1147610112 }, { "epoch": 3.04, "learning_rate": 0.00032944834503510534, "loss": 2.8792, "theoretical_loss": 3.602269020450417, "tokens_seen": 1147675648 }, { "epoch": 3.04, "learning_rate": 0.0003294383149448345, "loss": 2.9116, "theoretical_loss": 3.602249940455259, "tokens_seen": 1147741184 }, { "epoch": 3.04, "learning_rate": 0.0003294282848545637, "loss": 2.7827, "theoretical_loss": 3.6022308618545686, "tokens_seen": 1147806720 }, { "epoch": 3.04, "learning_rate": 0.0003294182547642929, "loss": 2.875, "theoretical_loss": 3.602211784648164, "tokens_seen": 1147872256 }, { "epoch": 3.04, "learning_rate": 0.0003294082246740221, "loss": 2.9556, "theoretical_loss": 3.602192708835863, "tokens_seen": 1147937792 }, { "epoch": 3.04, "learning_rate": 0.00032939819458375124, "loss": 2.8024, "theoretical_loss": 3.6021736344174853, "tokens_seen": 1148003328 }, { "epoch": 3.04, "learning_rate": 0.0003293881644934805, "loss": 2.8439, "theoretical_loss": 3.6021545613928483, "tokens_seen": 1148068864 }, { "epoch": 3.04, "learning_rate": 0.0003293781344032096, "loss": 2.8947, "theoretical_loss": 3.602135489761772, "tokens_seen": 1148134400 }, { "epoch": 3.04, "learning_rate": 0.00032936810431293884, "loss": 2.9798, "theoretical_loss": 3.602116419524074, "tokens_seen": 1148199936 }, { "epoch": 3.04, "learning_rate": 0.000329358074222668, "loss": 2.8497, "theoretical_loss": 3.6020973506795735, "tokens_seen": 1148265472 }, { "epoch": 3.04, "learning_rate": 0.0003293480441323972, "loss": 2.9387, "theoretical_loss": 3.6020782832280887, "tokens_seen": 1148331008 }, { "epoch": 3.04, "learning_rate": 0.0003293380140421264, "loss": 2.7479, "theoretical_loss": 3.6020592171694386, "tokens_seen": 1148396544 }, { "epoch": 3.04, "learning_rate": 0.00032932798395185557, "loss": 3.0205, "theoretical_loss": 3.6020401525034424, "tokens_seen": 1148462080 }, { "epoch": 3.04, "objective/train/docs_used": 1839880, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6183512210845947, "objective/train/theoretical_loss": 3.6020353865545243, "objective/train/tokens_used": 1168938464, "theoretical_loss": 3.6020353865545243, "tokens_seen": 1148478464 }, { "epoch": 3.04, "learning_rate": 0.00032931795386158475, "loss": 2.794, "theoretical_loss": 3.6020210892299183, "tokens_seen": 1148527616 }, { "epoch": 3.04, "learning_rate": 0.000329307923771314, "loss": 2.6601, "theoretical_loss": 3.6020020273486857, "tokens_seen": 1148593152 }, { "epoch": 3.04, "learning_rate": 0.0003292978936810431, "loss": 2.8233, "theoretical_loss": 3.6019829668595627, "tokens_seen": 1148658688 }, { "epoch": 3.04, "learning_rate": 0.00032928786359077235, "loss": 2.7837, "theoretical_loss": 3.6019639077623693, "tokens_seen": 1148724224 }, { "epoch": 3.04, "learning_rate": 0.0003292778335005015, "loss": 2.9452, "theoretical_loss": 3.6019448500569236, "tokens_seen": 1148789760 }, { "epoch": 3.04, "learning_rate": 0.0003292678034102307, "loss": 3.0197, "theoretical_loss": 3.6019257937430447, "tokens_seen": 1148855296 }, { "epoch": 3.04, "learning_rate": 0.0003292577733199599, "loss": 2.9453, "theoretical_loss": 3.601906738820552, "tokens_seen": 1148920832 }, { "epoch": 3.04, "learning_rate": 0.00032924774322968907, "loss": 2.9906, "theoretical_loss": 3.601887685289264, "tokens_seen": 1148986368 }, { "epoch": 3.04, "learning_rate": 0.00032923771313941825, "loss": 2.8003, "theoretical_loss": 3.601868633149, "tokens_seen": 1149051904 }, { "epoch": 3.04, "learning_rate": 0.0003292276830491475, "loss": 2.7198, "theoretical_loss": 3.6018495823995798, "tokens_seen": 1149117440 }, { "epoch": 3.04, "learning_rate": 0.0003292176529588766, "loss": 2.9, "theoretical_loss": 3.6018305330408213, "tokens_seen": 1149182976 }, { "epoch": 3.04, "learning_rate": 0.00032920762286860585, "loss": 2.8064, "theoretical_loss": 3.6018114850725444, "tokens_seen": 1149248512 }, { "epoch": 3.04, "learning_rate": 0.000329197592778335, "loss": 2.8122, "theoretical_loss": 3.6017924384945683, "tokens_seen": 1149314048 }, { "epoch": 3.04, "learning_rate": 0.0003291875626880642, "loss": 2.9537, "theoretical_loss": 3.6017733933067126, "tokens_seen": 1149379584 }, { "epoch": 3.04, "learning_rate": 0.0003291775325977934, "loss": 2.9057, "theoretical_loss": 3.601754349508796, "tokens_seen": 1149445120 }, { "epoch": 3.04, "learning_rate": 0.0003291675025075226, "loss": 2.8258, "theoretical_loss": 3.601735307100638, "tokens_seen": 1149510656 }, { "epoch": 3.04, "learning_rate": 0.00032915747241725175, "loss": 2.8566, "theoretical_loss": 3.601716266082058, "tokens_seen": 1149576192 }, { "epoch": 3.04, "learning_rate": 0.00032914744232698094, "loss": 2.9839, "theoretical_loss": 3.6016972264528753, "tokens_seen": 1149641728 }, { "epoch": 3.04, "learning_rate": 0.0003291374122367101, "loss": 2.8697, "theoretical_loss": 3.6016781882129094, "tokens_seen": 1149707264 }, { "epoch": 3.04, "learning_rate": 0.00032912738214643935, "loss": 2.8546, "theoretical_loss": 3.6016591513619796, "tokens_seen": 1149772800 }, { "epoch": 3.04, "learning_rate": 0.0003291173520561685, "loss": 2.8724, "theoretical_loss": 3.601640115899906, "tokens_seen": 1149838336 }, { "epoch": 3.04, "learning_rate": 0.0003291073219658977, "loss": 2.7771, "theoretical_loss": 3.601621081826507, "tokens_seen": 1149903872 }, { "epoch": 3.04, "learning_rate": 0.0003290972918756269, "loss": 2.7692, "theoretical_loss": 3.6016020491416034, "tokens_seen": 1149969408 }, { "epoch": 3.04, "learning_rate": 0.0003290872617853561, "loss": 2.8669, "theoretical_loss": 3.601583017845014, "tokens_seen": 1150034944 }, { "epoch": 3.04, "learning_rate": 0.00032907723169508526, "loss": 2.745, "theoretical_loss": 3.6015639879365584, "tokens_seen": 1150100480 }, { "epoch": 3.04, "objective/train/docs_used": 1842635, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.503868818283081, "objective/train/theoretical_loss": 3.60155923067632, "objective/train/tokens_used": 1170576864, "theoretical_loss": 3.60155923067632, "tokens_seen": 1150116864 }, { "epoch": 3.04, "learning_rate": 0.00032906720160481444, "loss": 2.8588, "theoretical_loss": 3.601544959416057, "tokens_seen": 1150166016 }, { "epoch": 3.04, "learning_rate": 0.0003290571715145436, "loss": 2.935, "theoretical_loss": 3.6015259322833284, "tokens_seen": 1150231552 }, { "epoch": 3.04, "learning_rate": 0.00032904714142427286, "loss": 2.7491, "theoretical_loss": 3.601506906538193, "tokens_seen": 1150297088 }, { "epoch": 3.04, "learning_rate": 0.000329037111334002, "loss": 2.8716, "theoretical_loss": 3.601487882180471, "tokens_seen": 1150362624 }, { "epoch": 3.04, "learning_rate": 0.0003290270812437312, "loss": 2.875, "theoretical_loss": 3.601468859209981, "tokens_seen": 1150428160 }, { "epoch": 3.04, "learning_rate": 0.00032901705115346034, "loss": 2.6441, "theoretical_loss": 3.601449837626544, "tokens_seen": 1150493696 }, { "epoch": 3.04, "learning_rate": 0.0003290070210631896, "loss": 2.8992, "theoretical_loss": 3.6014308174299794, "tokens_seen": 1150559232 }, { "epoch": 3.04, "learning_rate": 0.00032899699097291876, "loss": 2.891, "theoretical_loss": 3.6014117986201066, "tokens_seen": 1150624768 }, { "epoch": 3.04, "learning_rate": 0.00032898696088264794, "loss": 2.8874, "theoretical_loss": 3.601392781196746, "tokens_seen": 1150690304 }, { "epoch": 3.04, "learning_rate": 0.0003289769307923771, "loss": 2.8953, "theoretical_loss": 3.601373765159718, "tokens_seen": 1150755840 }, { "epoch": 3.04, "learning_rate": 0.0003289669007021063, "loss": 2.8212, "theoretical_loss": 3.601354750508842, "tokens_seen": 1150821376 }, { "epoch": 3.04, "learning_rate": 0.0003289568706118355, "loss": 2.9124, "theoretical_loss": 3.601335737243938, "tokens_seen": 1150886912 }, { "epoch": 3.04, "learning_rate": 0.0003289468405215647, "loss": 2.8963, "theoretical_loss": 3.601316725364826, "tokens_seen": 1150952448 }, { "epoch": 3.04, "learning_rate": 0.00032893681043129385, "loss": 2.7364, "theoretical_loss": 3.6012977148713268, "tokens_seen": 1151017984 }, { "epoch": 3.04, "learning_rate": 0.0003289267803410231, "loss": 2.7903, "theoretical_loss": 3.60127870576326, "tokens_seen": 1151083520 }, { "epoch": 3.04, "learning_rate": 0.00032891675025075226, "loss": 2.8385, "theoretical_loss": 3.6012596980404457, "tokens_seen": 1151149056 }, { "epoch": 3.04, "learning_rate": 0.00032890672016048145, "loss": 2.8828, "theoretical_loss": 3.6012406917027038, "tokens_seen": 1151214592 }, { "epoch": 3.04, "learning_rate": 0.0003288966900702106, "loss": 2.9143, "theoretical_loss": 3.6012216867498554, "tokens_seen": 1151280128 }, { "epoch": 3.04, "learning_rate": 0.0003288866599799398, "loss": 2.9274, "theoretical_loss": 3.6012026831817203, "tokens_seen": 1151345664 }, { "epoch": 3.04, "learning_rate": 0.000328876629889669, "loss": 2.937, "theoretical_loss": 3.601183680998119, "tokens_seen": 1151411200 }, { "epoch": 3.04, "learning_rate": 0.0003288665997993982, "loss": 2.9445, "theoretical_loss": 3.6011646801988713, "tokens_seen": 1151476736 }, { "epoch": 3.04, "learning_rate": 0.00032885656970912735, "loss": 2.8842, "theoretical_loss": 3.601145680783798, "tokens_seen": 1151542272 }, { "epoch": 3.04, "learning_rate": 0.0003288465396188566, "loss": 2.8329, "theoretical_loss": 3.601126682752719, "tokens_seen": 1151607808 }, { "epoch": 3.04, "learning_rate": 0.0003288365095285857, "loss": 2.9388, "theoretical_loss": 3.601107686105456, "tokens_seen": 1151673344 }, { "epoch": 3.04, "learning_rate": 0.00032882647943831495, "loss": 2.8093, "theoretical_loss": 3.601088690841828, "tokens_seen": 1151738880 }, { "epoch": 3.04, "objective/train/docs_used": 1845503, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8894810676574707, "objective/train/theoretical_loss": 3.6010839422420933, "objective/train/tokens_used": 1172215264, "theoretical_loss": 3.6010839422420933, "tokens_seen": 1151755264 }, { "epoch": 3.04, "learning_rate": 0.0003288164493480442, "loss": 2.913, "theoretical_loss": 3.6010696969616562, "tokens_seen": 1151804416 }, { "epoch": 3.04, "learning_rate": 0.0003288064192577733, "loss": 2.9056, "theoretical_loss": 3.601050704464761, "tokens_seen": 1151869952 }, { "epoch": 3.04, "learning_rate": 0.00032879638916750255, "loss": 2.7946, "theoretical_loss": 3.601031713350963, "tokens_seen": 1151935488 }, { "epoch": 3.04, "learning_rate": 0.0003287863590772317, "loss": 2.9235, "theoretical_loss": 3.601012723620083, "tokens_seen": 1152001024 }, { "epoch": 3.04, "learning_rate": 0.0003287763289869609, "loss": 2.7607, "theoretical_loss": 3.600993735271941, "tokens_seen": 1152066560 }, { "epoch": 3.04, "learning_rate": 0.0003287662988966901, "loss": 2.7524, "theoretical_loss": 3.600974748306359, "tokens_seen": 1152132096 }, { "epoch": 3.04, "learning_rate": 0.00032875626880641927, "loss": 2.9081, "theoretical_loss": 3.6009557627231557, "tokens_seen": 1152197632 }, { "epoch": 3.04, "learning_rate": 0.00032874623871614845, "loss": 2.8766, "theoretical_loss": 3.6009367785221533, "tokens_seen": 1152263168 }, { "epoch": 3.04, "learning_rate": 0.0003287362086258777, "loss": 2.9421, "theoretical_loss": 3.6009177957031726, "tokens_seen": 1152328704 }, { "epoch": 3.04, "learning_rate": 0.0003287261785356068, "loss": 2.8911, "theoretical_loss": 3.6008988142660336, "tokens_seen": 1152394240 }, { "epoch": 3.04, "learning_rate": 0.00032871614844533605, "loss": 2.9475, "theoretical_loss": 3.6008798342105575, "tokens_seen": 1152459776 }, { "epoch": 3.04, "learning_rate": 0.0003287061183550652, "loss": 2.9016, "theoretical_loss": 3.6008608555365655, "tokens_seen": 1152525312 }, { "epoch": 3.04, "learning_rate": 0.0003286960882647944, "loss": 3.013, "theoretical_loss": 3.600841878243878, "tokens_seen": 1152590848 }, { "epoch": 3.04, "learning_rate": 0.0003286860581745236, "loss": 2.881, "theoretical_loss": 3.6008229023323164, "tokens_seen": 1152656384 }, { "epoch": 3.04, "learning_rate": 0.0003286760280842528, "loss": 2.9736, "theoretical_loss": 3.600803927801701, "tokens_seen": 1152721920 }, { "epoch": 3.04, "learning_rate": 0.00032866599799398195, "loss": 2.822, "theoretical_loss": 3.6007849546518536, "tokens_seen": 1152787456 }, { "epoch": 3.04, "learning_rate": 0.00032865596790371114, "loss": 3.0119, "theoretical_loss": 3.6007659828825944, "tokens_seen": 1152852992 }, { "epoch": 3.04, "learning_rate": 0.0003286459378134403, "loss": 2.8684, "theoretical_loss": 3.600747012493745, "tokens_seen": 1152918528 }, { "epoch": 3.04, "learning_rate": 0.00032863590772316955, "loss": 2.7857, "theoretical_loss": 3.6007280434851268, "tokens_seen": 1152984064 }, { "epoch": 3.04, "learning_rate": 0.0003286258776328987, "loss": 2.9292, "theoretical_loss": 3.6007090758565603, "tokens_seen": 1153049600 }, { "epoch": 3.04, "learning_rate": 0.0003286158475426279, "loss": 2.9226, "theoretical_loss": 3.600690109607867, "tokens_seen": 1153115136 }, { "epoch": 3.04, "learning_rate": 0.0003286058174523571, "loss": 2.7262, "theoretical_loss": 3.6006711447388673, "tokens_seen": 1153180672 }, { "epoch": 3.04, "learning_rate": 0.0003285957873620863, "loss": 2.8472, "theoretical_loss": 3.6006521812493837, "tokens_seen": 1153246208 }, { "epoch": 3.04, "learning_rate": 0.00032858575727181546, "loss": 2.9117, "theoretical_loss": 3.600633219139237, "tokens_seen": 1153311744 }, { "epoch": 3.04, "learning_rate": 0.00032857572718154464, "loss": 2.8404, "theoretical_loss": 3.600614258408249, "tokens_seen": 1153377280 }, { "epoch": 3.04, "objective/train/docs_used": 1847928, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8266193866729736, "objective/train/theoretical_loss": 3.600609518440974, "objective/train/tokens_used": 1173853664, "theoretical_loss": 3.600609518440974, "tokens_seen": 1153393664 }, { "epoch": 3.04, "learning_rate": 0.0003285656970912738, "loss": 3.1043, "theoretical_loss": 3.6005952990562395, "tokens_seen": 1153442816 }, { "epoch": 3.04, "learning_rate": 0.00032855566700100306, "loss": 3.0137, "theoretical_loss": 3.6005763410830305, "tokens_seen": 1153508352 }, { "epoch": 3.04, "learning_rate": 0.0003285456369107322, "loss": 2.8069, "theoretical_loss": 3.600557384488445, "tokens_seen": 1153573888 }, { "epoch": 3.04, "learning_rate": 0.0003285356068204614, "loss": 2.9747, "theoretical_loss": 3.6005384292723024, "tokens_seen": 1153639424 }, { "epoch": 3.04, "learning_rate": 0.00032852557673019054, "loss": 3.0663, "theoretical_loss": 3.6005194754344245, "tokens_seen": 1153704960 }, { "epoch": 3.04, "learning_rate": 0.0003285155466399198, "loss": 2.8488, "theoretical_loss": 3.6005005229746336, "tokens_seen": 1153770496 }, { "epoch": 3.04, "learning_rate": 0.00032850551654964896, "loss": 2.8886, "theoretical_loss": 3.600481571892751, "tokens_seen": 1153836032 }, { "epoch": 3.04, "learning_rate": 0.00032849548645937814, "loss": 2.7676, "theoretical_loss": 3.600462622188598, "tokens_seen": 1153901568 }, { "epoch": 3.04, "learning_rate": 0.0003284854563691073, "loss": 2.893, "theoretical_loss": 3.6004436738619963, "tokens_seen": 1153967104 }, { "epoch": 3.04, "learning_rate": 0.0003284754262788365, "loss": 2.9588, "theoretical_loss": 3.6004247269127676, "tokens_seen": 1154032640 }, { "epoch": 3.04, "learning_rate": 0.0003284653961885657, "loss": 2.8118, "theoretical_loss": 3.6004057813407337, "tokens_seen": 1154098176 }, { "epoch": 3.04, "learning_rate": 0.0003284553660982949, "loss": 2.7919, "theoretical_loss": 3.6003868371457157, "tokens_seen": 1154163712 }, { "epoch": 3.04, "learning_rate": 0.00032844533600802405, "loss": 2.7919, "theoretical_loss": 3.600367894327536, "tokens_seen": 1154229248 }, { "epoch": 3.04, "learning_rate": 0.0003284353059177533, "loss": 2.922, "theoretical_loss": 3.600348952886016, "tokens_seen": 1154294784 }, { "epoch": 3.04, "learning_rate": 0.00032842527582748246, "loss": 2.8844, "theoretical_loss": 3.6003300128209776, "tokens_seen": 1154360320 }, { "epoch": 3.04, "learning_rate": 0.00032841524573721165, "loss": 2.8048, "theoretical_loss": 3.600311074132243, "tokens_seen": 1154425856 }, { "epoch": 3.04, "learning_rate": 0.0003284052156469408, "loss": 2.7899, "theoretical_loss": 3.6002921368196334, "tokens_seen": 1154491392 }, { "epoch": 3.04, "learning_rate": 0.00032839518555667, "loss": 2.819, "theoretical_loss": 3.6002732008829708, "tokens_seen": 1154556928 }, { "epoch": 3.04, "learning_rate": 0.0003283851554663992, "loss": 2.9661, "theoretical_loss": 3.600254266322078, "tokens_seen": 1154622464 }, { "epoch": 3.04, "learning_rate": 0.0003283751253761284, "loss": 2.8072, "theoretical_loss": 3.6002353331367756, "tokens_seen": 1154688000 }, { "epoch": 3.04, "learning_rate": 0.00032836509528585755, "loss": 3.0106, "theoretical_loss": 3.600216401326887, "tokens_seen": 1154753536 }, { "epoch": 3.04, "learning_rate": 0.0003283550651955868, "loss": 2.88, "theoretical_loss": 3.6001974708922324, "tokens_seen": 1154819072 }, { "epoch": 3.04, "learning_rate": 0.0003283450351053159, "loss": 2.9766, "theoretical_loss": 3.600178541832636, "tokens_seen": 1154884608 }, { "epoch": 3.04, "learning_rate": 0.00032833500501504515, "loss": 2.7809, "theoretical_loss": 3.600159614147919, "tokens_seen": 1154950144 }, { "epoch": 3.04, "learning_rate": 0.00032832497492477433, "loss": 2.7772, "theoretical_loss": 3.6001406878379028, "tokens_seen": 1155015680 }, { "epoch": 3.04, "objective/train/docs_used": 1850685, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8569583892822266, "objective/train/theoretical_loss": 3.6001359564751754, "objective/train/tokens_used": 1175492064, "theoretical_loss": 3.6001359564751754, "tokens_seen": 1155032064 }, { "epoch": 3.04, "learning_rate": 0.0003283149448345035, "loss": 2.8287, "theoretical_loss": 3.6001217629024103, "tokens_seen": 1155081216 }, { "epoch": 3.04, "learning_rate": 0.0003283049147442327, "loss": 2.9249, "theoretical_loss": 3.6001028393412637, "tokens_seen": 1155146752 }, { "epoch": 3.04, "learning_rate": 0.0003282948846539619, "loss": 2.9011, "theoretical_loss": 3.6000839171542856, "tokens_seen": 1155212288 }, { "epoch": 3.04, "learning_rate": 0.00032828485456369105, "loss": 2.8116, "theoretical_loss": 3.600064996341297, "tokens_seen": 1155277824 }, { "epoch": 3.04, "learning_rate": 0.0003282748244734203, "loss": 2.7909, "theoretical_loss": 3.6000460769021214, "tokens_seen": 1155343360 }, { "epoch": 3.04, "learning_rate": 0.0003282647943831494, "loss": 2.8433, "theoretical_loss": 3.6000271588365806, "tokens_seen": 1155408896 }, { "epoch": 3.04, "learning_rate": 0.00032825476429287865, "loss": 2.923, "theoretical_loss": 3.600008242144497, "tokens_seen": 1155474432 }, { "epoch": 3.04, "learning_rate": 0.00032824473420260783, "loss": 2.8416, "theoretical_loss": 3.5999893268256935, "tokens_seen": 1155539968 }, { "epoch": 3.04, "learning_rate": 0.000328234704112337, "loss": 2.7713, "theoretical_loss": 3.5999704128799914, "tokens_seen": 1155605504 }, { "epoch": 3.04, "learning_rate": 0.0003282246740220662, "loss": 2.7801, "theoretical_loss": 3.5999515003072142, "tokens_seen": 1155671040 }, { "epoch": 3.04, "learning_rate": 0.0003282146439317954, "loss": 2.8052, "theoretical_loss": 3.5999325891071843, "tokens_seen": 1155736576 }, { "epoch": 3.04, "learning_rate": 0.00032820461384152456, "loss": 2.8855, "theoretical_loss": 3.599913679279724, "tokens_seen": 1155802112 }, { "epoch": 3.04, "learning_rate": 0.0003281945837512538, "loss": 2.7904, "theoretical_loss": 3.5998947708246556, "tokens_seen": 1155867648 }, { "epoch": 3.04, "learning_rate": 0.0003281845536609829, "loss": 2.8808, "theoretical_loss": 3.599875863741802, "tokens_seen": 1155933184 }, { "epoch": 3.04, "learning_rate": 0.00032817452357071215, "loss": 2.8094, "theoretical_loss": 3.5998569580309856, "tokens_seen": 1155998720 }, { "epoch": 3.04, "learning_rate": 0.0003281644934804413, "loss": 2.8405, "theoretical_loss": 3.5998380536920296, "tokens_seen": 1156064256 }, { "epoch": 3.04, "learning_rate": 0.0003281544633901705, "loss": 3.1018, "theoretical_loss": 3.5998191507247563, "tokens_seen": 1156129792 }, { "epoch": 3.04, "learning_rate": 0.0003281444332998997, "loss": 2.8755, "theoretical_loss": 3.5998002491289878, "tokens_seen": 1156195328 }, { "epoch": 3.04, "learning_rate": 0.0003281344032096289, "loss": 2.6292, "theoretical_loss": 3.599781348904548, "tokens_seen": 1156260864 }, { "epoch": 3.04, "learning_rate": 0.00032812437311935806, "loss": 3.0038, "theoretical_loss": 3.599762450051259, "tokens_seen": 1156326400 }, { "epoch": 3.04, "learning_rate": 0.0003281143430290873, "loss": 2.8054, "theoretical_loss": 3.5997435525689436, "tokens_seen": 1156391936 }, { "epoch": 3.04, "learning_rate": 0.0003281043129388164, "loss": 2.8269, "theoretical_loss": 3.5997246564574255, "tokens_seen": 1156457472 }, { "epoch": 3.04, "learning_rate": 0.00032809428284854566, "loss": 2.8443, "theoretical_loss": 3.5997057617165265, "tokens_seen": 1156523008 }, { "epoch": 3.04, "learning_rate": 0.0003280842527582748, "loss": 2.7522, "theoretical_loss": 3.5996868683460708, "tokens_seen": 1156588544 }, { "epoch": 3.04, "learning_rate": 0.000328074222668004, "loss": 3.0209, "theoretical_loss": 3.5996679763458794, "tokens_seen": 1156654080 }, { "epoch": 3.04, "objective/train/docs_used": 1853532, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.766611099243164, "objective/train/theoretical_loss": 3.599663253559915, "objective/train/tokens_used": 1177130464, "theoretical_loss": 3.599663253559915, "tokens_seen": 1156670464 }, { "epoch": 3.04, "learning_rate": 0.00032806419257773326, "loss": 2.7697, "theoretical_loss": 3.599649085715777, "tokens_seen": 1156719616 }, { "epoch": 3.04, "learning_rate": 0.0003280541624874624, "loss": 2.7006, "theoretical_loss": 3.5996301964555864, "tokens_seen": 1156785152 }, { "epoch": 3.04, "learning_rate": 0.0003280441323971916, "loss": 2.7483, "theoretical_loss": 3.5996113085651293, "tokens_seen": 1156850688 }, { "epoch": 3.04, "learning_rate": 0.00032803410230692074, "loss": 2.852, "theoretical_loss": 3.5995924220442306, "tokens_seen": 1156916224 }, { "epoch": 3.04, "learning_rate": 0.00032802407221665, "loss": 2.7739, "theoretical_loss": 3.5995735368927124, "tokens_seen": 1156981760 }, { "epoch": 3.04, "learning_rate": 0.00032801404212637916, "loss": 2.9992, "theoretical_loss": 3.599554653110398, "tokens_seen": 1157047296 }, { "epoch": 3.04, "learning_rate": 0.00032800401203610834, "loss": 2.8117, "theoretical_loss": 3.599535770697111, "tokens_seen": 1157112832 }, { "epoch": 3.04, "learning_rate": 0.0003279939819458375, "loss": 2.8122, "theoretical_loss": 3.5995168896526737, "tokens_seen": 1157178368 }, { "epoch": 3.04, "learning_rate": 0.0003279839518555667, "loss": 2.8469, "theoretical_loss": 3.5994980099769105, "tokens_seen": 1157243904 }, { "epoch": 3.04, "learning_rate": 0.0003279739217652959, "loss": 2.8878, "theoretical_loss": 3.5994791316696437, "tokens_seen": 1157309440 }, { "epoch": 3.04, "learning_rate": 0.0003279638916750251, "loss": 2.8934, "theoretical_loss": 3.599460254730697, "tokens_seen": 1157374976 }, { "epoch": 3.04, "learning_rate": 0.00032795386158475425, "loss": 2.7234, "theoretical_loss": 3.5994413791598943, "tokens_seen": 1157440512 }, { "epoch": 3.04, "learning_rate": 0.0003279438314944835, "loss": 3.0275, "theoretical_loss": 3.599422504957058, "tokens_seen": 1157506048 }, { "epoch": 3.04, "learning_rate": 0.00032793380140421266, "loss": 2.9582, "theoretical_loss": 3.599403632122012, "tokens_seen": 1157571584 }, { "epoch": 3.04, "learning_rate": 0.00032792377131394185, "loss": 2.9294, "theoretical_loss": 3.59938476065458, "tokens_seen": 1157637120 }, { "epoch": 3.04, "learning_rate": 0.000327913741223671, "loss": 2.7827, "theoretical_loss": 3.5993658905545844, "tokens_seen": 1157702656 }, { "epoch": 3.04, "learning_rate": 0.0003279037111334002, "loss": 2.8408, "theoretical_loss": 3.599347021821851, "tokens_seen": 1157768192 }, { "epoch": 3.04, "learning_rate": 0.0003278936810431294, "loss": 2.8976, "theoretical_loss": 3.5993281544562006, "tokens_seen": 1157833728 }, { "epoch": 3.04, "learning_rate": 0.0003278836509528586, "loss": 2.8286, "theoretical_loss": 3.5993092884574587, "tokens_seen": 1157899264 }, { "epoch": 3.04, "learning_rate": 0.00032787362086258775, "loss": 2.9458, "theoretical_loss": 3.5992904238254475, "tokens_seen": 1157964800 }, { "epoch": 3.04, "learning_rate": 0.000327863590772317, "loss": 2.9424, "theoretical_loss": 3.599271560559992, "tokens_seen": 1158030336 }, { "epoch": 3.04, "learning_rate": 0.0003278535606820461, "loss": 2.9187, "theoretical_loss": 3.5992526986609157, "tokens_seen": 1158095872 }, { "epoch": 3.04, "learning_rate": 0.00032784353059177535, "loss": 2.7487, "theoretical_loss": 3.5992338381280415, "tokens_seen": 1158161408 }, { "epoch": 3.04, "learning_rate": 0.00032783350050150453, "loss": 2.8639, "theoretical_loss": 3.5992149789611934, "tokens_seen": 1158226944 }, { "epoch": 3.04, "learning_rate": 0.0003278234704112337, "loss": 2.7573, "theoretical_loss": 3.599196121160195, "tokens_seen": 1158292480 }, { "epoch": 3.04, "objective/train/docs_used": 1856400, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.953104019165039, "objective/train/theoretical_loss": 3.599191406923339, "objective/train/tokens_used": 1178768864, "theoretical_loss": 3.599191406923339, "tokens_seen": 1158308864 }, { "epoch": 3.04, "learning_rate": 0.0003278134403209629, "loss": 2.8882, "theoretical_loss": 3.5991772647248714, "tokens_seen": 1158358016 }, { "epoch": 3.04, "learning_rate": 0.00032780341023069213, "loss": 2.994, "theoretical_loss": 3.5991584096550446, "tokens_seen": 1158423552 }, { "epoch": 3.04, "learning_rate": 0.00032779338014042125, "loss": 3.0006, "theoretical_loss": 3.5991395559505395, "tokens_seen": 1158489088 }, { "epoch": 3.04, "learning_rate": 0.0003277833500501505, "loss": 2.93, "theoretical_loss": 3.59912070361118, "tokens_seen": 1158554624 }, { "epoch": 3.04, "learning_rate": 0.0003277733199598796, "loss": 2.9467, "theoretical_loss": 3.5991018526367897, "tokens_seen": 1158620160 }, { "epoch": 3.04, "learning_rate": 0.00032776328986960885, "loss": 2.8353, "theoretical_loss": 3.599083003027193, "tokens_seen": 1158685696 }, { "epoch": 3.04, "learning_rate": 0.00032775325977933803, "loss": 2.7511, "theoretical_loss": 3.599064154782214, "tokens_seen": 1158751232 }, { "epoch": 3.04, "learning_rate": 0.0003277432296890672, "loss": 2.938, "theoretical_loss": 3.5990453079016755, "tokens_seen": 1158816768 }, { "epoch": 3.04, "learning_rate": 0.0003277331995987964, "loss": 2.747, "theoretical_loss": 3.599026462385403, "tokens_seen": 1158882304 }, { "epoch": 3.04, "learning_rate": 0.0003277231695085256, "loss": 2.9422, "theoretical_loss": 3.5990076182332205, "tokens_seen": 1158947840 }, { "epoch": 3.04, "learning_rate": 0.00032771313941825476, "loss": 2.9175, "theoretical_loss": 3.598988775444951, "tokens_seen": 1159013376 }, { "epoch": 3.04, "learning_rate": 0.000327703109327984, "loss": 2.7732, "theoretical_loss": 3.5989699340204195, "tokens_seen": 1159078912 }, { "epoch": 3.04, "learning_rate": 0.0003276930792377131, "loss": 2.827, "theoretical_loss": 3.5989510939594505, "tokens_seen": 1159144448 }, { "epoch": 3.04, "learning_rate": 0.00032768304914744235, "loss": 2.7943, "theoretical_loss": 3.598932255261867, "tokens_seen": 1159209984 }, { "epoch": 3.04, "learning_rate": 0.0003276730190571715, "loss": 2.8032, "theoretical_loss": 3.598913417927495, "tokens_seen": 1159275520 }, { "epoch": 3.04, "learning_rate": 0.0003276629889669007, "loss": 2.9004, "theoretical_loss": 3.5988945819561575, "tokens_seen": 1159341056 }, { "epoch": 3.04, "learning_rate": 0.0003276529588766299, "loss": 2.7775, "theoretical_loss": 3.5988757473476793, "tokens_seen": 1159406592 }, { "epoch": 3.04, "learning_rate": 0.0003276429287863591, "loss": 2.8384, "theoretical_loss": 3.5988569141018845, "tokens_seen": 1159472128 }, { "epoch": 3.04, "learning_rate": 0.00032763289869608826, "loss": 2.8391, "theoretical_loss": 3.598838082218598, "tokens_seen": 1159537664 }, { "epoch": 3.04, "learning_rate": 0.0003276228686058175, "loss": 2.7649, "theoretical_loss": 3.5988192516976434, "tokens_seen": 1159603200 }, { "epoch": 3.04, "learning_rate": 0.0003276128385155466, "loss": 2.8151, "theoretical_loss": 3.598800422538846, "tokens_seen": 1159668736 }, { "epoch": 3.04, "learning_rate": 0.00032760280842527586, "loss": 2.8664, "theoretical_loss": 3.5987815947420296, "tokens_seen": 1159734272 }, { "epoch": 3.04, "learning_rate": 0.000327592778335005, "loss": 2.6866, "theoretical_loss": 3.5987627683070196, "tokens_seen": 1159799808 }, { "epoch": 3.04, "learning_rate": 0.0003275827482447342, "loss": 2.7615, "theoretical_loss": 3.5987439432336394, "tokens_seen": 1159865344 }, { "epoch": 3.04, "learning_rate": 0.0003275727181544634, "loss": 2.9314, "theoretical_loss": 3.598725119521715, "tokens_seen": 1159930880 }, { "epoch": 3.04, "objective/train/docs_used": 1857916, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9079790115356445, "objective/train/theoretical_loss": 3.598720413806441, "objective/train/tokens_used": 1180407264, "theoretical_loss": 3.598720413806441, "tokens_seen": 1159947264 }, { "epoch": 3.04, "learning_rate": 0.0003275626880641926, "loss": 2.9178, "theoretical_loss": 3.59870629717107, "tokens_seen": 1159996416 }, { "epoch": 3.04, "learning_rate": 0.00032755265797392176, "loss": 2.7578, "theoretical_loss": 3.598687476181529, "tokens_seen": 1160061952 }, { "epoch": 3.04, "learning_rate": 0.00032754262788365094, "loss": 2.7885, "theoretical_loss": 3.5986686565529173, "tokens_seen": 1160127488 }, { "epoch": 3.04, "learning_rate": 0.0003275325977933801, "loss": 2.7116, "theoretical_loss": 3.598649838285059, "tokens_seen": 1160193024 }, { "epoch": 3.04, "learning_rate": 0.00032752256770310936, "loss": 2.7141, "theoretical_loss": 3.5986310213777797, "tokens_seen": 1160258560 }, { "epoch": 3.04, "learning_rate": 0.0003275125376128385, "loss": 2.9268, "theoretical_loss": 3.598612205830903, "tokens_seen": 1160324096 }, { "epoch": 3.04, "learning_rate": 0.0003275025075225677, "loss": 2.8674, "theoretical_loss": 3.598593391644255, "tokens_seen": 1160389632 }, { "epoch": 3.04, "learning_rate": 0.00032749247743229685, "loss": 2.8576, "theoretical_loss": 3.5985745788176597, "tokens_seen": 1160455168 }, { "epoch": 3.04, "learning_rate": 0.0003274824473420261, "loss": 2.8044, "theoretical_loss": 3.598555767350942, "tokens_seen": 1160520704 }, { "epoch": 3.04, "learning_rate": 0.00032747241725175527, "loss": 2.9703, "theoretical_loss": 3.5985369572439274, "tokens_seen": 1160586240 }, { "epoch": 3.04, "learning_rate": 0.00032746238716148445, "loss": 2.8259, "theoretical_loss": 3.5985181484964404, "tokens_seen": 1160651776 }, { "epoch": 3.04, "learning_rate": 0.00032745235707121363, "loss": 2.938, "theoretical_loss": 3.5984993411083055, "tokens_seen": 1160717312 }, { "epoch": 3.04, "learning_rate": 0.00032744232698094286, "loss": 2.983, "theoretical_loss": 3.598480535079349, "tokens_seen": 1160782848 }, { "epoch": 3.04, "learning_rate": 0.000327432296890672, "loss": 3.0, "theoretical_loss": 3.598461730409395, "tokens_seen": 1160848384 }, { "epoch": 3.04, "learning_rate": 0.0003274222668004012, "loss": 2.9533, "theoretical_loss": 3.5984429270982687, "tokens_seen": 1160913920 }, { "epoch": 3.04, "learning_rate": 0.00032741223671013035, "loss": 2.8992, "theoretical_loss": 3.5984241251457956, "tokens_seen": 1160979456 }, { "epoch": 3.04, "learning_rate": 0.0003274022066198596, "loss": 2.8739, "theoretical_loss": 3.5984053245518, "tokens_seen": 1161044992 }, { "epoch": 3.04, "learning_rate": 0.00032739217652958877, "loss": 2.8526, "theoretical_loss": 3.5983865253161085, "tokens_seen": 1161110528 }, { "epoch": 3.04, "learning_rate": 0.00032738214643931795, "loss": 2.8804, "theoretical_loss": 3.5983677274385446, "tokens_seen": 1161176064 }, { "epoch": 3.04, "learning_rate": 0.00032737211634904713, "loss": 2.8415, "theoretical_loss": 3.5983489309189345, "tokens_seen": 1161241600 }, { "epoch": 3.04, "learning_rate": 0.0003273620862587763, "loss": 2.8545, "theoretical_loss": 3.598330135757103, "tokens_seen": 1161307136 }, { "epoch": 3.04, "learning_rate": 0.0003273520561685055, "loss": 2.9091, "theoretical_loss": 3.5983113419528765, "tokens_seen": 1161372672 }, { "epoch": 3.04, "learning_rate": 0.00032734202607823473, "loss": 2.7522, "theoretical_loss": 3.5982925495060796, "tokens_seen": 1161438208 }, { "epoch": 3.04, "learning_rate": 0.00032733199598796386, "loss": 2.8316, "theoretical_loss": 3.5982737584165374, "tokens_seen": 1161503744 }, { "epoch": 3.04, "learning_rate": 0.0003273219658976931, "loss": 3.0131, "theoretical_loss": 3.598254968684076, "tokens_seen": 1161569280 }, { "epoch": 3.04, "objective/train/docs_used": 1860918, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8824880123138428, "objective/train/theoretical_loss": 3.598250271462984, "objective/train/tokens_used": 1182045664, "theoretical_loss": 3.598250271462984, "tokens_seen": 1161585664 }, { "epoch": 3.04, "learning_rate": 0.00032731193580742233, "loss": 2.937, "theoretical_loss": 3.5982361803085197, "tokens_seen": 1161634816 }, { "epoch": 3.04, "learning_rate": 0.00032730190571715145, "loss": 2.837, "theoretical_loss": 3.598217393289695, "tokens_seen": 1161700352 }, { "epoch": 3.04, "learning_rate": 0.0003272918756268807, "loss": 2.8622, "theoretical_loss": 3.598198607627427, "tokens_seen": 1161765888 }, { "epoch": 3.04, "learning_rate": 0.0003272818455366098, "loss": 3.0634, "theoretical_loss": 3.598179823321541, "tokens_seen": 1161831424 }, { "epoch": 3.04, "learning_rate": 0.00032727181544633905, "loss": 2.988, "theoretical_loss": 3.5981610403718634, "tokens_seen": 1161896960 }, { "epoch": 3.04, "learning_rate": 0.00032726178535606823, "loss": 2.8675, "theoretical_loss": 3.5981422587782186, "tokens_seen": 1161962496 }, { "epoch": 3.04, "learning_rate": 0.0003272517552657974, "loss": 2.8682, "theoretical_loss": 3.5981234785404337, "tokens_seen": 1162028032 }, { "epoch": 3.04, "learning_rate": 0.0003272417251755266, "loss": 2.9391, "theoretical_loss": 3.5981046996583332, "tokens_seen": 1162093568 }, { "epoch": 3.04, "learning_rate": 0.0003272316950852558, "loss": 2.8673, "theoretical_loss": 3.5980859221317427, "tokens_seen": 1162159104 }, { "epoch": 3.04, "learning_rate": 0.00032722166499498496, "loss": 2.8942, "theoretical_loss": 3.598067145960489, "tokens_seen": 1162224640 }, { "epoch": 3.04, "learning_rate": 0.0003272116349047142, "loss": 2.7314, "theoretical_loss": 3.5980483711443965, "tokens_seen": 1162290176 }, { "epoch": 3.04, "learning_rate": 0.0003272016048144433, "loss": 2.9428, "theoretical_loss": 3.5980295976832926, "tokens_seen": 1162355712 }, { "epoch": 3.04, "learning_rate": 0.00032719157472417256, "loss": 2.8706, "theoretical_loss": 3.5980108255770014, "tokens_seen": 1162421248 }, { "epoch": 3.04, "learning_rate": 0.0003271815446339017, "loss": 2.7607, "theoretical_loss": 3.59799205482535, "tokens_seen": 1162486784 }, { "epoch": 3.04, "learning_rate": 0.0003271715145436309, "loss": 3.059, "theoretical_loss": 3.597973285428164, "tokens_seen": 1162552320 }, { "epoch": 3.04, "learning_rate": 0.0003271614844533601, "loss": 2.7525, "theoretical_loss": 3.5979545173852685, "tokens_seen": 1162617856 }, { "epoch": 3.04, "learning_rate": 0.0003271514543630893, "loss": 2.8571, "theoretical_loss": 3.597935750696491, "tokens_seen": 1162683392 }, { "epoch": 3.04, "learning_rate": 0.00032714142427281846, "loss": 2.8308, "theoretical_loss": 3.597916985361656, "tokens_seen": 1162748928 }, { "epoch": 3.04, "learning_rate": 0.0003271313941825477, "loss": 3.0927, "theoretical_loss": 3.59789822138059, "tokens_seen": 1162814464 }, { "epoch": 3.04, "learning_rate": 0.0003271213640922768, "loss": 2.8448, "theoretical_loss": 3.5978794587531198, "tokens_seen": 1162880000 }, { "epoch": 3.04, "learning_rate": 0.00032711133400200606, "loss": 2.8188, "theoretical_loss": 3.597860697479071, "tokens_seen": 1162945536 }, { "epoch": 3.04, "learning_rate": 0.0003271013039117352, "loss": 2.9602, "theoretical_loss": 3.5978419375582686, "tokens_seen": 1163011072 }, { "epoch": 3.04, "learning_rate": 0.0003270912738214644, "loss": 2.9491, "theoretical_loss": 3.59782317899054, "tokens_seen": 1163076608 }, { "epoch": 3.04, "learning_rate": 0.0003270812437311936, "loss": 2.9957, "theoretical_loss": 3.5978044217757112, "tokens_seen": 1163142144 }, { "epoch": 3.04, "learning_rate": 0.0003270712136409228, "loss": 2.8621, "theoretical_loss": 3.5977856659136087, "tokens_seen": 1163207680 }, { "epoch": 3.04, "objective/train/docs_used": 1864384, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.141984701156616, "objective/train/theoretical_loss": 3.597780977159426, "objective/train/tokens_used": 1183684064, "theoretical_loss": 3.597780977159426, "tokens_seen": 1163224064 }, { "epoch": 3.04, "learning_rate": 0.00032706118355065196, "loss": 2.9682, "theoretical_loss": 3.597766911404058, "tokens_seen": 1163273216 }, { "epoch": 3.04, "learning_rate": 0.00032705115346038114, "loss": 2.795, "theoretical_loss": 3.597748158246886, "tokens_seen": 1163338752 }, { "epoch": 3.04, "learning_rate": 0.0003270411233701103, "loss": 2.8707, "theoretical_loss": 3.5977294064419176, "tokens_seen": 1163404288 }, { "epoch": 3.04, "learning_rate": 0.00032703109327983956, "loss": 2.848, "theoretical_loss": 3.5977106559889815, "tokens_seen": 1163469824 }, { "epoch": 3.04, "learning_rate": 0.0003270210631895687, "loss": 2.7414, "theoretical_loss": 3.5976919068879023, "tokens_seen": 1163535360 }, { "epoch": 3.04, "learning_rate": 0.0003270110330992979, "loss": 2.8882, "theoretical_loss": 3.597673159138507, "tokens_seen": 1163600896 }, { "epoch": 3.04, "learning_rate": 0.00032700100300902705, "loss": 2.9584, "theoretical_loss": 3.5976544127406216, "tokens_seen": 1163666432 }, { "epoch": 3.04, "learning_rate": 0.0003269909729187563, "loss": 2.8375, "theoretical_loss": 3.5976356676940733, "tokens_seen": 1163731968 }, { "epoch": 3.04, "learning_rate": 0.00032698094282848547, "loss": 2.8289, "theoretical_loss": 3.5976169239986877, "tokens_seen": 1163797504 }, { "epoch": 3.04, "learning_rate": 0.00032697091273821465, "loss": 2.8481, "theoretical_loss": 3.597598181654292, "tokens_seen": 1163863040 }, { "epoch": 3.04, "learning_rate": 0.00032696088264794383, "loss": 2.9594, "theoretical_loss": 3.5975794406607133, "tokens_seen": 1163928576 }, { "epoch": 3.04, "learning_rate": 0.00032695085255767306, "loss": 2.8478, "theoretical_loss": 3.597560701017777, "tokens_seen": 1163994112 }, { "epoch": 3.04, "learning_rate": 0.0003269408224674022, "loss": 2.8304, "theoretical_loss": 3.59754196272531, "tokens_seen": 1164059648 }, { "epoch": 3.04, "learning_rate": 0.0003269307923771314, "loss": 2.9283, "theoretical_loss": 3.5975232257831395, "tokens_seen": 1164125184 }, { "epoch": 3.04, "learning_rate": 0.00032692076228686055, "loss": 2.7851, "theoretical_loss": 3.5975044901910915, "tokens_seen": 1164190720 }, { "epoch": 3.04, "learning_rate": 0.0003269107321965898, "loss": 2.8169, "theoretical_loss": 3.597485755948993, "tokens_seen": 1164256256 }, { "epoch": 3.04, "learning_rate": 0.00032690070210631897, "loss": 2.6546, "theoretical_loss": 3.597467023056671, "tokens_seen": 1164321792 }, { "epoch": 3.04, "learning_rate": 0.00032689067201604815, "loss": 2.8757, "theoretical_loss": 3.597448291513952, "tokens_seen": 1164387328 }, { "epoch": 3.04, "learning_rate": 0.00032688064192577733, "loss": 2.9068, "theoretical_loss": 3.5974295613206633, "tokens_seen": 1164452864 }, { "epoch": 3.04, "learning_rate": 0.0003268706118355065, "loss": 2.875, "theoretical_loss": 3.5974108324766307, "tokens_seen": 1164518400 }, { "epoch": 3.04, "learning_rate": 0.0003268605817452357, "loss": 2.9806, "theoretical_loss": 3.597392104981682, "tokens_seen": 1164583936 }, { "epoch": 3.04, "learning_rate": 0.00032685055165496493, "loss": 2.9438, "theoretical_loss": 3.5973733788356443, "tokens_seen": 1164649472 }, { "epoch": 3.04, "learning_rate": 0.00032684052156469406, "loss": 2.8533, "theoretical_loss": 3.5973546540383436, "tokens_seen": 1164715008 }, { "epoch": 3.04, "learning_rate": 0.0003268304914744233, "loss": 2.8704, "theoretical_loss": 3.5973359305896078, "tokens_seen": 1164780544 }, { "epoch": 3.04, "learning_rate": 0.0003268204613841524, "loss": 3.0148, "theoretical_loss": 3.5973172084892635, "tokens_seen": 1164846080 }, { "epoch": 3.04, "objective/train/docs_used": 1865774, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.83148193359375, "objective/train/theoretical_loss": 3.597312528174843, "objective/train/tokens_used": 1185322464, "theoretical_loss": 3.597312528174843, "tokens_seen": 1164862464 }, { "epoch": 3.04, "learning_rate": 0.00032681043129388165, "loss": 2.8237, "theoretical_loss": 3.5972984877371372, "tokens_seen": 1164911616 }, { "epoch": 3.04, "learning_rate": 0.00032680040120361084, "loss": 2.8553, "theoretical_loss": 3.597279768333057, "tokens_seen": 1164977152 }, { "epoch": 3.04, "learning_rate": 0.00032679037111334, "loss": 2.9444, "theoretical_loss": 3.597261050276849, "tokens_seen": 1165042688 }, { "epoch": 3.04, "learning_rate": 0.0003267803410230692, "loss": 2.864, "theoretical_loss": 3.597242333568341, "tokens_seen": 1165108224 }, { "epoch": 3.04, "learning_rate": 0.00032677031093279843, "loss": 2.917, "theoretical_loss": 3.5972236182073605, "tokens_seen": 1165173760 }, { "epoch": 3.04, "learning_rate": 0.00032676028084252756, "loss": 2.744, "theoretical_loss": 3.5972049041937337, "tokens_seen": 1165239296 }, { "epoch": 3.04, "learning_rate": 0.0003267502507522568, "loss": 2.8181, "theoretical_loss": 3.597186191527288, "tokens_seen": 1165304832 }, { "epoch": 3.04, "learning_rate": 0.0003267402206619859, "loss": 2.8911, "theoretical_loss": 3.5971674802078515, "tokens_seen": 1165370368 }, { "epoch": 3.04, "learning_rate": 0.00032673019057171516, "loss": 2.8212, "theoretical_loss": 3.597148770235251, "tokens_seen": 1165435904 }, { "epoch": 3.04, "learning_rate": 0.00032672016048144434, "loss": 2.9081, "theoretical_loss": 3.5971300616093136, "tokens_seen": 1165501440 }, { "epoch": 3.04, "learning_rate": 0.0003267101303911735, "loss": 2.7901, "theoretical_loss": 3.5971113543298667, "tokens_seen": 1165566976 }, { "epoch": 3.04, "learning_rate": 0.0003267001003009027, "loss": 2.8948, "theoretical_loss": 3.5970926483967385, "tokens_seen": 1165632512 }, { "epoch": 3.04, "learning_rate": 0.0003266900702106319, "loss": 2.8797, "theoretical_loss": 3.597073943809755, "tokens_seen": 1165698048 }, { "epoch": 3.04, "learning_rate": 0.00032668004012036106, "loss": 2.9164, "theoretical_loss": 3.597055240568745, "tokens_seen": 1165763584 }, { "epoch": 3.04, "learning_rate": 0.0003266700100300903, "loss": 2.6888, "theoretical_loss": 3.5970365386735352, "tokens_seen": 1165829120 }, { "epoch": 3.04, "learning_rate": 0.0003266599799398194, "loss": 2.8056, "theoretical_loss": 3.5970178381239535, "tokens_seen": 1165894656 }, { "epoch": 3.04, "learning_rate": 0.00032664994984954866, "loss": 2.8133, "theoretical_loss": 3.596999138919827, "tokens_seen": 1165960192 }, { "epoch": 3.04, "learning_rate": 0.00032663991975927784, "loss": 2.8217, "theoretical_loss": 3.596980441060984, "tokens_seen": 1166025728 }, { "epoch": 3.04, "learning_rate": 0.000326629889669007, "loss": 2.8774, "theoretical_loss": 3.5969617445472517, "tokens_seen": 1166091264 }, { "epoch": 3.04, "learning_rate": 0.0003266198595787362, "loss": 2.8713, "theoretical_loss": 3.5969430493784573, "tokens_seen": 1166156800 }, { "epoch": 3.04, "learning_rate": 0.0003266098294884654, "loss": 2.8491, "theoretical_loss": 3.596924355554429, "tokens_seen": 1166222336 }, { "epoch": 3.04, "learning_rate": 0.00032659979939819457, "loss": 2.8826, "theoretical_loss": 3.5969056630749945, "tokens_seen": 1166287872 }, { "epoch": 3.04, "learning_rate": 0.0003265897693079238, "loss": 2.8478, "theoretical_loss": 3.5968869719399814, "tokens_seen": 1166353408 }, { "epoch": 3.04, "learning_rate": 0.00032657973921765293, "loss": 2.8897, "theoretical_loss": 3.5968682821492175, "tokens_seen": 1166418944 }, { "epoch": 3.04, "learning_rate": 0.00032656970912738216, "loss": 2.902, "theoretical_loss": 3.596849593702531, "tokens_seen": 1166484480 }, { "epoch": 3.04, "objective/train/docs_used": 1868615, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6722476482391357, "objective/train/theoretical_loss": 3.596844921800851, "objective/train/tokens_used": 1186960864, "theoretical_loss": 3.596844921800851, "tokens_seen": 1166500864 }, { "epoch": 3.04, "learning_rate": 0.00032655967903711135, "loss": 2.6119, "theoretical_loss": 3.596830906599749, "tokens_seen": 1166550016 }, { "epoch": 3.04, "learning_rate": 0.0003265496489468405, "loss": 2.857, "theoretical_loss": 3.5968122208407, "tokens_seen": 1166615552 }, { "epoch": 3.04, "learning_rate": 0.00032653961885656976, "loss": 2.8546, "theoretical_loss": 3.5967935364252113, "tokens_seen": 1166681088 }, { "epoch": 3.04, "learning_rate": 0.0003265295887662989, "loss": 2.8899, "theoretical_loss": 3.596774853353111, "tokens_seen": 1166746624 }, { "epoch": 3.04, "learning_rate": 0.0003265195586760281, "loss": 2.7808, "theoretical_loss": 3.596756171624228, "tokens_seen": 1166812160 }, { "epoch": 3.04, "learning_rate": 0.00032650952858575725, "loss": 2.7966, "theoretical_loss": 3.5967374912383887, "tokens_seen": 1166877696 }, { "epoch": 3.04, "learning_rate": 0.0003264994984954865, "loss": 2.8869, "theoretical_loss": 3.596718812195422, "tokens_seen": 1166943232 }, { "epoch": 3.04, "learning_rate": 0.00032648946840521567, "loss": 2.7789, "theoretical_loss": 3.5967001344951566, "tokens_seen": 1167008768 }, { "epoch": 3.04, "learning_rate": 0.00032647943831494485, "loss": 2.9022, "theoretical_loss": 3.5966814581374194, "tokens_seen": 1167074304 }, { "epoch": 3.04, "learning_rate": 0.00032646940822467403, "loss": 2.9395, "theoretical_loss": 3.5966627831220395, "tokens_seen": 1167139840 }, { "epoch": 3.04, "learning_rate": 0.00032645937813440326, "loss": 2.8551, "theoretical_loss": 3.5966441094488437, "tokens_seen": 1167205376 }, { "epoch": 3.04, "learning_rate": 0.0003264493480441324, "loss": 2.9012, "theoretical_loss": 3.5966254371176616, "tokens_seen": 1167270912 }, { "epoch": 3.04, "learning_rate": 0.0003264393179538616, "loss": 2.8967, "theoretical_loss": 3.5966067661283208, "tokens_seen": 1167336448 }, { "epoch": 3.04, "learning_rate": 0.00032642928786359075, "loss": 2.8626, "theoretical_loss": 3.5965880964806494, "tokens_seen": 1167401984 }, { "epoch": 3.04, "learning_rate": 0.00032641925777332, "loss": 2.7754, "theoretical_loss": 3.596569428174476, "tokens_seen": 1167467520 }, { "epoch": 3.04, "learning_rate": 0.00032640922768304917, "loss": 2.899, "theoretical_loss": 3.5965507612096292, "tokens_seen": 1167533056 }, { "epoch": 3.04, "learning_rate": 0.00032639919759277835, "loss": 2.8165, "theoretical_loss": 3.5965320955859363, "tokens_seen": 1167598592 }, { "epoch": 3.04, "learning_rate": 0.00032638916750250753, "loss": 2.914, "theoretical_loss": 3.5965134313032268, "tokens_seen": 1167664128 }, { "epoch": 3.04, "learning_rate": 0.0003263791374122367, "loss": 2.8666, "theoretical_loss": 3.596494768361328, "tokens_seen": 1167729664 }, { "epoch": 3.04, "learning_rate": 0.0003263691073219659, "loss": 2.9111, "theoretical_loss": 3.5964761067600692, "tokens_seen": 1167795200 }, { "epoch": 3.04, "learning_rate": 0.00032635907723169513, "loss": 2.8341, "theoretical_loss": 3.5964574464992793, "tokens_seen": 1167860736 }, { "epoch": 3.04, "learning_rate": 0.00032634904714142426, "loss": 2.7871, "theoretical_loss": 3.5964387875787853, "tokens_seen": 1167926272 }, { "epoch": 3.04, "learning_rate": 0.0003263390170511535, "loss": 3.023, "theoretical_loss": 3.5964201299984166, "tokens_seen": 1167991808 }, { "epoch": 3.04, "learning_rate": 0.0003263289869608826, "loss": 2.7814, "theoretical_loss": 3.5964014737580023, "tokens_seen": 1168057344 }, { "epoch": 3.04, "learning_rate": 0.00032631895687061185, "loss": 2.922, "theoretical_loss": 3.5963828188573697, "tokens_seen": 1168122880 }, { "epoch": 3.04, "objective/train/docs_used": 1871360, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.851871967315674, "objective/train/theoretical_loss": 3.596378155341533, "objective/train/tokens_used": 1188599264, "theoretical_loss": 3.596378155341533, "tokens_seen": 1168139264 }, { "epoch": 3.04, "learning_rate": 0.00032630892678034104, "loss": 2.9312, "theoretical_loss": 3.5963641652963485, "tokens_seen": 1168188416 }, { "epoch": 3.04, "learning_rate": 0.0003262988966900702, "loss": 2.9505, "theoretical_loss": 3.596345513074767, "tokens_seen": 1168253952 }, { "epoch": 3.04, "learning_rate": 0.0003262888665997994, "loss": 2.7903, "theoretical_loss": 3.596326862192454, "tokens_seen": 1168319488 }, { "epoch": 3.04, "learning_rate": 0.00032627883650952863, "loss": 2.9689, "theoretical_loss": 3.5963082126492383, "tokens_seen": 1168385024 }, { "epoch": 3.04, "learning_rate": 0.00032626880641925776, "loss": 2.8127, "theoretical_loss": 3.596289564444948, "tokens_seen": 1168450560 }, { "epoch": 3.04, "learning_rate": 0.000326258776328987, "loss": 2.8894, "theoretical_loss": 3.596270917579412, "tokens_seen": 1168516096 }, { "epoch": 3.04, "learning_rate": 0.0003262487462387161, "loss": 2.8454, "theoretical_loss": 3.5962522720524603, "tokens_seen": 1168581632 }, { "epoch": 3.04, "learning_rate": 0.00032623871614844536, "loss": 2.6952, "theoretical_loss": 3.5962336278639206, "tokens_seen": 1168647168 }, { "epoch": 3.04, "learning_rate": 0.00032622868605817454, "loss": 2.8934, "theoretical_loss": 3.5962149850136216, "tokens_seen": 1168712704 }, { "epoch": 3.04, "learning_rate": 0.0003262186559679037, "loss": 2.7455, "theoretical_loss": 3.5961963435013935, "tokens_seen": 1168778240 }, { "epoch": 3.04, "learning_rate": 0.0003262086258776329, "loss": 2.939, "theoretical_loss": 3.596177703327064, "tokens_seen": 1168843776 }, { "epoch": 3.04, "learning_rate": 0.0003261985957873621, "loss": 2.9174, "theoretical_loss": 3.5961590644904624, "tokens_seen": 1168909312 }, { "epoch": 3.04, "learning_rate": 0.00032618856569709126, "loss": 2.8383, "theoretical_loss": 3.596140426991418, "tokens_seen": 1168974848 }, { "epoch": 3.04, "learning_rate": 0.0003261785356068205, "loss": 2.9556, "theoretical_loss": 3.596121790829759, "tokens_seen": 1169040384 }, { "epoch": 3.04, "learning_rate": 0.0003261685055165496, "loss": 2.8968, "theoretical_loss": 3.596103156005316, "tokens_seen": 1169105920 }, { "epoch": 3.04, "learning_rate": 0.00032615847542627886, "loss": 2.8691, "theoretical_loss": 3.5960845225179168, "tokens_seen": 1169171456 }, { "epoch": 3.04, "learning_rate": 0.00032614844533600804, "loss": 2.8862, "theoretical_loss": 3.5960658903673908, "tokens_seen": 1169236992 }, { "epoch": 3.04, "learning_rate": 0.0003261384152457372, "loss": 2.9196, "theoretical_loss": 3.5960472595535675, "tokens_seen": 1169302528 }, { "epoch": 3.04, "learning_rate": 0.0003261283851554664, "loss": 2.9067, "theoretical_loss": 3.596028630076276, "tokens_seen": 1169368064 }, { "epoch": 3.04, "learning_rate": 0.0003261183550651956, "loss": 2.7533, "theoretical_loss": 3.5960100019353454, "tokens_seen": 1169433600 }, { "epoch": 3.04, "learning_rate": 0.00032610832497492477, "loss": 2.8114, "theoretical_loss": 3.595991375130605, "tokens_seen": 1169499136 }, { "epoch": 3.04, "learning_rate": 0.000326098294884654, "loss": 2.9526, "theoretical_loss": 3.595972749661884, "tokens_seen": 1169564672 }, { "epoch": 3.04, "learning_rate": 0.00032608826479438313, "loss": 2.8739, "theoretical_loss": 3.5959541255290115, "tokens_seen": 1169630208 }, { "epoch": 3.04, "learning_rate": 0.00032607823470411236, "loss": 3.0373, "theoretical_loss": 3.595935502731817, "tokens_seen": 1169695744 }, { "epoch": 3.04, "learning_rate": 0.0003260682046138415, "loss": 2.75, "theoretical_loss": 3.5959168812701305, "tokens_seen": 1169761280 }, { "epoch": 3.04, "objective/train/docs_used": 1874343, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.859848737716675, "objective/train/theoretical_loss": 3.595912226113362, "objective/train/tokens_used": 1190237664, "theoretical_loss": 3.595912226113362, "tokens_seen": 1169777664 }, { "epoch": 3.04, "learning_rate": 0.0003260581745235707, "loss": 2.9007, "theoretical_loss": 3.595898261143781, "tokens_seen": 1169826816 }, { "epoch": 3.04, "learning_rate": 0.0003260481444332999, "loss": 2.8545, "theoretical_loss": 3.5958796423525974, "tokens_seen": 1169892352 }, { "epoch": 3.04, "learning_rate": 0.0003260381143430291, "loss": 2.9597, "theoretical_loss": 3.5958610248964096, "tokens_seen": 1169957888 }, { "epoch": 3.04, "learning_rate": 0.00032602808425275827, "loss": 2.6007, "theoretical_loss": 3.595842408775048, "tokens_seen": 1170023424 }, { "epoch": 3.04, "learning_rate": 0.00032601805416248745, "loss": 2.7249, "theoretical_loss": 3.59582379398834, "tokens_seen": 1170088960 }, { "epoch": 3.04, "learning_rate": 0.00032600802407221663, "loss": 2.9003, "theoretical_loss": 3.595805180536117, "tokens_seen": 1170154496 }, { "epoch": 3.04, "learning_rate": 0.00032599799398194587, "loss": 2.7923, "theoretical_loss": 3.595786568418208, "tokens_seen": 1170220032 }, { "epoch": 3.04, "learning_rate": 0.000325987963891675, "loss": 2.854, "theoretical_loss": 3.5957679576344432, "tokens_seen": 1170285568 }, { "epoch": 3.04, "learning_rate": 0.00032597793380140423, "loss": 2.8569, "theoretical_loss": 3.5957493481846514, "tokens_seen": 1170351104 }, { "epoch": 3.04, "learning_rate": 0.0003259679037111334, "loss": 2.6746, "theoretical_loss": 3.5957307400686624, "tokens_seen": 1170416640 }, { "epoch": 3.04, "learning_rate": 0.0003259578736208626, "loss": 2.932, "theoretical_loss": 3.5957121332863062, "tokens_seen": 1170482176 }, { "epoch": 3.04, "learning_rate": 0.0003259478435305918, "loss": 2.9687, "theoretical_loss": 3.5956935278374127, "tokens_seen": 1170547712 }, { "epoch": 3.04, "learning_rate": 0.00032593781344032095, "loss": 2.8418, "theoretical_loss": 3.5956749237218113, "tokens_seen": 1170613248 }, { "epoch": 3.04, "learning_rate": 0.00032592778335005014, "loss": 2.8919, "theoretical_loss": 3.595656320939332, "tokens_seen": 1170678784 }, { "epoch": 3.04, "learning_rate": 0.00032591775325977937, "loss": 2.9159, "theoretical_loss": 3.595637719489805, "tokens_seen": 1170744320 }, { "epoch": 3.04, "learning_rate": 0.0003259077231695085, "loss": 2.8228, "theoretical_loss": 3.5956191193730596, "tokens_seen": 1170809856 }, { "epoch": 3.04, "learning_rate": 0.00032589769307923773, "loss": 2.7883, "theoretical_loss": 3.595600520588926, "tokens_seen": 1170875392 }, { "epoch": 3.04, "learning_rate": 0.00032588766298896686, "loss": 2.9107, "theoretical_loss": 3.5955819231372343, "tokens_seen": 1170940928 }, { "epoch": 3.04, "learning_rate": 0.0003258776328986961, "loss": 2.9097, "theoretical_loss": 3.595563327017814, "tokens_seen": 1171006464 }, { "epoch": 3.04, "learning_rate": 0.0003258676028084253, "loss": 2.7805, "theoretical_loss": 3.5955447322304956, "tokens_seen": 1171072000 }, { "epoch": 3.04, "learning_rate": 0.00032585757271815446, "loss": 2.881, "theoretical_loss": 3.595526138775109, "tokens_seen": 1171137536 }, { "epoch": 3.04, "learning_rate": 0.00032584754262788364, "loss": 2.8276, "theoretical_loss": 3.5955075466514836, "tokens_seen": 1171203072 }, { "epoch": 3.04, "learning_rate": 0.0003258375125376128, "loss": 2.7579, "theoretical_loss": 3.5954889558594507, "tokens_seen": 1171268608 }, { "epoch": 3.04, "learning_rate": 0.000325827482447342, "loss": 2.8357, "theoretical_loss": 3.59547036639884, "tokens_seen": 1171334144 }, { "epoch": 3.04, "learning_rate": 0.00032581745235707124, "loss": 2.8185, "theoretical_loss": 3.5954517782694806, "tokens_seen": 1171399680 }, { "epoch": 3.04, "objective/train/docs_used": 1876977, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7813780307769775, "objective/train/theoretical_loss": 3.5954471314451295, "objective/train/tokens_used": 1191876064, "theoretical_loss": 3.5954471314451295, "tokens_seen": 1171416064 }, { "epoch": 3.04, "learning_rate": 0.0003258074222668004, "loss": 2.866, "theoretical_loss": 3.595433191471204, "tokens_seen": 1171465216 }, { "epoch": 3.04, "learning_rate": 0.0003257973921765296, "loss": 2.9924, "theoretical_loss": 3.5954146060038408, "tokens_seen": 1171530752 }, { "epoch": 3.04, "learning_rate": 0.00032578736208625883, "loss": 2.908, "theoretical_loss": 3.5953960218672196, "tokens_seen": 1171596288 }, { "epoch": 3.04, "learning_rate": 0.00032577733199598796, "loss": 2.8161, "theoretical_loss": 3.5953774390611715, "tokens_seen": 1171661824 }, { "epoch": 3.04, "learning_rate": 0.0003257673019057172, "loss": 2.8304, "theoretical_loss": 3.595358857585527, "tokens_seen": 1171727360 }, { "epoch": 3.04, "learning_rate": 0.0003257572718154463, "loss": 3.0144, "theoretical_loss": 3.595340277440117, "tokens_seen": 1171792896 }, { "epoch": 3.04, "learning_rate": 0.00032574724172517556, "loss": 2.8833, "theoretical_loss": 3.5953216986247707, "tokens_seen": 1171858432 }, { "epoch": 3.04, "learning_rate": 0.00032573721163490474, "loss": 2.8517, "theoretical_loss": 3.5953031211393185, "tokens_seen": 1171923968 }, { "epoch": 3.04, "learning_rate": 0.0003257271815446339, "loss": 2.7745, "theoretical_loss": 3.595284544983592, "tokens_seen": 1171989504 }, { "epoch": 3.04, "learning_rate": 0.0003257171514543631, "loss": 3.0098, "theoretical_loss": 3.5952659701574206, "tokens_seen": 1172055040 }, { "epoch": 3.04, "learning_rate": 0.0003257071213640923, "loss": 2.9587, "theoretical_loss": 3.5952473966606355, "tokens_seen": 1172120576 }, { "epoch": 3.04, "learning_rate": 0.00032569709127382146, "loss": 2.9533, "theoretical_loss": 3.5952288244930672, "tokens_seen": 1172186112 }, { "epoch": 3.04, "learning_rate": 0.0003256870611835507, "loss": 2.9271, "theoretical_loss": 3.5952102536545456, "tokens_seen": 1172251648 }, { "epoch": 3.04, "learning_rate": 0.0003256770310932798, "loss": 2.9281, "theoretical_loss": 3.5951916841449023, "tokens_seen": 1172317184 }, { "epoch": 3.04, "learning_rate": 0.00032566700100300906, "loss": 2.9038, "theoretical_loss": 3.5951731159639664, "tokens_seen": 1172382720 }, { "epoch": 3.04, "learning_rate": 0.00032565697091273824, "loss": 2.7898, "theoretical_loss": 3.5951545491115704, "tokens_seen": 1172448256 }, { "epoch": 3.04, "learning_rate": 0.0003256469408224674, "loss": 2.8632, "theoretical_loss": 3.595135983587544, "tokens_seen": 1172513792 }, { "epoch": 3.04, "learning_rate": 0.0003256369107321966, "loss": 2.7943, "theoretical_loss": 3.5951174193917175, "tokens_seen": 1172579328 }, { "epoch": 3.04, "learning_rate": 0.0003256268806419258, "loss": 2.65, "theoretical_loss": 3.595098856523922, "tokens_seen": 1172644864 }, { "epoch": 3.04, "learning_rate": 0.00032561685055165497, "loss": 2.9666, "theoretical_loss": 3.5950802949839895, "tokens_seen": 1172710400 }, { "epoch": 3.04, "learning_rate": 0.0003256068204613842, "loss": 2.7378, "theoretical_loss": 3.595061734771749, "tokens_seen": 1172775936 }, { "epoch": 3.04, "learning_rate": 0.00032559679037111333, "loss": 2.9035, "theoretical_loss": 3.5950431758870325, "tokens_seen": 1172841472 }, { "epoch": 3.04, "learning_rate": 0.00032558676028084256, "loss": 2.9321, "theoretical_loss": 3.5950246183296706, "tokens_seen": 1172907008 }, { "epoch": 3.04, "learning_rate": 0.0003255767301905717, "loss": 2.7802, "theoretical_loss": 3.5950060620994937, "tokens_seen": 1172972544 }, { "epoch": 3.04, "learning_rate": 0.0003255667001003009, "loss": 2.6625, "theoretical_loss": 3.5949875071963335, "tokens_seen": 1173038080 }, { "epoch": 3.04, "objective/train/docs_used": 1879951, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.729292392730713, "objective/train/theoretical_loss": 3.5949828686778695, "objective/train/tokens_used": 1193514464, "theoretical_loss": 3.5949828686778695, "tokens_seen": 1173054464 }, { "epoch": 3.04, "learning_rate": 0.0003255566700100301, "loss": 2.8865, "theoretical_loss": 3.59496895362002, "tokens_seen": 1173103616 }, { "epoch": 3.04, "learning_rate": 0.0003255466399197593, "loss": 2.9418, "theoretical_loss": 3.5949504013703857, "tokens_seen": 1173169152 }, { "epoch": 3.04, "learning_rate": 0.00032553660982948847, "loss": 2.8848, "theoretical_loss": 3.59493185044726, "tokens_seen": 1173234688 }, { "epoch": 3.04, "learning_rate": 0.00032552657973921765, "loss": 2.8553, "theoretical_loss": 3.5949133008504752, "tokens_seen": 1173300224 }, { "epoch": 3.04, "learning_rate": 0.00032551654964894683, "loss": 2.963, "theoretical_loss": 3.594894752579862, "tokens_seen": 1173365760 }, { "epoch": 3.04, "learning_rate": 0.00032550651955867607, "loss": 2.9143, "theoretical_loss": 3.5948762056352512, "tokens_seen": 1173431296 }, { "epoch": 3.04, "learning_rate": 0.0003254964894684052, "loss": 2.873, "theoretical_loss": 3.5948576600164746, "tokens_seen": 1173496832 }, { "epoch": 3.04, "learning_rate": 0.00032548645937813443, "loss": 2.714, "theoretical_loss": 3.594839115723362, "tokens_seen": 1173562368 }, { "epoch": 3.05, "learning_rate": 0.0003254764292878636, "loss": 2.8892, "theoretical_loss": 3.5948205727557463, "tokens_seen": 1173627904 }, { "epoch": 3.05, "learning_rate": 0.0003254663991975928, "loss": 3.0613, "theoretical_loss": 3.594802031113458, "tokens_seen": 1173693440 }, { "epoch": 3.05, "learning_rate": 0.000325456369107322, "loss": 2.9833, "theoretical_loss": 3.5947834907963285, "tokens_seen": 1173758976 }, { "epoch": 3.05, "learning_rate": 0.00032544633901705115, "loss": 2.8011, "theoretical_loss": 3.594764951804189, "tokens_seen": 1173824512 }, { "epoch": 3.05, "learning_rate": 0.00032543630892678034, "loss": 2.9809, "theoretical_loss": 3.594746414136871, "tokens_seen": 1173890048 }, { "epoch": 3.05, "learning_rate": 0.00032542627883650957, "loss": 2.8316, "theoretical_loss": 3.594727877794205, "tokens_seen": 1173955584 }, { "epoch": 3.05, "learning_rate": 0.0003254162487462387, "loss": 2.7968, "theoretical_loss": 3.5947093427760244, "tokens_seen": 1174021120 }, { "epoch": 3.05, "learning_rate": 0.00032540621865596793, "loss": 2.7107, "theoretical_loss": 3.5946908090821585, "tokens_seen": 1174086656 }, { "epoch": 3.05, "learning_rate": 0.00032539618856569706, "loss": 2.9261, "theoretical_loss": 3.5946722767124397, "tokens_seen": 1174152192 }, { "epoch": 3.05, "learning_rate": 0.0003253861584754263, "loss": 2.7391, "theoretical_loss": 3.5946537456667, "tokens_seen": 1174217728 }, { "epoch": 3.05, "learning_rate": 0.0003253761283851555, "loss": 2.8409, "theoretical_loss": 3.59463521594477, "tokens_seen": 1174283264 }, { "epoch": 3.05, "learning_rate": 0.00032536609829488466, "loss": 3.0014, "theoretical_loss": 3.5946166875464813, "tokens_seen": 1174348800 }, { "epoch": 3.05, "learning_rate": 0.00032535606820461384, "loss": 2.8512, "theoretical_loss": 3.5945981604716666, "tokens_seen": 1174414336 }, { "epoch": 3.05, "learning_rate": 0.000325346038114343, "loss": 2.9239, "theoretical_loss": 3.594579634720156, "tokens_seen": 1174479872 }, { "epoch": 3.05, "learning_rate": 0.0003253360080240722, "loss": 2.8224, "theoretical_loss": 3.5945611102917825, "tokens_seen": 1174545408 }, { "epoch": 3.05, "learning_rate": 0.00032532597793380144, "loss": 2.958, "theoretical_loss": 3.5945425871863765, "tokens_seen": 1174610944 }, { "epoch": 3.05, "learning_rate": 0.00032531594784353056, "loss": 2.8125, "theoretical_loss": 3.594524065403771, "tokens_seen": 1174676480 }, { "epoch": 3.05, "objective/train/docs_used": 1881236, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.098384141921997, "objective/train/theoretical_loss": 3.594519435164787, "objective/train/tokens_used": 1195152864, "theoretical_loss": 3.594519435164787, "tokens_seen": 1174692864 }, { "epoch": 3.05, "learning_rate": 0.0003253059177532598, "loss": 2.9322, "theoretical_loss": 3.5945055449437966, "tokens_seen": 1174742016 }, { "epoch": 3.05, "learning_rate": 0.000325295887662989, "loss": 3.0184, "theoretical_loss": 3.594487025806286, "tokens_seen": 1174807552 }, { "epoch": 3.05, "learning_rate": 0.00032528585757271816, "loss": 2.628, "theoretical_loss": 3.5944685079910705, "tokens_seen": 1174873088 }, { "epoch": 3.05, "learning_rate": 0.00032527582748244734, "loss": 2.8955, "theoretical_loss": 3.5944499914979824, "tokens_seen": 1174938624 }, { "epoch": 3.05, "learning_rate": 0.0003252657973921765, "loss": 2.9109, "theoretical_loss": 3.5944314763268523, "tokens_seen": 1175004160 }, { "epoch": 3.05, "learning_rate": 0.0003252557673019057, "loss": 2.9303, "theoretical_loss": 3.594412962477514, "tokens_seen": 1175069696 }, { "epoch": 3.05, "learning_rate": 0.00032524573721163494, "loss": 2.7196, "theoretical_loss": 3.5943944499497977, "tokens_seen": 1175135232 }, { "epoch": 3.05, "learning_rate": 0.00032523570712136407, "loss": 2.8911, "theoretical_loss": 3.5943759387435366, "tokens_seen": 1175200768 }, { "epoch": 3.05, "learning_rate": 0.0003252256770310933, "loss": 2.9309, "theoretical_loss": 3.5943574288585616, "tokens_seen": 1175266304 }, { "epoch": 3.05, "learning_rate": 0.00032521564694082243, "loss": 2.8967, "theoretical_loss": 3.594338920294706, "tokens_seen": 1175331840 }, { "epoch": 3.05, "learning_rate": 0.00032520561685055166, "loss": 2.8381, "theoretical_loss": 3.5943204130518005, "tokens_seen": 1175397376 }, { "epoch": 3.05, "learning_rate": 0.00032519558676028084, "loss": 2.976, "theoretical_loss": 3.594301907129678, "tokens_seen": 1175462912 }, { "epoch": 3.05, "learning_rate": 0.00032518555667001, "loss": 2.8676, "theoretical_loss": 3.5942834025281707, "tokens_seen": 1175528448 }, { "epoch": 3.05, "learning_rate": 0.0003251755265797392, "loss": 2.9886, "theoretical_loss": 3.59426489924711, "tokens_seen": 1175593984 }, { "epoch": 3.05, "learning_rate": 0.00032516549648946844, "loss": 2.8539, "theoretical_loss": 3.5942463972863288, "tokens_seen": 1175659520 }, { "epoch": 3.05, "learning_rate": 0.00032515546639919757, "loss": 2.9354, "theoretical_loss": 3.594227896645659, "tokens_seen": 1175725056 }, { "epoch": 3.05, "learning_rate": 0.0003251454363089268, "loss": 2.7935, "theoretical_loss": 3.594209397324933, "tokens_seen": 1175790592 }, { "epoch": 3.05, "learning_rate": 0.00032513540621865593, "loss": 2.9035, "theoretical_loss": 3.5941908993239826, "tokens_seen": 1175856128 }, { "epoch": 3.05, "learning_rate": 0.00032512537612838517, "loss": 2.688, "theoretical_loss": 3.5941724026426414, "tokens_seen": 1175921664 }, { "epoch": 3.05, "learning_rate": 0.00032511534603811435, "loss": 2.8263, "theoretical_loss": 3.59415390728074, "tokens_seen": 1175987200 }, { "epoch": 3.05, "learning_rate": 0.00032510531594784353, "loss": 2.8295, "theoretical_loss": 3.5941354132381114, "tokens_seen": 1176052736 }, { "epoch": 3.05, "learning_rate": 0.0003250952858575727, "loss": 2.8336, "theoretical_loss": 3.594116920514588, "tokens_seen": 1176118272 }, { "epoch": 3.05, "learning_rate": 0.0003250852557673019, "loss": 2.7522, "theoretical_loss": 3.594098429110003, "tokens_seen": 1176183808 }, { "epoch": 3.05, "learning_rate": 0.00032507522567703107, "loss": 2.9811, "theoretical_loss": 3.594079939024188, "tokens_seen": 1176249344 }, { "epoch": 3.05, "learning_rate": 0.0003250651955867603, "loss": 2.8608, "theoretical_loss": 3.5940614502569757, "tokens_seen": 1176314880 }, { "epoch": 3.05, "objective/train/docs_used": 1884052, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9219634532928467, "objective/train/theoretical_loss": 3.5940568282711842, "objective/train/tokens_used": 1196791264, "theoretical_loss": 3.5940568282711842, "tokens_seen": 1176331264 }, { "epoch": 3.05, "learning_rate": 0.0003250551654964895, "loss": 2.7628, "theoretical_loss": 3.5940429628081985, "tokens_seen": 1176380416 }, { "epoch": 3.05, "learning_rate": 0.00032504513540621867, "loss": 2.7418, "theoretical_loss": 3.5940244766776885, "tokens_seen": 1176445952 }, { "epoch": 3.05, "learning_rate": 0.00032503510531594785, "loss": 2.9338, "theoretical_loss": 3.594005991865279, "tokens_seen": 1176511488 }, { "epoch": 3.05, "learning_rate": 0.00032502507522567703, "loss": 2.7561, "theoretical_loss": 3.5939875083708026, "tokens_seen": 1176577024 }, { "epoch": 3.05, "learning_rate": 0.00032501504513540627, "loss": 2.8268, "theoretical_loss": 3.5939690261940918, "tokens_seen": 1176642560 }, { "epoch": 3.05, "learning_rate": 0.0003250050150451354, "loss": 2.8118, "theoretical_loss": 3.593950545334979, "tokens_seen": 1176708096 }, { "epoch": 3.05, "learning_rate": 0.00032499498495486463, "loss": 2.9314, "theoretical_loss": 3.593932065793297, "tokens_seen": 1176773632 }, { "epoch": 3.05, "learning_rate": 0.0003249849548645938, "loss": 2.8643, "theoretical_loss": 3.5939135875688786, "tokens_seen": 1176839168 }, { "epoch": 3.05, "learning_rate": 0.000324974924774323, "loss": 2.8537, "theoretical_loss": 3.593895110661557, "tokens_seen": 1176904704 }, { "epoch": 3.05, "learning_rate": 0.0003249648946840522, "loss": 2.7863, "theoretical_loss": 3.593876635071164, "tokens_seen": 1176970240 }, { "epoch": 3.05, "learning_rate": 0.00032495486459378135, "loss": 2.822, "theoretical_loss": 3.5938581607975335, "tokens_seen": 1177035776 }, { "epoch": 3.05, "learning_rate": 0.00032494483450351054, "loss": 2.8501, "theoretical_loss": 3.5938396878404975, "tokens_seen": 1177101312 }, { "epoch": 3.05, "learning_rate": 0.00032493480441323977, "loss": 2.9133, "theoretical_loss": 3.593821216199889, "tokens_seen": 1177166848 }, { "epoch": 3.05, "learning_rate": 0.0003249247743229689, "loss": 2.9008, "theoretical_loss": 3.593802745875541, "tokens_seen": 1177232384 }, { "epoch": 3.05, "learning_rate": 0.00032491474423269813, "loss": 2.8384, "theoretical_loss": 3.5937842768672867, "tokens_seen": 1177297920 }, { "epoch": 3.05, "learning_rate": 0.00032490471414242726, "loss": 2.7623, "theoretical_loss": 3.593765809174959, "tokens_seen": 1177363456 }, { "epoch": 3.05, "learning_rate": 0.0003248946840521565, "loss": 2.8575, "theoretical_loss": 3.59374734279839, "tokens_seen": 1177428992 }, { "epoch": 3.05, "learning_rate": 0.0003248846539618857, "loss": 2.7475, "theoretical_loss": 3.593728877737415, "tokens_seen": 1177494528 }, { "epoch": 3.05, "learning_rate": 0.00032487462387161486, "loss": 2.7378, "theoretical_loss": 3.5937104139918645, "tokens_seen": 1177560064 }, { "epoch": 3.05, "learning_rate": 0.00032486459378134404, "loss": 2.967, "theoretical_loss": 3.5936919515615724, "tokens_seen": 1177625600 }, { "epoch": 3.05, "learning_rate": 0.0003248545636910732, "loss": 2.662, "theoretical_loss": 3.5936734904463727, "tokens_seen": 1177691136 }, { "epoch": 3.05, "learning_rate": 0.0003248445336008024, "loss": 2.8845, "theoretical_loss": 3.593655030646098, "tokens_seen": 1177756672 }, { "epoch": 3.05, "learning_rate": 0.00032483450351053164, "loss": 2.8533, "theoretical_loss": 3.593636572160581, "tokens_seen": 1177822208 }, { "epoch": 3.05, "learning_rate": 0.00032482447342026076, "loss": 2.8502, "theoretical_loss": 3.5936181149896553, "tokens_seen": 1177887744 }, { "epoch": 3.05, "learning_rate": 0.00032481444332999, "loss": 2.8252, "theoretical_loss": 3.593599659133155, "tokens_seen": 1177953280 }, { "epoch": 3.05, "objective/train/docs_used": 1886939, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8688771724700928, "objective/train/theoretical_loss": 3.593595045374389, "objective/train/tokens_used": 1198429664, "theoretical_loss": 3.593595045374389, "tokens_seen": 1177969664 }, { "epoch": 3.05, "learning_rate": 0.0003248044132397192, "loss": 2.7793, "theoretical_loss": 3.5935812045909117, "tokens_seen": 1178018816 }, { "epoch": 3.05, "learning_rate": 0.00032479438314944836, "loss": 2.7981, "theoretical_loss": 3.59356275136276, "tokens_seen": 1178084352 }, { "epoch": 3.05, "learning_rate": 0.00032478435305917754, "loss": 2.9477, "theoretical_loss": 3.5935442994485323, "tokens_seen": 1178149888 }, { "epoch": 3.05, "learning_rate": 0.0003247743229689067, "loss": 2.8434, "theoretical_loss": 3.5935258488480626, "tokens_seen": 1178215424 }, { "epoch": 3.05, "learning_rate": 0.0003247642928786359, "loss": 2.7204, "theoretical_loss": 3.5935073995611844, "tokens_seen": 1178280960 }, { "epoch": 3.05, "learning_rate": 0.00032475426278836514, "loss": 2.963, "theoretical_loss": 3.5934889515877306, "tokens_seen": 1178346496 }, { "epoch": 3.05, "learning_rate": 0.00032474423269809427, "loss": 2.823, "theoretical_loss": 3.5934705049275353, "tokens_seen": 1178412032 }, { "epoch": 3.05, "learning_rate": 0.0003247342026078235, "loss": 2.8266, "theoretical_loss": 3.5934520595804313, "tokens_seen": 1178477568 }, { "epoch": 3.05, "learning_rate": 0.00032472417251755263, "loss": 2.922, "theoretical_loss": 3.5934336155462523, "tokens_seen": 1178543104 }, { "epoch": 3.05, "learning_rate": 0.00032471414242728186, "loss": 2.9834, "theoretical_loss": 3.593415172824832, "tokens_seen": 1178608640 }, { "epoch": 3.05, "learning_rate": 0.00032470411233701104, "loss": 2.8503, "theoretical_loss": 3.593396731416004, "tokens_seen": 1178674176 }, { "epoch": 3.05, "learning_rate": 0.0003246940822467402, "loss": 2.9306, "theoretical_loss": 3.5933782913196017, "tokens_seen": 1178739712 }, { "epoch": 3.05, "learning_rate": 0.0003246840521564694, "loss": 2.8919, "theoretical_loss": 3.593359852535459, "tokens_seen": 1178805248 }, { "epoch": 3.05, "learning_rate": 0.00032467402206619864, "loss": 2.9224, "theoretical_loss": 3.5933414150634095, "tokens_seen": 1178870784 }, { "epoch": 3.05, "learning_rate": 0.00032466399197592777, "loss": 2.9495, "theoretical_loss": 3.593322978903287, "tokens_seen": 1178936320 }, { "epoch": 3.05, "learning_rate": 0.000324653961885657, "loss": 2.939, "theoretical_loss": 3.5933045440549245, "tokens_seen": 1179001856 }, { "epoch": 3.05, "learning_rate": 0.00032464393179538613, "loss": 2.7751, "theoretical_loss": 3.5932861105181573, "tokens_seen": 1179067392 }, { "epoch": 3.05, "learning_rate": 0.00032463390170511537, "loss": 2.9402, "theoretical_loss": 3.5932676782928175, "tokens_seen": 1179132928 }, { "epoch": 3.05, "learning_rate": 0.00032462387161484455, "loss": 2.8377, "theoretical_loss": 3.59324924737874, "tokens_seen": 1179198464 }, { "epoch": 3.05, "learning_rate": 0.00032461384152457373, "loss": 2.8108, "theoretical_loss": 3.593230817775758, "tokens_seen": 1179264000 }, { "epoch": 3.05, "learning_rate": 0.0003246038114343029, "loss": 2.8634, "theoretical_loss": 3.5932123894837056, "tokens_seen": 1179329536 }, { "epoch": 3.05, "learning_rate": 0.0003245937813440321, "loss": 3.0235, "theoretical_loss": 3.5931939625024167, "tokens_seen": 1179395072 }, { "epoch": 3.05, "learning_rate": 0.00032458375125376127, "loss": 2.8023, "theoretical_loss": 3.5931755368317257, "tokens_seen": 1179460608 }, { "epoch": 3.05, "learning_rate": 0.0003245737211634905, "loss": 2.7741, "theoretical_loss": 3.593157112471466, "tokens_seen": 1179526144 }, { "epoch": 3.05, "learning_rate": 0.00032456369107321963, "loss": 2.8353, "theoretical_loss": 3.593138689421472, "tokens_seen": 1179591680 }, { "epoch": 3.05, "objective/train/docs_used": 1889250, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9162614345550537, "objective/train/theoretical_loss": 3.593134083863683, "objective/train/tokens_used": 1200068064, "theoretical_loss": 3.593134083863683, "tokens_seen": 1179608064 }, { "epoch": 3.05, "learning_rate": 0.00032455366098294887, "loss": 2.9513, "theoretical_loss": 3.593120267681577, "tokens_seen": 1179657216 }, { "epoch": 3.05, "learning_rate": 0.000324543630892678, "loss": 2.967, "theoretical_loss": 3.5931018472516163, "tokens_seen": 1179722752 }, { "epoch": 3.05, "learning_rate": 0.00032453360080240723, "loss": 2.7112, "theoretical_loss": 3.593083428131423, "tokens_seen": 1179788288 }, { "epoch": 3.05, "learning_rate": 0.0003245235707121364, "loss": 2.8317, "theoretical_loss": 3.5930650103208315, "tokens_seen": 1179853824 }, { "epoch": 3.05, "learning_rate": 0.0003245135406218656, "loss": 2.738, "theoretical_loss": 3.5930465938196754, "tokens_seen": 1179919360 }, { "epoch": 3.05, "learning_rate": 0.0003245035105315948, "loss": 2.7325, "theoretical_loss": 3.5930281786277902, "tokens_seen": 1179984896 }, { "epoch": 3.05, "learning_rate": 0.000324493480441324, "loss": 2.8524, "theoretical_loss": 3.593009764745009, "tokens_seen": 1180050432 }, { "epoch": 3.05, "learning_rate": 0.00032448345035105314, "loss": 2.8032, "theoretical_loss": 3.5929913521711665, "tokens_seen": 1180115968 }, { "epoch": 3.05, "learning_rate": 0.0003244734202607824, "loss": 2.7704, "theoretical_loss": 3.5929729409060966, "tokens_seen": 1180181504 }, { "epoch": 3.05, "learning_rate": 0.0003244633901705115, "loss": 2.9251, "theoretical_loss": 3.5929545309496342, "tokens_seen": 1180247040 }, { "epoch": 3.05, "learning_rate": 0.00032445336008024074, "loss": 2.9936, "theoretical_loss": 3.592936122301613, "tokens_seen": 1180312576 }, { "epoch": 3.05, "learning_rate": 0.0003244433299899699, "loss": 2.9337, "theoretical_loss": 3.592917714961868, "tokens_seen": 1180378112 }, { "epoch": 3.05, "learning_rate": 0.0003244332998996991, "loss": 2.8193, "theoretical_loss": 3.592899308930233, "tokens_seen": 1180443648 }, { "epoch": 3.05, "learning_rate": 0.0003244232698094283, "loss": 2.7879, "theoretical_loss": 3.5928809042065426, "tokens_seen": 1180509184 }, { "epoch": 3.05, "learning_rate": 0.00032441323971915746, "loss": 2.9081, "theoretical_loss": 3.5928625007906314, "tokens_seen": 1180574720 }, { "epoch": 3.05, "learning_rate": 0.00032440320962888664, "loss": 2.9148, "theoretical_loss": 3.592844098682334, "tokens_seen": 1180640256 }, { "epoch": 3.05, "learning_rate": 0.0003243931795386159, "loss": 2.835, "theoretical_loss": 3.5928256978814845, "tokens_seen": 1180705792 }, { "epoch": 3.05, "learning_rate": 0.000324383149448345, "loss": 2.8557, "theoretical_loss": 3.5928072983879176, "tokens_seen": 1180771328 }, { "epoch": 3.05, "learning_rate": 0.00032437311935807424, "loss": 2.9217, "theoretical_loss": 3.5927889002014686, "tokens_seen": 1180836864 }, { "epoch": 3.05, "learning_rate": 0.00032436308926780337, "loss": 2.9754, "theoretical_loss": 3.5927705033219706, "tokens_seen": 1180902400 }, { "epoch": 3.05, "learning_rate": 0.0003243530591775326, "loss": 2.8771, "theoretical_loss": 3.5927521077492597, "tokens_seen": 1180967936 }, { "epoch": 3.05, "learning_rate": 0.0003243430290872618, "loss": 2.8408, "theoretical_loss": 3.5927337134831694, "tokens_seen": 1181033472 }, { "epoch": 3.05, "learning_rate": 0.00032433299899699096, "loss": 2.8729, "theoretical_loss": 3.5927153205235354, "tokens_seen": 1181099008 }, { "epoch": 3.05, "learning_rate": 0.0003243229689067202, "loss": 2.7053, "theoretical_loss": 3.5926969288701915, "tokens_seen": 1181164544 }, { "epoch": 3.05, "learning_rate": 0.0003243129388164494, "loss": 2.8457, "theoretical_loss": 3.5926785385229736, "tokens_seen": 1181230080 }, { "epoch": 3.05, "objective/train/docs_used": 1892226, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.617610454559326, "objective/train/theoretical_loss": 3.592673941140232, "objective/train/tokens_used": 1201706464, "theoretical_loss": 3.592673941140232, "tokens_seen": 1181246464 }, { "epoch": 3.05, "learning_rate": 0.00032430290872617856, "loss": 2.7834, "theoretical_loss": 3.5926601494817154, "tokens_seen": 1181295616 }, { "epoch": 3.05, "learning_rate": 0.00032429287863590774, "loss": 3.0094, "theoretical_loss": 3.5926417617462523, "tokens_seen": 1181361152 }, { "epoch": 3.05, "learning_rate": 0.0003242828485456369, "loss": 2.6908, "theoretical_loss": 3.5926233753164185, "tokens_seen": 1181426688 }, { "epoch": 3.05, "learning_rate": 0.0003242728184553661, "loss": 2.741, "theoretical_loss": 3.59260499019205, "tokens_seen": 1181492224 }, { "epoch": 3.05, "learning_rate": 0.00032426278836509534, "loss": 2.7695, "theoretical_loss": 3.5925866063729806, "tokens_seen": 1181557760 }, { "epoch": 3.05, "learning_rate": 0.00032425275827482447, "loss": 2.9387, "theoretical_loss": 3.592568223859046, "tokens_seen": 1181623296 }, { "epoch": 3.05, "learning_rate": 0.0003242427281845537, "loss": 2.9188, "theoretical_loss": 3.5925498426500804, "tokens_seen": 1181688832 }, { "epoch": 3.05, "learning_rate": 0.00032423269809428283, "loss": 2.9804, "theoretical_loss": 3.5925314627459195, "tokens_seen": 1181754368 }, { "epoch": 3.05, "learning_rate": 0.00032422266800401206, "loss": 2.7413, "theoretical_loss": 3.592513084146398, "tokens_seen": 1181819904 }, { "epoch": 3.05, "learning_rate": 0.00032421263791374125, "loss": 3.056, "theoretical_loss": 3.592494706851351, "tokens_seen": 1181885440 }, { "epoch": 3.05, "learning_rate": 0.0003242026078234704, "loss": 2.8172, "theoretical_loss": 3.592476330860614, "tokens_seen": 1181950976 }, { "epoch": 3.05, "learning_rate": 0.0003241925777331996, "loss": 2.8365, "theoretical_loss": 3.5924579561740213, "tokens_seen": 1182016512 }, { "epoch": 3.05, "learning_rate": 0.00032418254764292884, "loss": 2.7786, "theoretical_loss": 3.5924395827914086, "tokens_seen": 1182082048 }, { "epoch": 3.05, "learning_rate": 0.00032417251755265797, "loss": 2.7877, "theoretical_loss": 3.592421210712611, "tokens_seen": 1182147584 }, { "epoch": 3.05, "learning_rate": 0.0003241624874623872, "loss": 3.0609, "theoretical_loss": 3.5924028399374635, "tokens_seen": 1182213120 }, { "epoch": 3.05, "learning_rate": 0.00032415245737211633, "loss": 2.9018, "theoretical_loss": 3.5923844704658014, "tokens_seen": 1182278656 }, { "epoch": 3.05, "learning_rate": 0.00032414242728184557, "loss": 2.8063, "theoretical_loss": 3.5923661022974605, "tokens_seen": 1182344192 }, { "epoch": 3.05, "learning_rate": 0.00032413239719157475, "loss": 2.8903, "theoretical_loss": 3.5923477354322753, "tokens_seen": 1182409728 }, { "epoch": 3.05, "learning_rate": 0.00032412236710130393, "loss": 2.7139, "theoretical_loss": 3.5923293698700816, "tokens_seen": 1182475264 }, { "epoch": 3.05, "learning_rate": 0.0003241123370110331, "loss": 3.0111, "theoretical_loss": 3.5923110056107146, "tokens_seen": 1182540800 }, { "epoch": 3.05, "learning_rate": 0.0003241023069207623, "loss": 2.8702, "theoretical_loss": 3.5922926426540096, "tokens_seen": 1182606336 }, { "epoch": 3.05, "learning_rate": 0.00032409227683049147, "loss": 2.8169, "theoretical_loss": 3.592274280999802, "tokens_seen": 1182671872 }, { "epoch": 3.05, "learning_rate": 0.0003240822467402207, "loss": 2.8743, "theoretical_loss": 3.5922559206479274, "tokens_seen": 1182737408 }, { "epoch": 3.05, "learning_rate": 0.00032407221664994984, "loss": 2.8809, "theoretical_loss": 3.5922375615982216, "tokens_seen": 1182802944 }, { "epoch": 3.05, "learning_rate": 0.00032406218655967907, "loss": 2.9239, "theoretical_loss": 3.5922192038505196, "tokens_seen": 1182868480 }, { "epoch": 3.05, "objective/train/docs_used": 1895058, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9884111881256104, "objective/train/theoretical_loss": 3.592214614617013, "objective/train/tokens_used": 1203344864, "theoretical_loss": 3.592214614617013, "tokens_seen": 1182884864 }, { "epoch": 3.05, "learning_rate": 0.0003240521564694082, "loss": 2.7738, "theoretical_loss": 3.5922008474046567, "tokens_seen": 1182934016 }, { "epoch": 3.05, "learning_rate": 0.00032404212637913743, "loss": 2.8583, "theoretical_loss": 3.59218249226047, "tokens_seen": 1182999552 }, { "epoch": 3.05, "learning_rate": 0.0003240320962888666, "loss": 2.9912, "theoretical_loss": 3.5921641384177927, "tokens_seen": 1183065088 }, { "epoch": 3.05, "learning_rate": 0.0003240220661985958, "loss": 2.9694, "theoretical_loss": 3.592145785876462, "tokens_seen": 1183130624 }, { "epoch": 3.05, "learning_rate": 0.000324012036108325, "loss": 2.7974, "theoretical_loss": 3.5921274346363132, "tokens_seen": 1183196160 }, { "epoch": 3.05, "learning_rate": 0.0003240020060180542, "loss": 2.9921, "theoretical_loss": 3.5921090846971824, "tokens_seen": 1183261696 }, { "epoch": 3.05, "learning_rate": 0.00032399197592778334, "loss": 2.9561, "theoretical_loss": 3.5920907360589043, "tokens_seen": 1183327232 }, { "epoch": 3.05, "learning_rate": 0.0003239819458375126, "loss": 2.8737, "theoretical_loss": 3.592072388721316, "tokens_seen": 1183392768 }, { "epoch": 3.05, "learning_rate": 0.0003239719157472417, "loss": 2.9443, "theoretical_loss": 3.5920540426842518, "tokens_seen": 1183458304 }, { "epoch": 3.05, "learning_rate": 0.00032396188565697094, "loss": 2.8214, "theoretical_loss": 3.5920356979475487, "tokens_seen": 1183523840 }, { "epoch": 3.05, "learning_rate": 0.0003239518555667001, "loss": 2.8383, "theoretical_loss": 3.592017354511042, "tokens_seen": 1183589376 }, { "epoch": 3.05, "learning_rate": 0.0003239418254764293, "loss": 2.9722, "theoretical_loss": 3.5919990123745675, "tokens_seen": 1183654912 }, { "epoch": 3.05, "learning_rate": 0.0003239317953861585, "loss": 2.8025, "theoretical_loss": 3.5919806715379616, "tokens_seen": 1183720448 }, { "epoch": 3.05, "learning_rate": 0.00032392176529588766, "loss": 3.007, "theoretical_loss": 3.591962332001059, "tokens_seen": 1183785984 }, { "epoch": 3.05, "learning_rate": 0.00032391173520561684, "loss": 2.8156, "theoretical_loss": 3.591943993763697, "tokens_seen": 1183851520 }, { "epoch": 3.05, "learning_rate": 0.0003239017051153461, "loss": 2.797, "theoretical_loss": 3.591925656825711, "tokens_seen": 1183917056 }, { "epoch": 3.05, "learning_rate": 0.0003238916750250752, "loss": 2.8137, "theoretical_loss": 3.5919073211869375, "tokens_seen": 1183982592 }, { "epoch": 3.05, "learning_rate": 0.00032388164493480444, "loss": 2.9023, "theoretical_loss": 3.591888986847212, "tokens_seen": 1184048128 }, { "epoch": 3.05, "learning_rate": 0.00032387161484453357, "loss": 2.8542, "theoretical_loss": 3.5918706538063705, "tokens_seen": 1184113664 }, { "epoch": 3.05, "learning_rate": 0.0003238615847542628, "loss": 2.9198, "theoretical_loss": 3.5918523220642493, "tokens_seen": 1184179200 }, { "epoch": 3.05, "learning_rate": 0.000323851554663992, "loss": 2.8943, "theoretical_loss": 3.5918339916206845, "tokens_seen": 1184244736 }, { "epoch": 3.05, "learning_rate": 0.00032384152457372116, "loss": 2.7526, "theoretical_loss": 3.591815662475512, "tokens_seen": 1184310272 }, { "epoch": 3.05, "learning_rate": 0.00032383149448345034, "loss": 2.9807, "theoretical_loss": 3.591797334628569, "tokens_seen": 1184375808 }, { "epoch": 3.05, "learning_rate": 0.0003238214643931796, "loss": 2.8042, "theoretical_loss": 3.5917790080796905, "tokens_seen": 1184441344 }, { "epoch": 3.05, "learning_rate": 0.0003238114343029087, "loss": 2.9762, "theoretical_loss": 3.5917606828287134, "tokens_seen": 1184506880 }, { "epoch": 3.05, "objective/train/docs_used": 1898140, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9372475147247314, "objective/train/theoretical_loss": 3.5917561017187474, "objective/train/tokens_used": 1204983264, "theoretical_loss": 3.5917561017187474, "tokens_seen": 1184523264 }, { "epoch": 3.05, "learning_rate": 0.00032380140421263794, "loss": 2.7964, "theoretical_loss": 3.591742358875474, "tokens_seen": 1184572416 }, { "epoch": 3.05, "learning_rate": 0.00032379137412236707, "loss": 2.9269, "theoretical_loss": 3.591724036219808, "tokens_seen": 1184637952 }, { "epoch": 3.05, "learning_rate": 0.0003237813440320963, "loss": 2.8328, "theoretical_loss": 3.5917057148615528, "tokens_seen": 1184703488 }, { "epoch": 3.05, "learning_rate": 0.0003237713139418255, "loss": 2.7683, "theoretical_loss": 3.5916873948005437, "tokens_seen": 1184769024 }, { "epoch": 3.05, "learning_rate": 0.00032376128385155467, "loss": 2.9255, "theoretical_loss": 3.591669076036618, "tokens_seen": 1184834560 }, { "epoch": 3.05, "learning_rate": 0.00032375125376128385, "loss": 2.8501, "theoretical_loss": 3.591650758569611, "tokens_seen": 1184900096 }, { "epoch": 3.05, "learning_rate": 0.00032374122367101303, "loss": 2.7789, "theoretical_loss": 3.5916324423993604, "tokens_seen": 1184965632 }, { "epoch": 3.05, "learning_rate": 0.0003237311935807422, "loss": 2.9406, "theoretical_loss": 3.591614127525702, "tokens_seen": 1185031168 }, { "epoch": 3.05, "learning_rate": 0.00032372116349047145, "loss": 2.8282, "theoretical_loss": 3.591595813948472, "tokens_seen": 1185096704 }, { "epoch": 3.05, "learning_rate": 0.00032371113340020057, "loss": 3.0593, "theoretical_loss": 3.591577501667508, "tokens_seen": 1185162240 }, { "epoch": 3.05, "learning_rate": 0.0003237011033099298, "loss": 2.999, "theoretical_loss": 3.591559190682646, "tokens_seen": 1185227776 }, { "epoch": 3.05, "learning_rate": 0.00032369107321965893, "loss": 2.9709, "theoretical_loss": 3.5915408809937217, "tokens_seen": 1185293312 }, { "epoch": 3.05, "learning_rate": 0.00032368104312938817, "loss": 2.7934, "theoretical_loss": 3.5915225726005735, "tokens_seen": 1185358848 }, { "epoch": 3.05, "learning_rate": 0.00032367101303911735, "loss": 2.981, "theoretical_loss": 3.591504265503037, "tokens_seen": 1185424384 }, { "epoch": 3.05, "learning_rate": 0.00032366098294884653, "loss": 2.7759, "theoretical_loss": 3.5914859597009485, "tokens_seen": 1185489920 }, { "epoch": 3.05, "learning_rate": 0.0003236509528585757, "loss": 2.9366, "theoretical_loss": 3.591467655194146, "tokens_seen": 1185555456 }, { "epoch": 3.05, "learning_rate": 0.00032364092276830495, "loss": 2.9494, "theoretical_loss": 3.5914493519824653, "tokens_seen": 1185620992 }, { "epoch": 3.05, "learning_rate": 0.0003236308926780341, "loss": 2.9795, "theoretical_loss": 3.591431050065743, "tokens_seen": 1185686528 }, { "epoch": 3.05, "learning_rate": 0.0003236208625877633, "loss": 2.9249, "theoretical_loss": 3.591412749443817, "tokens_seen": 1185752064 }, { "epoch": 3.05, "learning_rate": 0.00032361083249749244, "loss": 2.9589, "theoretical_loss": 3.591394450116523, "tokens_seen": 1185817600 }, { "epoch": 3.05, "learning_rate": 0.0003236008024072217, "loss": 2.7762, "theoretical_loss": 3.5913761520836984, "tokens_seen": 1185883136 }, { "epoch": 3.05, "learning_rate": 0.00032359077231695085, "loss": 2.8903, "theoretical_loss": 3.5913578553451804, "tokens_seen": 1185948672 }, { "epoch": 3.05, "learning_rate": 0.00032358074222668004, "loss": 2.687, "theoretical_loss": 3.5913395599008053, "tokens_seen": 1186014208 }, { "epoch": 3.05, "learning_rate": 0.00032357071213640927, "loss": 2.8403, "theoretical_loss": 3.5913212657504103, "tokens_seen": 1186079744 }, { "epoch": 3.05, "learning_rate": 0.0003235606820461384, "loss": 2.7969, "theoretical_loss": 3.591302972893833, "tokens_seen": 1186145280 }, { "epoch": 3.05, "objective/train/docs_used": 1901076, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0487306118011475, "objective/train/theoretical_loss": 3.591298399881828, "objective/train/tokens_used": 1206621664, "theoretical_loss": 3.591298399881828, "tokens_seen": 1186161664 }, { "epoch": 3.05, "learning_rate": 0.00032355065195586763, "loss": 3.0085, "theoretical_loss": 3.591284681330909, "tokens_seen": 1186210816 }, { "epoch": 3.05, "learning_rate": 0.0003235406218655968, "loss": 2.9662, "theoretical_loss": 3.5912663910614766, "tokens_seen": 1186276352 }, { "epoch": 3.05, "learning_rate": 0.000323530591775326, "loss": 2.7199, "theoretical_loss": 3.5912481020853724, "tokens_seen": 1186341888 }, { "epoch": 3.05, "learning_rate": 0.0003235205616850552, "loss": 2.9256, "theoretical_loss": 3.5912298144024337, "tokens_seen": 1186407424 }, { "epoch": 3.05, "learning_rate": 0.0003235105315947844, "loss": 2.64, "theoretical_loss": 3.5912115280124977, "tokens_seen": 1186472960 }, { "epoch": 3.05, "learning_rate": 0.00032350050150451354, "loss": 2.9888, "theoretical_loss": 3.591193242915401, "tokens_seen": 1186538496 }, { "epoch": 3.05, "learning_rate": 0.0003234904714142428, "loss": 2.9452, "theoretical_loss": 3.5911749591109814, "tokens_seen": 1186604032 }, { "epoch": 3.05, "learning_rate": 0.0003234804413239719, "loss": 2.8518, "theoretical_loss": 3.591156676599076, "tokens_seen": 1186669568 }, { "epoch": 3.05, "learning_rate": 0.00032347041123370114, "loss": 2.7515, "theoretical_loss": 3.591138395379522, "tokens_seen": 1186735104 }, { "epoch": 3.05, "learning_rate": 0.0003234603811434303, "loss": 3.0433, "theoretical_loss": 3.5911201154521564, "tokens_seen": 1186800640 }, { "epoch": 3.05, "learning_rate": 0.0003234503510531595, "loss": 2.9094, "theoretical_loss": 3.591101836816817, "tokens_seen": 1186866176 }, { "epoch": 3.05, "learning_rate": 0.0003234403209628887, "loss": 2.8627, "theoretical_loss": 3.591083559473341, "tokens_seen": 1186931712 }, { "epoch": 3.05, "learning_rate": 0.00032343029087261786, "loss": 3.0311, "theoretical_loss": 3.5910652834215657, "tokens_seen": 1186997248 }, { "epoch": 3.05, "learning_rate": 0.00032342026078234704, "loss": 2.9155, "theoretical_loss": 3.591047008661328, "tokens_seen": 1187062784 }, { "epoch": 3.05, "learning_rate": 0.0003234102306920763, "loss": 3.021, "theoretical_loss": 3.5910287351924666, "tokens_seen": 1187128320 }, { "epoch": 3.05, "learning_rate": 0.0003234002006018054, "loss": 2.9172, "theoretical_loss": 3.5910104630148174, "tokens_seen": 1187193856 }, { "epoch": 3.05, "learning_rate": 0.00032339017051153464, "loss": 2.9556, "theoretical_loss": 3.590992192128219, "tokens_seen": 1187259392 }, { "epoch": 3.05, "learning_rate": 0.00032338014042126377, "loss": 2.8604, "theoretical_loss": 3.590973922532509, "tokens_seen": 1187324928 }, { "epoch": 3.05, "learning_rate": 0.000323370110330993, "loss": 2.7532, "theoretical_loss": 3.5909556542275243, "tokens_seen": 1187390464 }, { "epoch": 3.05, "learning_rate": 0.0003233600802407222, "loss": 2.8179, "theoretical_loss": 3.5909373872131027, "tokens_seen": 1187456000 }, { "epoch": 3.05, "learning_rate": 0.00032335005015045136, "loss": 2.847, "theoretical_loss": 3.5909191214890814, "tokens_seen": 1187521536 }, { "epoch": 3.05, "learning_rate": 0.00032334002006018054, "loss": 2.9675, "theoretical_loss": 3.590900857055299, "tokens_seen": 1187587072 }, { "epoch": 3.05, "learning_rate": 0.0003233299899699098, "loss": 2.7137, "theoretical_loss": 3.590882593911593, "tokens_seen": 1187652608 }, { "epoch": 3.05, "learning_rate": 0.0003233199598796389, "loss": 2.8745, "theoretical_loss": 3.5908643320578, "tokens_seen": 1187718144 }, { "epoch": 3.05, "learning_rate": 0.00032330992978936814, "loss": 2.857, "theoretical_loss": 3.5908460714937593, "tokens_seen": 1187783680 }, { "epoch": 3.05, "objective/train/docs_used": 1903128, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0602121353149414, "objective/train/theoretical_loss": 3.5908415065542534, "objective/train/tokens_used": 1208260064, "theoretical_loss": 3.5908415065542534, "tokens_seen": 1187800064 }, { "epoch": 3.05, "learning_rate": 0.00032329989969909727, "loss": 2.8592, "theoretical_loss": 3.590827812219307, "tokens_seen": 1187849216 }, { "epoch": 3.05, "learning_rate": 0.0003232898696088265, "loss": 2.8837, "theoretical_loss": 3.590809554234282, "tokens_seen": 1187914752 }, { "epoch": 3.05, "learning_rate": 0.0003232798395185557, "loss": 2.8459, "theoretical_loss": 3.590791297538522, "tokens_seen": 1187980288 }, { "epoch": 3.05, "learning_rate": 0.00032326980942828487, "loss": 2.742, "theoretical_loss": 3.5907730421318647, "tokens_seen": 1188045824 }, { "epoch": 3.05, "learning_rate": 0.00032325977933801405, "loss": 2.8443, "theoretical_loss": 3.5907547880141477, "tokens_seen": 1188111360 }, { "epoch": 3.05, "learning_rate": 0.00032324974924774323, "loss": 2.9536, "theoretical_loss": 3.5907365351852096, "tokens_seen": 1188176896 }, { "epoch": 3.05, "learning_rate": 0.0003232397191574724, "loss": 2.8263, "theoretical_loss": 3.5907182836448874, "tokens_seen": 1188242432 }, { "epoch": 3.05, "learning_rate": 0.00032322968906720165, "loss": 2.9292, "theoretical_loss": 3.59070003339302, "tokens_seen": 1188307968 }, { "epoch": 3.05, "learning_rate": 0.00032321965897693077, "loss": 3.0071, "theoretical_loss": 3.590681784429444, "tokens_seen": 1188373504 }, { "epoch": 3.05, "learning_rate": 0.00032320962888666, "loss": 2.8783, "theoretical_loss": 3.5906635367539996, "tokens_seen": 1188439040 }, { "epoch": 3.05, "learning_rate": 0.00032319959879638913, "loss": 2.9152, "theoretical_loss": 3.5906452903665227, "tokens_seen": 1188504576 }, { "epoch": 3.05, "learning_rate": 0.00032318956870611837, "loss": 2.8729, "theoretical_loss": 3.590627045266853, "tokens_seen": 1188570112 }, { "epoch": 3.05, "learning_rate": 0.00032317953861584755, "loss": 2.7818, "theoretical_loss": 3.5906088014548274, "tokens_seen": 1188635648 }, { "epoch": 3.05, "learning_rate": 0.00032316950852557673, "loss": 3.0034, "theoretical_loss": 3.590590558930285, "tokens_seen": 1188701184 }, { "epoch": 3.05, "learning_rate": 0.0003231594784353059, "loss": 2.9618, "theoretical_loss": 3.590572317693063, "tokens_seen": 1188766720 }, { "epoch": 3.05, "learning_rate": 0.00032314944834503515, "loss": 2.9026, "theoretical_loss": 3.590554077743, "tokens_seen": 1188832256 }, { "epoch": 3.05, "learning_rate": 0.0003231394182547643, "loss": 2.7322, "theoretical_loss": 3.5905358390799345, "tokens_seen": 1188897792 }, { "epoch": 3.05, "learning_rate": 0.0003231293881644935, "loss": 2.8823, "theoretical_loss": 3.590517601703705, "tokens_seen": 1188963328 }, { "epoch": 3.05, "learning_rate": 0.00032311935807422264, "loss": 2.9933, "theoretical_loss": 3.590499365614149, "tokens_seen": 1189028864 }, { "epoch": 3.05, "learning_rate": 0.0003231093279839519, "loss": 2.825, "theoretical_loss": 3.590481130811105, "tokens_seen": 1189094400 }, { "epoch": 3.05, "learning_rate": 0.00032309929789368105, "loss": 2.8343, "theoretical_loss": 3.5904628972944117, "tokens_seen": 1189159936 }, { "epoch": 3.05, "learning_rate": 0.00032308926780341024, "loss": 2.9231, "theoretical_loss": 3.5904446650639072, "tokens_seen": 1189225472 }, { "epoch": 3.05, "learning_rate": 0.0003230792377131394, "loss": 2.7729, "theoretical_loss": 3.59042643411943, "tokens_seen": 1189291008 }, { "epoch": 3.05, "learning_rate": 0.0003230692076228686, "loss": 2.9087, "theoretical_loss": 3.590408204460819, "tokens_seen": 1189356544 }, { "epoch": 3.05, "learning_rate": 0.0003230591775325978, "loss": 2.8204, "theoretical_loss": 3.590389976087912, "tokens_seen": 1189422080 }, { "epoch": 3.05, "objective/train/docs_used": 1906045, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8355515003204346, "objective/train/theoretical_loss": 3.590385419195557, "objective/train/tokens_used": 1209898464, "theoretical_loss": 3.590385419195557, "tokens_seen": 1189438464 }, { "epoch": 3.05, "learning_rate": 0.000323049147442327, "loss": 2.6861, "theoretical_loss": 3.590371749000547, "tokens_seen": 1189487616 }, { "epoch": 3.05, "learning_rate": 0.00032303911735205614, "loss": 2.8039, "theoretical_loss": 3.590353523198564, "tokens_seen": 1189553152 }, { "epoch": 3.05, "learning_rate": 0.0003230290872617854, "loss": 2.9288, "theoretical_loss": 3.5903352986818, "tokens_seen": 1189618688 }, { "epoch": 3.05, "learning_rate": 0.0003230190571715145, "loss": 2.7351, "theoretical_loss": 3.590317075450095, "tokens_seen": 1189684224 }, { "epoch": 3.05, "learning_rate": 0.00032300902708124374, "loss": 2.9437, "theoretical_loss": 3.5902988535032865, "tokens_seen": 1189749760 }, { "epoch": 3.05, "learning_rate": 0.0003229989969909729, "loss": 2.7252, "theoretical_loss": 3.590280632841213, "tokens_seen": 1189815296 }, { "epoch": 3.05, "learning_rate": 0.0003229889669007021, "loss": 2.8298, "theoretical_loss": 3.590262413463715, "tokens_seen": 1189880832 }, { "epoch": 3.05, "learning_rate": 0.0003229789368104313, "loss": 2.768, "theoretical_loss": 3.590244195370629, "tokens_seen": 1189946368 }, { "epoch": 3.05, "learning_rate": 0.0003229689067201605, "loss": 2.8799, "theoretical_loss": 3.5902259785617945, "tokens_seen": 1190011904 }, { "epoch": 3.05, "learning_rate": 0.00032295887662988964, "loss": 3.014, "theoretical_loss": 3.590207763037051, "tokens_seen": 1190077440 }, { "epoch": 3.05, "learning_rate": 0.0003229488465396189, "loss": 2.8409, "theoretical_loss": 3.5901895487962365, "tokens_seen": 1190142976 }, { "epoch": 3.05, "learning_rate": 0.000322938816449348, "loss": 2.7993, "theoretical_loss": 3.5901713358391896, "tokens_seen": 1190208512 }, { "epoch": 3.05, "learning_rate": 0.00032292878635907724, "loss": 2.8645, "theoretical_loss": 3.5901531241657496, "tokens_seen": 1190274048 }, { "epoch": 3.05, "learning_rate": 0.0003229187562688064, "loss": 2.8065, "theoretical_loss": 3.5901349137757554, "tokens_seen": 1190339584 }, { "epoch": 3.05, "learning_rate": 0.0003229087261785356, "loss": 2.7714, "theoretical_loss": 3.590116704669046, "tokens_seen": 1190405120 }, { "epoch": 3.05, "learning_rate": 0.0003228986960882648, "loss": 2.8289, "theoretical_loss": 3.5900984968454592, "tokens_seen": 1190470656 }, { "epoch": 3.05, "learning_rate": 0.00032288866599799397, "loss": 2.6696, "theoretical_loss": 3.5900802903048357, "tokens_seen": 1190536192 }, { "epoch": 3.05, "learning_rate": 0.00032287863590772315, "loss": 2.9565, "theoretical_loss": 3.5900620850470135, "tokens_seen": 1190601728 }, { "epoch": 3.05, "learning_rate": 0.0003228686058174524, "loss": 2.8571, "theoretical_loss": 3.5900438810718316, "tokens_seen": 1190667264 }, { "epoch": 3.05, "learning_rate": 0.0003228585757271815, "loss": 2.9831, "theoretical_loss": 3.590025678379129, "tokens_seen": 1190732800 }, { "epoch": 3.05, "learning_rate": 0.00032284854563691074, "loss": 2.9073, "theoretical_loss": 3.5900074769687453, "tokens_seen": 1190798336 }, { "epoch": 3.05, "learning_rate": 0.0003228385155466399, "loss": 2.7997, "theoretical_loss": 3.589989276840519, "tokens_seen": 1190863872 }, { "epoch": 3.05, "learning_rate": 0.0003228284854563691, "loss": 2.8584, "theoretical_loss": 3.5899710779942895, "tokens_seen": 1190929408 }, { "epoch": 3.05, "learning_rate": 0.00032281845536609834, "loss": 2.8577, "theoretical_loss": 3.589952880429896, "tokens_seen": 1190994944 }, { "epoch": 3.05, "learning_rate": 0.00032280842527582747, "loss": 2.6651, "theoretical_loss": 3.5899346841471775, "tokens_seen": 1191060480 }, { "epoch": 3.05, "objective/train/docs_used": 1908714, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8588204383850098, "objective/train/theoretical_loss": 3.5899301352767408, "objective/train/tokens_used": 1211536864, "theoretical_loss": 3.5899301352767408, "tokens_seen": 1191076864 }, { "epoch": 3.05, "learning_rate": 0.0003227983951855567, "loss": 2.8462, "theoretical_loss": 3.589916489145973, "tokens_seen": 1191126016 }, { "epoch": 3.05, "learning_rate": 0.0003227883650952859, "loss": 2.9599, "theoretical_loss": 3.589898295426123, "tokens_seen": 1191191552 }, { "epoch": 3.05, "learning_rate": 0.00032277833500501507, "loss": 2.8689, "theoretical_loss": 3.589880102987465, "tokens_seen": 1191257088 }, { "epoch": 3.05, "learning_rate": 0.00032276830491474425, "loss": 2.9181, "theoretical_loss": 3.58986191182984, "tokens_seen": 1191322624 }, { "epoch": 3.05, "learning_rate": 0.00032275827482447343, "loss": 2.9412, "theoretical_loss": 3.589843721953086, "tokens_seen": 1191388160 }, { "epoch": 3.05, "learning_rate": 0.0003227482447342026, "loss": 2.9879, "theoretical_loss": 3.5898255333570424, "tokens_seen": 1191453696 }, { "epoch": 3.05, "learning_rate": 0.00032273821464393185, "loss": 2.8375, "theoretical_loss": 3.5898073460415496, "tokens_seen": 1191519232 }, { "epoch": 3.05, "learning_rate": 0.00032272818455366097, "loss": 2.7372, "theoretical_loss": 3.5897891600064464, "tokens_seen": 1191584768 }, { "epoch": 3.05, "learning_rate": 0.0003227181544633902, "loss": 2.9367, "theoretical_loss": 3.5897709752515725, "tokens_seen": 1191650304 }, { "epoch": 3.05, "learning_rate": 0.00032270812437311933, "loss": 2.754, "theoretical_loss": 3.5897527917767666, "tokens_seen": 1191715840 }, { "epoch": 3.05, "learning_rate": 0.00032269809428284857, "loss": 2.8592, "theoretical_loss": 3.5897346095818694, "tokens_seen": 1191781376 }, { "epoch": 3.05, "learning_rate": 0.00032268806419257775, "loss": 3.0064, "theoretical_loss": 3.589716428666719, "tokens_seen": 1191846912 }, { "epoch": 3.05, "learning_rate": 0.00032267803410230693, "loss": 2.9594, "theoretical_loss": 3.589698249031157, "tokens_seen": 1191912448 }, { "epoch": 3.05, "learning_rate": 0.0003226680040120361, "loss": 2.7387, "theoretical_loss": 3.589680070675021, "tokens_seen": 1191977984 }, { "epoch": 3.05, "learning_rate": 0.00032265797392176535, "loss": 2.925, "theoretical_loss": 3.5896618935981515, "tokens_seen": 1192043520 }, { "epoch": 3.05, "learning_rate": 0.0003226479438314945, "loss": 2.8936, "theoretical_loss": 3.5896437178003877, "tokens_seen": 1192109056 }, { "epoch": 3.05, "learning_rate": 0.0003226379137412237, "loss": 2.9623, "theoretical_loss": 3.58962554328157, "tokens_seen": 1192174592 }, { "epoch": 3.05, "learning_rate": 0.00032262788365095284, "loss": 2.9033, "theoretical_loss": 3.589607370041538, "tokens_seen": 1192240128 }, { "epoch": 3.05, "learning_rate": 0.0003226178535606821, "loss": 2.7388, "theoretical_loss": 3.5895891980801307, "tokens_seen": 1192305664 }, { "epoch": 3.05, "learning_rate": 0.00032260782347041125, "loss": 3.0489, "theoretical_loss": 3.5895710273971884, "tokens_seen": 1192371200 }, { "epoch": 3.05, "learning_rate": 0.00032259779338014044, "loss": 2.8598, "theoretical_loss": 3.589552857992551, "tokens_seen": 1192436736 }, { "epoch": 3.05, "learning_rate": 0.0003225877632898696, "loss": 2.7676, "theoretical_loss": 3.589534689866058, "tokens_seen": 1192502272 }, { "epoch": 3.05, "learning_rate": 0.0003225777331995988, "loss": 2.8894, "theoretical_loss": 3.5895165230175494, "tokens_seen": 1192567808 }, { "epoch": 3.05, "learning_rate": 0.000322567703109328, "loss": 2.9053, "theoretical_loss": 3.5894983574468653, "tokens_seen": 1192633344 }, { "epoch": 3.05, "learning_rate": 0.0003225576730190572, "loss": 2.962, "theoretical_loss": 3.589480193153845, "tokens_seen": 1192698880 }, { "epoch": 3.05, "objective/train/docs_used": 1911629, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.868990898132324, "objective/train/theoretical_loss": 3.5894756522802065, "objective/train/tokens_used": 1213175264, "theoretical_loss": 3.5894756522802065, "tokens_seen": 1192715264 }, { "epoch": 3.05, "learning_rate": 0.00032254764292878634, "loss": 2.8771, "theoretical_loss": 3.5894620301383293, "tokens_seen": 1192764416 }, { "epoch": 3.05, "learning_rate": 0.0003225376128385156, "loss": 2.9432, "theoretical_loss": 3.5894438684001573, "tokens_seen": 1192829952 }, { "epoch": 3.05, "learning_rate": 0.0003225275827482447, "loss": 2.9451, "theoretical_loss": 3.58942570793917, "tokens_seen": 1192895488 }, { "epoch": 3.05, "learning_rate": 0.00032251755265797394, "loss": 2.88, "theoretical_loss": 3.589407548755206, "tokens_seen": 1192961024 }, { "epoch": 3.05, "learning_rate": 0.0003225075225677031, "loss": 2.9928, "theoretical_loss": 3.589389390848107, "tokens_seen": 1193026560 }, { "epoch": 3.05, "learning_rate": 0.0003224974924774323, "loss": 2.8816, "theoretical_loss": 3.5893712342177118, "tokens_seen": 1193092096 }, { "epoch": 3.05, "learning_rate": 0.0003224874623871615, "loss": 2.8982, "theoretical_loss": 3.5893530788638612, "tokens_seen": 1193157632 }, { "epoch": 3.05, "learning_rate": 0.0003224774322968907, "loss": 2.7746, "theoretical_loss": 3.589334924786395, "tokens_seen": 1193223168 }, { "epoch": 3.05, "learning_rate": 0.00032246740220661984, "loss": 2.8735, "theoretical_loss": 3.5893167719851533, "tokens_seen": 1193288704 }, { "epoch": 3.05, "learning_rate": 0.0003224573721163491, "loss": 2.7934, "theoretical_loss": 3.589298620459977, "tokens_seen": 1193354240 }, { "epoch": 3.05, "learning_rate": 0.0003224473420260782, "loss": 2.9788, "theoretical_loss": 3.5892804702107055, "tokens_seen": 1193419776 }, { "epoch": 3.05, "learning_rate": 0.00032243731193580744, "loss": 2.9195, "theoretical_loss": 3.5892623212371793, "tokens_seen": 1193485312 }, { "epoch": 3.05, "learning_rate": 0.0003224272818455366, "loss": 2.9413, "theoretical_loss": 3.589244173539239, "tokens_seen": 1193550848 }, { "epoch": 3.05, "learning_rate": 0.0003224172517552658, "loss": 2.833, "theoretical_loss": 3.5892260271167244, "tokens_seen": 1193616384 }, { "epoch": 3.05, "learning_rate": 0.000322407221664995, "loss": 2.8698, "theoretical_loss": 3.589207881969476, "tokens_seen": 1193681920 }, { "epoch": 3.05, "learning_rate": 0.00032239719157472417, "loss": 2.9021, "theoretical_loss": 3.5891897380973345, "tokens_seen": 1193747456 }, { "epoch": 3.05, "learning_rate": 0.00032238716148445335, "loss": 2.9403, "theoretical_loss": 3.58917159550014, "tokens_seen": 1193812992 }, { "epoch": 3.05, "learning_rate": 0.0003223771313941826, "loss": 2.9823, "theoretical_loss": 3.5891534541777332, "tokens_seen": 1193878528 }, { "epoch": 3.05, "learning_rate": 0.0003223671013039117, "loss": 2.9507, "theoretical_loss": 3.5891353141299547, "tokens_seen": 1193944064 }, { "epoch": 3.05, "learning_rate": 0.00032235707121364095, "loss": 2.8868, "theoretical_loss": 3.5891171753566438, "tokens_seen": 1194009600 }, { "epoch": 3.05, "learning_rate": 0.0003223470411233701, "loss": 2.7532, "theoretical_loss": 3.5890990378576424, "tokens_seen": 1194075136 }, { "epoch": 3.05, "learning_rate": 0.0003223370110330993, "loss": 2.8568, "theoretical_loss": 3.5890809016327903, "tokens_seen": 1194140672 }, { "epoch": 3.05, "learning_rate": 0.0003223269809428285, "loss": 3.0156, "theoretical_loss": 3.5890627666819284, "tokens_seen": 1194206208 }, { "epoch": 3.05, "learning_rate": 0.00032231695085255767, "loss": 2.8708, "theoretical_loss": 3.589044633004897, "tokens_seen": 1194271744 }, { "epoch": 3.05, "learning_rate": 0.00032230692076228685, "loss": 2.7637, "theoretical_loss": 3.589026500601537, "tokens_seen": 1194337280 }, { "epoch": 3.05, "objective/train/docs_used": 1912989, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9554383754730225, "objective/train/theoretical_loss": 3.58902196769969, "objective/train/tokens_used": 1214813664, "theoretical_loss": 3.58902196769969, "tokens_seen": 1194353664 }, { "epoch": 3.05, "learning_rate": 0.0003222968906720161, "loss": 2.9167, "theoretical_loss": 3.5890083694716894, "tokens_seen": 1194402816 }, { "epoch": 3.05, "learning_rate": 0.0003222868605817452, "loss": 2.7981, "theoretical_loss": 3.5889902396151943, "tokens_seen": 1194468352 }, { "epoch": 3.05, "learning_rate": 0.00032227683049147445, "loss": 2.8435, "theoretical_loss": 3.588972111031892, "tokens_seen": 1194533888 }, { "epoch": 3.05, "learning_rate": 0.0003222668004012036, "loss": 2.9817, "theoretical_loss": 3.588953983721624, "tokens_seen": 1194599424 }, { "epoch": 3.05, "learning_rate": 0.0003222567703109328, "loss": 2.8673, "theoretical_loss": 3.5889358576842314, "tokens_seen": 1194664960 }, { "epoch": 3.05, "learning_rate": 0.000322246740220662, "loss": 2.8069, "theoretical_loss": 3.588917732919554, "tokens_seen": 1194730496 }, { "epoch": 3.05, "learning_rate": 0.00032223671013039117, "loss": 2.8597, "theoretical_loss": 3.5888996094274335, "tokens_seen": 1194796032 }, { "epoch": 3.05, "learning_rate": 0.00032222668004012035, "loss": 2.8257, "theoretical_loss": 3.5888814872077104, "tokens_seen": 1194861568 }, { "epoch": 3.05, "learning_rate": 0.00032221664994984953, "loss": 2.9564, "theoretical_loss": 3.5888633662602247, "tokens_seen": 1194927104 }, { "epoch": 3.05, "learning_rate": 0.0003222066198595787, "loss": 2.8282, "theoretical_loss": 3.5888452465848193, "tokens_seen": 1194992640 }, { "epoch": 3.05, "learning_rate": 0.00032219658976930795, "loss": 2.7952, "theoretical_loss": 3.5888271281813333, "tokens_seen": 1195058176 }, { "epoch": 3.05, "learning_rate": 0.0003221865596790371, "loss": 2.8129, "theoretical_loss": 3.5888090110496087, "tokens_seen": 1195123712 }, { "epoch": 3.05, "learning_rate": 0.0003221765295887663, "loss": 2.7444, "theoretical_loss": 3.5887908951894865, "tokens_seen": 1195189248 }, { "epoch": 3.05, "learning_rate": 0.0003221664994984955, "loss": 2.9722, "theoretical_loss": 3.5887727806008067, "tokens_seen": 1195254784 }, { "epoch": 3.05, "learning_rate": 0.0003221564694082247, "loss": 2.9337, "theoretical_loss": 3.5887546672834114, "tokens_seen": 1195320320 }, { "epoch": 3.05, "learning_rate": 0.00032214643931795386, "loss": 2.9209, "theoretical_loss": 3.5887365552371415, "tokens_seen": 1195385856 }, { "epoch": 3.05, "learning_rate": 0.00032213640922768304, "loss": 2.703, "theoretical_loss": 3.588718444461838, "tokens_seen": 1195451392 }, { "epoch": 3.05, "learning_rate": 0.0003221263791374122, "loss": 3.0166, "theoretical_loss": 3.588700334957342, "tokens_seen": 1195516928 }, { "epoch": 3.05, "learning_rate": 0.00032211634904714145, "loss": 2.9096, "theoretical_loss": 3.5886822267234946, "tokens_seen": 1195582464 }, { "epoch": 3.05, "learning_rate": 0.0003221063189568706, "loss": 2.7059, "theoretical_loss": 3.5886641197601374, "tokens_seen": 1195648000 }, { "epoch": 3.05, "learning_rate": 0.0003220962888665998, "loss": 2.7881, "theoretical_loss": 3.5886460140671113, "tokens_seen": 1195713536 }, { "epoch": 3.05, "learning_rate": 0.00032208625877632894, "loss": 2.9842, "theoretical_loss": 3.5886279096442575, "tokens_seen": 1195779072 }, { "epoch": 3.05, "learning_rate": 0.0003220762286860582, "loss": 2.8883, "theoretical_loss": 3.5886098064914176, "tokens_seen": 1195844608 }, { "epoch": 3.05, "learning_rate": 0.0003220661985957874, "loss": 2.8828, "theoretical_loss": 3.588591704608432, "tokens_seen": 1195910144 }, { "epoch": 3.05, "learning_rate": 0.00032205616850551654, "loss": 2.8582, "theoretical_loss": 3.588573603995144, "tokens_seen": 1195975680 }, { "epoch": 3.05, "objective/train/docs_used": 1915819, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6918792724609375, "objective/train/theoretical_loss": 3.5885690790401927, "objective/train/tokens_used": 1216452064, "theoretical_loss": 3.5885690790401927, "tokens_seen": 1195992064 }, { "epoch": 3.05, "learning_rate": 0.0003220461384152458, "loss": 2.8494, "theoretical_loss": 3.588555504651392, "tokens_seen": 1196041216 }, { "epoch": 3.05, "learning_rate": 0.0003220361083249749, "loss": 3.014, "theoretical_loss": 3.5885374065770206, "tokens_seen": 1196106752 }, { "epoch": 3.05, "learning_rate": 0.00032202607823470414, "loss": 3.002, "theoretical_loss": 3.5885193097718693, "tokens_seen": 1196172288 }, { "epoch": 3.05, "learning_rate": 0.0003220160481444333, "loss": 2.7601, "theoretical_loss": 3.58850121423578, "tokens_seen": 1196237824 }, { "epoch": 3.05, "learning_rate": 0.0003220060180541625, "loss": 2.8695, "theoretical_loss": 3.5884831199685943, "tokens_seen": 1196303360 }, { "epoch": 3.05, "learning_rate": 0.0003219959879638917, "loss": 2.627, "theoretical_loss": 3.5884650269701535, "tokens_seen": 1196368896 }, { "epoch": 3.05, "learning_rate": 0.0003219859578736209, "loss": 2.9182, "theoretical_loss": 3.5884469352402992, "tokens_seen": 1196434432 }, { "epoch": 3.05, "learning_rate": 0.00032197592778335004, "loss": 2.8315, "theoretical_loss": 3.588428844778873, "tokens_seen": 1196499968 }, { "epoch": 3.05, "learning_rate": 0.0003219658976930793, "loss": 2.8222, "theoretical_loss": 3.5884107555857168, "tokens_seen": 1196565504 }, { "epoch": 3.05, "learning_rate": 0.0003219558676028084, "loss": 2.6763, "theoretical_loss": 3.588392667660672, "tokens_seen": 1196631040 }, { "epoch": 3.05, "learning_rate": 0.00032194583751253764, "loss": 2.8775, "theoretical_loss": 3.58837458100358, "tokens_seen": 1196696576 }, { "epoch": 3.05, "learning_rate": 0.0003219358074222668, "loss": 2.8357, "theoretical_loss": 3.5883564956142826, "tokens_seen": 1196762112 }, { "epoch": 3.05, "learning_rate": 0.000321925777331996, "loss": 2.9314, "theoretical_loss": 3.5883384114926216, "tokens_seen": 1196827648 }, { "epoch": 3.05, "learning_rate": 0.0003219157472417252, "loss": 2.9962, "theoretical_loss": 3.588320328638439, "tokens_seen": 1196893184 }, { "epoch": 3.05, "learning_rate": 0.00032190571715145437, "loss": 2.9558, "theoretical_loss": 3.588302247051576, "tokens_seen": 1196958720 }, { "epoch": 3.05, "learning_rate": 0.00032189568706118355, "loss": 2.9201, "theoretical_loss": 3.5882841667318752, "tokens_seen": 1197024256 }, { "epoch": 3.05, "learning_rate": 0.0003218856569709128, "loss": 2.814, "theoretical_loss": 3.588266087679178, "tokens_seen": 1197089792 }, { "epoch": 3.05, "learning_rate": 0.0003218756268806419, "loss": 3.004, "theoretical_loss": 3.5882480098933263, "tokens_seen": 1197155328 }, { "epoch": 3.05, "learning_rate": 0.00032186559679037115, "loss": 2.9296, "theoretical_loss": 3.588229933374161, "tokens_seen": 1197220864 }, { "epoch": 3.05, "learning_rate": 0.0003218555667001003, "loss": 2.9597, "theoretical_loss": 3.588211858121526, "tokens_seen": 1197286400 }, { "epoch": 3.05, "learning_rate": 0.0003218455366098295, "loss": 2.9378, "theoretical_loss": 3.5881937841352616, "tokens_seen": 1197351936 }, { "epoch": 3.05, "learning_rate": 0.0003218355065195587, "loss": 3.0144, "theoretical_loss": 3.588175711415211, "tokens_seen": 1197417472 }, { "epoch": 3.05, "learning_rate": 0.00032182547642928787, "loss": 2.7357, "theoretical_loss": 3.5881576399612145, "tokens_seen": 1197483008 }, { "epoch": 3.05, "learning_rate": 0.00032181544633901705, "loss": 2.8803, "theoretical_loss": 3.5881395697731158, "tokens_seen": 1197548544 }, { "epoch": 3.05, "learning_rate": 0.0003218054162487463, "loss": 2.9078, "theoretical_loss": 3.5881215008507557, "tokens_seen": 1197614080 }, { "epoch": 3.05, "objective/train/docs_used": 1918337, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6916067600250244, "objective/train/theoretical_loss": 3.5881169838179194, "objective/train/tokens_used": 1218090464, "theoretical_loss": 3.5881169838179194, "tokens_seen": 1197630464 }, { "epoch": 3.05, "learning_rate": 0.0003217953861584754, "loss": 2.8546, "theoretical_loss": 3.588103433193978, "tokens_seen": 1197679616 }, { "epoch": 3.05, "learning_rate": 0.00032178535606820465, "loss": 2.9416, "theoretical_loss": 3.5880853668026225, "tokens_seen": 1197745152 }, { "epoch": 3.05, "learning_rate": 0.0003217753259779338, "loss": 2.8284, "theoretical_loss": 3.588067301676533, "tokens_seen": 1197810688 }, { "epoch": 3.05, "learning_rate": 0.000321765295887663, "loss": 2.9333, "theoretical_loss": 3.588049237815552, "tokens_seen": 1197876224 }, { "epoch": 3.05, "learning_rate": 0.0003217552657973922, "loss": 2.8555, "theoretical_loss": 3.5880311752195198, "tokens_seen": 1197941760 }, { "epoch": 3.05, "learning_rate": 0.00032174523570712137, "loss": 2.8644, "theoretical_loss": 3.5880131138882803, "tokens_seen": 1198007296 }, { "epoch": 3.05, "learning_rate": 0.00032173520561685055, "loss": 2.8999, "theoretical_loss": 3.5879950538216754, "tokens_seen": 1198072832 }, { "epoch": 3.05, "learning_rate": 0.00032172517552657974, "loss": 2.8708, "theoretical_loss": 3.5879769950195466, "tokens_seen": 1198138368 }, { "epoch": 3.05, "learning_rate": 0.0003217151454363089, "loss": 2.8595, "theoretical_loss": 3.587958937481737, "tokens_seen": 1198203904 }, { "epoch": 3.05, "learning_rate": 0.00032170511534603815, "loss": 2.7828, "theoretical_loss": 3.5879408812080893, "tokens_seen": 1198269440 }, { "epoch": 3.05, "learning_rate": 0.0003216950852557673, "loss": 2.8231, "theoretical_loss": 3.5879228261984446, "tokens_seen": 1198334976 }, { "epoch": 3.05, "learning_rate": 0.0003216850551654965, "loss": 2.9625, "theoretical_loss": 3.5879047724526463, "tokens_seen": 1198400512 }, { "epoch": 3.05, "learning_rate": 0.0003216750250752257, "loss": 2.8047, "theoretical_loss": 3.5878867199705367, "tokens_seen": 1198466048 }, { "epoch": 3.05, "learning_rate": 0.0003216649949849549, "loss": 2.713, "theoretical_loss": 3.587868668751957, "tokens_seen": 1198531584 }, { "epoch": 3.05, "learning_rate": 0.00032165496489468406, "loss": 2.923, "theoretical_loss": 3.587850618796752, "tokens_seen": 1198597120 }, { "epoch": 3.05, "learning_rate": 0.00032164493480441324, "loss": 2.8768, "theoretical_loss": 3.587832570104762, "tokens_seen": 1198662656 }, { "epoch": 3.05, "learning_rate": 0.0003216349047141424, "loss": 2.8692, "theoretical_loss": 3.5878145226758313, "tokens_seen": 1198728192 }, { "epoch": 3.05, "learning_rate": 0.00032162487462387165, "loss": 2.8005, "theoretical_loss": 3.5877964765098014, "tokens_seen": 1198793728 }, { "epoch": 3.05, "learning_rate": 0.0003216148445336008, "loss": 2.904, "theoretical_loss": 3.587778431606515, "tokens_seen": 1198859264 }, { "epoch": 3.05, "learning_rate": 0.00032160481444333, "loss": 2.9227, "theoretical_loss": 3.587760387965815, "tokens_seen": 1198924800 }, { "epoch": 3.05, "learning_rate": 0.00032159478435305914, "loss": 3.0263, "theoretical_loss": 3.587742345587544, "tokens_seen": 1198990336 }, { "epoch": 3.05, "learning_rate": 0.0003215847542627884, "loss": 3.0073, "theoretical_loss": 3.587724304471544, "tokens_seen": 1199055872 }, { "epoch": 3.05, "learning_rate": 0.00032157472417251756, "loss": 2.9634, "theoretical_loss": 3.587706264617659, "tokens_seen": 1199121408 }, { "epoch": 3.05, "learning_rate": 0.00032156469408224674, "loss": 2.8394, "theoretical_loss": 3.5876882260257306, "tokens_seen": 1199186944 }, { "epoch": 3.05, "learning_rate": 0.0003215546639919759, "loss": 2.9628, "theoretical_loss": 3.5876701886956024, "tokens_seen": 1199252480 }, { "epoch": 3.05, "objective/train/docs_used": 1921136, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9944987297058105, "objective/train/theoretical_loss": 3.5876656795602084, "objective/train/tokens_used": 1219728864, "theoretical_loss": 3.5876656795602084, "tokens_seen": 1199268864 }, { "epoch": 3.05, "learning_rate": 0.0003215446339017051, "loss": 2.9547, "theoretical_loss": 3.5876521526271166, "tokens_seen": 1199318016 }, { "epoch": 3.05, "learning_rate": 0.0003215346038114343, "loss": 2.9331, "theoretical_loss": 3.587634117820116, "tokens_seen": 1199383552 }, { "epoch": 3.05, "learning_rate": 0.0003215245737211635, "loss": 2.9155, "theoretical_loss": 3.587616084274444, "tokens_seen": 1199449088 }, { "epoch": 3.05, "learning_rate": 0.00032151454363089265, "loss": 2.9802, "theoretical_loss": 3.587598051989943, "tokens_seen": 1199514624 }, { "epoch": 3.05, "learning_rate": 0.0003215045135406219, "loss": 2.9023, "theoretical_loss": 3.587580020966456, "tokens_seen": 1199580160 }, { "epoch": 3.05, "learning_rate": 0.00032149448345035106, "loss": 2.9775, "theoretical_loss": 3.5875619912038257, "tokens_seen": 1199645696 }, { "epoch": 3.05, "learning_rate": 0.00032148445336008024, "loss": 2.9642, "theoretical_loss": 3.5875439627018957, "tokens_seen": 1199711232 }, { "epoch": 3.05, "learning_rate": 0.0003214744232698094, "loss": 2.8627, "theoretical_loss": 3.5875259354605085, "tokens_seen": 1199776768 }, { "epoch": 3.05, "learning_rate": 0.0003214643931795386, "loss": 2.7681, "theoretical_loss": 3.5875079094795073, "tokens_seen": 1199842304 }, { "epoch": 3.05, "learning_rate": 0.0003214543630892678, "loss": 2.9608, "theoretical_loss": 3.587489884758735, "tokens_seen": 1199907840 }, { "epoch": 3.05, "learning_rate": 0.000321444332998997, "loss": 2.9183, "theoretical_loss": 3.5874718612980345, "tokens_seen": 1199973376 }, { "epoch": 3.05, "learning_rate": 0.00032143430290872615, "loss": 2.9384, "theoretical_loss": 3.5874538390972495, "tokens_seen": 1200038912 }, { "epoch": 3.05, "learning_rate": 0.0003214242728184554, "loss": 2.9021, "theoretical_loss": 3.5874358181562225, "tokens_seen": 1200104448 }, { "epoch": 3.05, "learning_rate": 0.0003214142427281845, "loss": 2.8365, "theoretical_loss": 3.587417798474797, "tokens_seen": 1200169984 }, { "epoch": 3.05, "learning_rate": 0.00032140421263791375, "loss": 2.8391, "theoretical_loss": 3.587399780052816, "tokens_seen": 1200235520 }, { "epoch": 3.05, "learning_rate": 0.00032139418254764293, "loss": 2.9201, "theoretical_loss": 3.5873817628901232, "tokens_seen": 1200301056 }, { "epoch": 3.05, "learning_rate": 0.0003213841524573721, "loss": 2.9547, "theoretical_loss": 3.5873637469865614, "tokens_seen": 1200366592 }, { "epoch": 3.05, "learning_rate": 0.0003213741223671013, "loss": 2.8963, "theoretical_loss": 3.5873457323419737, "tokens_seen": 1200432128 }, { "epoch": 3.05, "learning_rate": 0.0003213640922768305, "loss": 3.0144, "theoretical_loss": 3.587327718956203, "tokens_seen": 1200497664 }, { "epoch": 3.05, "learning_rate": 0.00032135406218655965, "loss": 2.8993, "theoretical_loss": 3.5873097068290942, "tokens_seen": 1200563200 }, { "epoch": 3.05, "learning_rate": 0.0003213440320962889, "loss": 2.9215, "theoretical_loss": 3.5872916959604897, "tokens_seen": 1200628736 }, { "epoch": 3.05, "learning_rate": 0.000321334002006018, "loss": 2.81, "theoretical_loss": 3.5872736863502324, "tokens_seen": 1200694272 }, { "epoch": 3.05, "learning_rate": 0.00032132397191574725, "loss": 2.8233, "theoretical_loss": 3.587255677998167, "tokens_seen": 1200759808 }, { "epoch": 3.05, "learning_rate": 0.0003213139418254765, "loss": 2.908, "theoretical_loss": 3.587237670904135, "tokens_seen": 1200825344 }, { "epoch": 3.05, "learning_rate": 0.0003213039117352056, "loss": 2.9675, "theoretical_loss": 3.5872196650679813, "tokens_seen": 1200890880 }, { "epoch": 3.05, "objective/train/docs_used": 1923865, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2222867012023926, "objective/train/theoretical_loss": 3.5872151638054683, "objective/train/tokens_used": 1221367264, "theoretical_loss": 3.5872151638054683, "tokens_seen": 1200907264 }, { "epoch": 3.05, "learning_rate": 0.00032129388164493485, "loss": 2.8966, "theoretical_loss": 3.5872016604895496, "tokens_seen": 1200956416 }, { "epoch": 3.05, "learning_rate": 0.000321283851554664, "loss": 2.8034, "theoretical_loss": 3.5871836571686826, "tokens_seen": 1201021952 }, { "epoch": 3.05, "learning_rate": 0.0003212738214643932, "loss": 2.8467, "theoretical_loss": 3.587165655105224, "tokens_seen": 1201087488 }, { "epoch": 3.05, "learning_rate": 0.0003212637913741224, "loss": 2.6437, "theoretical_loss": 3.5871476542990175, "tokens_seen": 1201153024 }, { "epoch": 3.05, "learning_rate": 0.0003212537612838516, "loss": 2.7632, "theoretical_loss": 3.587129654749907, "tokens_seen": 1201218560 }, { "epoch": 3.05, "learning_rate": 0.00032124373119358075, "loss": 2.9036, "theoretical_loss": 3.587111656457736, "tokens_seen": 1201284096 }, { "epoch": 3.05, "learning_rate": 0.00032123370110330994, "loss": 3.0295, "theoretical_loss": 3.5870936594223477, "tokens_seen": 1201349632 }, { "epoch": 3.05, "learning_rate": 0.0003212236710130391, "loss": 2.6419, "theoretical_loss": 3.5870756636435863, "tokens_seen": 1201415168 }, { "epoch": 3.05, "learning_rate": 0.00032121364092276835, "loss": 2.9782, "theoretical_loss": 3.587057669121295, "tokens_seen": 1201480704 }, { "epoch": 3.05, "learning_rate": 0.0003212036108324975, "loss": 2.9261, "theoretical_loss": 3.5870396758553182, "tokens_seen": 1201546240 }, { "epoch": 3.05, "learning_rate": 0.0003211935807422267, "loss": 2.85, "theoretical_loss": 3.5870216838454994, "tokens_seen": 1201611776 }, { "epoch": 3.05, "learning_rate": 0.0003211835506519559, "loss": 2.7867, "theoretical_loss": 3.587003693091682, "tokens_seen": 1201677312 }, { "epoch": 3.05, "learning_rate": 0.0003211735205616851, "loss": 2.979, "theoretical_loss": 3.5869857035937107, "tokens_seen": 1201742848 }, { "epoch": 3.05, "learning_rate": 0.00032116349047141426, "loss": 2.91, "theoretical_loss": 3.5869677153514283, "tokens_seen": 1201808384 }, { "epoch": 3.05, "learning_rate": 0.00032115346038114344, "loss": 2.7404, "theoretical_loss": 3.58694972836468, "tokens_seen": 1201873920 }, { "epoch": 3.05, "learning_rate": 0.0003211434302908726, "loss": 2.842, "theoretical_loss": 3.5869317426333085, "tokens_seen": 1201939456 }, { "epoch": 3.05, "learning_rate": 0.00032113340020060185, "loss": 2.8103, "theoretical_loss": 3.5869137581571575, "tokens_seen": 1202004992 }, { "epoch": 3.05, "learning_rate": 0.000321123370110331, "loss": 2.9972, "theoretical_loss": 3.5868957749360724, "tokens_seen": 1202070528 }, { "epoch": 3.05, "learning_rate": 0.0003211133400200602, "loss": 2.9204, "theoretical_loss": 3.5868777929698963, "tokens_seen": 1202136064 }, { "epoch": 3.05, "learning_rate": 0.00032110330992978934, "loss": 2.8973, "theoretical_loss": 3.5868598122584734, "tokens_seen": 1202201600 }, { "epoch": 3.05, "learning_rate": 0.0003210932798395186, "loss": 2.7829, "theoretical_loss": 3.586841832801648, "tokens_seen": 1202267136 }, { "epoch": 3.05, "learning_rate": 0.00032108324974924776, "loss": 2.9692, "theoretical_loss": 3.5868238545992632, "tokens_seen": 1202332672 }, { "epoch": 3.05, "learning_rate": 0.00032107321965897694, "loss": 2.8622, "theoretical_loss": 3.586805877651164, "tokens_seen": 1202398208 }, { "epoch": 3.05, "learning_rate": 0.0003210631895687061, "loss": 2.8216, "theoretical_loss": 3.5867879019571944, "tokens_seen": 1202463744 }, { "epoch": 3.05, "learning_rate": 0.0003210531594784353, "loss": 2.8499, "theoretical_loss": 3.5867699275171985, "tokens_seen": 1202529280 }, { "epoch": 3.05, "objective/train/docs_used": 1926800, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.621340274810791, "objective/train/theoretical_loss": 3.586765434103115, "objective/train/tokens_used": 1223005664, "theoretical_loss": 3.586765434103115, "tokens_seen": 1202545664 }, { "epoch": 3.05, "learning_rate": 0.0003210431293881645, "loss": 2.8587, "theoretical_loss": 3.586751954331021, "tokens_seen": 1202594816 }, { "epoch": 3.05, "learning_rate": 0.0003210330992978937, "loss": 3.0213, "theoretical_loss": 3.586733982398505, "tokens_seen": 1202660352 }, { "epoch": 3.05, "learning_rate": 0.00032102306920762285, "loss": 2.8656, "theoretical_loss": 3.5867160117194956, "tokens_seen": 1202725888 }, { "epoch": 3.05, "learning_rate": 0.0003210130391173521, "loss": 2.7987, "theoretical_loss": 3.5866980422938366, "tokens_seen": 1202791424 }, { "epoch": 3.05, "learning_rate": 0.00032100300902708126, "loss": 2.8445, "theoretical_loss": 3.5866800741213725, "tokens_seen": 1202856960 }, { "epoch": 3.05, "learning_rate": 0.00032099297893681044, "loss": 2.7865, "theoretical_loss": 3.586662107201948, "tokens_seen": 1202922496 }, { "epoch": 3.05, "learning_rate": 0.0003209829488465396, "loss": 2.8398, "theoretical_loss": 3.586644141535407, "tokens_seen": 1202988032 }, { "epoch": 3.05, "learning_rate": 0.0003209729187562688, "loss": 2.7972, "theoretical_loss": 3.5866261771215937, "tokens_seen": 1203053568 }, { "epoch": 3.05, "learning_rate": 0.000320962888665998, "loss": 2.9088, "theoretical_loss": 3.5866082139603535, "tokens_seen": 1203119104 }, { "epoch": 3.05, "learning_rate": 0.0003209528585757272, "loss": 2.7898, "theoretical_loss": 3.5865902520515296, "tokens_seen": 1203184640 }, { "epoch": 3.05, "learning_rate": 0.00032094282848545635, "loss": 2.89, "theoretical_loss": 3.586572291394967, "tokens_seen": 1203250176 }, { "epoch": 3.05, "learning_rate": 0.0003209327983951856, "loss": 2.7278, "theoretical_loss": 3.58655433199051, "tokens_seen": 1203315712 }, { "epoch": 3.05, "learning_rate": 0.0003209227683049147, "loss": 2.8542, "theoretical_loss": 3.586536373838004, "tokens_seen": 1203381248 }, { "epoch": 3.05, "learning_rate": 0.00032091273821464395, "loss": 2.8586, "theoretical_loss": 3.586518416937293, "tokens_seen": 1203446784 }, { "epoch": 3.05, "learning_rate": 0.00032090270812437313, "loss": 2.9393, "theoretical_loss": 3.586500461288221, "tokens_seen": 1203512320 }, { "epoch": 3.05, "learning_rate": 0.0003208926780341023, "loss": 2.7542, "theoretical_loss": 3.586482506890633, "tokens_seen": 1203577856 }, { "epoch": 3.05, "learning_rate": 0.0003208826479438315, "loss": 2.877, "theoretical_loss": 3.5864645537443742, "tokens_seen": 1203643392 }, { "epoch": 3.05, "learning_rate": 0.0003208726178535607, "loss": 2.9576, "theoretical_loss": 3.586446601849288, "tokens_seen": 1203708928 }, { "epoch": 3.05, "learning_rate": 0.00032086258776328985, "loss": 2.8107, "theoretical_loss": 3.586428651205221, "tokens_seen": 1203774464 }, { "epoch": 3.05, "learning_rate": 0.0003208525576730191, "loss": 2.8226, "theoretical_loss": 3.5864107018120164, "tokens_seen": 1203840000 }, { "epoch": 3.05, "learning_rate": 0.0003208425275827482, "loss": 2.8898, "theoretical_loss": 3.586392753669519, "tokens_seen": 1203905536 }, { "epoch": 3.05, "learning_rate": 0.00032083249749247745, "loss": 2.8528, "theoretical_loss": 3.586374806777574, "tokens_seen": 1203971072 }, { "epoch": 3.05, "learning_rate": 0.00032082246740220663, "loss": 2.9154, "theoretical_loss": 3.5863568611360264, "tokens_seen": 1204036608 }, { "epoch": 3.05, "learning_rate": 0.0003208124373119358, "loss": 2.8806, "theoretical_loss": 3.5863389167447206, "tokens_seen": 1204102144 }, { "epoch": 3.05, "learning_rate": 0.000320802407221665, "loss": 3.0542, "theoretical_loss": 3.5863209736035015, "tokens_seen": 1204167680 }, { "epoch": 3.05, "objective/train/docs_used": 1929726, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.992762804031372, "objective/train/theoretical_loss": 3.5863164880135043, "objective/train/tokens_used": 1224644064, "theoretical_loss": 3.5863164880135043, "tokens_seen": 1204184064 }, { "epoch": 3.05, "learning_rate": 0.0003207923771313942, "loss": 2.8534, "theoretical_loss": 3.586303031712214, "tokens_seen": 1204233216 }, { "epoch": 3.05, "learning_rate": 0.00032078234704112336, "loss": 2.9666, "theoretical_loss": 3.5862850910707036, "tokens_seen": 1204298752 }, { "epoch": 3.05, "learning_rate": 0.0003207723169508526, "loss": 2.905, "theoretical_loss": 3.5862671516788143, "tokens_seen": 1204364288 }, { "epoch": 3.05, "learning_rate": 0.0003207622868605817, "loss": 2.9166, "theoretical_loss": 3.5862492135363917, "tokens_seen": 1204429824 }, { "epoch": 3.05, "learning_rate": 0.00032075225677031095, "loss": 2.997, "theoretical_loss": 3.5862312766432805, "tokens_seen": 1204495360 }, { "epoch": 3.05, "learning_rate": 0.0003207422266800401, "loss": 2.8455, "theoretical_loss": 3.586213340999326, "tokens_seen": 1204560896 }, { "epoch": 3.05, "learning_rate": 0.0003207321965897693, "loss": 2.8546, "theoretical_loss": 3.5861954066043733, "tokens_seen": 1204626432 }, { "epoch": 3.05, "learning_rate": 0.0003207221664994985, "loss": 2.8963, "theoretical_loss": 3.5861774734582665, "tokens_seen": 1204691968 }, { "epoch": 3.05, "learning_rate": 0.0003207121364092277, "loss": 2.7237, "theoretical_loss": 3.5861595415608525, "tokens_seen": 1204757504 }, { "epoch": 3.05, "learning_rate": 0.00032070210631895686, "loss": 2.9595, "theoretical_loss": 3.586141610911975, "tokens_seen": 1204823040 }, { "epoch": 3.05, "learning_rate": 0.0003206920762286861, "loss": 2.8851, "theoretical_loss": 3.586123681511479, "tokens_seen": 1204888576 }, { "epoch": 3.05, "learning_rate": 0.0003206820461384152, "loss": 2.8065, "theoretical_loss": 3.586105753359211, "tokens_seen": 1204954112 }, { "epoch": 3.05, "learning_rate": 0.00032067201604814446, "loss": 2.8144, "theoretical_loss": 3.586087826455015, "tokens_seen": 1205019648 }, { "epoch": 3.05, "learning_rate": 0.0003206619859578736, "loss": 2.931, "theoretical_loss": 3.586069900798737, "tokens_seen": 1205085184 }, { "epoch": 3.05, "learning_rate": 0.0003206519558676028, "loss": 2.7407, "theoretical_loss": 3.5860519763902223, "tokens_seen": 1205150720 }, { "epoch": 3.05, "learning_rate": 0.000320641925777332, "loss": 2.9134, "theoretical_loss": 3.5860340532293153, "tokens_seen": 1205216256 }, { "epoch": 3.05, "learning_rate": 0.0003206318956870612, "loss": 2.9527, "theoretical_loss": 3.586016131315862, "tokens_seen": 1205281792 }, { "epoch": 3.05, "learning_rate": 0.00032062186559679036, "loss": 2.8263, "theoretical_loss": 3.5859982106497084, "tokens_seen": 1205347328 }, { "epoch": 3.05, "learning_rate": 0.00032061183550651954, "loss": 2.9145, "theoretical_loss": 3.5859802912306984, "tokens_seen": 1205412864 }, { "epoch": 3.05, "learning_rate": 0.0003206018054162487, "loss": 2.9886, "theoretical_loss": 3.5859623730586785, "tokens_seen": 1205478400 }, { "epoch": 3.05, "learning_rate": 0.00032059177532597796, "loss": 2.9669, "theoretical_loss": 3.585944456133493, "tokens_seen": 1205543936 }, { "epoch": 3.05, "learning_rate": 0.0003205817452357071, "loss": 2.9309, "theoretical_loss": 3.5859265404549894, "tokens_seen": 1205609472 }, { "epoch": 3.05, "learning_rate": 0.0003205717151454363, "loss": 2.7744, "theoretical_loss": 3.5859086260230115, "tokens_seen": 1205675008 }, { "epoch": 3.05, "learning_rate": 0.0003205616850551655, "loss": 2.9873, "theoretical_loss": 3.585890712837405, "tokens_seen": 1205740544 }, { "epoch": 3.05, "learning_rate": 0.0003205516549648947, "loss": 2.8288, "theoretical_loss": 3.585872800898016, "tokens_seen": 1205806080 }, { "epoch": 3.05, "objective/train/docs_used": 1932020, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8672702312469482, "objective/train/theoretical_loss": 3.585868323107872, "objective/train/tokens_used": 1226282464, "theoretical_loss": 3.585868323107872, "tokens_seen": 1205822464 }, { "epoch": 3.05, "learning_rate": 0.0003205416248746239, "loss": 2.809, "theoretical_loss": 3.58585489020469, "tokens_seen": 1205871616 }, { "epoch": 3.05, "learning_rate": 0.00032053159478435305, "loss": 2.9457, "theoretical_loss": 3.5858369807572723, "tokens_seen": 1205937152 }, { "epoch": 3.05, "learning_rate": 0.0003205215646940823, "loss": 2.9644, "theoretical_loss": 3.5858190725556085, "tokens_seen": 1206002688 }, { "epoch": 3.05, "learning_rate": 0.00032051153460381146, "loss": 2.7715, "theoretical_loss": 3.5858011655995443, "tokens_seen": 1206068224 }, { "epoch": 3.05, "learning_rate": 0.00032050150451354064, "loss": 2.9566, "theoretical_loss": 3.5857832598889257, "tokens_seen": 1206133760 }, { "epoch": 3.05, "learning_rate": 0.0003204914744232698, "loss": 2.9538, "theoretical_loss": 3.5857653554235984, "tokens_seen": 1206199296 }, { "epoch": 3.05, "learning_rate": 0.000320481444332999, "loss": 2.9093, "theoretical_loss": 3.5857474522034076, "tokens_seen": 1206264832 }, { "epoch": 3.05, "learning_rate": 0.0003204714142427282, "loss": 2.878, "theoretical_loss": 3.5857295502281996, "tokens_seen": 1206330368 }, { "epoch": 3.05, "learning_rate": 0.0003204613841524574, "loss": 2.859, "theoretical_loss": 3.58571164949782, "tokens_seen": 1206395904 }, { "epoch": 3.05, "learning_rate": 0.00032045135406218655, "loss": 2.9294, "theoretical_loss": 3.585693750012114, "tokens_seen": 1206461440 }, { "epoch": 3.05, "learning_rate": 0.0003204413239719158, "loss": 2.9319, "theoretical_loss": 3.5856758517709286, "tokens_seen": 1206526976 }, { "epoch": 3.05, "learning_rate": 0.0003204312938816449, "loss": 2.8573, "theoretical_loss": 3.58565795477411, "tokens_seen": 1206592512 }, { "epoch": 3.06, "learning_rate": 0.00032042126379137415, "loss": 2.9242, "theoretical_loss": 3.585640059021502, "tokens_seen": 1206658048 }, { "epoch": 3.06, "learning_rate": 0.00032041123370110333, "loss": 2.8628, "theoretical_loss": 3.5856221645129525, "tokens_seen": 1206723584 }, { "epoch": 3.06, "learning_rate": 0.0003204012036108325, "loss": 2.9523, "theoretical_loss": 3.5856042712483065, "tokens_seen": 1206789120 }, { "epoch": 3.06, "learning_rate": 0.0003203911735205617, "loss": 2.7983, "theoretical_loss": 3.58558637922741, "tokens_seen": 1206854656 }, { "epoch": 3.06, "learning_rate": 0.0003203811434302909, "loss": 2.7518, "theoretical_loss": 3.5855684884501096, "tokens_seen": 1206920192 }, { "epoch": 3.06, "learning_rate": 0.00032037111334002005, "loss": 2.9228, "theoretical_loss": 3.585550598916251, "tokens_seen": 1206985728 }, { "epoch": 3.06, "learning_rate": 0.0003203610832497493, "loss": 2.8482, "theoretical_loss": 3.5855327106256802, "tokens_seen": 1207051264 }, { "epoch": 3.06, "learning_rate": 0.0003203510531594784, "loss": 2.8747, "theoretical_loss": 3.5855148235782437, "tokens_seen": 1207116800 }, { "epoch": 3.06, "learning_rate": 0.00032034102306920765, "loss": 2.8048, "theoretical_loss": 3.5854969377737866, "tokens_seen": 1207182336 }, { "epoch": 3.06, "learning_rate": 0.00032033099297893683, "loss": 2.8065, "theoretical_loss": 3.5854790532121563, "tokens_seen": 1207247872 }, { "epoch": 3.06, "learning_rate": 0.000320320962888666, "loss": 2.9496, "theoretical_loss": 3.585461169893198, "tokens_seen": 1207313408 }, { "epoch": 3.06, "learning_rate": 0.0003203109327983952, "loss": 2.8699, "theoretical_loss": 3.5854432878167586, "tokens_seen": 1207378944 }, { "epoch": 3.06, "learning_rate": 0.0003203009027081244, "loss": 3.0041, "theoretical_loss": 3.5854254069826843, "tokens_seen": 1207444480 }, { "epoch": 3.06, "objective/train/docs_used": 1934942, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.055860757827759, "objective/train/theoretical_loss": 3.5854209369682675, "objective/train/tokens_used": 1227920864, "theoretical_loss": 3.5854209369682675, "tokens_seen": 1207460864 }, { "epoch": 3.06, "learning_rate": 0.00032029087261785356, "loss": 2.8647, "theoretical_loss": 3.585407527390821, "tokens_seen": 1207510016 }, { "epoch": 3.06, "learning_rate": 0.0003202808425275828, "loss": 3.0595, "theoretical_loss": 3.585389649041015, "tokens_seen": 1207575552 }, { "epoch": 3.06, "learning_rate": 0.0003202708124373119, "loss": 2.7445, "theoretical_loss": 3.5853717719331124, "tokens_seen": 1207641088 }, { "epoch": 3.06, "learning_rate": 0.00032026078234704115, "loss": 2.8862, "theoretical_loss": 3.5853538960669606, "tokens_seen": 1207706624 }, { "epoch": 3.06, "learning_rate": 0.0003202507522567703, "loss": 2.8242, "theoretical_loss": 3.585336021442405, "tokens_seen": 1207772160 }, { "epoch": 3.06, "learning_rate": 0.0003202407221664995, "loss": 2.8444, "theoretical_loss": 3.585318148059292, "tokens_seen": 1207837696 }, { "epoch": 3.06, "learning_rate": 0.0003202306920762287, "loss": 2.8348, "theoretical_loss": 3.5853002759174686, "tokens_seen": 1207903232 }, { "epoch": 3.06, "learning_rate": 0.0003202206619859579, "loss": 2.716, "theoretical_loss": 3.585282405016781, "tokens_seen": 1207968768 }, { "epoch": 3.06, "learning_rate": 0.00032021063189568706, "loss": 2.835, "theoretical_loss": 3.585264535357075, "tokens_seen": 1208034304 }, { "epoch": 3.06, "learning_rate": 0.0003202006018054163, "loss": 2.9049, "theoretical_loss": 3.585246666938198, "tokens_seen": 1208099840 }, { "epoch": 3.06, "learning_rate": 0.0003201905717151454, "loss": 2.9006, "theoretical_loss": 3.5852287997599968, "tokens_seen": 1208165376 }, { "epoch": 3.06, "learning_rate": 0.00032018054162487466, "loss": 2.8803, "theoretical_loss": 3.585210933822317, "tokens_seen": 1208230912 }, { "epoch": 3.06, "learning_rate": 0.0003201705115346038, "loss": 2.7075, "theoretical_loss": 3.5851930691250056, "tokens_seen": 1208296448 }, { "epoch": 3.06, "learning_rate": 0.000320160481444333, "loss": 2.8085, "theoretical_loss": 3.585175205667909, "tokens_seen": 1208361984 }, { "epoch": 3.06, "learning_rate": 0.0003201504513540622, "loss": 2.9255, "theoretical_loss": 3.5851573434508746, "tokens_seen": 1208427520 }, { "epoch": 3.06, "learning_rate": 0.0003201404212637914, "loss": 2.8359, "theoretical_loss": 3.585139482473748, "tokens_seen": 1208493056 }, { "epoch": 3.06, "learning_rate": 0.00032013039117352056, "loss": 2.9354, "theoretical_loss": 3.5851216227363767, "tokens_seen": 1208558592 }, { "epoch": 3.06, "learning_rate": 0.00032012036108324974, "loss": 2.8645, "theoretical_loss": 3.585103764238607, "tokens_seen": 1208624128 }, { "epoch": 3.06, "learning_rate": 0.0003201103309929789, "loss": 2.8023, "theoretical_loss": 3.585085906980286, "tokens_seen": 1208689664 }, { "epoch": 3.06, "learning_rate": 0.00032010030090270816, "loss": 2.7286, "theoretical_loss": 3.5850680509612607, "tokens_seen": 1208755200 }, { "epoch": 3.06, "learning_rate": 0.0003200902708124373, "loss": 2.8319, "theoretical_loss": 3.5850501961813768, "tokens_seen": 1208820736 }, { "epoch": 3.06, "learning_rate": 0.0003200802407221665, "loss": 2.8125, "theoretical_loss": 3.585032342640482, "tokens_seen": 1208886272 }, { "epoch": 3.06, "learning_rate": 0.00032007021063189565, "loss": 2.8748, "theoretical_loss": 3.5850144903384233, "tokens_seen": 1208951808 }, { "epoch": 3.06, "learning_rate": 0.0003200601805416249, "loss": 2.7114, "theoretical_loss": 3.584996639275047, "tokens_seen": 1209017344 }, { "epoch": 3.06, "learning_rate": 0.00032005015045135407, "loss": 2.8209, "theoretical_loss": 3.5849787894502003, "tokens_seen": 1209082880 }, { "epoch": 3.06, "objective/train/docs_used": 1937725, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6510515213012695, "objective/train/theoretical_loss": 3.584974327187491, "objective/train/tokens_used": 1229559264, "theoretical_loss": 3.584974327187491, "tokens_seen": 1209099264 }, { "epoch": 3.06, "learning_rate": 0.00032004012036108325, "loss": 2.914, "theoretical_loss": 3.5849609408637306, "tokens_seen": 1209148416 }, { "epoch": 3.06, "learning_rate": 0.00032003009027081243, "loss": 2.9606, "theoretical_loss": 3.5849430935154842, "tokens_seen": 1209213952 }, { "epoch": 3.06, "learning_rate": 0.00032002006018054166, "loss": 2.9041, "theoretical_loss": 3.584925247405308, "tokens_seen": 1209279488 }, { "epoch": 3.06, "learning_rate": 0.0003200100300902708, "loss": 2.8359, "theoretical_loss": 3.5849074025330494, "tokens_seen": 1209345024 }, { "epoch": 3.06, "learning_rate": 0.00032, "loss": 3.0154, "theoretical_loss": 3.5848895588985554, "tokens_seen": 1209410560 }, { "epoch": 3.06, "learning_rate": 0.00031998996990972915, "loss": 2.656, "theoretical_loss": 3.5848717165016737, "tokens_seen": 1209476096 }, { "epoch": 3.06, "learning_rate": 0.0003199799398194584, "loss": 2.8799, "theoretical_loss": 3.58485387534225, "tokens_seen": 1209541632 }, { "epoch": 3.06, "learning_rate": 0.00031996990972918757, "loss": 2.741, "theoretical_loss": 3.5848360354201327, "tokens_seen": 1209607168 }, { "epoch": 3.06, "learning_rate": 0.00031995987963891675, "loss": 2.8953, "theoretical_loss": 3.5848181967351684, "tokens_seen": 1209672704 }, { "epoch": 3.06, "learning_rate": 0.00031994984954864593, "loss": 2.9234, "theoretical_loss": 3.584800359287205, "tokens_seen": 1209738240 }, { "epoch": 3.06, "learning_rate": 0.0003199398194583751, "loss": 2.9279, "theoretical_loss": 3.584782523076088, "tokens_seen": 1209803776 }, { "epoch": 3.06, "learning_rate": 0.0003199297893681043, "loss": 2.9126, "theoretical_loss": 3.584764688101666, "tokens_seen": 1209869312 }, { "epoch": 3.06, "learning_rate": 0.00031991975927783353, "loss": 2.794, "theoretical_loss": 3.5847468543637864, "tokens_seen": 1209934848 }, { "epoch": 3.06, "learning_rate": 0.00031990972918756266, "loss": 2.9813, "theoretical_loss": 3.5847290218622962, "tokens_seen": 1210000384 }, { "epoch": 3.06, "learning_rate": 0.0003198996990972919, "loss": 2.8497, "theoretical_loss": 3.584711190597042, "tokens_seen": 1210065920 }, { "epoch": 3.06, "learning_rate": 0.000319889669007021, "loss": 2.9245, "theoretical_loss": 3.584693360567872, "tokens_seen": 1210131456 }, { "epoch": 3.06, "learning_rate": 0.00031987963891675025, "loss": 3.0662, "theoretical_loss": 3.584675531774634, "tokens_seen": 1210196992 }, { "epoch": 3.06, "learning_rate": 0.00031986960882647943, "loss": 2.8487, "theoretical_loss": 3.584657704217174, "tokens_seen": 1210262528 }, { "epoch": 3.06, "learning_rate": 0.0003198595787362086, "loss": 2.7895, "theoretical_loss": 3.5846398778953406, "tokens_seen": 1210328064 }, { "epoch": 3.06, "learning_rate": 0.0003198495486459378, "loss": 2.8096, "theoretical_loss": 3.5846220528089807, "tokens_seen": 1210393600 }, { "epoch": 3.06, "learning_rate": 0.00031983951855566703, "loss": 2.8213, "theoretical_loss": 3.584604228957942, "tokens_seen": 1210459136 }, { "epoch": 3.06, "learning_rate": 0.00031982948846539616, "loss": 2.8795, "theoretical_loss": 3.584586406342072, "tokens_seen": 1210524672 }, { "epoch": 3.06, "learning_rate": 0.0003198194583751254, "loss": 2.897, "theoretical_loss": 3.584568584961218, "tokens_seen": 1210590208 }, { "epoch": 3.06, "learning_rate": 0.0003198094282848546, "loss": 2.7994, "theoretical_loss": 3.584550764815228, "tokens_seen": 1210655744 }, { "epoch": 3.06, "learning_rate": 0.00031979939819458376, "loss": 2.8255, "theoretical_loss": 3.584532945903949, "tokens_seen": 1210721280 }, { "epoch": 3.06, "objective/train/docs_used": 1939010, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.827955961227417, "objective/train/theoretical_loss": 3.584528491369035, "objective/train/tokens_used": 1231197664, "theoretical_loss": 3.584528491369035, "tokens_seen": 1210737664 }, { "epoch": 3.06, "learning_rate": 0.000319789368104313, "loss": 2.9049, "theoretical_loss": 3.5845151282272294, "tokens_seen": 1210786816 }, { "epoch": 3.06, "learning_rate": 0.0003197793380140421, "loss": 2.8006, "theoretical_loss": 3.584497311784916, "tokens_seen": 1210852352 }, { "epoch": 3.06, "learning_rate": 0.00031976930792377135, "loss": 2.9039, "theoretical_loss": 3.584479496576857, "tokens_seen": 1210917888 }, { "epoch": 3.06, "learning_rate": 0.0003197592778335005, "loss": 2.8528, "theoretical_loss": 3.5844616826029005, "tokens_seen": 1210983424 }, { "epoch": 3.06, "learning_rate": 0.0003197492477432297, "loss": 2.9169, "theoretical_loss": 3.5844438698628935, "tokens_seen": 1211048960 }, { "epoch": 3.06, "learning_rate": 0.0003197392176529589, "loss": 2.8929, "theoretical_loss": 3.5844260583566836, "tokens_seen": 1211114496 }, { "epoch": 3.06, "learning_rate": 0.0003197291875626881, "loss": 2.7152, "theoretical_loss": 3.5844082480841197, "tokens_seen": 1211180032 }, { "epoch": 3.06, "learning_rate": 0.00031971915747241726, "loss": 2.9049, "theoretical_loss": 3.5843904390450483, "tokens_seen": 1211245568 }, { "epoch": 3.06, "learning_rate": 0.0003197091273821465, "loss": 2.7932, "theoretical_loss": 3.5843726312393183, "tokens_seen": 1211311104 }, { "epoch": 3.06, "learning_rate": 0.0003196990972918756, "loss": 2.7109, "theoretical_loss": 3.584354824666777, "tokens_seen": 1211376640 }, { "epoch": 3.06, "learning_rate": 0.00031968906720160486, "loss": 2.7891, "theoretical_loss": 3.584337019327272, "tokens_seen": 1211442176 }, { "epoch": 3.06, "learning_rate": 0.000319679037111334, "loss": 2.9411, "theoretical_loss": 3.5843192152206518, "tokens_seen": 1211507712 }, { "epoch": 3.06, "learning_rate": 0.0003196690070210632, "loss": 2.8986, "theoretical_loss": 3.5843014123467647, "tokens_seen": 1211573248 }, { "epoch": 3.06, "learning_rate": 0.0003196589769307924, "loss": 2.8703, "theoretical_loss": 3.5842836107054574, "tokens_seen": 1211638784 }, { "epoch": 3.06, "learning_rate": 0.0003196489468405216, "loss": 2.8538, "theoretical_loss": 3.584265810296579, "tokens_seen": 1211704320 }, { "epoch": 3.06, "learning_rate": 0.00031963891675025076, "loss": 2.8209, "theoretical_loss": 3.584248011119977, "tokens_seen": 1211769856 }, { "epoch": 3.06, "learning_rate": 0.00031962888665997994, "loss": 2.878, "theoretical_loss": 3.5842302131755, "tokens_seen": 1211835392 }, { "epoch": 3.06, "learning_rate": 0.0003196188565697091, "loss": 2.8914, "theoretical_loss": 3.584212416462995, "tokens_seen": 1211900928 }, { "epoch": 3.06, "learning_rate": 0.00031960882647943836, "loss": 2.7509, "theoretical_loss": 3.5841946209823115, "tokens_seen": 1211966464 }, { "epoch": 3.06, "learning_rate": 0.0003195987963891675, "loss": 2.8095, "theoretical_loss": 3.5841768267332967, "tokens_seen": 1212032000 }, { "epoch": 3.06, "learning_rate": 0.0003195887662988967, "loss": 2.8372, "theoretical_loss": 3.5841590337157987, "tokens_seen": 1212097536 }, { "epoch": 3.06, "learning_rate": 0.00031957873620862585, "loss": 2.9379, "theoretical_loss": 3.5841412419296663, "tokens_seen": 1212163072 }, { "epoch": 3.06, "learning_rate": 0.0003195687061183551, "loss": 2.8339, "theoretical_loss": 3.5841234513747477, "tokens_seen": 1212228608 }, { "epoch": 3.06, "learning_rate": 0.00031955867602808427, "loss": 2.7627, "theoretical_loss": 3.5841056620508907, "tokens_seen": 1212294144 }, { "epoch": 3.06, "learning_rate": 0.00031954864593781345, "loss": 2.8976, "theoretical_loss": 3.5840878739579436, "tokens_seen": 1212359680 }, { "epoch": 3.06, "objective/train/docs_used": 1941813, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4692506790161133, "objective/train/theoretical_loss": 3.5840834271270188, "objective/train/tokens_used": 1232836064, "theoretical_loss": 3.5840834271270188, "tokens_seen": 1212376064 }, { "epoch": 3.06, "learning_rate": 0.00031953861584754263, "loss": 2.9123, "theoretical_loss": 3.5840700870957547, "tokens_seen": 1212425216 }, { "epoch": 3.06, "learning_rate": 0.00031952858575727186, "loss": 2.7909, "theoretical_loss": 3.5840523014641725, "tokens_seen": 1212490752 }, { "epoch": 3.06, "learning_rate": 0.000319518555667001, "loss": 2.8951, "theoretical_loss": 3.5840345170630457, "tokens_seen": 1212556288 }, { "epoch": 3.06, "learning_rate": 0.0003195085255767302, "loss": 2.8443, "theoretical_loss": 3.5840167338922218, "tokens_seen": 1212621824 }, { "epoch": 3.06, "learning_rate": 0.00031949849548645935, "loss": 2.8098, "theoretical_loss": 3.58399895195155, "tokens_seen": 1212687360 }, { "epoch": 3.06, "learning_rate": 0.0003194884653961886, "loss": 2.7547, "theoretical_loss": 3.583981171240878, "tokens_seen": 1212752896 }, { "epoch": 3.06, "learning_rate": 0.00031947843530591777, "loss": 2.9403, "theoretical_loss": 3.583963391760055, "tokens_seen": 1212818432 }, { "epoch": 3.06, "learning_rate": 0.00031946840521564695, "loss": 2.8355, "theoretical_loss": 3.583945613508929, "tokens_seen": 1212883968 }, { "epoch": 3.06, "learning_rate": 0.00031945837512537613, "loss": 2.8452, "theoretical_loss": 3.583927836487349, "tokens_seen": 1212949504 }, { "epoch": 3.06, "learning_rate": 0.0003194483450351053, "loss": 2.7747, "theoretical_loss": 3.583910060695162, "tokens_seen": 1213015040 }, { "epoch": 3.06, "learning_rate": 0.0003194383149448345, "loss": 2.8146, "theoretical_loss": 3.583892286132219, "tokens_seen": 1213080576 }, { "epoch": 3.06, "learning_rate": 0.00031942828485456373, "loss": 2.7602, "theoretical_loss": 3.583874512798367, "tokens_seen": 1213146112 }, { "epoch": 3.06, "learning_rate": 0.00031941825476429286, "loss": 2.7948, "theoretical_loss": 3.583856740693455, "tokens_seen": 1213211648 }, { "epoch": 3.06, "learning_rate": 0.0003194082246740221, "loss": 2.9541, "theoretical_loss": 3.5838389698173314, "tokens_seen": 1213277184 }, { "epoch": 3.06, "learning_rate": 0.0003193981945837512, "loss": 2.9359, "theoretical_loss": 3.5838212001698455, "tokens_seen": 1213342720 }, { "epoch": 3.06, "learning_rate": 0.00031938816449348045, "loss": 2.9021, "theoretical_loss": 3.5838034317508454, "tokens_seen": 1213408256 }, { "epoch": 3.06, "learning_rate": 0.00031937813440320964, "loss": 2.8743, "theoretical_loss": 3.5837856645601804, "tokens_seen": 1213473792 }, { "epoch": 3.06, "learning_rate": 0.0003193681043129388, "loss": 2.7812, "theoretical_loss": 3.5837678985976984, "tokens_seen": 1213539328 }, { "epoch": 3.06, "learning_rate": 0.000319358074222668, "loss": 2.8134, "theoretical_loss": 3.5837501338632487, "tokens_seen": 1213604864 }, { "epoch": 3.06, "learning_rate": 0.00031934804413239723, "loss": 2.8933, "theoretical_loss": 3.583732370356681, "tokens_seen": 1213670400 }, { "epoch": 3.06, "learning_rate": 0.00031933801404212636, "loss": 2.785, "theoretical_loss": 3.583714608077842, "tokens_seen": 1213735936 }, { "epoch": 3.06, "learning_rate": 0.0003193279839518556, "loss": 2.8866, "theoretical_loss": 3.583696847026582, "tokens_seen": 1213801472 }, { "epoch": 3.06, "learning_rate": 0.0003193179538615847, "loss": 2.8677, "theoretical_loss": 3.58367908720275, "tokens_seen": 1213867008 }, { "epoch": 3.06, "learning_rate": 0.00031930792377131396, "loss": 2.9172, "theoretical_loss": 3.583661328606194, "tokens_seen": 1213932544 }, { "epoch": 3.06, "learning_rate": 0.00031929789368104314, "loss": 2.859, "theoretical_loss": 3.583643571236764, "tokens_seen": 1213998080 }, { "epoch": 3.06, "objective/train/docs_used": 1944383, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0708305835723877, "objective/train/theoretical_loss": 3.583639132086127, "objective/train/tokens_used": 1234474464, "theoretical_loss": 3.583639132086127, "tokens_seen": 1214014464 }, { "epoch": 3.06, "learning_rate": 0.0003192878635907723, "loss": 3.0225, "theoretical_loss": 3.583625815094308, "tokens_seen": 1214063616 }, { "epoch": 3.06, "learning_rate": 0.0003192778335005015, "loss": 2.829, "theoretical_loss": 3.583608060178676, "tokens_seen": 1214129152 }, { "epoch": 3.06, "learning_rate": 0.0003192678034102307, "loss": 2.824, "theoretical_loss": 3.5835903064897163, "tokens_seen": 1214194688 }, { "epoch": 3.06, "learning_rate": 0.00031925777331995986, "loss": 2.8785, "theoretical_loss": 3.5835725540272785, "tokens_seen": 1214260224 }, { "epoch": 3.06, "learning_rate": 0.0003192477432296891, "loss": 3.0311, "theoretical_loss": 3.5835548027912107, "tokens_seen": 1214325760 }, { "epoch": 3.06, "learning_rate": 0.0003192377131394182, "loss": 2.9709, "theoretical_loss": 3.583537052781363, "tokens_seen": 1214391296 }, { "epoch": 3.06, "learning_rate": 0.00031922768304914746, "loss": 2.8453, "theoretical_loss": 3.5835193039975834, "tokens_seen": 1214456832 }, { "epoch": 3.06, "learning_rate": 0.0003192176529588766, "loss": 2.9002, "theoretical_loss": 3.583501556439723, "tokens_seen": 1214522368 }, { "epoch": 3.06, "learning_rate": 0.0003192076228686058, "loss": 2.9438, "theoretical_loss": 3.5834838101076287, "tokens_seen": 1214587904 }, { "epoch": 3.06, "learning_rate": 0.000319197592778335, "loss": 2.7904, "theoretical_loss": 3.583466065001151, "tokens_seen": 1214653440 }, { "epoch": 3.06, "learning_rate": 0.0003191875626880642, "loss": 2.872, "theoretical_loss": 3.583448321120139, "tokens_seen": 1214718976 }, { "epoch": 3.06, "learning_rate": 0.00031917753259779337, "loss": 2.9639, "theoretical_loss": 3.5834305784644425, "tokens_seen": 1214784512 }, { "epoch": 3.06, "learning_rate": 0.0003191675025075226, "loss": 2.9028, "theoretical_loss": 3.58341283703391, "tokens_seen": 1214850048 }, { "epoch": 3.06, "learning_rate": 0.00031915747241725173, "loss": 2.8325, "theoretical_loss": 3.58339509682839, "tokens_seen": 1214915584 }, { "epoch": 3.06, "learning_rate": 0.00031914744232698096, "loss": 2.9841, "theoretical_loss": 3.5833773578477333, "tokens_seen": 1214981120 }, { "epoch": 3.06, "learning_rate": 0.0003191374122367101, "loss": 2.9743, "theoretical_loss": 3.583359620091789, "tokens_seen": 1215046656 }, { "epoch": 3.06, "learning_rate": 0.0003191273821464393, "loss": 2.7009, "theoretical_loss": 3.583341883560406, "tokens_seen": 1215112192 }, { "epoch": 3.06, "learning_rate": 0.0003191173520561685, "loss": 3.0391, "theoretical_loss": 3.5833241482534346, "tokens_seen": 1215177728 }, { "epoch": 3.06, "learning_rate": 0.0003191073219658977, "loss": 2.9727, "theoretical_loss": 3.583306414170723, "tokens_seen": 1215243264 }, { "epoch": 3.06, "learning_rate": 0.00031909729187562687, "loss": 2.7728, "theoretical_loss": 3.583288681312122, "tokens_seen": 1215308800 }, { "epoch": 3.06, "learning_rate": 0.00031908726178535605, "loss": 2.7804, "theoretical_loss": 3.58327094967748, "tokens_seen": 1215374336 }, { "epoch": 3.06, "learning_rate": 0.00031907723169508523, "loss": 2.8948, "theoretical_loss": 3.5832532192666466, "tokens_seen": 1215439872 }, { "epoch": 3.06, "learning_rate": 0.00031906720160481447, "loss": 2.8549, "theoretical_loss": 3.5832354900794723, "tokens_seen": 1215505408 }, { "epoch": 3.06, "learning_rate": 0.00031905717151454365, "loss": 2.9903, "theoretical_loss": 3.583217762115806, "tokens_seen": 1215570944 }, { "epoch": 3.06, "learning_rate": 0.00031904714142427283, "loss": 2.925, "theoretical_loss": 3.583200035375497, "tokens_seen": 1215636480 }, { "epoch": 3.06, "objective/train/docs_used": 1947233, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6473023891448975, "objective/train/theoretical_loss": 3.583195603881552, "objective/train/tokens_used": 1236112864, "theoretical_loss": 3.583195603881552, "tokens_seen": 1215652864 }, { "epoch": 3.06, "learning_rate": 0.00031903711133400206, "loss": 2.8788, "theoretical_loss": 3.5831823098583957, "tokens_seen": 1215702016 }, { "epoch": 3.06, "learning_rate": 0.0003190270812437312, "loss": 2.8491, "theoretical_loss": 3.583164585564351, "tokens_seen": 1215767552 }, { "epoch": 3.06, "learning_rate": 0.0003190170511534604, "loss": 2.8728, "theoretical_loss": 3.583146862493213, "tokens_seen": 1215833088 }, { "epoch": 3.06, "learning_rate": 0.00031900702106318955, "loss": 2.7667, "theoretical_loss": 3.5831291406448322, "tokens_seen": 1215898624 }, { "epoch": 3.06, "learning_rate": 0.0003189969909729188, "loss": 2.9473, "theoretical_loss": 3.5831114200190566, "tokens_seen": 1215964160 }, { "epoch": 3.06, "learning_rate": 0.00031898696088264797, "loss": 2.8492, "theoretical_loss": 3.5830937006157373, "tokens_seen": 1216029696 }, { "epoch": 3.06, "learning_rate": 0.00031897693079237715, "loss": 2.9344, "theoretical_loss": 3.583075982434724, "tokens_seen": 1216095232 }, { "epoch": 3.06, "learning_rate": 0.00031896690070210633, "loss": 2.8893, "theoretical_loss": 3.5830582654758656, "tokens_seen": 1216160768 }, { "epoch": 3.06, "learning_rate": 0.0003189568706118355, "loss": 2.7298, "theoretical_loss": 3.583040549739013, "tokens_seen": 1216226304 }, { "epoch": 3.06, "learning_rate": 0.0003189468405215647, "loss": 2.741, "theoretical_loss": 3.583022835224015, "tokens_seen": 1216291840 }, { "epoch": 3.06, "learning_rate": 0.00031893681043129393, "loss": 3.0042, "theoretical_loss": 3.583005121930723, "tokens_seen": 1216357376 }, { "epoch": 3.06, "learning_rate": 0.00031892678034102306, "loss": 2.8253, "theoretical_loss": 3.5829874098589856, "tokens_seen": 1216422912 }, { "epoch": 3.06, "learning_rate": 0.0003189167502507523, "loss": 2.7347, "theoretical_loss": 3.582969699008653, "tokens_seen": 1216488448 }, { "epoch": 3.06, "learning_rate": 0.0003189067201604814, "loss": 2.915, "theoretical_loss": 3.5829519893795756, "tokens_seen": 1216553984 }, { "epoch": 3.06, "learning_rate": 0.00031889669007021065, "loss": 3.0375, "theoretical_loss": 3.582934280971603, "tokens_seen": 1216619520 }, { "epoch": 3.06, "learning_rate": 0.00031888665997993984, "loss": 2.9369, "theoretical_loss": 3.582916573784586, "tokens_seen": 1216685056 }, { "epoch": 3.06, "learning_rate": 0.000318876629889669, "loss": 2.8818, "theoretical_loss": 3.5828988678183737, "tokens_seen": 1216750592 }, { "epoch": 3.06, "learning_rate": 0.0003188665997993982, "loss": 2.9297, "theoretical_loss": 3.5828811630728166, "tokens_seen": 1216816128 }, { "epoch": 3.06, "learning_rate": 0.00031885656970912743, "loss": 2.7341, "theoretical_loss": 3.582863459547765, "tokens_seen": 1216881664 }, { "epoch": 3.06, "learning_rate": 0.00031884653961885656, "loss": 2.7759, "theoretical_loss": 3.582845757243068, "tokens_seen": 1216947200 }, { "epoch": 3.06, "learning_rate": 0.0003188365095285858, "loss": 2.9023, "theoretical_loss": 3.5828280561585775, "tokens_seen": 1217012736 }, { "epoch": 3.06, "learning_rate": 0.0003188264794383149, "loss": 2.8332, "theoretical_loss": 3.582810356294143, "tokens_seen": 1217078272 }, { "epoch": 3.06, "learning_rate": 0.00031881644934804416, "loss": 2.9109, "theoretical_loss": 3.5827926576496134, "tokens_seen": 1217143808 }, { "epoch": 3.06, "learning_rate": 0.00031880641925777334, "loss": 2.9682, "theoretical_loss": 3.5827749602248407, "tokens_seen": 1217209344 }, { "epoch": 3.06, "learning_rate": 0.0003187963891675025, "loss": 2.8569, "theoretical_loss": 3.5827572640196745, "tokens_seen": 1217274880 }, { "epoch": 3.06, "objective/train/docs_used": 1949834, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8495869636535645, "objective/train/theoretical_loss": 3.582752840158929, "objective/train/tokens_used": 1237751264, "theoretical_loss": 3.582752840158929, "tokens_seen": 1217291264 }, { "epoch": 3.06, "learning_rate": 0.0003187863590772317, "loss": 2.8502, "theoretical_loss": 3.582739569033965, "tokens_seen": 1217340416 }, { "epoch": 3.06, "learning_rate": 0.0003187763289869609, "loss": 2.6703, "theoretical_loss": 3.5827218752675627, "tokens_seen": 1217405952 }, { "epoch": 3.06, "learning_rate": 0.00031876629889669006, "loss": 2.8458, "theoretical_loss": 3.5827041827203177, "tokens_seen": 1217471488 }, { "epoch": 3.06, "learning_rate": 0.0003187562688064193, "loss": 2.7052, "theoretical_loss": 3.5826864913920806, "tokens_seen": 1217537024 }, { "epoch": 3.06, "learning_rate": 0.0003187462387161484, "loss": 2.9079, "theoretical_loss": 3.582668801282702, "tokens_seen": 1217602560 }, { "epoch": 3.06, "learning_rate": 0.00031873620862587766, "loss": 2.8308, "theoretical_loss": 3.582651112392032, "tokens_seen": 1217668096 }, { "epoch": 3.06, "learning_rate": 0.0003187261785356068, "loss": 2.8578, "theoretical_loss": 3.582633424719921, "tokens_seen": 1217733632 }, { "epoch": 3.06, "learning_rate": 0.000318716148445336, "loss": 2.7259, "theoretical_loss": 3.5826157382662194, "tokens_seen": 1217799168 }, { "epoch": 3.06, "learning_rate": 0.0003187061183550652, "loss": 2.7024, "theoretical_loss": 3.582598053030778, "tokens_seen": 1217864704 }, { "epoch": 3.06, "learning_rate": 0.0003186960882647944, "loss": 2.7634, "theoretical_loss": 3.582580369013448, "tokens_seen": 1217930240 }, { "epoch": 3.06, "learning_rate": 0.00031868605817452357, "loss": 2.9249, "theoretical_loss": 3.5825626862140787, "tokens_seen": 1217995776 }, { "epoch": 3.06, "learning_rate": 0.0003186760280842528, "loss": 2.8629, "theoretical_loss": 3.582545004632521, "tokens_seen": 1218061312 }, { "epoch": 3.06, "learning_rate": 0.00031866599799398193, "loss": 2.6375, "theoretical_loss": 3.582527324268626, "tokens_seen": 1218126848 }, { "epoch": 3.06, "learning_rate": 0.00031865596790371116, "loss": 2.9091, "theoretical_loss": 3.582509645122244, "tokens_seen": 1218192384 }, { "epoch": 3.06, "learning_rate": 0.0003186459378134403, "loss": 2.8975, "theoretical_loss": 3.5824919671932256, "tokens_seen": 1218257920 }, { "epoch": 3.06, "learning_rate": 0.0003186359077231695, "loss": 2.8703, "theoretical_loss": 3.5824742904814215, "tokens_seen": 1218323456 }, { "epoch": 3.06, "learning_rate": 0.0003186258776328987, "loss": 2.757, "theoretical_loss": 3.582456614986683, "tokens_seen": 1218388992 }, { "epoch": 3.06, "learning_rate": 0.0003186158475426279, "loss": 2.8147, "theoretical_loss": 3.58243894070886, "tokens_seen": 1218454528 }, { "epoch": 3.06, "learning_rate": 0.00031860581745235707, "loss": 2.853, "theoretical_loss": 3.582421267647804, "tokens_seen": 1218520064 }, { "epoch": 3.06, "learning_rate": 0.00031859578736208625, "loss": 2.8175, "theoretical_loss": 3.5824035958033655, "tokens_seen": 1218585600 }, { "epoch": 3.06, "learning_rate": 0.00031858575727181543, "loss": 2.9833, "theoretical_loss": 3.5823859251753953, "tokens_seen": 1218651136 }, { "epoch": 3.06, "learning_rate": 0.00031857572718154467, "loss": 2.8774, "theoretical_loss": 3.5823682557637437, "tokens_seen": 1218716672 }, { "epoch": 3.06, "learning_rate": 0.0003185656970912738, "loss": 2.6859, "theoretical_loss": 3.5823505875682624, "tokens_seen": 1218782208 }, { "epoch": 3.06, "learning_rate": 0.00031855566700100303, "loss": 2.8762, "theoretical_loss": 3.582332920588802, "tokens_seen": 1218847744 }, { "epoch": 3.06, "learning_rate": 0.0003185456369107322, "loss": 2.8825, "theoretical_loss": 3.5823152548252137, "tokens_seen": 1218913280 }, { "epoch": 3.06, "objective/train/docs_used": 1952726, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6885409355163574, "objective/train/theoretical_loss": 3.582310838574279, "objective/train/tokens_used": 1239389664, "theoretical_loss": 3.582310838574279, "tokens_seen": 1218929664 }, { "epoch": 3.06, "learning_rate": 0.0003185356068204614, "loss": 2.8559, "theoretical_loss": 3.582297590277348, "tokens_seen": 1218978816 }, { "epoch": 3.06, "learning_rate": 0.00031852557673019057, "loss": 2.7829, "theoretical_loss": 3.582279926945056, "tokens_seen": 1219044352 }, { "epoch": 3.06, "learning_rate": 0.00031851554663991975, "loss": 2.8183, "theoretical_loss": 3.5822622648281888, "tokens_seen": 1219109888 }, { "epoch": 3.06, "learning_rate": 0.00031850551654964893, "loss": 2.8343, "theoretical_loss": 3.582244603926598, "tokens_seen": 1219175424 }, { "epoch": 3.06, "learning_rate": 0.00031849548645937817, "loss": 2.879, "theoretical_loss": 3.582226944240133, "tokens_seen": 1219240960 }, { "epoch": 3.06, "learning_rate": 0.0003184854563691073, "loss": 2.8923, "theoretical_loss": 3.5822092857686467, "tokens_seen": 1219306496 }, { "epoch": 3.06, "learning_rate": 0.00031847542627883653, "loss": 2.743, "theoretical_loss": 3.5821916285119895, "tokens_seen": 1219372032 }, { "epoch": 3.06, "learning_rate": 0.00031846539618856566, "loss": 2.8035, "theoretical_loss": 3.582173972470012, "tokens_seen": 1219437568 }, { "epoch": 3.06, "learning_rate": 0.0003184553660982949, "loss": 2.912, "theoretical_loss": 3.582156317642567, "tokens_seen": 1219503104 }, { "epoch": 3.06, "learning_rate": 0.0003184453360080241, "loss": 2.9151, "theoretical_loss": 3.5821386640295034, "tokens_seen": 1219568640 }, { "epoch": 3.06, "learning_rate": 0.00031843530591775326, "loss": 2.8518, "theoretical_loss": 3.582121011630674, "tokens_seen": 1219634176 }, { "epoch": 3.06, "learning_rate": 0.00031842527582748244, "loss": 2.8698, "theoretical_loss": 3.58210336044593, "tokens_seen": 1219699712 }, { "epoch": 3.06, "learning_rate": 0.0003184152457372116, "loss": 2.9272, "theoretical_loss": 3.5820857104751216, "tokens_seen": 1219765248 }, { "epoch": 3.06, "learning_rate": 0.0003184052156469408, "loss": 2.7484, "theoretical_loss": 3.5820680617181013, "tokens_seen": 1219830784 }, { "epoch": 3.06, "learning_rate": 0.00031839518555667004, "loss": 2.7222, "theoretical_loss": 3.58205041417472, "tokens_seen": 1219896320 }, { "epoch": 3.06, "learning_rate": 0.00031838515546639916, "loss": 2.8511, "theoretical_loss": 3.5820327678448294, "tokens_seen": 1219961856 }, { "epoch": 3.06, "learning_rate": 0.0003183751253761284, "loss": 2.9481, "theoretical_loss": 3.5820151227282797, "tokens_seen": 1220027392 }, { "epoch": 3.06, "learning_rate": 0.0003183650952858576, "loss": 2.8673, "theoretical_loss": 3.581997478824923, "tokens_seen": 1220092928 }, { "epoch": 3.06, "learning_rate": 0.00031835506519558676, "loss": 2.841, "theoretical_loss": 3.5819798361346114, "tokens_seen": 1220158464 }, { "epoch": 3.06, "learning_rate": 0.00031834503510531594, "loss": 2.9997, "theoretical_loss": 3.5819621946571956, "tokens_seen": 1220224000 }, { "epoch": 3.06, "learning_rate": 0.0003183350050150451, "loss": 2.8085, "theoretical_loss": 3.581944554392527, "tokens_seen": 1220289536 }, { "epoch": 3.06, "learning_rate": 0.0003183249749247743, "loss": 2.8985, "theoretical_loss": 3.5819269153404574, "tokens_seen": 1220355072 }, { "epoch": 3.06, "learning_rate": 0.00031831494483450354, "loss": 2.7827, "theoretical_loss": 3.5819092775008383, "tokens_seen": 1220420608 }, { "epoch": 3.06, "learning_rate": 0.0003183049147442327, "loss": 2.9012, "theoretical_loss": 3.581891640873521, "tokens_seen": 1220486144 }, { "epoch": 3.06, "learning_rate": 0.0003182948846539619, "loss": 2.6739, "theoretical_loss": 3.5818740054583578, "tokens_seen": 1220551680 }, { "epoch": 3.06, "objective/train/docs_used": 1955470, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7112009525299072, "objective/train/theoretical_loss": 3.5818695967939487, "objective/train/tokens_used": 1241028064, "theoretical_loss": 3.5818695967939487, "tokens_seen": 1220568064 }, { "epoch": 3.06, "learning_rate": 0.0003182848545636911, "loss": 2.947, "theoretical_loss": 3.581856371255199, "tokens_seen": 1220617216 }, { "epoch": 3.06, "learning_rate": 0.00031827482447342026, "loss": 2.9268, "theoretical_loss": 3.5818387382638983, "tokens_seen": 1220682752 }, { "epoch": 3.06, "learning_rate": 0.0003182647943831495, "loss": 2.686, "theoretical_loss": 3.5818211064843055, "tokens_seen": 1220748288 }, { "epoch": 3.06, "learning_rate": 0.0003182547642928786, "loss": 2.8017, "theoretical_loss": 3.5818034759162725, "tokens_seen": 1220813824 }, { "epoch": 3.06, "learning_rate": 0.00031824473420260786, "loss": 2.7709, "theoretical_loss": 3.581785846559652, "tokens_seen": 1220879360 }, { "epoch": 3.06, "learning_rate": 0.000318234704112337, "loss": 2.7224, "theoretical_loss": 3.581768218414295, "tokens_seen": 1220944896 }, { "epoch": 3.06, "learning_rate": 0.0003182246740220662, "loss": 2.9585, "theoretical_loss": 3.5817505914800534, "tokens_seen": 1221010432 }, { "epoch": 3.06, "learning_rate": 0.0003182146439317954, "loss": 2.8102, "theoretical_loss": 3.581732965756779, "tokens_seen": 1221075968 }, { "epoch": 3.06, "learning_rate": 0.0003182046138415246, "loss": 2.8315, "theoretical_loss": 3.5817153412443234, "tokens_seen": 1221141504 }, { "epoch": 3.06, "learning_rate": 0.00031819458375125377, "loss": 2.6924, "theoretical_loss": 3.5816977179425393, "tokens_seen": 1221207040 }, { "epoch": 3.06, "learning_rate": 0.000318184553660983, "loss": 2.7158, "theoretical_loss": 3.5816800958512776, "tokens_seen": 1221272576 }, { "epoch": 3.06, "learning_rate": 0.00031817452357071213, "loss": 2.9574, "theoretical_loss": 3.5816624749703907, "tokens_seen": 1221338112 }, { "epoch": 3.06, "learning_rate": 0.00031816449348044136, "loss": 2.8431, "theoretical_loss": 3.5816448552997304, "tokens_seen": 1221403648 }, { "epoch": 3.06, "learning_rate": 0.0003181544633901705, "loss": 2.9402, "theoretical_loss": 3.5816272368391484, "tokens_seen": 1221469184 }, { "epoch": 3.06, "learning_rate": 0.0003181444332998997, "loss": 2.8562, "theoretical_loss": 3.581609619588497, "tokens_seen": 1221534720 }, { "epoch": 3.06, "learning_rate": 0.0003181344032096289, "loss": 2.9124, "theoretical_loss": 3.5815920035476285, "tokens_seen": 1221600256 }, { "epoch": 3.06, "learning_rate": 0.0003181243731193581, "loss": 2.9693, "theoretical_loss": 3.581574388716394, "tokens_seen": 1221665792 }, { "epoch": 3.06, "learning_rate": 0.00031811434302908727, "loss": 2.8712, "theoretical_loss": 3.5815567750946466, "tokens_seen": 1221731328 }, { "epoch": 3.06, "learning_rate": 0.00031810431293881645, "loss": 2.8939, "theoretical_loss": 3.5815391626822377, "tokens_seen": 1221796864 }, { "epoch": 3.06, "learning_rate": 0.00031809428284854563, "loss": 2.7797, "theoretical_loss": 3.5815215514790193, "tokens_seen": 1221862400 }, { "epoch": 3.06, "learning_rate": 0.00031808425275827487, "loss": 2.8082, "theoretical_loss": 3.5815039414848444, "tokens_seen": 1221927936 }, { "epoch": 3.06, "learning_rate": 0.000318074222668004, "loss": 2.8565, "theoretical_loss": 3.5814863326995643, "tokens_seen": 1221993472 }, { "epoch": 3.06, "learning_rate": 0.00031806419257773323, "loss": 2.8524, "theoretical_loss": 3.581468725123031, "tokens_seen": 1222059008 }, { "epoch": 3.06, "learning_rate": 0.0003180541624874624, "loss": 2.9041, "theoretical_loss": 3.5814511187550977, "tokens_seen": 1222124544 }, { "epoch": 3.06, "learning_rate": 0.0003180441323971916, "loss": 2.8848, "theoretical_loss": 3.5814335135956163, "tokens_seen": 1222190080 }, { "epoch": 3.06, "objective/train/docs_used": 1956960, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9708824157714844, "objective/train/theoretical_loss": 3.581429112494549, "objective/train/tokens_used": 1242666464, "theoretical_loss": 3.581429112494549, "tokens_seen": 1222206464 }, { "epoch": 3.06, "learning_rate": 0.00031803410230692077, "loss": 2.9942, "theoretical_loss": 3.5814159096444387, "tokens_seen": 1222255616 }, { "epoch": 3.06, "learning_rate": 0.00031802407221664995, "loss": 2.917, "theoretical_loss": 3.5813983069014173, "tokens_seen": 1222321152 }, { "epoch": 3.06, "learning_rate": 0.00031801404212637913, "loss": 2.9221, "theoretical_loss": 3.5813807053664046, "tokens_seen": 1222386688 }, { "epoch": 3.06, "learning_rate": 0.00031800401203610837, "loss": 2.9117, "theoretical_loss": 3.5813631050392525, "tokens_seen": 1222452224 }, { "epoch": 3.06, "learning_rate": 0.0003179939819458375, "loss": 2.7042, "theoretical_loss": 3.581345505919814, "tokens_seen": 1222517760 }, { "epoch": 3.06, "learning_rate": 0.00031798395185556673, "loss": 2.703, "theoretical_loss": 3.5813279080079408, "tokens_seen": 1222583296 }, { "epoch": 3.06, "learning_rate": 0.00031797392176529586, "loss": 2.9329, "theoretical_loss": 3.5813103113034863, "tokens_seen": 1222648832 }, { "epoch": 3.06, "learning_rate": 0.0003179638916750251, "loss": 2.9079, "theoretical_loss": 3.5812927158063017, "tokens_seen": 1222714368 }, { "epoch": 3.06, "learning_rate": 0.0003179538615847543, "loss": 2.8908, "theoretical_loss": 3.5812751215162404, "tokens_seen": 1222779904 }, { "epoch": 3.06, "learning_rate": 0.00031794383149448346, "loss": 2.9685, "theoretical_loss": 3.581257528433155, "tokens_seen": 1222845440 }, { "epoch": 3.06, "learning_rate": 0.00031793380140421264, "loss": 2.8193, "theoretical_loss": 3.581239936556897, "tokens_seen": 1222910976 }, { "epoch": 3.06, "learning_rate": 0.0003179237713139418, "loss": 2.9292, "theoretical_loss": 3.58122234588732, "tokens_seen": 1222976512 }, { "epoch": 3.06, "learning_rate": 0.000317913741223671, "loss": 2.8524, "theoretical_loss": 3.5812047564242757, "tokens_seen": 1223042048 }, { "epoch": 3.06, "learning_rate": 0.00031790371113340024, "loss": 2.9017, "theoretical_loss": 3.5811871681676175, "tokens_seen": 1223107584 }, { "epoch": 3.06, "learning_rate": 0.00031789368104312936, "loss": 2.9633, "theoretical_loss": 3.581169581117198, "tokens_seen": 1223173120 }, { "epoch": 3.06, "learning_rate": 0.0003178836509528586, "loss": 2.866, "theoretical_loss": 3.5811519952728688, "tokens_seen": 1223238656 }, { "epoch": 3.06, "learning_rate": 0.0003178736208625878, "loss": 3.0302, "theoretical_loss": 3.581134410634484, "tokens_seen": 1223304192 }, { "epoch": 3.06, "learning_rate": 0.00031786359077231696, "loss": 2.8078, "theoretical_loss": 3.581116827201895, "tokens_seen": 1223369728 }, { "epoch": 3.06, "learning_rate": 0.00031785356068204614, "loss": 2.9645, "theoretical_loss": 3.581099244974956, "tokens_seen": 1223435264 }, { "epoch": 3.06, "learning_rate": 0.0003178435305917753, "loss": 2.8741, "theoretical_loss": 3.581081663953518, "tokens_seen": 1223500800 }, { "epoch": 3.06, "learning_rate": 0.0003178335005015045, "loss": 2.9405, "theoretical_loss": 3.5810640841374353, "tokens_seen": 1223566336 }, { "epoch": 3.06, "learning_rate": 0.00031782347041123374, "loss": 2.9568, "theoretical_loss": 3.58104650552656, "tokens_seen": 1223631872 }, { "epoch": 3.06, "learning_rate": 0.00031781344032096287, "loss": 2.7272, "theoretical_loss": 3.5810289281207446, "tokens_seen": 1223697408 }, { "epoch": 3.06, "learning_rate": 0.0003178034102306921, "loss": 2.7976, "theoretical_loss": 3.5810113519198428, "tokens_seen": 1223762944 }, { "epoch": 3.06, "learning_rate": 0.00031779338014042123, "loss": 2.9463, "theoretical_loss": 3.5809937769237066, "tokens_seen": 1223828480 }, { "epoch": 3.06, "objective/train/docs_used": 1959765, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.853987216949463, "objective/train/theoretical_loss": 3.5809893833629003, "objective/train/tokens_used": 1244304864, "theoretical_loss": 3.5809893833629003, "tokens_seen": 1223844864 }, { "epoch": 3.06, "learning_rate": 0.00031778335005015046, "loss": 2.8965, "theoretical_loss": 3.5809762031321895, "tokens_seen": 1223894016 }, { "epoch": 3.06, "learning_rate": 0.00031777331995987964, "loss": 2.7966, "theoretical_loss": 3.580958630545145, "tokens_seen": 1223959552 }, { "epoch": 3.06, "learning_rate": 0.0003177632898696088, "loss": 2.89, "theoretical_loss": 3.5809410591624244, "tokens_seen": 1224025088 }, { "epoch": 3.06, "learning_rate": 0.000317753259779338, "loss": 2.8543, "theoretical_loss": 3.580923488983882, "tokens_seen": 1224090624 }, { "epoch": 3.06, "learning_rate": 0.0003177432296890672, "loss": 2.878, "theoretical_loss": 3.5809059200093705, "tokens_seen": 1224156160 }, { "epoch": 3.06, "learning_rate": 0.00031773319959879637, "loss": 2.8112, "theoretical_loss": 3.580888352238743, "tokens_seen": 1224221696 }, { "epoch": 3.06, "learning_rate": 0.0003177231695085256, "loss": 2.9163, "theoretical_loss": 3.5808707856718525, "tokens_seen": 1224287232 }, { "epoch": 3.06, "learning_rate": 0.00031771313941825473, "loss": 2.842, "theoretical_loss": 3.580853220308552, "tokens_seen": 1224352768 }, { "epoch": 3.06, "learning_rate": 0.00031770310932798397, "loss": 2.8634, "theoretical_loss": 3.580835656148695, "tokens_seen": 1224418304 }, { "epoch": 3.06, "learning_rate": 0.00031769307923771315, "loss": 2.9166, "theoretical_loss": 3.580818093192134, "tokens_seen": 1224483840 }, { "epoch": 3.06, "learning_rate": 0.00031768304914744233, "loss": 2.829, "theoretical_loss": 3.5808005314387223, "tokens_seen": 1224549376 }, { "epoch": 3.06, "learning_rate": 0.0003176730190571715, "loss": 2.7603, "theoretical_loss": 3.5807829708883134, "tokens_seen": 1224614912 }, { "epoch": 3.06, "learning_rate": 0.0003176629889669007, "loss": 2.8999, "theoretical_loss": 3.5807654115407606, "tokens_seen": 1224680448 }, { "epoch": 3.06, "learning_rate": 0.00031765295887662987, "loss": 2.8851, "theoretical_loss": 3.580747853395917, "tokens_seen": 1224745984 }, { "epoch": 3.06, "learning_rate": 0.0003176429287863591, "loss": 2.6752, "theoretical_loss": 3.580730296453636, "tokens_seen": 1224811520 }, { "epoch": 3.06, "learning_rate": 0.00031763289869608823, "loss": 2.9888, "theoretical_loss": 3.58071274071377, "tokens_seen": 1224877056 }, { "epoch": 3.06, "learning_rate": 0.00031762286860581747, "loss": 2.9011, "theoretical_loss": 3.5806951861761736, "tokens_seen": 1224942592 }, { "epoch": 3.06, "learning_rate": 0.0003176128385155466, "loss": 2.8232, "theoretical_loss": 3.5806776328406995, "tokens_seen": 1225008128 }, { "epoch": 3.06, "learning_rate": 0.00031760280842527583, "loss": 2.8127, "theoretical_loss": 3.580660080707201, "tokens_seen": 1225073664 }, { "epoch": 3.06, "learning_rate": 0.000317592778335005, "loss": 2.8851, "theoretical_loss": 3.580642529775532, "tokens_seen": 1225139200 }, { "epoch": 3.06, "learning_rate": 0.0003175827482447342, "loss": 2.9245, "theoretical_loss": 3.5806249800455454, "tokens_seen": 1225204736 }, { "epoch": 3.06, "learning_rate": 0.0003175727181544634, "loss": 2.8352, "theoretical_loss": 3.5806074315170946, "tokens_seen": 1225270272 }, { "epoch": 3.06, "learning_rate": 0.0003175626880641926, "loss": 2.9542, "theoretical_loss": 3.580589884190034, "tokens_seen": 1225335808 }, { "epoch": 3.06, "learning_rate": 0.0003175526579739218, "loss": 2.8108, "theoretical_loss": 3.580572338064216, "tokens_seen": 1225401344 }, { "epoch": 3.06, "learning_rate": 0.00031754262788365097, "loss": 2.7916, "theoretical_loss": 3.5805547931394948, "tokens_seen": 1225466880 }, { "epoch": 3.06, "objective/train/docs_used": 1962462, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8813467025756836, "objective/train/theoretical_loss": 3.580550407095968, "objective/train/tokens_used": 1245943264, "theoretical_loss": 3.580550407095968, "tokens_seen": 1225483264 }, { "epoch": 3.06, "learning_rate": 0.00031753259779338015, "loss": 2.7545, "theoretical_loss": 3.5805372494157233, "tokens_seen": 1225532416 }, { "epoch": 3.06, "learning_rate": 0.00031752256770310933, "loss": 2.8629, "theoretical_loss": 3.5805197068927557, "tokens_seen": 1225597952 }, { "epoch": 3.06, "learning_rate": 0.00031751253761283857, "loss": 2.8637, "theoretical_loss": 3.5805021655704454, "tokens_seen": 1225663488 }, { "epoch": 3.06, "learning_rate": 0.0003175025075225677, "loss": 2.9135, "theoretical_loss": 3.580484625448646, "tokens_seen": 1225729024 }, { "epoch": 3.06, "learning_rate": 0.00031749247743229693, "loss": 2.9184, "theoretical_loss": 3.580467086527211, "tokens_seen": 1225794560 }, { "epoch": 3.06, "learning_rate": 0.00031748244734202606, "loss": 2.8279, "theoretical_loss": 3.5804495488059946, "tokens_seen": 1225860096 }, { "epoch": 3.06, "learning_rate": 0.0003174724172517553, "loss": 2.7973, "theoretical_loss": 3.58043201228485, "tokens_seen": 1225925632 }, { "epoch": 3.06, "learning_rate": 0.0003174623871614845, "loss": 2.8474, "theoretical_loss": 3.5804144769636315, "tokens_seen": 1225991168 }, { "epoch": 3.06, "learning_rate": 0.00031745235707121366, "loss": 2.9404, "theoretical_loss": 3.5803969428421922, "tokens_seen": 1226056704 }, { "epoch": 3.06, "learning_rate": 0.00031744232698094284, "loss": 2.9202, "theoretical_loss": 3.580379409920386, "tokens_seen": 1226122240 }, { "epoch": 3.06, "learning_rate": 0.000317432296890672, "loss": 2.7535, "theoretical_loss": 3.580361878198067, "tokens_seen": 1226187776 }, { "epoch": 3.06, "learning_rate": 0.0003174222668004012, "loss": 2.7563, "theoretical_loss": 3.580344347675089, "tokens_seen": 1226253312 }, { "epoch": 3.06, "learning_rate": 0.00031741223671013044, "loss": 2.996, "theoretical_loss": 3.5803268183513053, "tokens_seen": 1226318848 }, { "epoch": 3.06, "learning_rate": 0.00031740220661985956, "loss": 2.8604, "theoretical_loss": 3.5803092902265705, "tokens_seen": 1226384384 }, { "epoch": 3.06, "learning_rate": 0.0003173921765295888, "loss": 2.8115, "theoretical_loss": 3.5802917633007385, "tokens_seen": 1226449920 }, { "epoch": 3.06, "learning_rate": 0.000317382146439318, "loss": 2.876, "theoretical_loss": 3.580274237573663, "tokens_seen": 1226515456 }, { "epoch": 3.06, "learning_rate": 0.00031737211634904716, "loss": 2.7304, "theoretical_loss": 3.5802567130451974, "tokens_seen": 1226580992 }, { "epoch": 3.06, "learning_rate": 0.00031736208625877634, "loss": 2.7892, "theoretical_loss": 3.580239189715197, "tokens_seen": 1226646528 }, { "epoch": 3.06, "learning_rate": 0.0003173520561685055, "loss": 2.943, "theoretical_loss": 3.5802216675835146, "tokens_seen": 1226712064 }, { "epoch": 3.06, "learning_rate": 0.0003173420260782347, "loss": 2.8232, "theoretical_loss": 3.580204146650005, "tokens_seen": 1226777600 }, { "epoch": 3.06, "learning_rate": 0.00031733199598796394, "loss": 2.6933, "theoretical_loss": 3.580186626914522, "tokens_seen": 1226843136 }, { "epoch": 3.06, "learning_rate": 0.00031732196589769307, "loss": 2.804, "theoretical_loss": 3.5801691083769196, "tokens_seen": 1226908672 }, { "epoch": 3.06, "learning_rate": 0.0003173119358074223, "loss": 2.856, "theoretical_loss": 3.580151591037052, "tokens_seen": 1226974208 }, { "epoch": 3.06, "learning_rate": 0.00031730190571715143, "loss": 2.7746, "theoretical_loss": 3.580134074894773, "tokens_seen": 1227039744 }, { "epoch": 3.06, "learning_rate": 0.00031729187562688066, "loss": 2.9381, "theoretical_loss": 3.580116559949938, "tokens_seen": 1227105280 }, { "epoch": 3.06, "objective/train/docs_used": 1965393, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0982131958007812, "objective/train/theoretical_loss": 3.5801121814008123, "objective/train/tokens_used": 1247581664, "theoretical_loss": 3.5801121814008123, "tokens_seen": 1227121664 }, { "epoch": 3.06, "learning_rate": 0.00031728184553660984, "loss": 2.8896, "theoretical_loss": 3.5800990462024, "tokens_seen": 1227170816 }, { "epoch": 3.06, "learning_rate": 0.000317271815446339, "loss": 2.8509, "theoretical_loss": 3.580081533652013, "tokens_seen": 1227236352 }, { "epoch": 3.06, "learning_rate": 0.0003172617853560682, "loss": 2.8276, "theoretical_loss": 3.5800640222986315, "tokens_seen": 1227301888 }, { "epoch": 3.06, "learning_rate": 0.0003172517552657974, "loss": 3.0245, "theoretical_loss": 3.5800465121421112, "tokens_seen": 1227367424 }, { "epoch": 3.06, "learning_rate": 0.00031724172517552657, "loss": 2.9095, "theoretical_loss": 3.5800290031823043, "tokens_seen": 1227432960 }, { "epoch": 3.06, "learning_rate": 0.0003172316950852558, "loss": 2.8987, "theoretical_loss": 3.5800114954190665, "tokens_seen": 1227498496 }, { "epoch": 3.06, "learning_rate": 0.00031722166499498493, "loss": 2.8145, "theoretical_loss": 3.5799939888522516, "tokens_seen": 1227564032 }, { "epoch": 3.06, "learning_rate": 0.00031721163490471417, "loss": 2.8468, "theoretical_loss": 3.579976483481714, "tokens_seen": 1227629568 }, { "epoch": 3.06, "learning_rate": 0.00031720160481444335, "loss": 2.8434, "theoretical_loss": 3.5799589793073086, "tokens_seen": 1227695104 }, { "epoch": 3.06, "learning_rate": 0.00031719157472417253, "loss": 2.877, "theoretical_loss": 3.579941476328889, "tokens_seen": 1227760640 }, { "epoch": 3.06, "learning_rate": 0.0003171815446339017, "loss": 2.8823, "theoretical_loss": 3.57992397454631, "tokens_seen": 1227826176 }, { "epoch": 3.06, "learning_rate": 0.0003171715145436309, "loss": 2.8032, "theoretical_loss": 3.579906473959426, "tokens_seen": 1227891712 }, { "epoch": 3.06, "learning_rate": 0.00031716148445336007, "loss": 2.8707, "theoretical_loss": 3.5798889745680924, "tokens_seen": 1227957248 }, { "epoch": 3.06, "learning_rate": 0.0003171514543630893, "loss": 2.8465, "theoretical_loss": 3.579871476372162, "tokens_seen": 1228022784 }, { "epoch": 3.06, "learning_rate": 0.00031714142427281843, "loss": 2.9686, "theoretical_loss": 3.579853979371491, "tokens_seen": 1228088320 }, { "epoch": 3.06, "learning_rate": 0.00031713139418254767, "loss": 2.855, "theoretical_loss": 3.579836483565933, "tokens_seen": 1228153856 }, { "epoch": 3.06, "learning_rate": 0.0003171213640922768, "loss": 3.0105, "theoretical_loss": 3.579818988955343, "tokens_seen": 1228219392 }, { "epoch": 3.06, "learning_rate": 0.00031711133400200603, "loss": 2.602, "theoretical_loss": 3.5798014955395754, "tokens_seen": 1228284928 }, { "epoch": 3.06, "learning_rate": 0.0003171013039117352, "loss": 2.9792, "theoretical_loss": 3.579784003318485, "tokens_seen": 1228350464 }, { "epoch": 3.06, "learning_rate": 0.0003170912738214644, "loss": 2.9457, "theoretical_loss": 3.5797665122919264, "tokens_seen": 1228416000 }, { "epoch": 3.06, "learning_rate": 0.0003170812437311936, "loss": 2.9105, "theoretical_loss": 3.5797490224597546, "tokens_seen": 1228481536 }, { "epoch": 3.06, "learning_rate": 0.0003170712136409228, "loss": 2.7621, "theoretical_loss": 3.579731533821824, "tokens_seen": 1228547072 }, { "epoch": 3.06, "learning_rate": 0.00031706118355065194, "loss": 2.938, "theoretical_loss": 3.5797140463779895, "tokens_seen": 1228612608 }, { "epoch": 3.06, "learning_rate": 0.0003170511534603812, "loss": 2.8276, "theoretical_loss": 3.5796965601281054, "tokens_seen": 1228678144 }, { "epoch": 3.06, "learning_rate": 0.0003170411233701103, "loss": 2.9497, "theoretical_loss": 3.5796790750720273, "tokens_seen": 1228743680 }, { "epoch": 3.06, "objective/train/docs_used": 1968139, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.950263261795044, "objective/train/theoretical_loss": 3.579674703994523, "objective/train/tokens_used": 1249220064, "theoretical_loss": 3.579674703994523, "tokens_seen": 1228760064 }, { "epoch": 3.06, "learning_rate": 0.00031703109327983954, "loss": 2.8042, "theoretical_loss": 3.57966159120961, "tokens_seen": 1228809216 }, { "epoch": 3.06, "learning_rate": 0.0003170210631895687, "loss": 2.8785, "theoretical_loss": 3.579644108540707, "tokens_seen": 1228874752 }, { "epoch": 3.06, "learning_rate": 0.0003170110330992979, "loss": 2.92, "theoretical_loss": 3.579626627065175, "tokens_seen": 1228940288 }, { "epoch": 3.06, "learning_rate": 0.0003170010030090271, "loss": 2.8584, "theoretical_loss": 3.579609146782868, "tokens_seen": 1229005824 }, { "epoch": 3.06, "learning_rate": 0.00031699097291875626, "loss": 2.9389, "theoretical_loss": 3.5795916676936415, "tokens_seen": 1229071360 }, { "epoch": 3.06, "learning_rate": 0.00031698094282848544, "loss": 2.7969, "theoretical_loss": 3.579574189797349, "tokens_seen": 1229136896 }, { "epoch": 3.06, "learning_rate": 0.0003169709127382147, "loss": 2.8965, "theoretical_loss": 3.5795567130938473, "tokens_seen": 1229202432 }, { "epoch": 3.06, "learning_rate": 0.0003169608826479438, "loss": 2.8161, "theoretical_loss": 3.57953923758299, "tokens_seen": 1229267968 }, { "epoch": 3.06, "learning_rate": 0.00031695085255767304, "loss": 2.8098, "theoretical_loss": 3.579521763264633, "tokens_seen": 1229333504 }, { "epoch": 3.06, "learning_rate": 0.00031694082246740217, "loss": 2.7964, "theoretical_loss": 3.579504290138631, "tokens_seen": 1229399040 }, { "epoch": 3.06, "learning_rate": 0.0003169307923771314, "loss": 2.935, "theoretical_loss": 3.579486818204839, "tokens_seen": 1229464576 }, { "epoch": 3.06, "learning_rate": 0.0003169207622868606, "loss": 2.9249, "theoretical_loss": 3.579469347463113, "tokens_seen": 1229530112 }, { "epoch": 3.06, "learning_rate": 0.00031691073219658976, "loss": 2.9568, "theoretical_loss": 3.579451877913307, "tokens_seen": 1229595648 }, { "epoch": 3.06, "learning_rate": 0.00031690070210631894, "loss": 2.7556, "theoretical_loss": 3.5794344095552764, "tokens_seen": 1229661184 }, { "epoch": 3.06, "learning_rate": 0.0003168906720160482, "loss": 2.8433, "theoretical_loss": 3.5794169423888764, "tokens_seen": 1229726720 }, { "epoch": 3.06, "learning_rate": 0.0003168806419257773, "loss": 2.7794, "theoretical_loss": 3.579399476413963, "tokens_seen": 1229792256 }, { "epoch": 3.06, "learning_rate": 0.00031687061183550654, "loss": 2.7279, "theoretical_loss": 3.5793820116303903, "tokens_seen": 1229857792 }, { "epoch": 3.06, "learning_rate": 0.00031686058174523567, "loss": 2.8522, "theoretical_loss": 3.579364548038014, "tokens_seen": 1229923328 }, { "epoch": 3.06, "learning_rate": 0.0003168505516549649, "loss": 2.8151, "theoretical_loss": 3.57934708563669, "tokens_seen": 1229988864 }, { "epoch": 3.06, "learning_rate": 0.0003168405215646941, "loss": 2.8626, "theoretical_loss": 3.579329624426273, "tokens_seen": 1230054400 }, { "epoch": 3.06, "learning_rate": 0.00031683049147442327, "loss": 2.8384, "theoretical_loss": 3.579312164406618, "tokens_seen": 1230119936 }, { "epoch": 3.06, "learning_rate": 0.00031682046138415245, "loss": 2.8063, "theoretical_loss": 3.5792947055775812, "tokens_seen": 1230185472 }, { "epoch": 3.06, "learning_rate": 0.00031681043129388163, "loss": 2.7796, "theoretical_loss": 3.579277247939017, "tokens_seen": 1230251008 }, { "epoch": 3.06, "learning_rate": 0.00031680040120361086, "loss": 2.9153, "theoretical_loss": 3.5792597914907818, "tokens_seen": 1230316544 }, { "epoch": 3.06, "learning_rate": 0.00031679037111334004, "loss": 2.8296, "theoretical_loss": 3.5792423362327304, "tokens_seen": 1230382080 }, { "epoch": 3.06, "objective/train/docs_used": 1971045, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6510016918182373, "objective/train/theoretical_loss": 3.579237972604167, "objective/train/tokens_used": 1250858464, "theoretical_loss": 3.579237972604167, "tokens_seen": 1230398464 }, { "epoch": 3.06, "learning_rate": 0.0003167803410230692, "loss": 2.7958, "theoretical_loss": 3.579224882164719, "tokens_seen": 1230447616 }, { "epoch": 3.06, "learning_rate": 0.0003167703109327984, "loss": 2.8945, "theoretical_loss": 3.579207429286602, "tokens_seen": 1230513152 }, { "epoch": 3.06, "learning_rate": 0.0003167602808425276, "loss": 3.0516, "theoretical_loss": 3.5791899775982357, "tokens_seen": 1230578688 }, { "epoch": 3.06, "learning_rate": 0.00031675025075225677, "loss": 2.9118, "theoretical_loss": 3.579172527099476, "tokens_seen": 1230644224 }, { "epoch": 3.06, "learning_rate": 0.000316740220661986, "loss": 2.8778, "theoretical_loss": 3.579155077790177, "tokens_seen": 1230709760 }, { "epoch": 3.06, "learning_rate": 0.00031673019057171513, "loss": 2.8266, "theoretical_loss": 3.5791376296701953, "tokens_seen": 1230775296 }, { "epoch": 3.06, "learning_rate": 0.00031672016048144437, "loss": 2.9415, "theoretical_loss": 3.579120182739387, "tokens_seen": 1230840832 }, { "epoch": 3.06, "learning_rate": 0.00031671013039117355, "loss": 2.9804, "theoretical_loss": 3.5791027369976067, "tokens_seen": 1230906368 }, { "epoch": 3.06, "learning_rate": 0.00031670010030090273, "loss": 2.7334, "theoretical_loss": 3.5790852924447103, "tokens_seen": 1230971904 }, { "epoch": 3.06, "learning_rate": 0.0003166900702106319, "loss": 2.7705, "theoretical_loss": 3.5790678490805545, "tokens_seen": 1231037440 }, { "epoch": 3.06, "learning_rate": 0.0003166800401203611, "loss": 2.6072, "theoretical_loss": 3.5790504069049938, "tokens_seen": 1231102976 }, { "epoch": 3.06, "learning_rate": 0.00031667001003009027, "loss": 2.9098, "theoretical_loss": 3.579032965917884, "tokens_seen": 1231168512 }, { "epoch": 3.06, "learning_rate": 0.0003166599799398195, "loss": 2.9328, "theoretical_loss": 3.5790155261190817, "tokens_seen": 1231234048 }, { "epoch": 3.06, "learning_rate": 0.00031664994984954863, "loss": 2.6992, "theoretical_loss": 3.5789980875084417, "tokens_seen": 1231299584 }, { "epoch": 3.06, "learning_rate": 0.00031663991975927787, "loss": 2.9453, "theoretical_loss": 3.5789806500858212, "tokens_seen": 1231365120 }, { "epoch": 3.06, "learning_rate": 0.000316629889669007, "loss": 2.8615, "theoretical_loss": 3.5789632138510745, "tokens_seen": 1231430656 }, { "epoch": 3.06, "learning_rate": 0.00031661985957873623, "loss": 2.9191, "theoretical_loss": 3.578945778804058, "tokens_seen": 1231496192 }, { "epoch": 3.06, "learning_rate": 0.0003166098294884654, "loss": 2.7748, "theoretical_loss": 3.5789283449446283, "tokens_seen": 1231561728 }, { "epoch": 3.06, "learning_rate": 0.0003165997993981946, "loss": 2.9085, "theoretical_loss": 3.5789109122726406, "tokens_seen": 1231627264 }, { "epoch": 3.06, "learning_rate": 0.0003165897693079238, "loss": 2.7978, "theoretical_loss": 3.578893480787951, "tokens_seen": 1231692800 }, { "epoch": 3.06, "learning_rate": 0.000316579739217653, "loss": 2.8625, "theoretical_loss": 3.5788760504904156, "tokens_seen": 1231758336 }, { "epoch": 3.06, "learning_rate": 0.00031656970912738214, "loss": 2.9817, "theoretical_loss": 3.57885862137989, "tokens_seen": 1231823872 }, { "epoch": 3.06, "learning_rate": 0.0003165596790371114, "loss": 2.8094, "theoretical_loss": 3.5788411934562303, "tokens_seen": 1231889408 }, { "epoch": 3.06, "learning_rate": 0.0003165496489468405, "loss": 2.7854, "theoretical_loss": 3.5788237667192933, "tokens_seen": 1231954944 }, { "epoch": 3.06, "learning_rate": 0.00031653961885656974, "loss": 2.8997, "theoretical_loss": 3.578806341168934, "tokens_seen": 1232020480 }, { "epoch": 3.06, "objective/train/docs_used": 1973510, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.633111000061035, "objective/train/theoretical_loss": 3.5788019849667303, "objective/train/tokens_used": 1252496864, "theoretical_loss": 3.5788019849667303, "tokens_seen": 1232036864 }, { "epoch": 3.06, "learning_rate": 0.0003165295887662989, "loss": 2.8538, "theoretical_loss": 3.578788916805009, "tokens_seen": 1232086016 }, { "epoch": 3.06, "learning_rate": 0.0003165195586760281, "loss": 2.9178, "theoretical_loss": 3.578771493627375, "tokens_seen": 1232151552 }, { "epoch": 3.06, "learning_rate": 0.0003165095285857573, "loss": 2.9328, "theoretical_loss": 3.5787540716358874, "tokens_seen": 1232217088 }, { "epoch": 3.06, "learning_rate": 0.00031649949849548646, "loss": 2.7474, "theoretical_loss": 3.578736650830402, "tokens_seen": 1232282624 }, { "epoch": 3.06, "learning_rate": 0.00031648946840521564, "loss": 2.937, "theoretical_loss": 3.5787192312107763, "tokens_seen": 1232348160 }, { "epoch": 3.06, "learning_rate": 0.0003164794383149449, "loss": 2.7979, "theoretical_loss": 3.578701812776865, "tokens_seen": 1232413696 }, { "epoch": 3.06, "learning_rate": 0.000316469408224674, "loss": 2.9193, "theoretical_loss": 3.5786843955285255, "tokens_seen": 1232479232 }, { "epoch": 3.06, "learning_rate": 0.00031645937813440324, "loss": 2.8668, "theoretical_loss": 3.5786669794656136, "tokens_seen": 1232544768 }, { "epoch": 3.06, "learning_rate": 0.00031644934804413237, "loss": 2.7144, "theoretical_loss": 3.578649564587985, "tokens_seen": 1232610304 }, { "epoch": 3.06, "learning_rate": 0.0003164393179538616, "loss": 2.8372, "theoretical_loss": 3.5786321508954977, "tokens_seen": 1232675840 }, { "epoch": 3.06, "learning_rate": 0.0003164292878635908, "loss": 2.6874, "theoretical_loss": 3.5786147383880067, "tokens_seen": 1232741376 }, { "epoch": 3.06, "learning_rate": 0.00031641925777331996, "loss": 2.8464, "theoretical_loss": 3.578597327065368, "tokens_seen": 1232806912 }, { "epoch": 3.06, "learning_rate": 0.00031640922768304914, "loss": 2.9638, "theoretical_loss": 3.5785799169274393, "tokens_seen": 1232872448 }, { "epoch": 3.06, "learning_rate": 0.0003163991975927784, "loss": 2.7808, "theoretical_loss": 3.578562507974077, "tokens_seen": 1232937984 }, { "epoch": 3.06, "learning_rate": 0.0003163891675025075, "loss": 2.6359, "theoretical_loss": 3.578545100205136, "tokens_seen": 1233003520 }, { "epoch": 3.06, "learning_rate": 0.00031637913741223674, "loss": 2.8381, "theoretical_loss": 3.5785276936204737, "tokens_seen": 1233069056 }, { "epoch": 3.06, "learning_rate": 0.00031636910732196587, "loss": 3.0087, "theoretical_loss": 3.578510288219947, "tokens_seen": 1233134592 }, { "epoch": 3.06, "learning_rate": 0.0003163590772316951, "loss": 2.9467, "theoretical_loss": 3.5784928840034116, "tokens_seen": 1233200128 }, { "epoch": 3.06, "learning_rate": 0.0003163490471414243, "loss": 2.9307, "theoretical_loss": 3.578475480970725, "tokens_seen": 1233265664 }, { "epoch": 3.06, "learning_rate": 0.00031633901705115347, "loss": 2.8047, "theoretical_loss": 3.5784580791217433, "tokens_seen": 1233331200 }, { "epoch": 3.06, "learning_rate": 0.00031632898696088265, "loss": 2.7793, "theoretical_loss": 3.5784406784563227, "tokens_seen": 1233396736 }, { "epoch": 3.06, "learning_rate": 0.00031631895687061183, "loss": 2.8513, "theoretical_loss": 3.57842327897432, "tokens_seen": 1233462272 }, { "epoch": 3.06, "learning_rate": 0.000316308926780341, "loss": 2.7008, "theoretical_loss": 3.5784058806755925, "tokens_seen": 1233527808 }, { "epoch": 3.06, "learning_rate": 0.00031629889669007024, "loss": 2.8127, "theoretical_loss": 3.578388483559996, "tokens_seen": 1233593344 }, { "epoch": 3.06, "learning_rate": 0.00031628886659979937, "loss": 2.9065, "theoretical_loss": 3.5783710876273878, "tokens_seen": 1233658880 }, { "epoch": 3.06, "objective/train/docs_used": 1974912, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.2034895420074463, "objective/train/theoretical_loss": 3.578366738829061, "objective/train/tokens_used": 1254135264, "theoretical_loss": 3.578366738829061, "tokens_seen": 1233675264 }, { "epoch": 3.06, "learning_rate": 0.0003162788365095286, "loss": 3.0663, "theoretical_loss": 3.5783536928776245, "tokens_seen": 1233724416 }, { "epoch": 3.06, "learning_rate": 0.00031626880641925773, "loss": 2.7121, "theoretical_loss": 3.5783362993105623, "tokens_seen": 1233789952 }, { "epoch": 3.06, "learning_rate": 0.00031625877632898697, "loss": 2.9428, "theoretical_loss": 3.578318906926059, "tokens_seen": 1233855488 }, { "epoch": 3.06, "learning_rate": 0.00031624874623871615, "loss": 2.7236, "theoretical_loss": 3.57830151572397, "tokens_seen": 1233921024 }, { "epoch": 3.06, "learning_rate": 0.00031623871614844533, "loss": 2.9115, "theoretical_loss": 3.5782841257041538, "tokens_seen": 1233986560 }, { "epoch": 3.06, "learning_rate": 0.0003162286860581745, "loss": 2.7548, "theoretical_loss": 3.578266736866466, "tokens_seen": 1234052096 }, { "epoch": 3.06, "learning_rate": 0.00031621865596790375, "loss": 2.9511, "theoretical_loss": 3.578249349210764, "tokens_seen": 1234117632 }, { "epoch": 3.06, "learning_rate": 0.0003162086258776329, "loss": 2.7303, "theoretical_loss": 3.578231962736904, "tokens_seen": 1234183168 }, { "epoch": 3.06, "learning_rate": 0.0003161985957873621, "loss": 2.8022, "theoretical_loss": 3.5782145774447436, "tokens_seen": 1234248704 }, { "epoch": 3.06, "learning_rate": 0.00031618856569709124, "loss": 2.8688, "theoretical_loss": 3.57819719333414, "tokens_seen": 1234314240 }, { "epoch": 3.06, "learning_rate": 0.00031617853560682047, "loss": 2.7294, "theoretical_loss": 3.5781798104049494, "tokens_seen": 1234379776 }, { "epoch": 3.06, "learning_rate": 0.00031616850551654965, "loss": 2.7527, "theoretical_loss": 3.5781624286570297, "tokens_seen": 1234445312 }, { "epoch": 3.06, "learning_rate": 0.00031615847542627883, "loss": 2.8331, "theoretical_loss": 3.578145048090237, "tokens_seen": 1234510848 }, { "epoch": 3.06, "learning_rate": 0.000316148445336008, "loss": 2.728, "theoretical_loss": 3.578127668704428, "tokens_seen": 1234576384 }, { "epoch": 3.06, "learning_rate": 0.0003161384152457372, "loss": 2.8534, "theoretical_loss": 3.5781102904994615, "tokens_seen": 1234641920 }, { "epoch": 3.06, "learning_rate": 0.0003161283851554664, "loss": 2.8578, "theoretical_loss": 3.5780929134751935, "tokens_seen": 1234707456 }, { "epoch": 3.06, "learning_rate": 0.0003161183550651956, "loss": 2.8101, "theoretical_loss": 3.578075537631481, "tokens_seen": 1234772992 }, { "epoch": 3.06, "learning_rate": 0.00031610832497492474, "loss": 2.7729, "theoretical_loss": 3.578058162968181, "tokens_seen": 1234838528 }, { "epoch": 3.06, "learning_rate": 0.000316098294884654, "loss": 2.8004, "theoretical_loss": 3.5780407894851516, "tokens_seen": 1234904064 }, { "epoch": 3.06, "learning_rate": 0.0003160882647943831, "loss": 2.7124, "theoretical_loss": 3.578023417182249, "tokens_seen": 1234969600 }, { "epoch": 3.06, "learning_rate": 0.00031607823470411234, "loss": 2.6899, "theoretical_loss": 3.578006046059331, "tokens_seen": 1235035136 }, { "epoch": 3.06, "learning_rate": 0.0003160682046138415, "loss": 2.8979, "theoretical_loss": 3.577988676116255, "tokens_seen": 1235100672 }, { "epoch": 3.06, "learning_rate": 0.0003160581745235707, "loss": 2.6963, "theoretical_loss": 3.5779713073528776, "tokens_seen": 1235166208 }, { "epoch": 3.06, "learning_rate": 0.00031604814443329994, "loss": 3.0189, "theoretical_loss": 3.5779539397690563, "tokens_seen": 1235231744 }, { "epoch": 3.06, "learning_rate": 0.0003160381143430291, "loss": 2.9403, "theoretical_loss": 3.577936573364649, "tokens_seen": 1235297280 }, { "epoch": 3.06, "objective/train/docs_used": 1977647, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5737202167510986, "objective/train/theoretical_loss": 3.5779322319478135, "objective/train/tokens_used": 1255773664, "theoretical_loss": 3.5779322319478135, "tokens_seen": 1235313664 }, { "epoch": 3.06, "learning_rate": 0.0003160280842527583, "loss": 2.7503, "theoretical_loss": 3.577919208139512, "tokens_seen": 1235362816 }, { "epoch": 3.06, "learning_rate": 0.0003160180541624875, "loss": 2.823, "theoretical_loss": 3.5779018440935033, "tokens_seen": 1235428352 }, { "epoch": 3.06, "learning_rate": 0.00031600802407221666, "loss": 2.8912, "theoretical_loss": 3.577884481226481, "tokens_seen": 1235493888 }, { "epoch": 3.06, "learning_rate": 0.00031599799398194584, "loss": 2.8182, "theoretical_loss": 3.5778671195383014, "tokens_seen": 1235559424 }, { "epoch": 3.06, "learning_rate": 0.0003159879638916751, "loss": 2.6693, "theoretical_loss": 3.5778497590288216, "tokens_seen": 1235624960 }, { "epoch": 3.06, "learning_rate": 0.0003159779338014042, "loss": 2.8311, "theoretical_loss": 3.5778323996979005, "tokens_seen": 1235690496 }, { "epoch": 3.06, "learning_rate": 0.00031596790371113344, "loss": 2.8377, "theoretical_loss": 3.577815041545395, "tokens_seen": 1235756032 }, { "epoch": 3.06, "learning_rate": 0.00031595787362086257, "loss": 2.9571, "theoretical_loss": 3.577797684571162, "tokens_seen": 1235821568 }, { "epoch": 3.06, "learning_rate": 0.0003159478435305918, "loss": 3.0239, "theoretical_loss": 3.57778032877506, "tokens_seen": 1235887104 }, { "epoch": 3.06, "learning_rate": 0.000315937813440321, "loss": 2.7359, "theoretical_loss": 3.5777629741569457, "tokens_seen": 1235952640 }, { "epoch": 3.06, "learning_rate": 0.00031592778335005016, "loss": 2.9028, "theoretical_loss": 3.5777456207166773, "tokens_seen": 1236018176 }, { "epoch": 3.06, "learning_rate": 0.00031591775325977934, "loss": 2.906, "theoretical_loss": 3.577728268454112, "tokens_seen": 1236083712 }, { "epoch": 3.06, "learning_rate": 0.0003159077231695086, "loss": 2.7657, "theoretical_loss": 3.577710917369108, "tokens_seen": 1236149248 }, { "epoch": 3.06, "learning_rate": 0.0003158976930792377, "loss": 2.8702, "theoretical_loss": 3.5776935674615222, "tokens_seen": 1236214784 }, { "epoch": 3.06, "learning_rate": 0.00031588766298896694, "loss": 2.6374, "theoretical_loss": 3.577676218731213, "tokens_seen": 1236280320 }, { "epoch": 3.06, "learning_rate": 0.00031587763289869607, "loss": 2.7712, "theoretical_loss": 3.5776588711780377, "tokens_seen": 1236345856 }, { "epoch": 3.06, "learning_rate": 0.0003158676028084253, "loss": 2.8402, "theoretical_loss": 3.5776415248018543, "tokens_seen": 1236411392 }, { "epoch": 3.06, "learning_rate": 0.0003158575727181545, "loss": 2.8077, "theoretical_loss": 3.57762417960252, "tokens_seen": 1236476928 }, { "epoch": 3.06, "learning_rate": 0.00031584754262788367, "loss": 2.8393, "theoretical_loss": 3.5776068355798936, "tokens_seen": 1236542464 }, { "epoch": 3.06, "learning_rate": 0.00031583751253761285, "loss": 2.8415, "theoretical_loss": 3.5775894927338316, "tokens_seen": 1236608000 }, { "epoch": 3.06, "learning_rate": 0.00031582748244734203, "loss": 2.8796, "theoretical_loss": 3.577572151064193, "tokens_seen": 1236673536 }, { "epoch": 3.06, "learning_rate": 0.0003158174523570712, "loss": 2.8198, "theoretical_loss": 3.577554810570835, "tokens_seen": 1236739072 }, { "epoch": 3.06, "learning_rate": 0.00031580742226680044, "loss": 2.8687, "theoretical_loss": 3.577537471253616, "tokens_seen": 1236804608 }, { "epoch": 3.06, "learning_rate": 0.00031579739217652957, "loss": 2.7652, "theoretical_loss": 3.577520133112393, "tokens_seen": 1236870144 }, { "epoch": 3.06, "learning_rate": 0.0003157873620862588, "loss": 2.8591, "theoretical_loss": 3.577502796147025, "tokens_seen": 1236935680 }, { "epoch": 3.06, "objective/train/docs_used": 1980617, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.883366346359253, "objective/train/theoretical_loss": 3.5774984620893937, "objective/train/tokens_used": 1257412064, "theoretical_loss": 3.5774984620893937, "tokens_seen": 1236952064 }, { "epoch": 3.06, "learning_rate": 0.00031577733199598793, "loss": 2.727, "theoretical_loss": 3.5774854603573694, "tokens_seen": 1237001216 }, { "epoch": 3.06, "learning_rate": 0.00031576730190571717, "loss": 2.9409, "theoretical_loss": 3.5774681257432848, "tokens_seen": 1237066752 }, { "epoch": 3.06, "learning_rate": 0.00031575727181544635, "loss": 2.8513, "theoretical_loss": 3.577450792304628, "tokens_seen": 1237132288 }, { "epoch": 3.06, "learning_rate": 0.00031574724172517553, "loss": 3.066, "theoretical_loss": 3.577433460041258, "tokens_seen": 1237197824 }, { "epoch": 3.06, "learning_rate": 0.0003157372116349047, "loss": 2.8754, "theoretical_loss": 3.5774161289530326, "tokens_seen": 1237263360 }, { "epoch": 3.06, "learning_rate": 0.00031572718154463395, "loss": 2.7729, "theoretical_loss": 3.57739879903981, "tokens_seen": 1237328896 }, { "epoch": 3.06, "learning_rate": 0.0003157171514543631, "loss": 2.9309, "theoretical_loss": 3.577381470301448, "tokens_seen": 1237394432 }, { "epoch": 3.06, "learning_rate": 0.0003157071213640923, "loss": 3.0337, "theoretical_loss": 3.577364142737805, "tokens_seen": 1237459968 }, { "epoch": 3.06, "learning_rate": 0.00031569709127382144, "loss": 2.5707, "theoretical_loss": 3.577346816348739, "tokens_seen": 1237525504 }, { "epoch": 3.06, "learning_rate": 0.00031568706118355067, "loss": 2.887, "theoretical_loss": 3.5773294911341083, "tokens_seen": 1237591040 }, { "epoch": 3.06, "learning_rate": 0.00031567703109327985, "loss": 2.7722, "theoretical_loss": 3.5773121670937713, "tokens_seen": 1237656576 }, { "epoch": 3.06, "learning_rate": 0.00031566700100300903, "loss": 2.985, "theoretical_loss": 3.577294844227586, "tokens_seen": 1237722112 }, { "epoch": 3.06, "learning_rate": 0.0003156569709127382, "loss": 2.9072, "theoretical_loss": 3.57727752253541, "tokens_seen": 1237787648 }, { "epoch": 3.06, "learning_rate": 0.0003156469408224674, "loss": 2.8135, "theoretical_loss": 3.5772602020171034, "tokens_seen": 1237853184 }, { "epoch": 3.06, "learning_rate": 0.0003156369107321966, "loss": 2.875, "theoretical_loss": 3.5772428826725227, "tokens_seen": 1237918720 }, { "epoch": 3.06, "learning_rate": 0.0003156268806419258, "loss": 2.8896, "theoretical_loss": 3.5772255645015267, "tokens_seen": 1237984256 }, { "epoch": 3.06, "learning_rate": 0.00031561685055165494, "loss": 2.8637, "theoretical_loss": 3.5772082475039744, "tokens_seen": 1238049792 }, { "epoch": 3.06, "learning_rate": 0.0003156068204613842, "loss": 2.9069, "theoretical_loss": 3.5771909316797235, "tokens_seen": 1238115328 }, { "epoch": 3.06, "learning_rate": 0.0003155967903711133, "loss": 2.7983, "theoretical_loss": 3.5771736170286323, "tokens_seen": 1238180864 }, { "epoch": 3.06, "learning_rate": 0.00031558676028084254, "loss": 2.9454, "theoretical_loss": 3.5771563035505602, "tokens_seen": 1238246400 }, { "epoch": 3.06, "learning_rate": 0.0003155767301905717, "loss": 2.9519, "theoretical_loss": 3.5771389912453646, "tokens_seen": 1238311936 }, { "epoch": 3.06, "learning_rate": 0.0003155667001003009, "loss": 2.7629, "theoretical_loss": 3.5771216801129047, "tokens_seen": 1238377472 }, { "epoch": 3.06, "learning_rate": 0.0003155566700100301, "loss": 2.8314, "theoretical_loss": 3.5771043701530383, "tokens_seen": 1238443008 }, { "epoch": 3.06, "learning_rate": 0.0003155466399197593, "loss": 2.9609, "theoretical_loss": 3.5770870613656243, "tokens_seen": 1238508544 }, { "epoch": 3.06, "learning_rate": 0.00031553660982948844, "loss": 2.8863, "theoretical_loss": 3.5770697537505214, "tokens_seen": 1238574080 }, { "epoch": 3.06, "objective/train/docs_used": 1983470, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.642559289932251, "objective/train/theoretical_loss": 3.577065427029903, "objective/train/tokens_used": 1259050464, "theoretical_loss": 3.577065427029903, "tokens_seen": 1238590464 }, { "epoch": 3.06, "learning_rate": 0.0003155265797392177, "loss": 2.789, "theoretical_loss": 3.5770524473075884, "tokens_seen": 1238639616 }, { "epoch": 3.06, "learning_rate": 0.0003155165496489468, "loss": 2.9481, "theoretical_loss": 3.5770351420366833, "tokens_seen": 1238705152 }, { "epoch": 3.06, "learning_rate": 0.00031550651955867604, "loss": 2.8047, "theoretical_loss": 3.5770178379376647, "tokens_seen": 1238770688 }, { "epoch": 3.06, "learning_rate": 0.0003154964894684052, "loss": 2.8068, "theoretical_loss": 3.5770005350103915, "tokens_seen": 1238836224 }, { "epoch": 3.06, "learning_rate": 0.0003154864593781344, "loss": 2.7103, "theoretical_loss": 3.576983233254723, "tokens_seen": 1238901760 }, { "epoch": 3.06, "learning_rate": 0.0003154764292878636, "loss": 2.7783, "theoretical_loss": 3.576965932670517, "tokens_seen": 1238967296 }, { "epoch": 3.06, "learning_rate": 0.00031546639919759277, "loss": 2.9113, "theoretical_loss": 3.576948633257633, "tokens_seen": 1239032832 }, { "epoch": 3.06, "learning_rate": 0.00031545636910732195, "loss": 2.8903, "theoretical_loss": 3.576931335015928, "tokens_seen": 1239098368 }, { "epoch": 3.06, "learning_rate": 0.0003154463390170512, "loss": 2.7887, "theoretical_loss": 3.5769140379452633, "tokens_seen": 1239163904 }, { "epoch": 3.06, "learning_rate": 0.0003154363089267803, "loss": 2.9392, "theoretical_loss": 3.576896742045496, "tokens_seen": 1239229440 }, { "epoch": 3.06, "learning_rate": 0.00031542627883650954, "loss": 2.8858, "theoretical_loss": 3.576879447316485, "tokens_seen": 1239294976 }, { "epoch": 3.06, "learning_rate": 0.0003154162487462387, "loss": 2.9571, "theoretical_loss": 3.57686215375809, "tokens_seen": 1239360512 }, { "epoch": 3.06, "learning_rate": 0.0003154062186559679, "loss": 2.8497, "theoretical_loss": 3.5768448613701693, "tokens_seen": 1239426048 }, { "epoch": 3.06, "learning_rate": 0.0003153961885656971, "loss": 2.8779, "theoretical_loss": 3.576827570152582, "tokens_seen": 1239491584 }, { "epoch": 3.06, "learning_rate": 0.00031538615847542627, "loss": 2.8442, "theoretical_loss": 3.576810280105186, "tokens_seen": 1239557120 }, { "epoch": 3.06, "learning_rate": 0.00031537612838515545, "loss": 2.7781, "theoretical_loss": 3.5767929912278422, "tokens_seen": 1239622656 }, { "epoch": 3.07, "learning_rate": 0.0003153660982948847, "loss": 2.8634, "theoretical_loss": 3.5767757035204077, "tokens_seen": 1239688192 }, { "epoch": 3.07, "learning_rate": 0.0003153560682046138, "loss": 2.7736, "theoretical_loss": 3.5767584169827433, "tokens_seen": 1239753728 }, { "epoch": 3.07, "learning_rate": 0.00031534603811434305, "loss": 2.9149, "theoretical_loss": 3.5767411316147064, "tokens_seen": 1239819264 }, { "epoch": 3.07, "learning_rate": 0.0003153360080240722, "loss": 2.9372, "theoretical_loss": 3.5767238474161567, "tokens_seen": 1239884800 }, { "epoch": 3.07, "learning_rate": 0.0003153259779338014, "loss": 2.8707, "theoretical_loss": 3.576706564386953, "tokens_seen": 1239950336 }, { "epoch": 3.07, "learning_rate": 0.0003153159478435306, "loss": 2.7632, "theoretical_loss": 3.576689282526955, "tokens_seen": 1240015872 }, { "epoch": 3.07, "learning_rate": 0.00031530591775325977, "loss": 2.6672, "theoretical_loss": 3.5766720018360214, "tokens_seen": 1240081408 }, { "epoch": 3.07, "learning_rate": 0.000315295887662989, "loss": 2.8436, "theoretical_loss": 3.5766547223140113, "tokens_seen": 1240146944 }, { "epoch": 3.07, "learning_rate": 0.00031528585757271813, "loss": 2.9577, "theoretical_loss": 3.576637443960784, "tokens_seen": 1240212480 }, { "epoch": 3.07, "objective/train/docs_used": 1985980, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.783776044845581, "objective/train/theoretical_loss": 3.576633124555083, "objective/train/tokens_used": 1260688864, "theoretical_loss": 3.576633124555083, "tokens_seen": 1240228864 }, { "epoch": 3.07, "learning_rate": 0.00031527582748244737, "loss": 2.8161, "theoretical_loss": 3.5766201667761983, "tokens_seen": 1240278016 }, { "epoch": 3.07, "learning_rate": 0.00031526579739217655, "loss": 3.037, "theoretical_loss": 3.5766028907601144, "tokens_seen": 1240343552 }, { "epoch": 3.07, "learning_rate": 0.00031525576730190573, "loss": 2.9044, "theoretical_loss": 3.57658561591239, "tokens_seen": 1240409088 }, { "epoch": 3.07, "learning_rate": 0.0003152457372116349, "loss": 2.9046, "theoretical_loss": 3.576568342232886, "tokens_seen": 1240474624 }, { "epoch": 3.07, "learning_rate": 0.00031523570712136415, "loss": 2.9153, "theoretical_loss": 3.576551069721461, "tokens_seen": 1240540160 }, { "epoch": 3.07, "learning_rate": 0.0003152256770310933, "loss": 2.7524, "theoretical_loss": 3.576533798377974, "tokens_seen": 1240605696 }, { "epoch": 3.07, "learning_rate": 0.0003152156469408225, "loss": 2.8402, "theoretical_loss": 3.5765165282022844, "tokens_seen": 1240671232 }, { "epoch": 3.07, "learning_rate": 0.00031520561685055164, "loss": 2.8179, "theoretical_loss": 3.5764992591942515, "tokens_seen": 1240736768 }, { "epoch": 3.07, "learning_rate": 0.00031519558676028087, "loss": 2.8846, "theoretical_loss": 3.5764819913537353, "tokens_seen": 1240802304 }, { "epoch": 3.07, "learning_rate": 0.00031518555667001005, "loss": 2.9915, "theoretical_loss": 3.5764647246805947, "tokens_seen": 1240867840 }, { "epoch": 3.07, "learning_rate": 0.00031517552657973924, "loss": 2.7217, "theoretical_loss": 3.5764474591746893, "tokens_seen": 1240933376 }, { "epoch": 3.07, "learning_rate": 0.0003151654964894684, "loss": 3.0452, "theoretical_loss": 3.5764301948358783, "tokens_seen": 1240998912 }, { "epoch": 3.07, "learning_rate": 0.0003151554663991976, "loss": 2.9431, "theoretical_loss": 3.576412931664022, "tokens_seen": 1241064448 }, { "epoch": 3.07, "learning_rate": 0.0003151454363089268, "loss": 2.9651, "theoretical_loss": 3.5763956696589787, "tokens_seen": 1241129984 }, { "epoch": 3.07, "learning_rate": 0.000315135406218656, "loss": 2.8194, "theoretical_loss": 3.5763784088206085, "tokens_seen": 1241195520 }, { "epoch": 3.07, "learning_rate": 0.00031512537612838514, "loss": 2.8722, "theoretical_loss": 3.5763611491487715, "tokens_seen": 1241261056 }, { "epoch": 3.07, "learning_rate": 0.0003151153460381144, "loss": 2.8265, "theoretical_loss": 3.5763438906433267, "tokens_seen": 1241326592 }, { "epoch": 3.07, "learning_rate": 0.0003151053159478435, "loss": 2.7733, "theoretical_loss": 3.5763266333041335, "tokens_seen": 1241392128 }, { "epoch": 3.07, "learning_rate": 0.00031509528585757274, "loss": 2.8036, "theoretical_loss": 3.576309377131052, "tokens_seen": 1241457664 }, { "epoch": 3.07, "learning_rate": 0.0003150852557673019, "loss": 2.968, "theoretical_loss": 3.576292122123941, "tokens_seen": 1241523200 }, { "epoch": 3.07, "learning_rate": 0.0003150752256770311, "loss": 2.7498, "theoretical_loss": 3.5762748682826615, "tokens_seen": 1241588736 }, { "epoch": 3.07, "learning_rate": 0.0003150651955867603, "loss": 2.9419, "theoretical_loss": 3.5762576156070724, "tokens_seen": 1241654272 }, { "epoch": 3.07, "learning_rate": 0.0003150551654964895, "loss": 2.8658, "theoretical_loss": 3.5762403640970333, "tokens_seen": 1241719808 }, { "epoch": 3.07, "learning_rate": 0.00031504513540621864, "loss": 2.9512, "theoretical_loss": 3.5762231137524045, "tokens_seen": 1241785344 }, { "epoch": 3.07, "learning_rate": 0.0003150351053159479, "loss": 2.6658, "theoretical_loss": 3.5762058645730455, "tokens_seen": 1241850880 }, { "epoch": 3.07, "objective/train/docs_used": 1988886, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.148139476776123, "objective/train/theoretical_loss": 3.5762015524602626, "objective/train/tokens_used": 1262327264, "theoretical_loss": 3.5762015524602626, "tokens_seen": 1241867264 }, { "epoch": 3.07, "learning_rate": 0.000315025075225677, "loss": 2.8167, "theoretical_loss": 3.5761886165588157, "tokens_seen": 1241916416 }, { "epoch": 3.07, "learning_rate": 0.00031501504513540624, "loss": 2.8194, "theoretical_loss": 3.5761713697095754, "tokens_seen": 1241981952 }, { "epoch": 3.07, "learning_rate": 0.0003150050150451354, "loss": 3.0176, "theoretical_loss": 3.5761541240251846, "tokens_seen": 1242047488 }, { "epoch": 3.07, "learning_rate": 0.0003149949849548646, "loss": 2.8393, "theoretical_loss": 3.5761368795055026, "tokens_seen": 1242113024 }, { "epoch": 3.07, "learning_rate": 0.0003149849548645938, "loss": 2.9821, "theoretical_loss": 3.5761196361503895, "tokens_seen": 1242178560 }, { "epoch": 3.07, "learning_rate": 0.00031497492477432297, "loss": 2.9186, "theoretical_loss": 3.576102393959706, "tokens_seen": 1242244096 }, { "epoch": 3.07, "learning_rate": 0.00031496489468405215, "loss": 2.9379, "theoretical_loss": 3.5760851529333104, "tokens_seen": 1242309632 }, { "epoch": 3.07, "learning_rate": 0.0003149548645937814, "loss": 2.958, "theoretical_loss": 3.576067913071064, "tokens_seen": 1242375168 }, { "epoch": 3.07, "learning_rate": 0.0003149448345035105, "loss": 2.8413, "theoretical_loss": 3.5760506743728264, "tokens_seen": 1242440704 }, { "epoch": 3.07, "learning_rate": 0.00031493480441323974, "loss": 2.7821, "theoretical_loss": 3.576033436838458, "tokens_seen": 1242506240 }, { "epoch": 3.07, "learning_rate": 0.0003149247743229689, "loss": 2.8703, "theoretical_loss": 3.5760162004678184, "tokens_seen": 1242571776 }, { "epoch": 3.07, "learning_rate": 0.0003149147442326981, "loss": 3.0114, "theoretical_loss": 3.5759989652607675, "tokens_seen": 1242637312 }, { "epoch": 3.07, "learning_rate": 0.0003149047141424273, "loss": 2.8374, "theoretical_loss": 3.5759817312171656, "tokens_seen": 1242702848 }, { "epoch": 3.07, "learning_rate": 0.00031489468405215647, "loss": 2.8676, "theoretical_loss": 3.5759644983368726, "tokens_seen": 1242768384 }, { "epoch": 3.07, "learning_rate": 0.00031488465396188565, "loss": 3.0306, "theoretical_loss": 3.575947266619749, "tokens_seen": 1242833920 }, { "epoch": 3.07, "learning_rate": 0.0003148746238716149, "loss": 2.8708, "theoretical_loss": 3.575930036065655, "tokens_seen": 1242899456 }, { "epoch": 3.07, "learning_rate": 0.000314864593781344, "loss": 2.9128, "theoretical_loss": 3.5759128066744506, "tokens_seen": 1242964992 }, { "epoch": 3.07, "learning_rate": 0.00031485456369107325, "loss": 2.866, "theoretical_loss": 3.575895578445996, "tokens_seen": 1243030528 }, { "epoch": 3.07, "learning_rate": 0.0003148445336008024, "loss": 2.8477, "theoretical_loss": 3.5758783513801515, "tokens_seen": 1243096064 }, { "epoch": 3.07, "learning_rate": 0.0003148345035105316, "loss": 2.8639, "theoretical_loss": 3.575861125476777, "tokens_seen": 1243161600 }, { "epoch": 3.07, "learning_rate": 0.0003148244734202608, "loss": 2.9773, "theoretical_loss": 3.5758439007357334, "tokens_seen": 1243227136 }, { "epoch": 3.07, "learning_rate": 0.00031481444332998997, "loss": 2.849, "theoretical_loss": 3.575826677156881, "tokens_seen": 1243292672 }, { "epoch": 3.07, "learning_rate": 0.00031480441323971915, "loss": 2.8587, "theoretical_loss": 3.575809454740079, "tokens_seen": 1243358208 }, { "epoch": 3.07, "learning_rate": 0.00031479438314944833, "loss": 2.9804, "theoretical_loss": 3.5757922334851893, "tokens_seen": 1243423744 }, { "epoch": 3.07, "learning_rate": 0.0003147843530591775, "loss": 2.8095, "theoretical_loss": 3.5757750133920716, "tokens_seen": 1243489280 }, { "epoch": 3.07, "objective/train/docs_used": 1991679, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8168485164642334, "objective/train/theoretical_loss": 3.5757707085503023, "objective/train/tokens_used": 1263965664, "theoretical_loss": 3.5757707085503023, "tokens_seen": 1243505664 }, { "epoch": 3.07, "learning_rate": 0.00031477432296890675, "loss": 2.8764, "theoretical_loss": 3.5757577944605856, "tokens_seen": 1243554816 }, { "epoch": 3.07, "learning_rate": 0.0003147642928786359, "loss": 2.9622, "theoretical_loss": 3.5757405766905928, "tokens_seen": 1243620352 }, { "epoch": 3.07, "learning_rate": 0.0003147542627883651, "loss": 2.8231, "theoretical_loss": 3.5757233600819527, "tokens_seen": 1243685888 }, { "epoch": 3.07, "learning_rate": 0.0003147442326980943, "loss": 2.9308, "theoretical_loss": 3.575706144634527, "tokens_seen": 1243751424 }, { "epoch": 3.07, "learning_rate": 0.0003147342026078235, "loss": 2.7103, "theoretical_loss": 3.575688930348176, "tokens_seen": 1243816960 }, { "epoch": 3.07, "learning_rate": 0.00031472417251755266, "loss": 2.804, "theoretical_loss": 3.5756717172227583, "tokens_seen": 1243882496 }, { "epoch": 3.07, "learning_rate": 0.00031471414242728184, "loss": 2.8179, "theoretical_loss": 3.5756545052581368, "tokens_seen": 1243948032 }, { "epoch": 3.07, "learning_rate": 0.000314704112337011, "loss": 2.9509, "theoretical_loss": 3.575637294454171, "tokens_seen": 1244013568 }, { "epoch": 3.07, "learning_rate": 0.00031469408224674025, "loss": 2.7463, "theoretical_loss": 3.575620084810722, "tokens_seen": 1244079104 }, { "epoch": 3.07, "learning_rate": 0.0003146840521564694, "loss": 2.8069, "theoretical_loss": 3.575602876327649, "tokens_seen": 1244144640 }, { "epoch": 3.07, "learning_rate": 0.0003146740220661986, "loss": 2.9272, "theoretical_loss": 3.5755856690048144, "tokens_seen": 1244210176 }, { "epoch": 3.07, "learning_rate": 0.00031466399197592774, "loss": 2.8064, "theoretical_loss": 3.575568462842078, "tokens_seen": 1244275712 }, { "epoch": 3.07, "learning_rate": 0.000314653961885657, "loss": 2.7417, "theoretical_loss": 3.575551257839301, "tokens_seen": 1244341248 }, { "epoch": 3.07, "learning_rate": 0.00031464393179538616, "loss": 2.7637, "theoretical_loss": 3.5755340539963436, "tokens_seen": 1244406784 }, { "epoch": 3.07, "learning_rate": 0.00031463390170511534, "loss": 2.9198, "theoretical_loss": 3.5755168513130666, "tokens_seen": 1244472320 }, { "epoch": 3.07, "learning_rate": 0.0003146238716148445, "loss": 2.7808, "theoretical_loss": 3.5754996497893314, "tokens_seen": 1244537856 }, { "epoch": 3.07, "learning_rate": 0.0003146138415245737, "loss": 2.8582, "theoretical_loss": 3.5754824494249977, "tokens_seen": 1244603392 }, { "epoch": 3.07, "learning_rate": 0.0003146038114343029, "loss": 2.7589, "theoretical_loss": 3.575465250219927, "tokens_seen": 1244668928 }, { "epoch": 3.07, "learning_rate": 0.0003145937813440321, "loss": 2.9078, "theoretical_loss": 3.57544805217398, "tokens_seen": 1244734464 }, { "epoch": 3.07, "learning_rate": 0.00031458375125376125, "loss": 2.7874, "theoretical_loss": 3.575430855287018, "tokens_seen": 1244800000 }, { "epoch": 3.07, "learning_rate": 0.0003145737211634905, "loss": 2.9798, "theoretical_loss": 3.575413659558901, "tokens_seen": 1244865536 }, { "epoch": 3.07, "learning_rate": 0.0003145636910732197, "loss": 2.865, "theoretical_loss": 3.57539646498949, "tokens_seen": 1244931072 }, { "epoch": 3.07, "learning_rate": 0.00031455366098294884, "loss": 2.8181, "theoretical_loss": 3.575379271578647, "tokens_seen": 1244996608 }, { "epoch": 3.07, "learning_rate": 0.0003145436308926781, "loss": 2.8957, "theoretical_loss": 3.575362079326232, "tokens_seen": 1245062144 }, { "epoch": 3.07, "learning_rate": 0.0003145336008024072, "loss": 2.9374, "theoretical_loss": 3.5753448882321064, "tokens_seen": 1245127680 }, { "epoch": 3.07, "objective/train/docs_used": 1994510, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.918125629425049, "objective/train/theoretical_loss": 3.5753405906395415, "objective/train/tokens_used": 1265604064, "theoretical_loss": 3.5753405906395415, "tokens_seen": 1245144064 }, { "epoch": 3.07, "learning_rate": 0.00031452357071213644, "loss": 2.9672, "theoretical_loss": 3.575327698296131, "tokens_seen": 1245193216 }, { "epoch": 3.07, "learning_rate": 0.0003145135406218656, "loss": 2.7885, "theoretical_loss": 3.5753105095181668, "tokens_seen": 1245258752 }, { "epoch": 3.07, "learning_rate": 0.0003145035105315948, "loss": 2.9218, "theoretical_loss": 3.5752933218980747, "tokens_seen": 1245324288 }, { "epoch": 3.07, "learning_rate": 0.000314493480441324, "loss": 2.8807, "theoretical_loss": 3.5752761354357165, "tokens_seen": 1245389824 }, { "epoch": 3.07, "learning_rate": 0.00031448345035105317, "loss": 2.9223, "theoretical_loss": 3.5752589501309524, "tokens_seen": 1245455360 }, { "epoch": 3.07, "learning_rate": 0.00031447342026078235, "loss": 2.9136, "theoretical_loss": 3.5752417659836437, "tokens_seen": 1245520896 }, { "epoch": 3.07, "learning_rate": 0.0003144633901705116, "loss": 2.8496, "theoretical_loss": 3.575224582993652, "tokens_seen": 1245586432 }, { "epoch": 3.07, "learning_rate": 0.0003144533600802407, "loss": 2.9633, "theoretical_loss": 3.5752074011608386, "tokens_seen": 1245651968 }, { "epoch": 3.07, "learning_rate": 0.00031444332998996994, "loss": 2.8904, "theoretical_loss": 3.575190220485064, "tokens_seen": 1245717504 }, { "epoch": 3.07, "learning_rate": 0.0003144332998996991, "loss": 2.8597, "theoretical_loss": 3.57517304096619, "tokens_seen": 1245783040 }, { "epoch": 3.07, "learning_rate": 0.0003144232698094283, "loss": 2.9155, "theoretical_loss": 3.575155862604077, "tokens_seen": 1245848576 }, { "epoch": 3.07, "learning_rate": 0.0003144132397191575, "loss": 2.9134, "theoretical_loss": 3.5751386853985876, "tokens_seen": 1245914112 }, { "epoch": 3.07, "learning_rate": 0.00031440320962888667, "loss": 2.9595, "theoretical_loss": 3.575121509349582, "tokens_seen": 1245979648 }, { "epoch": 3.07, "learning_rate": 0.00031439317953861585, "loss": 2.7562, "theoretical_loss": 3.5751043344569218, "tokens_seen": 1246045184 }, { "epoch": 3.07, "learning_rate": 0.0003143831494483451, "loss": 2.9086, "theoretical_loss": 3.5750871607204684, "tokens_seen": 1246110720 }, { "epoch": 3.07, "learning_rate": 0.0003143731193580742, "loss": 3.0031, "theoretical_loss": 3.575069988140083, "tokens_seen": 1246176256 }, { "epoch": 3.07, "learning_rate": 0.00031436308926780345, "loss": 2.8215, "theoretical_loss": 3.5750528167156275, "tokens_seen": 1246241792 }, { "epoch": 3.07, "learning_rate": 0.0003143530591775326, "loss": 2.9635, "theoretical_loss": 3.5750356464469633, "tokens_seen": 1246307328 }, { "epoch": 3.07, "learning_rate": 0.0003143430290872618, "loss": 2.702, "theoretical_loss": 3.575018477333951, "tokens_seen": 1246372864 }, { "epoch": 3.07, "learning_rate": 0.000314332998996991, "loss": 2.9286, "theoretical_loss": 3.5750013093764523, "tokens_seen": 1246438400 }, { "epoch": 3.07, "learning_rate": 0.00031432296890672017, "loss": 2.826, "theoretical_loss": 3.5749841425743294, "tokens_seen": 1246503936 }, { "epoch": 3.07, "learning_rate": 0.00031431293881644935, "loss": 2.7062, "theoretical_loss": 3.5749669769274433, "tokens_seen": 1246569472 }, { "epoch": 3.07, "learning_rate": 0.00031430290872617853, "loss": 2.7217, "theoretical_loss": 3.5749498124356553, "tokens_seen": 1246635008 }, { "epoch": 3.07, "learning_rate": 0.0003142928786359077, "loss": 2.8439, "theoretical_loss": 3.5749326490988276, "tokens_seen": 1246700544 }, { "epoch": 3.07, "learning_rate": 0.00031428284854563695, "loss": 2.8985, "theoretical_loss": 3.5749154869168214, "tokens_seen": 1246766080 }, { "epoch": 3.07, "objective/train/docs_used": 1995821, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7515900135040283, "objective/train/theoretical_loss": 3.5749111965517444, "objective/train/tokens_used": 1267242464, "theoretical_loss": 3.5749111965517444, "tokens_seen": 1246782464 }, { "epoch": 3.07, "learning_rate": 0.0003142728184553661, "loss": 2.9104, "theoretical_loss": 3.574898325889498, "tokens_seen": 1246831616 }, { "epoch": 3.07, "learning_rate": 0.0003142627883650953, "loss": 2.9511, "theoretical_loss": 3.57488116601672, "tokens_seen": 1246897152 }, { "epoch": 3.07, "learning_rate": 0.0003142527582748245, "loss": 2.9041, "theoretical_loss": 3.5748640072983475, "tokens_seen": 1246962688 }, { "epoch": 3.07, "learning_rate": 0.0003142427281845537, "loss": 2.995, "theoretical_loss": 3.5748468497342434, "tokens_seen": 1247028224 }, { "epoch": 3.07, "learning_rate": 0.00031423269809428286, "loss": 2.8609, "theoretical_loss": 3.5748296933242694, "tokens_seen": 1247093760 }, { "epoch": 3.07, "learning_rate": 0.00031422266800401204, "loss": 2.9878, "theoretical_loss": 3.5748125380682865, "tokens_seen": 1247159296 }, { "epoch": 3.07, "learning_rate": 0.0003142126379137412, "loss": 2.7992, "theoretical_loss": 3.5747953839661566, "tokens_seen": 1247224832 }, { "epoch": 3.07, "learning_rate": 0.00031420260782347045, "loss": 2.8955, "theoretical_loss": 3.5747782310177425, "tokens_seen": 1247290368 }, { "epoch": 3.07, "learning_rate": 0.0003141925777331996, "loss": 2.8493, "theoretical_loss": 3.5747610792229048, "tokens_seen": 1247355904 }, { "epoch": 3.07, "learning_rate": 0.0003141825476429288, "loss": 2.8846, "theoretical_loss": 3.5747439285815057, "tokens_seen": 1247421440 }, { "epoch": 3.07, "learning_rate": 0.00031417251755265794, "loss": 2.9837, "theoretical_loss": 3.5747267790934067, "tokens_seen": 1247486976 }, { "epoch": 3.07, "learning_rate": 0.0003141624874623872, "loss": 2.8831, "theoretical_loss": 3.5747096307584707, "tokens_seen": 1247552512 }, { "epoch": 3.07, "learning_rate": 0.00031415245737211636, "loss": 2.8976, "theoretical_loss": 3.574692483576558, "tokens_seen": 1247618048 }, { "epoch": 3.07, "learning_rate": 0.00031414242728184554, "loss": 2.793, "theoretical_loss": 3.574675337547532, "tokens_seen": 1247683584 }, { "epoch": 3.07, "learning_rate": 0.0003141323971915747, "loss": 2.7482, "theoretical_loss": 3.5746581926712544, "tokens_seen": 1247749120 }, { "epoch": 3.07, "learning_rate": 0.0003141223671013039, "loss": 2.9763, "theoretical_loss": 3.574641048947586, "tokens_seen": 1247814656 }, { "epoch": 3.07, "learning_rate": 0.0003141123370110331, "loss": 2.7259, "theoretical_loss": 3.57462390637639, "tokens_seen": 1247880192 }, { "epoch": 3.07, "learning_rate": 0.0003141023069207623, "loss": 2.8166, "theoretical_loss": 3.574606764957528, "tokens_seen": 1247945728 }, { "epoch": 3.07, "learning_rate": 0.00031409227683049145, "loss": 2.6718, "theoretical_loss": 3.574589624690862, "tokens_seen": 1248011264 }, { "epoch": 3.07, "learning_rate": 0.0003140822467402207, "loss": 2.7532, "theoretical_loss": 3.574572485576254, "tokens_seen": 1248076800 }, { "epoch": 3.07, "learning_rate": 0.00031407221664994986, "loss": 2.9114, "theoretical_loss": 3.5745553476135665, "tokens_seen": 1248142336 }, { "epoch": 3.07, "learning_rate": 0.00031406218655967904, "loss": 3.0023, "theoretical_loss": 3.574538210802661, "tokens_seen": 1248207872 }, { "epoch": 3.07, "learning_rate": 0.0003140521564694082, "loss": 2.7877, "theoretical_loss": 3.5745210751434, "tokens_seen": 1248273408 }, { "epoch": 3.07, "learning_rate": 0.0003140421263791374, "loss": 2.9151, "theoretical_loss": 3.574503940635645, "tokens_seen": 1248338944 }, { "epoch": 3.07, "learning_rate": 0.0003140320962888666, "loss": 2.7233, "theoretical_loss": 3.5744868072792593, "tokens_seen": 1248404480 }, { "epoch": 3.07, "objective/train/docs_used": 1998704, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7028555870056152, "objective/train/theoretical_loss": 3.574482524120048, "objective/train/tokens_used": 1268880864, "theoretical_loss": 3.574482524120048, "tokens_seen": 1248420864 }, { "epoch": 3.07, "learning_rate": 0.0003140220661985958, "loss": 2.7508, "theoretical_loss": 3.5744696750741043, "tokens_seen": 1248470016 }, { "epoch": 3.07, "learning_rate": 0.00031401203610832495, "loss": 2.9336, "theoretical_loss": 3.5744525440200423, "tokens_seen": 1248535552 }, { "epoch": 3.07, "learning_rate": 0.0003140020060180542, "loss": 2.8525, "theoretical_loss": 3.5744354141169357, "tokens_seen": 1248601088 }, { "epoch": 3.07, "learning_rate": 0.0003139919759277833, "loss": 2.85, "theoretical_loss": 3.574418285364647, "tokens_seen": 1248666624 }, { "epoch": 3.07, "learning_rate": 0.00031398194583751255, "loss": 2.8908, "theoretical_loss": 3.574401157763038, "tokens_seen": 1248732160 }, { "epoch": 3.07, "learning_rate": 0.00031397191574724173, "loss": 2.9609, "theoretical_loss": 3.5743840313119715, "tokens_seen": 1248797696 }, { "epoch": 3.07, "learning_rate": 0.0003139618856569709, "loss": 2.882, "theoretical_loss": 3.5743669060113095, "tokens_seen": 1248863232 }, { "epoch": 3.07, "learning_rate": 0.0003139518555667001, "loss": 2.8988, "theoretical_loss": 3.5743497818609145, "tokens_seen": 1248928768 }, { "epoch": 3.07, "learning_rate": 0.0003139418254764293, "loss": 2.7592, "theoretical_loss": 3.5743326588606483, "tokens_seen": 1248994304 }, { "epoch": 3.07, "learning_rate": 0.00031393179538615845, "loss": 2.9293, "theoretical_loss": 3.5743155370103743, "tokens_seen": 1249059840 }, { "epoch": 3.07, "learning_rate": 0.0003139217652958877, "loss": 2.6491, "theoretical_loss": 3.5742984163099543, "tokens_seen": 1249125376 }, { "epoch": 3.07, "learning_rate": 0.0003139117352056168, "loss": 2.8408, "theoretical_loss": 3.574281296759251, "tokens_seen": 1249190912 }, { "epoch": 3.07, "learning_rate": 0.00031390170511534605, "loss": 2.9397, "theoretical_loss": 3.574264178358127, "tokens_seen": 1249256448 }, { "epoch": 3.07, "learning_rate": 0.00031389167502507523, "loss": 2.6634, "theoretical_loss": 3.5742470611064445, "tokens_seen": 1249321984 }, { "epoch": 3.07, "learning_rate": 0.0003138816449348044, "loss": 2.7968, "theoretical_loss": 3.574229945004066, "tokens_seen": 1249387520 }, { "epoch": 3.07, "learning_rate": 0.0003138716148445336, "loss": 2.755, "theoretical_loss": 3.5742128300508544, "tokens_seen": 1249453056 }, { "epoch": 3.07, "learning_rate": 0.0003138615847542628, "loss": 2.8052, "theoretical_loss": 3.574195716246672, "tokens_seen": 1249518592 }, { "epoch": 3.07, "learning_rate": 0.00031385155466399196, "loss": 2.886, "theoretical_loss": 3.574178603591381, "tokens_seen": 1249584128 }, { "epoch": 3.07, "learning_rate": 0.0003138415245737212, "loss": 2.8421, "theoretical_loss": 3.5741614920848455, "tokens_seen": 1249649664 }, { "epoch": 3.07, "learning_rate": 0.0003138314944834503, "loss": 2.7112, "theoretical_loss": 3.5741443817269265, "tokens_seen": 1249715200 }, { "epoch": 3.07, "learning_rate": 0.00031382146439317955, "loss": 2.8898, "theoretical_loss": 3.5741272725174875, "tokens_seen": 1249780736 }, { "epoch": 3.07, "learning_rate": 0.00031381143430290873, "loss": 2.8122, "theoretical_loss": 3.574110164456391, "tokens_seen": 1249846272 }, { "epoch": 3.07, "learning_rate": 0.0003138014042126379, "loss": 2.8158, "theoretical_loss": 3.5740930575434997, "tokens_seen": 1249911808 }, { "epoch": 3.07, "learning_rate": 0.00031379137412236715, "loss": 2.9306, "theoretical_loss": 3.574075951778676, "tokens_seen": 1249977344 }, { "epoch": 3.07, "learning_rate": 0.0003137813440320963, "loss": 2.8694, "theoretical_loss": 3.5740588471617833, "tokens_seen": 1250042880 }, { "epoch": 3.07, "objective/train/docs_used": 2001293, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0441510677337646, "objective/train/theoretical_loss": 3.5740545711869087, "objective/train/tokens_used": 1270519264, "theoretical_loss": 3.5740545711869087, "tokens_seen": 1250059264 }, { "epoch": 3.07, "learning_rate": 0.0003137713139418255, "loss": 2.8483, "theoretical_loss": 3.5740417436926846, "tokens_seen": 1250108416 }, { "epoch": 3.07, "learning_rate": 0.0003137612838515547, "loss": 2.6297, "theoretical_loss": 3.574024641371242, "tokens_seen": 1250173952 }, { "epoch": 3.07, "learning_rate": 0.0003137512537612839, "loss": 2.7773, "theoretical_loss": 3.574007540197318, "tokens_seen": 1250239488 }, { "epoch": 3.07, "learning_rate": 0.00031374122367101306, "loss": 2.7073, "theoretical_loss": 3.5739904401707765, "tokens_seen": 1250305024 }, { "epoch": 3.07, "learning_rate": 0.00031373119358074224, "loss": 2.9116, "theoretical_loss": 3.57397334129148, "tokens_seen": 1250370560 }, { "epoch": 3.07, "learning_rate": 0.0003137211634904714, "loss": 2.7802, "theoretical_loss": 3.573956243559291, "tokens_seen": 1250436096 }, { "epoch": 3.07, "learning_rate": 0.00031371113340020065, "loss": 2.8496, "theoretical_loss": 3.573939146974073, "tokens_seen": 1250501632 }, { "epoch": 3.07, "learning_rate": 0.0003137011033099298, "loss": 2.7789, "theoretical_loss": 3.5739220515356886, "tokens_seen": 1250567168 }, { "epoch": 3.07, "learning_rate": 0.000313691073219659, "loss": 2.8826, "theoretical_loss": 3.573904957244001, "tokens_seen": 1250632704 }, { "epoch": 3.07, "learning_rate": 0.00031368104312938814, "loss": 2.9033, "theoretical_loss": 3.573887864098873, "tokens_seen": 1250698240 }, { "epoch": 3.07, "learning_rate": 0.0003136710130391174, "loss": 2.9239, "theoretical_loss": 3.5738707721001672, "tokens_seen": 1250763776 }, { "epoch": 3.07, "learning_rate": 0.00031366098294884656, "loss": 2.9296, "theoretical_loss": 3.5738536812477477, "tokens_seen": 1250829312 }, { "epoch": 3.07, "learning_rate": 0.00031365095285857574, "loss": 2.8291, "theoretical_loss": 3.573836591541477, "tokens_seen": 1250894848 }, { "epoch": 3.07, "learning_rate": 0.0003136409227683049, "loss": 2.7819, "theoretical_loss": 3.5738195029812183, "tokens_seen": 1250960384 }, { "epoch": 3.07, "learning_rate": 0.0003136308926780341, "loss": 2.8419, "theoretical_loss": 3.5738024155668344, "tokens_seen": 1251025920 }, { "epoch": 3.07, "learning_rate": 0.0003136208625877633, "loss": 2.8687, "theoretical_loss": 3.5737853292981887, "tokens_seen": 1251091456 }, { "epoch": 3.07, "learning_rate": 0.0003136108324974925, "loss": 2.6981, "theoretical_loss": 3.5737682441751444, "tokens_seen": 1251156992 }, { "epoch": 3.07, "learning_rate": 0.00031360080240722165, "loss": 2.7567, "theoretical_loss": 3.573751160197565, "tokens_seen": 1251222528 }, { "epoch": 3.07, "learning_rate": 0.0003135907723169509, "loss": 2.6105, "theoretical_loss": 3.573734077365313, "tokens_seen": 1251288064 }, { "epoch": 3.07, "learning_rate": 0.00031358074222668006, "loss": 2.8767, "theoretical_loss": 3.573716995678252, "tokens_seen": 1251353600 }, { "epoch": 3.07, "learning_rate": 0.00031357071213640924, "loss": 2.9389, "theoretical_loss": 3.573699915136245, "tokens_seen": 1251419136 }, { "epoch": 3.07, "learning_rate": 0.0003135606820461384, "loss": 2.7442, "theoretical_loss": 3.573682835739156, "tokens_seen": 1251484672 }, { "epoch": 3.07, "learning_rate": 0.0003135506519558676, "loss": 2.7864, "theoretical_loss": 3.573665757486847, "tokens_seen": 1251550208 }, { "epoch": 3.07, "learning_rate": 0.0003135406218655968, "loss": 2.8265, "theoretical_loss": 3.5736486803791827, "tokens_seen": 1251615744 }, { "epoch": 3.07, "learning_rate": 0.000313530591775326, "loss": 2.8999, "theoretical_loss": 3.573631604416026, "tokens_seen": 1251681280 }, { "epoch": 3.07, "objective/train/docs_used": 2004074, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.847318410873413, "objective/train/theoretical_loss": 3.5736273356040504, "objective/train/tokens_used": 1272157664, "theoretical_loss": 3.5736273356040504, "tokens_seen": 1251697664 }, { "epoch": 3.07, "learning_rate": 0.00031352056168505515, "loss": 2.8363, "theoretical_loss": 3.57361452959724, "tokens_seen": 1251746816 }, { "epoch": 3.07, "learning_rate": 0.0003135105315947844, "loss": 2.7191, "theoretical_loss": 3.5735974559226884, "tokens_seen": 1251812352 }, { "epoch": 3.07, "learning_rate": 0.0003135005015045135, "loss": 2.8301, "theoretical_loss": 3.5735803833922346, "tokens_seen": 1251877888 }, { "epoch": 3.07, "learning_rate": 0.00031349047141424275, "loss": 2.7822, "theoretical_loss": 3.5735633120057417, "tokens_seen": 1251943424 }, { "epoch": 3.07, "learning_rate": 0.00031348044132397193, "loss": 2.883, "theoretical_loss": 3.5735462417630734, "tokens_seen": 1252008960 }, { "epoch": 3.07, "learning_rate": 0.0003134704112337011, "loss": 2.8905, "theoretical_loss": 3.5735291726640934, "tokens_seen": 1252074496 }, { "epoch": 3.07, "learning_rate": 0.0003134603811434303, "loss": 2.6823, "theoretical_loss": 3.5735121047086653, "tokens_seen": 1252140032 }, { "epoch": 3.07, "learning_rate": 0.0003134503510531595, "loss": 2.8481, "theoretical_loss": 3.573495037896652, "tokens_seen": 1252205568 }, { "epoch": 3.07, "learning_rate": 0.00031344032096288865, "loss": 2.7616, "theoretical_loss": 3.5734779722279173, "tokens_seen": 1252271104 }, { "epoch": 3.07, "learning_rate": 0.0003134302908726179, "loss": 2.9908, "theoretical_loss": 3.5734609077023256, "tokens_seen": 1252336640 }, { "epoch": 3.07, "learning_rate": 0.000313420260782347, "loss": 2.8463, "theoretical_loss": 3.573443844319739, "tokens_seen": 1252402176 }, { "epoch": 3.07, "learning_rate": 0.00031341023069207625, "loss": 2.6616, "theoretical_loss": 3.5734267820800225, "tokens_seen": 1252467712 }, { "epoch": 3.07, "learning_rate": 0.00031340020060180543, "loss": 2.8389, "theoretical_loss": 3.5734097209830393, "tokens_seen": 1252533248 }, { "epoch": 3.07, "learning_rate": 0.0003133901705115346, "loss": 2.8367, "theoretical_loss": 3.5733926610286524, "tokens_seen": 1252598784 }, { "epoch": 3.07, "learning_rate": 0.0003133801404212638, "loss": 2.8583, "theoretical_loss": 3.5733756022167267, "tokens_seen": 1252664320 }, { "epoch": 3.07, "learning_rate": 0.000313370110330993, "loss": 2.8597, "theoretical_loss": 3.5733585445471254, "tokens_seen": 1252729856 }, { "epoch": 3.07, "learning_rate": 0.00031336008024072216, "loss": 2.929, "theoretical_loss": 3.573341488019712, "tokens_seen": 1252795392 }, { "epoch": 3.07, "learning_rate": 0.0003133500501504514, "loss": 2.9511, "theoretical_loss": 3.57332443263435, "tokens_seen": 1252860928 }, { "epoch": 3.07, "learning_rate": 0.0003133400200601805, "loss": 2.7525, "theoretical_loss": 3.573307378390904, "tokens_seen": 1252926464 }, { "epoch": 3.07, "learning_rate": 0.00031332998996990975, "loss": 3.0105, "theoretical_loss": 3.573290325289238, "tokens_seen": 1252992000 }, { "epoch": 3.07, "learning_rate": 0.0003133199598796389, "loss": 2.7568, "theoretical_loss": 3.5732732733292147, "tokens_seen": 1253057536 }, { "epoch": 3.07, "learning_rate": 0.0003133099297893681, "loss": 2.9528, "theoretical_loss": 3.573256222510699, "tokens_seen": 1253123072 }, { "epoch": 3.07, "learning_rate": 0.0003132998996990973, "loss": 2.8006, "theoretical_loss": 3.573239172833554, "tokens_seen": 1253188608 }, { "epoch": 3.07, "learning_rate": 0.0003132898696088265, "loss": 2.8056, "theoretical_loss": 3.5732221242976445, "tokens_seen": 1253254144 }, { "epoch": 3.07, "learning_rate": 0.00031327983951855566, "loss": 2.8522, "theoretical_loss": 3.5732050769028336, "tokens_seen": 1253319680 }, { "epoch": 3.07, "objective/train/docs_used": 2006919, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0917487144470215, "objective/train/theoretical_loss": 3.5732008152324117, "objective/train/tokens_used": 1273796064, "theoretical_loss": 3.5732008152324117, "tokens_seen": 1253336064 }, { "epoch": 3.07, "learning_rate": 0.0003132698094282849, "loss": 2.7706, "theoretical_loss": 3.5731880306489856, "tokens_seen": 1253385216 }, { "epoch": 3.07, "learning_rate": 0.000313259779338014, "loss": 2.8445, "theoretical_loss": 3.573170985535965, "tokens_seen": 1253450752 }, { "epoch": 3.07, "learning_rate": 0.00031324974924774326, "loss": 2.8154, "theoretical_loss": 3.5731539415636346, "tokens_seen": 1253516288 }, { "epoch": 3.07, "learning_rate": 0.0003132397191574724, "loss": 2.8053, "theoretical_loss": 3.5731368987318595, "tokens_seen": 1253581824 }, { "epoch": 3.07, "learning_rate": 0.0003132296890672016, "loss": 2.7193, "theoretical_loss": 3.5731198570405036, "tokens_seen": 1253647360 }, { "epoch": 3.07, "learning_rate": 0.0003132196589769308, "loss": 2.8143, "theoretical_loss": 3.5731028164894303, "tokens_seen": 1253712896 }, { "epoch": 3.07, "learning_rate": 0.00031320962888666, "loss": 2.7373, "theoretical_loss": 3.5730857770785045, "tokens_seen": 1253778432 }, { "epoch": 3.07, "learning_rate": 0.00031319959879638916, "loss": 2.9174, "theoretical_loss": 3.5730687388075895, "tokens_seen": 1253843968 }, { "epoch": 3.07, "learning_rate": 0.00031318956870611834, "loss": 2.815, "theoretical_loss": 3.5730517016765506, "tokens_seen": 1253909504 }, { "epoch": 3.07, "learning_rate": 0.0003131795386158475, "loss": 2.8592, "theoretical_loss": 3.5730346656852507, "tokens_seen": 1253975040 }, { "epoch": 3.07, "learning_rate": 0.00031316950852557676, "loss": 2.6932, "theoretical_loss": 3.5730176308335553, "tokens_seen": 1254040576 }, { "epoch": 3.07, "learning_rate": 0.0003131594784353059, "loss": 2.7757, "theoretical_loss": 3.5730005971213274, "tokens_seen": 1254106112 }, { "epoch": 3.07, "learning_rate": 0.0003131494483450351, "loss": 2.9265, "theoretical_loss": 3.572983564548432, "tokens_seen": 1254171648 }, { "epoch": 3.07, "learning_rate": 0.00031313941825476425, "loss": 2.7846, "theoretical_loss": 3.572966533114733, "tokens_seen": 1254237184 }, { "epoch": 3.07, "learning_rate": 0.0003131293881644935, "loss": 3.0343, "theoretical_loss": 3.572949502820095, "tokens_seen": 1254302720 }, { "epoch": 3.07, "learning_rate": 0.00031311935807422267, "loss": 2.8444, "theoretical_loss": 3.572932473664382, "tokens_seen": 1254368256 }, { "epoch": 3.07, "learning_rate": 0.00031310932798395185, "loss": 2.8066, "theoretical_loss": 3.5729154456474586, "tokens_seen": 1254433792 }, { "epoch": 3.07, "learning_rate": 0.00031309929789368103, "loss": 2.7687, "theoretical_loss": 3.5728984187691886, "tokens_seen": 1254499328 }, { "epoch": 3.07, "learning_rate": 0.00031308926780341026, "loss": 2.5698, "theoretical_loss": 3.572881393029437, "tokens_seen": 1254564864 }, { "epoch": 3.07, "learning_rate": 0.0003130792377131394, "loss": 2.8781, "theoretical_loss": 3.5728643684280685, "tokens_seen": 1254630400 }, { "epoch": 3.07, "learning_rate": 0.0003130692076228686, "loss": 2.8201, "theoretical_loss": 3.5728473449649467, "tokens_seen": 1254695936 }, { "epoch": 3.07, "learning_rate": 0.0003130591775325978, "loss": 2.7605, "theoretical_loss": 3.572830322639936, "tokens_seen": 1254761472 }, { "epoch": 3.07, "learning_rate": 0.000313049147442327, "loss": 2.8794, "theoretical_loss": 3.5728133014529018, "tokens_seen": 1254827008 }, { "epoch": 3.07, "learning_rate": 0.0003130391173520562, "loss": 2.8823, "theoretical_loss": 3.5727962814037078, "tokens_seen": 1254892544 }, { "epoch": 3.07, "learning_rate": 0.00031302908726178535, "loss": 2.8815, "theoretical_loss": 3.5727792624922188, "tokens_seen": 1254958080 }, { "epoch": 3.07, "objective/train/docs_used": 2009646, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.783987283706665, "objective/train/theoretical_loss": 3.572775007942097, "objective/train/tokens_used": 1275434464, "theoretical_loss": 3.572775007942097, "tokens_seen": 1254974464 }, { "epoch": 3.07, "learning_rate": 0.0003130190571715146, "loss": 2.8313, "theoretical_loss": 3.572762244718299, "tokens_seen": 1255023616 }, { "epoch": 3.07, "learning_rate": 0.0003130090270812437, "loss": 3.0174, "theoretical_loss": 3.572745228081814, "tokens_seen": 1255089152 }, { "epoch": 3.07, "learning_rate": 0.00031299899699097295, "loss": 2.9959, "theoretical_loss": 3.572728212582627, "tokens_seen": 1255154688 }, { "epoch": 3.07, "learning_rate": 0.00031298896690070213, "loss": 2.7313, "theoretical_loss": 3.5727111982206035, "tokens_seen": 1255220224 }, { "epoch": 3.07, "learning_rate": 0.0003129789368104313, "loss": 2.8875, "theoretical_loss": 3.5726941849956084, "tokens_seen": 1255285760 }, { "epoch": 3.07, "learning_rate": 0.0003129689067201605, "loss": 2.9145, "theoretical_loss": 3.5726771729075053, "tokens_seen": 1255351296 }, { "epoch": 3.07, "learning_rate": 0.0003129588766298897, "loss": 2.76, "theoretical_loss": 3.5726601619561595, "tokens_seen": 1255416832 }, { "epoch": 3.07, "learning_rate": 0.00031294884653961885, "loss": 2.8809, "theoretical_loss": 3.5726431521414357, "tokens_seen": 1255482368 }, { "epoch": 3.07, "learning_rate": 0.0003129388164493481, "loss": 2.8813, "theoretical_loss": 3.572626143463199, "tokens_seen": 1255547904 }, { "epoch": 3.07, "learning_rate": 0.0003129287863590772, "loss": 2.802, "theoretical_loss": 3.5726091359213132, "tokens_seen": 1255613440 }, { "epoch": 3.07, "learning_rate": 0.00031291875626880645, "loss": 2.8191, "theoretical_loss": 3.572592129515644, "tokens_seen": 1255678976 }, { "epoch": 3.07, "learning_rate": 0.00031290872617853563, "loss": 2.8817, "theoretical_loss": 3.5725751242460557, "tokens_seen": 1255744512 }, { "epoch": 3.07, "learning_rate": 0.0003128986960882648, "loss": 2.9207, "theoretical_loss": 3.5725581201124132, "tokens_seen": 1255810048 }, { "epoch": 3.07, "learning_rate": 0.000312888665997994, "loss": 2.8412, "theoretical_loss": 3.5725411171145813, "tokens_seen": 1255875584 }, { "epoch": 3.07, "learning_rate": 0.0003128786359077232, "loss": 3.0198, "theoretical_loss": 3.5725241152524254, "tokens_seen": 1255941120 }, { "epoch": 3.07, "learning_rate": 0.00031286860581745236, "loss": 2.741, "theoretical_loss": 3.5725071145258096, "tokens_seen": 1256006656 }, { "epoch": 3.07, "learning_rate": 0.0003128585757271816, "loss": 2.7891, "theoretical_loss": 3.572490114934599, "tokens_seen": 1256072192 }, { "epoch": 3.07, "learning_rate": 0.0003128485456369107, "loss": 2.8026, "theoretical_loss": 3.572473116478659, "tokens_seen": 1256137728 }, { "epoch": 3.07, "learning_rate": 0.00031283851554663995, "loss": 2.8607, "theoretical_loss": 3.572456119157854, "tokens_seen": 1256203264 }, { "epoch": 3.07, "learning_rate": 0.0003128284854563691, "loss": 2.8637, "theoretical_loss": 3.5724391229720496, "tokens_seen": 1256268800 }, { "epoch": 3.07, "learning_rate": 0.0003128184553660983, "loss": 2.7992, "theoretical_loss": 3.5724221279211106, "tokens_seen": 1256334336 }, { "epoch": 3.07, "learning_rate": 0.0003128084252758275, "loss": 2.9458, "theoretical_loss": 3.5724051340049012, "tokens_seen": 1256399872 }, { "epoch": 3.07, "learning_rate": 0.0003127983951855567, "loss": 2.706, "theoretical_loss": 3.5723881412232874, "tokens_seen": 1256465408 }, { "epoch": 3.07, "learning_rate": 0.00031278836509528586, "loss": 2.735, "theoretical_loss": 3.572371149576134, "tokens_seen": 1256530944 }, { "epoch": 3.07, "learning_rate": 0.0003127783350050151, "loss": 2.9048, "theoretical_loss": 3.5723541590633063, "tokens_seen": 1256596480 }, { "epoch": 3.07, "objective/train/docs_used": 2011066, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.829768657684326, "objective/train/theoretical_loss": 3.5723499116123216, "objective/train/tokens_used": 1277072864, "theoretical_loss": 3.5723499116123216, "tokens_seen": 1256612864 }, { "epoch": 3.07, "learning_rate": 0.0003127683049147442, "loss": 2.8532, "theoretical_loss": 3.572337169684669, "tokens_seen": 1256662016 }, { "epoch": 3.07, "learning_rate": 0.00031275827482447346, "loss": 2.943, "theoretical_loss": 3.572320181440088, "tokens_seen": 1256727552 }, { "epoch": 3.07, "learning_rate": 0.0003127482447342026, "loss": 2.9141, "theoretical_loss": 3.572303194329427, "tokens_seen": 1256793088 }, { "epoch": 3.07, "learning_rate": 0.0003127382146439318, "loss": 2.7185, "theoretical_loss": 3.572286208352553, "tokens_seen": 1256858624 }, { "epoch": 3.07, "learning_rate": 0.000312728184553661, "loss": 2.8316, "theoretical_loss": 3.5722692235093296, "tokens_seen": 1256924160 }, { "epoch": 3.07, "learning_rate": 0.0003127181544633902, "loss": 2.9641, "theoretical_loss": 3.5722522397996235, "tokens_seen": 1256989696 }, { "epoch": 3.07, "learning_rate": 0.00031270812437311936, "loss": 2.8946, "theoretical_loss": 3.5722352572232987, "tokens_seen": 1257055232 }, { "epoch": 3.07, "learning_rate": 0.00031269809428284854, "loss": 3.0819, "theoretical_loss": 3.572218275780221, "tokens_seen": 1257120768 }, { "epoch": 3.07, "learning_rate": 0.0003126880641925777, "loss": 2.795, "theoretical_loss": 3.572201295470256, "tokens_seen": 1257186304 }, { "epoch": 3.07, "learning_rate": 0.00031267803410230696, "loss": 2.699, "theoretical_loss": 3.572184316293269, "tokens_seen": 1257251840 }, { "epoch": 3.07, "learning_rate": 0.0003126680040120361, "loss": 2.785, "theoretical_loss": 3.572167338249125, "tokens_seen": 1257317376 }, { "epoch": 3.07, "learning_rate": 0.0003126579739217653, "loss": 2.6602, "theoretical_loss": 3.5721503613376893, "tokens_seen": 1257382912 }, { "epoch": 3.07, "learning_rate": 0.00031264794383149445, "loss": 2.8313, "theoretical_loss": 3.5721333855588275, "tokens_seen": 1257448448 }, { "epoch": 3.07, "learning_rate": 0.0003126379137412237, "loss": 2.9098, "theoretical_loss": 3.572116410912405, "tokens_seen": 1257513984 }, { "epoch": 3.07, "learning_rate": 0.00031262788365095287, "loss": 2.8392, "theoretical_loss": 3.572099437398287, "tokens_seen": 1257579520 }, { "epoch": 3.07, "learning_rate": 0.00031261785356068205, "loss": 2.7237, "theoretical_loss": 3.572082465016339, "tokens_seen": 1257645056 }, { "epoch": 3.07, "learning_rate": 0.00031260782347041123, "loss": 2.8912, "theoretical_loss": 3.5720654937664276, "tokens_seen": 1257710592 }, { "epoch": 3.07, "learning_rate": 0.00031259779338014046, "loss": 3.0839, "theoretical_loss": 3.572048523648417, "tokens_seen": 1257776128 }, { "epoch": 3.07, "learning_rate": 0.0003125877632898696, "loss": 2.9246, "theoretical_loss": 3.572031554662173, "tokens_seen": 1257841664 }, { "epoch": 3.07, "learning_rate": 0.0003125777331995988, "loss": 2.7763, "theoretical_loss": 3.572014586807562, "tokens_seen": 1257907200 }, { "epoch": 3.07, "learning_rate": 0.00031256770310932795, "loss": 2.7756, "theoretical_loss": 3.5719976200844483, "tokens_seen": 1257972736 }, { "epoch": 3.07, "learning_rate": 0.0003125576730190572, "loss": 2.6491, "theoretical_loss": 3.5719806544926977, "tokens_seen": 1258038272 }, { "epoch": 3.07, "learning_rate": 0.00031254764292878637, "loss": 2.7145, "theoretical_loss": 3.5719636900321765, "tokens_seen": 1258103808 }, { "epoch": 3.07, "learning_rate": 0.00031253761283851555, "loss": 2.8584, "theoretical_loss": 3.571946726702751, "tokens_seen": 1258169344 }, { "epoch": 3.07, "learning_rate": 0.00031252758274824473, "loss": 2.85, "theoretical_loss": 3.5719297645042847, "tokens_seen": 1258234880 }, { "epoch": 3.07, "objective/train/docs_used": 2014802, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7352161407470703, "objective/train/theoretical_loss": 3.571925524131365, "objective/train/tokens_used": 1278711264, "theoretical_loss": 3.571925524131365, "tokens_seen": 1258251264 }, { "epoch": 3.07, "learning_rate": 0.0003125175526579739, "loss": 2.6915, "theoretical_loss": 3.5719128034366454, "tokens_seen": 1258300416 }, { "epoch": 3.07, "learning_rate": 0.0003125075225677031, "loss": 2.9116, "theoretical_loss": 3.5718958434996972, "tokens_seen": 1258365952 }, { "epoch": 3.07, "learning_rate": 0.00031249749247743233, "loss": 2.7641, "theoretical_loss": 3.571878884693307, "tokens_seen": 1258431488 }, { "epoch": 3.07, "learning_rate": 0.00031248746238716146, "loss": 2.9037, "theoretical_loss": 3.5718619270173404, "tokens_seen": 1258497024 }, { "epoch": 3.07, "learning_rate": 0.0003124774322968907, "loss": 2.744, "theoretical_loss": 3.5718449704716626, "tokens_seen": 1258562560 }, { "epoch": 3.07, "learning_rate": 0.0003124674022066198, "loss": 2.8577, "theoretical_loss": 3.5718280150561403, "tokens_seen": 1258628096 }, { "epoch": 3.07, "learning_rate": 0.00031245737211634905, "loss": 2.4797, "theoretical_loss": 3.5718110607706386, "tokens_seen": 1258693632 }, { "epoch": 3.07, "learning_rate": 0.00031244734202607823, "loss": 2.7919, "theoretical_loss": 3.5717941076150233, "tokens_seen": 1258759168 }, { "epoch": 3.07, "learning_rate": 0.0003124373119358074, "loss": 2.7424, "theoretical_loss": 3.571777155589161, "tokens_seen": 1258824704 }, { "epoch": 3.07, "learning_rate": 0.0003124272818455366, "loss": 2.9125, "theoretical_loss": 3.5717602046929167, "tokens_seen": 1258890240 }, { "epoch": 3.07, "learning_rate": 0.00031241725175526583, "loss": 2.7915, "theoretical_loss": 3.571743254926157, "tokens_seen": 1258955776 }, { "epoch": 3.07, "learning_rate": 0.00031240722166499496, "loss": 2.7801, "theoretical_loss": 3.5717263062887477, "tokens_seen": 1259021312 }, { "epoch": 3.07, "learning_rate": 0.0003123971915747242, "loss": 2.8987, "theoretical_loss": 3.5717093587805544, "tokens_seen": 1259086848 }, { "epoch": 3.07, "learning_rate": 0.0003123871614844533, "loss": 2.8675, "theoretical_loss": 3.5716924124014433, "tokens_seen": 1259152384 }, { "epoch": 3.07, "learning_rate": 0.00031237713139418256, "loss": 2.9327, "theoretical_loss": 3.571675467151281, "tokens_seen": 1259217920 }, { "epoch": 3.07, "learning_rate": 0.00031236710130391174, "loss": 2.7791, "theoretical_loss": 3.571658523029933, "tokens_seen": 1259283456 }, { "epoch": 3.07, "learning_rate": 0.0003123570712136409, "loss": 2.8166, "theoretical_loss": 3.5716415800372654, "tokens_seen": 1259348992 }, { "epoch": 3.07, "learning_rate": 0.0003123470411233701, "loss": 2.7871, "theoretical_loss": 3.571624638173144, "tokens_seen": 1259414528 }, { "epoch": 3.07, "learning_rate": 0.0003123370110330993, "loss": 2.8736, "theoretical_loss": 3.5716076974374356, "tokens_seen": 1259480064 }, { "epoch": 3.07, "learning_rate": 0.00031232698094282846, "loss": 2.7818, "theoretical_loss": 3.5715907578300055, "tokens_seen": 1259545600 }, { "epoch": 3.07, "learning_rate": 0.0003123169508525577, "loss": 2.9027, "theoretical_loss": 3.5715738193507205, "tokens_seen": 1259611136 }, { "epoch": 3.07, "learning_rate": 0.0003123069207622869, "loss": 2.7996, "theoretical_loss": 3.571556881999447, "tokens_seen": 1259676672 }, { "epoch": 3.07, "learning_rate": 0.00031229689067201606, "loss": 2.822, "theoretical_loss": 3.57153994577605, "tokens_seen": 1259742208 }, { "epoch": 3.07, "learning_rate": 0.0003122868605817453, "loss": 2.9231, "theoretical_loss": 3.571523010680397, "tokens_seen": 1259807744 }, { "epoch": 3.07, "learning_rate": 0.0003122768304914744, "loss": 2.673, "theoretical_loss": 3.571506076712354, "tokens_seen": 1259873280 }, { "epoch": 3.07, "objective/train/docs_used": 2016318, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7590839862823486, "objective/train/theoretical_loss": 3.571501843396516, "objective/train/tokens_used": 1280349664, "theoretical_loss": 3.571501843396516, "tokens_seen": 1259889664 }, { "epoch": 3.07, "learning_rate": 0.00031226680040120366, "loss": 2.8414, "theoretical_loss": 3.571489143871786, "tokens_seen": 1259938816 }, { "epoch": 3.07, "learning_rate": 0.0003122567703109328, "loss": 2.8762, "theoretical_loss": 3.571472212158562, "tokens_seen": 1260004352 }, { "epoch": 3.07, "learning_rate": 0.000312246740220662, "loss": 2.8433, "theoretical_loss": 3.571455281572545, "tokens_seen": 1260069888 }, { "epoch": 3.07, "learning_rate": 0.0003122367101303912, "loss": 2.8957, "theoretical_loss": 3.5714383521136037, "tokens_seen": 1260135424 }, { "epoch": 3.07, "learning_rate": 0.0003122266800401204, "loss": 2.8287, "theoretical_loss": 3.5714214237816035, "tokens_seen": 1260200960 }, { "epoch": 3.07, "learning_rate": 0.00031221664994984956, "loss": 2.8884, "theoretical_loss": 3.5714044965764113, "tokens_seen": 1260266496 }, { "epoch": 3.07, "learning_rate": 0.00031220661985957874, "loss": 2.8224, "theoretical_loss": 3.571387570497893, "tokens_seen": 1260332032 }, { "epoch": 3.07, "learning_rate": 0.0003121965897693079, "loss": 2.9612, "theoretical_loss": 3.5713706455459153, "tokens_seen": 1260397568 }, { "epoch": 3.07, "learning_rate": 0.00031218655967903716, "loss": 2.8039, "theoretical_loss": 3.571353721720344, "tokens_seen": 1260463104 }, { "epoch": 3.07, "learning_rate": 0.0003121765295887663, "loss": 2.8594, "theoretical_loss": 3.571336799021047, "tokens_seen": 1260528640 }, { "epoch": 3.07, "learning_rate": 0.0003121664994984955, "loss": 2.9559, "theoretical_loss": 3.5713198774478894, "tokens_seen": 1260594176 }, { "epoch": 3.07, "learning_rate": 0.00031215646940822465, "loss": 2.8971, "theoretical_loss": 3.571302957000739, "tokens_seen": 1260659712 }, { "epoch": 3.07, "learning_rate": 0.0003121464393179539, "loss": 2.9338, "theoretical_loss": 3.571286037679461, "tokens_seen": 1260725248 }, { "epoch": 3.07, "learning_rate": 0.00031213640922768307, "loss": 2.6515, "theoretical_loss": 3.5712691194839232, "tokens_seen": 1260790784 }, { "epoch": 3.07, "learning_rate": 0.00031212637913741225, "loss": 2.7945, "theoretical_loss": 3.5712522024139908, "tokens_seen": 1260856320 }, { "epoch": 3.07, "learning_rate": 0.00031211634904714143, "loss": 2.9621, "theoretical_loss": 3.5712352864695314, "tokens_seen": 1260921856 }, { "epoch": 3.07, "learning_rate": 0.00031210631895687066, "loss": 3.0532, "theoretical_loss": 3.571218371650412, "tokens_seen": 1260987392 }, { "epoch": 3.07, "learning_rate": 0.0003120962888665998, "loss": 2.8615, "theoretical_loss": 3.571201457956498, "tokens_seen": 1261052928 }, { "epoch": 3.07, "learning_rate": 0.000312086258776329, "loss": 2.891, "theoretical_loss": 3.5711845453876565, "tokens_seen": 1261118464 }, { "epoch": 3.07, "learning_rate": 0.00031207622868605815, "loss": 2.8495, "theoretical_loss": 3.5711676339437552, "tokens_seen": 1261184000 }, { "epoch": 3.07, "learning_rate": 0.0003120661985957874, "loss": 2.8053, "theoretical_loss": 3.57115072362466, "tokens_seen": 1261249536 }, { "epoch": 3.07, "learning_rate": 0.00031205616850551657, "loss": 2.728, "theoretical_loss": 3.5711338144302376, "tokens_seen": 1261315072 }, { "epoch": 3.07, "learning_rate": 0.00031204613841524575, "loss": 2.8362, "theoretical_loss": 3.5711169063603547, "tokens_seen": 1261380608 }, { "epoch": 3.07, "learning_rate": 0.00031203610832497493, "loss": 2.9221, "theoretical_loss": 3.571099999414878, "tokens_seen": 1261446144 }, { "epoch": 3.07, "learning_rate": 0.0003120260782347041, "loss": 2.829, "theoretical_loss": 3.5710830935936757, "tokens_seen": 1261511680 }, { "epoch": 3.07, "objective/train/docs_used": 2018834, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9820408821105957, "objective/train/theoretical_loss": 3.571078867314027, "objective/train/tokens_used": 1281988064, "theoretical_loss": 3.571078867314027, "tokens_seen": 1261528064 }, { "epoch": 3.07, "learning_rate": 0.0003120160481444333, "loss": 2.9405, "theoretical_loss": 3.5710661888966126, "tokens_seen": 1261577216 }, { "epoch": 3.07, "learning_rate": 0.00031200601805416253, "loss": 2.7483, "theoretical_loss": 3.571049285323557, "tokens_seen": 1261642752 }, { "epoch": 3.07, "learning_rate": 0.00031199598796389166, "loss": 2.8681, "theoretical_loss": 3.5710323828743746, "tokens_seen": 1261708288 }, { "epoch": 3.07, "learning_rate": 0.0003119859578736209, "loss": 2.8236, "theoretical_loss": 3.571015481548934, "tokens_seen": 1261773824 }, { "epoch": 3.07, "learning_rate": 0.00031197592778335, "loss": 2.93, "theoretical_loss": 3.5709985813471006, "tokens_seen": 1261839360 }, { "epoch": 3.07, "learning_rate": 0.00031196589769307925, "loss": 2.875, "theoretical_loss": 3.5709816822687417, "tokens_seen": 1261904896 }, { "epoch": 3.07, "learning_rate": 0.00031195586760280843, "loss": 2.8302, "theoretical_loss": 3.5709647843137247, "tokens_seen": 1261970432 }, { "epoch": 3.07, "learning_rate": 0.0003119458375125376, "loss": 2.8289, "theoretical_loss": 3.570947887481916, "tokens_seen": 1262035968 }, { "epoch": 3.07, "learning_rate": 0.0003119358074222668, "loss": 2.9849, "theoretical_loss": 3.5709309917731833, "tokens_seen": 1262101504 }, { "epoch": 3.07, "learning_rate": 0.00031192577733199603, "loss": 2.9647, "theoretical_loss": 3.5709140971873934, "tokens_seen": 1262167040 }, { "epoch": 3.07, "learning_rate": 0.00031191574724172516, "loss": 2.9086, "theoretical_loss": 3.570897203724413, "tokens_seen": 1262232576 }, { "epoch": 3.07, "learning_rate": 0.0003119057171514544, "loss": 2.7185, "theoretical_loss": 3.5708803113841094, "tokens_seen": 1262298112 }, { "epoch": 3.07, "learning_rate": 0.0003118956870611835, "loss": 2.8929, "theoretical_loss": 3.57086342016635, "tokens_seen": 1262363648 }, { "epoch": 3.07, "learning_rate": 0.00031188565697091276, "loss": 2.83, "theoretical_loss": 3.5708465300710017, "tokens_seen": 1262429184 }, { "epoch": 3.07, "learning_rate": 0.00031187562688064194, "loss": 2.5619, "theoretical_loss": 3.5708296410979314, "tokens_seen": 1262494720 }, { "epoch": 3.07, "learning_rate": 0.0003118655967903711, "loss": 2.8012, "theoretical_loss": 3.5708127532470066, "tokens_seen": 1262560256 }, { "epoch": 3.07, "learning_rate": 0.0003118555667001003, "loss": 2.822, "theoretical_loss": 3.5707958665180946, "tokens_seen": 1262625792 }, { "epoch": 3.07, "learning_rate": 0.0003118455366098295, "loss": 2.5665, "theoretical_loss": 3.5707789809110624, "tokens_seen": 1262691328 }, { "epoch": 3.07, "learning_rate": 0.00031183550651955866, "loss": 2.7314, "theoretical_loss": 3.570762096425777, "tokens_seen": 1262756864 }, { "epoch": 3.07, "learning_rate": 0.0003118254764292879, "loss": 2.9226, "theoretical_loss": 3.5707452130621062, "tokens_seen": 1262822400 }, { "epoch": 3.07, "learning_rate": 0.000311815446339017, "loss": 2.9681, "theoretical_loss": 3.570728330819917, "tokens_seen": 1262887936 }, { "epoch": 3.07, "learning_rate": 0.00031180541624874626, "loss": 2.8989, "theoretical_loss": 3.5707114496990773, "tokens_seen": 1262953472 }, { "epoch": 3.07, "learning_rate": 0.0003117953861584754, "loss": 2.8593, "theoretical_loss": 3.5706945696994534, "tokens_seen": 1263019008 }, { "epoch": 3.07, "learning_rate": 0.0003117853560682046, "loss": 2.7803, "theoretical_loss": 3.5706776908209132, "tokens_seen": 1263084544 }, { "epoch": 3.07, "learning_rate": 0.0003117753259779338, "loss": 2.8653, "theoretical_loss": 3.570660813063324, "tokens_seen": 1263150080 }, { "epoch": 3.07, "objective/train/docs_used": 2021599, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8017051219940186, "objective/train/theoretical_loss": 3.5706565937990598, "objective/train/tokens_used": 1283626464, "theoretical_loss": 3.5706565937990598, "tokens_seen": 1263166464 }, { "epoch": 3.07, "learning_rate": 0.000311765295887663, "loss": 2.8767, "theoretical_loss": 3.570643936426553, "tokens_seen": 1263215616 }, { "epoch": 3.07, "learning_rate": 0.00031175526579739217, "loss": 2.6813, "theoretical_loss": 3.5706270609104687, "tokens_seen": 1263281152 }, { "epoch": 3.07, "learning_rate": 0.0003117452357071214, "loss": 2.8417, "theoretical_loss": 3.570610186514937, "tokens_seen": 1263346688 }, { "epoch": 3.07, "learning_rate": 0.00031173520561685053, "loss": 2.9401, "theoretical_loss": 3.5705933132398266, "tokens_seen": 1263412224 }, { "epoch": 3.07, "learning_rate": 0.00031172517552657976, "loss": 2.8829, "theoretical_loss": 3.570576441085004, "tokens_seen": 1263477760 }, { "epoch": 3.07, "learning_rate": 0.0003117151454363089, "loss": 2.8901, "theoretical_loss": 3.5705595700503374, "tokens_seen": 1263543296 }, { "epoch": 3.07, "learning_rate": 0.0003117051153460381, "loss": 2.884, "theoretical_loss": 3.570542700135694, "tokens_seen": 1263608832 }, { "epoch": 3.07, "learning_rate": 0.0003116950852557673, "loss": 2.6627, "theoretical_loss": 3.570525831340942, "tokens_seen": 1263674368 }, { "epoch": 3.07, "learning_rate": 0.0003116850551654965, "loss": 2.7634, "theoretical_loss": 3.570508963665948, "tokens_seen": 1263739904 }, { "epoch": 3.07, "learning_rate": 0.00031167502507522567, "loss": 2.8296, "theoretical_loss": 3.5704920971105802, "tokens_seen": 1263805440 }, { "epoch": 3.07, "learning_rate": 0.00031166499498495485, "loss": 2.7401, "theoretical_loss": 3.570475231674706, "tokens_seen": 1263870976 }, { "epoch": 3.07, "learning_rate": 0.00031165496489468403, "loss": 2.9474, "theoretical_loss": 3.570458367358193, "tokens_seen": 1263936512 }, { "epoch": 3.07, "learning_rate": 0.00031164493480441327, "loss": 2.8062, "theoretical_loss": 3.57044150416091, "tokens_seen": 1264002048 }, { "epoch": 3.07, "learning_rate": 0.0003116349047141424, "loss": 2.9163, "theoretical_loss": 3.570424642082723, "tokens_seen": 1264067584 }, { "epoch": 3.07, "learning_rate": 0.00031162487462387163, "loss": 2.74, "theoretical_loss": 3.5704077811235004, "tokens_seen": 1264133120 }, { "epoch": 3.07, "learning_rate": 0.0003116148445336008, "loss": 2.9102, "theoretical_loss": 3.5703909212831104, "tokens_seen": 1264198656 }, { "epoch": 3.07, "learning_rate": 0.00031160481444333, "loss": 2.8633, "theoretical_loss": 3.57037406256142, "tokens_seen": 1264264192 }, { "epoch": 3.07, "learning_rate": 0.00031159478435305917, "loss": 2.9727, "theoretical_loss": 3.570357204958298, "tokens_seen": 1264329728 }, { "epoch": 3.07, "learning_rate": 0.00031158475426278835, "loss": 2.8942, "theoretical_loss": 3.5703403484736107, "tokens_seen": 1264395264 }, { "epoch": 3.07, "learning_rate": 0.00031157472417251753, "loss": 2.7604, "theoretical_loss": 3.570323493107227, "tokens_seen": 1264460800 }, { "epoch": 3.07, "learning_rate": 0.00031156469408224677, "loss": 2.8878, "theoretical_loss": 3.570306638859015, "tokens_seen": 1264526336 }, { "epoch": 3.07, "learning_rate": 0.00031155466399197595, "loss": 2.7201, "theoretical_loss": 3.570289785728842, "tokens_seen": 1264591872 }, { "epoch": 3.07, "learning_rate": 0.00031154463390170513, "loss": 2.7315, "theoretical_loss": 3.5702729337165757, "tokens_seen": 1264657408 }, { "epoch": 3.07, "learning_rate": 0.0003115346038114343, "loss": 2.7757, "theoretical_loss": 3.570256082822085, "tokens_seen": 1264722944 }, { "epoch": 3.07, "learning_rate": 0.0003115245737211635, "loss": 2.7742, "theoretical_loss": 3.5702392330452364, "tokens_seen": 1264788480 }, { "epoch": 3.07, "objective/train/docs_used": 2024598, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.2765471935272217, "objective/train/theoretical_loss": 3.57023502077564, "objective/train/tokens_used": 1285264864, "theoretical_loss": 3.57023502077564, "tokens_seen": 1264804864 }, { "epoch": 3.07, "learning_rate": 0.00031151454363089273, "loss": 2.4965, "theoretical_loss": 3.5702223843858993, "tokens_seen": 1264854016 }, { "epoch": 3.07, "learning_rate": 0.00031150451354062186, "loss": 2.8098, "theoretical_loss": 3.5702055368439405, "tokens_seen": 1264919552 }, { "epoch": 3.07, "learning_rate": 0.0003114944834503511, "loss": 2.8203, "theoretical_loss": 3.5701886904192284, "tokens_seen": 1264985088 }, { "epoch": 3.07, "learning_rate": 0.0003114844533600802, "loss": 2.8388, "theoretical_loss": 3.570171845111632, "tokens_seen": 1265050624 }, { "epoch": 3.07, "learning_rate": 0.00031147442326980945, "loss": 2.8465, "theoretical_loss": 3.570155000921018, "tokens_seen": 1265116160 }, { "epoch": 3.07, "learning_rate": 0.00031146439317953863, "loss": 2.7918, "theoretical_loss": 3.5701381578472553, "tokens_seen": 1265181696 }, { "epoch": 3.07, "learning_rate": 0.0003114543630892678, "loss": 2.8023, "theoretical_loss": 3.570121315890211, "tokens_seen": 1265247232 }, { "epoch": 3.07, "learning_rate": 0.000311444332998997, "loss": 2.7872, "theoretical_loss": 3.570104475049755, "tokens_seen": 1265312768 }, { "epoch": 3.07, "learning_rate": 0.00031143430290872623, "loss": 2.9357, "theoretical_loss": 3.570087635325754, "tokens_seen": 1265378304 }, { "epoch": 3.07, "learning_rate": 0.00031142427281845536, "loss": 2.947, "theoretical_loss": 3.5700707967180763, "tokens_seen": 1265443840 }, { "epoch": 3.07, "learning_rate": 0.0003114142427281846, "loss": 2.7663, "theoretical_loss": 3.5700539592265903, "tokens_seen": 1265509376 }, { "epoch": 3.07, "learning_rate": 0.0003114042126379137, "loss": 2.7821, "theoretical_loss": 3.570037122851165, "tokens_seen": 1265574912 }, { "epoch": 3.07, "learning_rate": 0.00031139418254764296, "loss": 3.0352, "theoretical_loss": 3.5700202875916673, "tokens_seen": 1265640448 }, { "epoch": 3.07, "learning_rate": 0.00031138415245737214, "loss": 2.8755, "theoretical_loss": 3.570003453447966, "tokens_seen": 1265705984 }, { "epoch": 3.07, "learning_rate": 0.0003113741223671013, "loss": 2.7463, "theoretical_loss": 3.5699866204199298, "tokens_seen": 1265771520 }, { "epoch": 3.07, "learning_rate": 0.0003113640922768305, "loss": 2.7906, "theoretical_loss": 3.569969788507426, "tokens_seen": 1265837056 }, { "epoch": 3.07, "learning_rate": 0.0003113540621865597, "loss": 2.8549, "theoretical_loss": 3.569952957710324, "tokens_seen": 1265902592 }, { "epoch": 3.07, "learning_rate": 0.00031134403209628886, "loss": 2.8209, "theoretical_loss": 3.569936128028492, "tokens_seen": 1265968128 }, { "epoch": 3.07, "learning_rate": 0.0003113340020060181, "loss": 2.862, "theoretical_loss": 3.569919299461797, "tokens_seen": 1266033664 }, { "epoch": 3.07, "learning_rate": 0.0003113239719157472, "loss": 2.7523, "theoretical_loss": 3.56990247201011, "tokens_seen": 1266099200 }, { "epoch": 3.07, "learning_rate": 0.00031131394182547646, "loss": 2.7737, "theoretical_loss": 3.569885645673297, "tokens_seen": 1266164736 }, { "epoch": 3.07, "learning_rate": 0.0003113039117352056, "loss": 2.7517, "theoretical_loss": 3.5698688204512274, "tokens_seen": 1266230272 }, { "epoch": 3.07, "learning_rate": 0.0003112938816449348, "loss": 2.9829, "theoretical_loss": 3.5698519963437696, "tokens_seen": 1266295808 }, { "epoch": 3.07, "learning_rate": 0.000311283851554664, "loss": 2.7301, "theoretical_loss": 3.569835173350792, "tokens_seen": 1266361344 }, { "epoch": 3.07, "learning_rate": 0.0003112738214643932, "loss": 2.7871, "theoretical_loss": 3.569818351472163, "tokens_seen": 1266426880 }, { "epoch": 3.07, "objective/train/docs_used": 2027075, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9360647201538086, "objective/train/theoretical_loss": 3.5698141461766077, "objective/train/tokens_used": 1286903264, "theoretical_loss": 3.5698141461766077, "tokens_seen": 1266443264 }, { "epoch": 3.07, "learning_rate": 0.00031126379137412237, "loss": 2.8944, "theoretical_loss": 3.5698015307077515, "tokens_seen": 1266492416 }, { "epoch": 3.07, "learning_rate": 0.0003112537612838516, "loss": 2.7495, "theoretical_loss": 3.569784711057426, "tokens_seen": 1266557952 }, { "epoch": 3.07, "learning_rate": 0.00031124373119358073, "loss": 2.7069, "theoretical_loss": 3.569767892521055, "tokens_seen": 1266623488 }, { "epoch": 3.07, "learning_rate": 0.00031123370110330996, "loss": 2.8902, "theoretical_loss": 3.5697510750985066, "tokens_seen": 1266689024 }, { "epoch": 3.07, "learning_rate": 0.0003112236710130391, "loss": 2.7359, "theoretical_loss": 3.5697342587896497, "tokens_seen": 1266754560 }, { "epoch": 3.07, "learning_rate": 0.0003112136409227683, "loss": 2.8858, "theoretical_loss": 3.569717443594354, "tokens_seen": 1266820096 }, { "epoch": 3.07, "learning_rate": 0.0003112036108324975, "loss": 2.9426, "theoretical_loss": 3.569700629512486, "tokens_seen": 1266885632 }, { "epoch": 3.07, "learning_rate": 0.0003111935807422267, "loss": 3.0471, "theoretical_loss": 3.5696838165439164, "tokens_seen": 1266951168 }, { "epoch": 3.07, "learning_rate": 0.00031118355065195587, "loss": 3.0293, "theoretical_loss": 3.5696670046885126, "tokens_seen": 1267016704 }, { "epoch": 3.07, "learning_rate": 0.00031117352056168505, "loss": 2.7651, "theoretical_loss": 3.569650193946144, "tokens_seen": 1267082240 }, { "epoch": 3.07, "learning_rate": 0.00031116349047141423, "loss": 2.7726, "theoretical_loss": 3.5696333843166794, "tokens_seen": 1267147776 }, { "epoch": 3.07, "learning_rate": 0.00031115346038114347, "loss": 2.725, "theoretical_loss": 3.569616575799987, "tokens_seen": 1267213312 }, { "epoch": 3.07, "learning_rate": 0.0003111434302908726, "loss": 2.8315, "theoretical_loss": 3.5695997683959364, "tokens_seen": 1267278848 }, { "epoch": 3.07, "learning_rate": 0.00031113340020060183, "loss": 3.0297, "theoretical_loss": 3.5695829621043953, "tokens_seen": 1267344384 }, { "epoch": 3.07, "learning_rate": 0.000311123370110331, "loss": 2.7618, "theoretical_loss": 3.5695661569252337, "tokens_seen": 1267409920 }, { "epoch": 3.07, "learning_rate": 0.0003111133400200602, "loss": 2.8591, "theoretical_loss": 3.5695493528583198, "tokens_seen": 1267475456 }, { "epoch": 3.07, "learning_rate": 0.00031110330992978937, "loss": 2.9068, "theoretical_loss": 3.5695325499035224, "tokens_seen": 1267540992 }, { "epoch": 3.07, "learning_rate": 0.00031109327983951855, "loss": 2.7394, "theoretical_loss": 3.569515748060711, "tokens_seen": 1267606528 }, { "epoch": 3.07, "learning_rate": 0.00031108324974924773, "loss": 2.7335, "theoretical_loss": 3.5694989473297536, "tokens_seen": 1267672064 }, { "epoch": 3.07, "learning_rate": 0.00031107321965897697, "loss": 2.9251, "theoretical_loss": 3.56948214771052, "tokens_seen": 1267737600 }, { "epoch": 3.07, "learning_rate": 0.0003110631895687061, "loss": 2.7161, "theoretical_loss": 3.569465349202879, "tokens_seen": 1267803136 }, { "epoch": 3.07, "learning_rate": 0.00031105315947843533, "loss": 2.9742, "theoretical_loss": 3.5694485518066994, "tokens_seen": 1267868672 }, { "epoch": 3.07, "learning_rate": 0.00031104312938816446, "loss": 2.7322, "theoretical_loss": 3.5694317555218507, "tokens_seen": 1267934208 }, { "epoch": 3.07, "learning_rate": 0.0003110330992978937, "loss": 2.7705, "theoretical_loss": 3.569414960348201, "tokens_seen": 1267999744 }, { "epoch": 3.07, "learning_rate": 0.0003110230692076229, "loss": 2.9635, "theoretical_loss": 3.56939816628562, "tokens_seen": 1268065280 }, { "epoch": 3.07, "objective/train/docs_used": 2028563, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.978868007659912, "objective/train/theoretical_loss": 3.569393967943564, "objective/train/tokens_used": 1288541664, "theoretical_loss": 3.569393967943564, "tokens_seen": 1268081664 }, { "epoch": 3.07, "learning_rate": 0.00031101303911735206, "loss": 2.7914, "theoretical_loss": 3.569381373333977, "tokens_seen": 1268130816 }, { "epoch": 3.07, "learning_rate": 0.00031100300902708124, "loss": 2.8198, "theoretical_loss": 3.56936458149314, "tokens_seen": 1268196352 }, { "epoch": 3.07, "learning_rate": 0.0003109929789368104, "loss": 2.6989, "theoretical_loss": 3.5693477907629796, "tokens_seen": 1268261888 }, { "epoch": 3.07, "learning_rate": 0.0003109829488465396, "loss": 2.8288, "theoretical_loss": 3.5693310011433637, "tokens_seen": 1268327424 }, { "epoch": 3.07, "learning_rate": 0.00031097291875626883, "loss": 2.8351, "theoretical_loss": 3.5693142126341626, "tokens_seen": 1268392960 }, { "epoch": 3.07, "learning_rate": 0.00031096288866599796, "loss": 2.856, "theoretical_loss": 3.5692974252352445, "tokens_seen": 1268458496 }, { "epoch": 3.07, "learning_rate": 0.0003109528585757272, "loss": 2.7485, "theoretical_loss": 3.5692806389464793, "tokens_seen": 1268524032 }, { "epoch": 3.07, "learning_rate": 0.0003109428284854564, "loss": 2.6703, "theoretical_loss": 3.569263853767736, "tokens_seen": 1268589568 }, { "epoch": 3.07, "learning_rate": 0.00031093279839518556, "loss": 2.931, "theoretical_loss": 3.569247069698883, "tokens_seen": 1268655104 }, { "epoch": 3.07, "learning_rate": 0.00031092276830491474, "loss": 2.8337, "theoretical_loss": 3.569230286739791, "tokens_seen": 1268720640 }, { "epoch": 3.07, "learning_rate": 0.0003109127382146439, "loss": 2.7436, "theoretical_loss": 3.569213504890329, "tokens_seen": 1268786176 }, { "epoch": 3.07, "learning_rate": 0.0003109027081243731, "loss": 2.7819, "theoretical_loss": 3.569196724150366, "tokens_seen": 1268851712 }, { "epoch": 3.07, "learning_rate": 0.00031089267803410234, "loss": 2.9098, "theoretical_loss": 3.569179944519771, "tokens_seen": 1268917248 }, { "epoch": 3.07, "learning_rate": 0.00031088264794383147, "loss": 2.7764, "theoretical_loss": 3.5691631659984138, "tokens_seen": 1268982784 }, { "epoch": 3.07, "learning_rate": 0.0003108726178535607, "loss": 2.968, "theoretical_loss": 3.569146388586164, "tokens_seen": 1269048320 }, { "epoch": 3.07, "learning_rate": 0.00031086258776328983, "loss": 2.8239, "theoretical_loss": 3.5691296122828904, "tokens_seen": 1269113856 }, { "epoch": 3.07, "learning_rate": 0.00031085255767301906, "loss": 2.6878, "theoretical_loss": 3.569112837088463, "tokens_seen": 1269179392 }, { "epoch": 3.07, "learning_rate": 0.00031084252758274824, "loss": 2.8738, "theoretical_loss": 3.569096063002751, "tokens_seen": 1269244928 }, { "epoch": 3.07, "learning_rate": 0.0003108324974924774, "loss": 2.865, "theoretical_loss": 3.5690792900256243, "tokens_seen": 1269310464 }, { "epoch": 3.07, "learning_rate": 0.0003108224674022066, "loss": 2.8876, "theoretical_loss": 3.5690625181569513, "tokens_seen": 1269376000 }, { "epoch": 3.07, "learning_rate": 0.0003108124373119358, "loss": 2.7268, "theoretical_loss": 3.5690457473966024, "tokens_seen": 1269441536 }, { "epoch": 3.07, "learning_rate": 0.000310802407221665, "loss": 2.9122, "theoretical_loss": 3.5690289777444475, "tokens_seen": 1269507072 }, { "epoch": 3.07, "learning_rate": 0.0003107923771313942, "loss": 2.9057, "theoretical_loss": 3.5690122092003556, "tokens_seen": 1269572608 }, { "epoch": 3.07, "learning_rate": 0.0003107823470411234, "loss": 2.8608, "theoretical_loss": 3.568995441764196, "tokens_seen": 1269638144 }, { "epoch": 3.07, "learning_rate": 0.00031077231695085257, "loss": 2.7307, "theoretical_loss": 3.568978675435839, "tokens_seen": 1269703680 }, { "epoch": 3.07, "objective/train/docs_used": 2031593, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4318389892578125, "objective/train/theoretical_loss": 3.5689744840268283, "objective/train/tokens_used": 1290180064, "theoretical_loss": 3.5689744840268283, "tokens_seen": 1269720064 }, { "epoch": 3.07, "learning_rate": 0.0003107622868605818, "loss": 2.7329, "theoretical_loss": 3.5689619102151537, "tokens_seen": 1269769216 }, { "epoch": 3.07, "learning_rate": 0.00031075225677031093, "loss": 2.7917, "theoretical_loss": 3.56894514610201, "tokens_seen": 1269834752 }, { "epoch": 3.07, "learning_rate": 0.00031074222668004016, "loss": 2.8096, "theoretical_loss": 3.5689283830962775, "tokens_seen": 1269900288 }, { "epoch": 3.07, "learning_rate": 0.0003107321965897693, "loss": 2.7673, "theoretical_loss": 3.568911621197826, "tokens_seen": 1269965824 }, { "epoch": 3.07, "learning_rate": 0.0003107221664994985, "loss": 2.8997, "theoretical_loss": 3.5688948604065254, "tokens_seen": 1270031360 }, { "epoch": 3.07, "learning_rate": 0.0003107121364092277, "loss": 2.9329, "theoretical_loss": 3.568878100722245, "tokens_seen": 1270096896 }, { "epoch": 3.07, "learning_rate": 0.0003107021063189569, "loss": 2.8849, "theoretical_loss": 3.5688613421448547, "tokens_seen": 1270162432 }, { "epoch": 3.07, "learning_rate": 0.00031069207622868607, "loss": 2.8314, "theoretical_loss": 3.5688445846742245, "tokens_seen": 1270227968 }, { "epoch": 3.07, "learning_rate": 0.00031068204613841525, "loss": 2.827, "theoretical_loss": 3.568827828310224, "tokens_seen": 1270293504 }, { "epoch": 3.07, "learning_rate": 0.00031067201604814443, "loss": 2.9081, "theoretical_loss": 3.568811073052723, "tokens_seen": 1270359040 }, { "epoch": 3.07, "learning_rate": 0.00031066198595787367, "loss": 2.8777, "theoretical_loss": 3.568794318901592, "tokens_seen": 1270424576 }, { "epoch": 3.07, "learning_rate": 0.0003106519558676028, "loss": 2.815, "theoretical_loss": 3.5687775658567, "tokens_seen": 1270490112 }, { "epoch": 3.07, "learning_rate": 0.00031064192577733203, "loss": 2.7558, "theoretical_loss": 3.568760813917917, "tokens_seen": 1270555648 }, { "epoch": 3.07, "learning_rate": 0.0003106318956870612, "loss": 2.7231, "theoretical_loss": 3.5687440630851137, "tokens_seen": 1270621184 }, { "epoch": 3.07, "learning_rate": 0.0003106218655967904, "loss": 2.8486, "theoretical_loss": 3.568727313358159, "tokens_seen": 1270686720 }, { "epoch": 3.07, "learning_rate": 0.00031061183550651957, "loss": 2.8382, "theoretical_loss": 3.568710564736924, "tokens_seen": 1270752256 }, { "epoch": 3.07, "learning_rate": 0.00031060180541624875, "loss": 2.7819, "theoretical_loss": 3.568693817221278, "tokens_seen": 1270817792 }, { "epoch": 3.07, "learning_rate": 0.00031059177532597793, "loss": 2.8563, "theoretical_loss": 3.5686770708110904, "tokens_seen": 1270883328 }, { "epoch": 3.07, "learning_rate": 0.00031058174523570717, "loss": 2.791, "theoretical_loss": 3.568660325506232, "tokens_seen": 1270948864 }, { "epoch": 3.07, "learning_rate": 0.0003105717151454363, "loss": 2.9038, "theoretical_loss": 3.5686435813065733, "tokens_seen": 1271014400 }, { "epoch": 3.07, "learning_rate": 0.00031056168505516553, "loss": 2.6734, "theoretical_loss": 3.568626838211984, "tokens_seen": 1271079936 }, { "epoch": 3.07, "learning_rate": 0.00031055165496489466, "loss": 2.7217, "theoretical_loss": 3.5686100962223337, "tokens_seen": 1271145472 }, { "epoch": 3.07, "learning_rate": 0.0003105416248746239, "loss": 2.8985, "theoretical_loss": 3.5685933553374927, "tokens_seen": 1271211008 }, { "epoch": 3.07, "learning_rate": 0.0003105315947843531, "loss": 2.8316, "theoretical_loss": 3.5685766155573315, "tokens_seen": 1271276544 }, { "epoch": 3.07, "learning_rate": 0.00031052156469408226, "loss": 2.804, "theoretical_loss": 3.56855987688172, "tokens_seen": 1271342080 }, { "epoch": 3.07, "objective/train/docs_used": 2034445, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6874160766601562, "objective/train/theoretical_loss": 3.568555692385388, "objective/train/tokens_used": 1291818464, "theoretical_loss": 3.568555692385388, "tokens_seen": 1271358464 }, { "epoch": 3.07, "learning_rate": 0.00031051153460381144, "loss": 2.6484, "theoretical_loss": 3.5685431393105285, "tokens_seen": 1271407616 }, { "epoch": 3.07, "learning_rate": 0.0003105015045135406, "loss": 2.7512, "theoretical_loss": 3.568526402843627, "tokens_seen": 1271473152 }, { "epoch": 3.07, "learning_rate": 0.0003104914744232698, "loss": 2.7451, "theoretical_loss": 3.5685096674808863, "tokens_seen": 1271538688 }, { "epoch": 3.07, "learning_rate": 0.00031048144433299904, "loss": 2.7512, "theoretical_loss": 3.568492933222176, "tokens_seen": 1271604224 }, { "epoch": 3.07, "learning_rate": 0.00031047141424272816, "loss": 2.7715, "theoretical_loss": 3.5684762000673667, "tokens_seen": 1271669760 }, { "epoch": 3.07, "learning_rate": 0.0003104613841524574, "loss": 2.9023, "theoretical_loss": 3.5684594680163286, "tokens_seen": 1271735296 }, { "epoch": 3.07, "learning_rate": 0.0003104513540621866, "loss": 2.8469, "theoretical_loss": 3.5684427370689313, "tokens_seen": 1271800832 }, { "epoch": 3.07, "learning_rate": 0.00031044132397191576, "loss": 2.9312, "theoretical_loss": 3.568426007225047, "tokens_seen": 1271866368 }, { "epoch": 3.07, "learning_rate": 0.00031043129388164494, "loss": 2.8436, "theoretical_loss": 3.5684092784845443, "tokens_seen": 1271931904 }, { "epoch": 3.07, "learning_rate": 0.0003104212637913741, "loss": 2.7894, "theoretical_loss": 3.5683925508472942, "tokens_seen": 1271997440 }, { "epoch": 3.07, "learning_rate": 0.0003104112337011033, "loss": 2.7187, "theoretical_loss": 3.568375824313167, "tokens_seen": 1272062976 }, { "epoch": 3.07, "learning_rate": 0.00031040120361083254, "loss": 2.9318, "theoretical_loss": 3.5683590988820333, "tokens_seen": 1272128512 }, { "epoch": 3.07, "learning_rate": 0.00031039117352056167, "loss": 2.6369, "theoretical_loss": 3.568342374553764, "tokens_seen": 1272194048 }, { "epoch": 3.07, "learning_rate": 0.0003103811434302909, "loss": 2.7708, "theoretical_loss": 3.5683256513282284, "tokens_seen": 1272259584 }, { "epoch": 3.07, "learning_rate": 0.00031037111334002003, "loss": 2.879, "theoretical_loss": 3.5683089292052985, "tokens_seen": 1272325120 }, { "epoch": 3.07, "learning_rate": 0.00031036108324974926, "loss": 2.6508, "theoretical_loss": 3.568292208184843, "tokens_seen": 1272390656 }, { "epoch": 3.07, "learning_rate": 0.00031035105315947844, "loss": 2.6774, "theoretical_loss": 3.5682754882667345, "tokens_seen": 1272456192 }, { "epoch": 3.07, "learning_rate": 0.0003103410230692076, "loss": 2.8611, "theoretical_loss": 3.5682587694508414, "tokens_seen": 1272521728 }, { "epoch": 3.07, "learning_rate": 0.0003103309929789368, "loss": 2.9046, "theoretical_loss": 3.5682420517370357, "tokens_seen": 1272587264 }, { "epoch": 3.08, "learning_rate": 0.000310320962888666, "loss": 2.7873, "theoretical_loss": 3.568225335125188, "tokens_seen": 1272652800 }, { "epoch": 3.08, "learning_rate": 0.00031031093279839517, "loss": 2.856, "theoretical_loss": 3.568208619615168, "tokens_seen": 1272718336 }, { "epoch": 3.08, "learning_rate": 0.0003103009027081244, "loss": 2.798, "theoretical_loss": 3.568191905206847, "tokens_seen": 1272783872 }, { "epoch": 3.08, "learning_rate": 0.00031029087261785353, "loss": 2.7634, "theoretical_loss": 3.568175191900096, "tokens_seen": 1272849408 }, { "epoch": 3.08, "learning_rate": 0.00031028084252758277, "loss": 2.6817, "theoretical_loss": 3.5681584796947847, "tokens_seen": 1272914944 }, { "epoch": 3.08, "learning_rate": 0.00031027081243731195, "loss": 2.9343, "theoretical_loss": 3.5681417685907846, "tokens_seen": 1272980480 }, { "epoch": 3.08, "objective/train/docs_used": 2037271, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.992802619934082, "objective/train/theoretical_loss": 3.5681375909868494, "objective/train/tokens_used": 1293456864, "theoretical_loss": 3.5681375909868494, "tokens_seen": 1272996864 }, { "epoch": 3.08, "learning_rate": 0.00031026078234704113, "loss": 2.873, "theoretical_loss": 3.5681250585879667, "tokens_seen": 1273046016 }, { "epoch": 3.08, "learning_rate": 0.0003102507522567703, "loss": 2.6626, "theoretical_loss": 3.5681083496862005, "tokens_seen": 1273111552 }, { "epoch": 3.08, "learning_rate": 0.0003102407221664995, "loss": 3.0131, "theoretical_loss": 3.568091641885358, "tokens_seen": 1273177088 }, { "epoch": 3.08, "learning_rate": 0.00031023069207622867, "loss": 2.9352, "theoretical_loss": 3.5680749351853094, "tokens_seen": 1273242624 }, { "epoch": 3.08, "learning_rate": 0.0003102206619859579, "loss": 2.5904, "theoretical_loss": 3.5680582295859256, "tokens_seen": 1273308160 }, { "epoch": 3.08, "learning_rate": 0.00031021063189568703, "loss": 2.9928, "theoretical_loss": 3.5680415250870774, "tokens_seen": 1273373696 }, { "epoch": 3.08, "learning_rate": 0.00031020060180541627, "loss": 2.8002, "theoretical_loss": 3.568024821688636, "tokens_seen": 1273439232 }, { "epoch": 3.08, "learning_rate": 0.0003101905717151454, "loss": 2.9039, "theoretical_loss": 3.5680081193904725, "tokens_seen": 1273504768 }, { "epoch": 3.08, "learning_rate": 0.00031018054162487463, "loss": 2.8028, "theoretical_loss": 3.5679914181924564, "tokens_seen": 1273570304 }, { "epoch": 3.08, "learning_rate": 0.0003101705115346038, "loss": 2.8076, "theoretical_loss": 3.56797471809446, "tokens_seen": 1273635840 }, { "epoch": 3.08, "learning_rate": 0.000310160481444333, "loss": 2.8597, "theoretical_loss": 3.567958019096354, "tokens_seen": 1273701376 }, { "epoch": 3.08, "learning_rate": 0.0003101504513540622, "loss": 2.6731, "theoretical_loss": 3.567941321198009, "tokens_seen": 1273766912 }, { "epoch": 3.08, "learning_rate": 0.0003101404212637914, "loss": 2.8598, "theoretical_loss": 3.5679246243992964, "tokens_seen": 1273832448 }, { "epoch": 3.08, "learning_rate": 0.00031013039117352054, "loss": 2.9682, "theoretical_loss": 3.5679079287000866, "tokens_seen": 1273897984 }, { "epoch": 3.08, "learning_rate": 0.00031012036108324977, "loss": 2.8626, "theoretical_loss": 3.5678912341002516, "tokens_seen": 1273963520 }, { "epoch": 3.08, "learning_rate": 0.0003101103309929789, "loss": 2.9677, "theoretical_loss": 3.5678745405996617, "tokens_seen": 1274029056 }, { "epoch": 3.08, "learning_rate": 0.00031010030090270813, "loss": 3.001, "theoretical_loss": 3.567857848198188, "tokens_seen": 1274094592 }, { "epoch": 3.08, "learning_rate": 0.0003100902708124373, "loss": 2.8511, "theoretical_loss": 3.5678411568957022, "tokens_seen": 1274160128 }, { "epoch": 3.08, "learning_rate": 0.0003100802407221665, "loss": 2.957, "theoretical_loss": 3.567824466692074, "tokens_seen": 1274225664 }, { "epoch": 3.08, "learning_rate": 0.0003100702106318957, "loss": 2.7461, "theoretical_loss": 3.567807777587177, "tokens_seen": 1274291200 }, { "epoch": 3.08, "learning_rate": 0.00031006018054162486, "loss": 2.7136, "theoretical_loss": 3.56779108958088, "tokens_seen": 1274356736 }, { "epoch": 3.08, "learning_rate": 0.0003100501504513541, "loss": 2.7836, "theoretical_loss": 3.567774402673056, "tokens_seen": 1274422272 }, { "epoch": 3.08, "learning_rate": 0.0003100401203610833, "loss": 2.7507, "theoretical_loss": 3.567757716863574, "tokens_seen": 1274487808 }, { "epoch": 3.08, "learning_rate": 0.00031003009027081246, "loss": 2.7953, "theoretical_loss": 3.5677410321523078, "tokens_seen": 1274553344 }, { "epoch": 3.08, "learning_rate": 0.00031002006018054164, "loss": 2.7721, "theoretical_loss": 3.567724348539127, "tokens_seen": 1274618880 }, { "epoch": 3.08, "objective/train/docs_used": 2040189, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.507341146469116, "objective/train/theoretical_loss": 3.5677201778073924, "objective/train/tokens_used": 1295095264, "theoretical_loss": 3.5677201778073924, "tokens_seen": 1274635264 }, { "epoch": 3.08, "learning_rate": 0.0003100100300902708, "loss": 2.6806, "theoretical_loss": 3.567707666023903, "tokens_seen": 1274684416 }, { "epoch": 3.08, "learning_rate": 0.00031, "loss": 2.4803, "theoretical_loss": 3.567690984606508, "tokens_seen": 1274749952 }, { "epoch": 3.08, "learning_rate": 0.00030998996990972924, "loss": 2.8835, "theoretical_loss": 3.567674304286812, "tokens_seen": 1274815488 }, { "epoch": 3.08, "learning_rate": 0.00030997993981945836, "loss": 2.987, "theoretical_loss": 3.5676576250646876, "tokens_seen": 1274881024 }, { "epoch": 3.08, "learning_rate": 0.0003099699097291876, "loss": 2.751, "theoretical_loss": 3.5676409469400054, "tokens_seen": 1274946560 }, { "epoch": 3.08, "learning_rate": 0.0003099598796389168, "loss": 2.8225, "theoretical_loss": 3.5676242699126375, "tokens_seen": 1275012096 }, { "epoch": 3.08, "learning_rate": 0.00030994984954864596, "loss": 2.9898, "theoretical_loss": 3.567607593982454, "tokens_seen": 1275077632 }, { "epoch": 3.08, "learning_rate": 0.00030993981945837514, "loss": 2.9676, "theoretical_loss": 3.5675909191493282, "tokens_seen": 1275143168 }, { "epoch": 3.08, "learning_rate": 0.0003099297893681043, "loss": 2.7476, "theoretical_loss": 3.56757424541313, "tokens_seen": 1275208704 }, { "epoch": 3.08, "learning_rate": 0.0003099197592778335, "loss": 2.7606, "theoretical_loss": 3.567557572773731, "tokens_seen": 1275274240 }, { "epoch": 3.08, "learning_rate": 0.00030990972918756274, "loss": 2.6549, "theoretical_loss": 3.5675409012310038, "tokens_seen": 1275339776 }, { "epoch": 3.08, "learning_rate": 0.00030989969909729187, "loss": 2.8866, "theoretical_loss": 3.5675242307848185, "tokens_seen": 1275405312 }, { "epoch": 3.08, "learning_rate": 0.0003098896690070211, "loss": 2.8909, "theoretical_loss": 3.5675075614350478, "tokens_seen": 1275470848 }, { "epoch": 3.08, "learning_rate": 0.00030987963891675023, "loss": 2.98, "theoretical_loss": 3.567490893181562, "tokens_seen": 1275536384 }, { "epoch": 3.08, "learning_rate": 0.00030986960882647946, "loss": 2.8949, "theoretical_loss": 3.567474226024234, "tokens_seen": 1275601920 }, { "epoch": 3.08, "learning_rate": 0.00030985957873620864, "loss": 2.9954, "theoretical_loss": 3.567457559962935, "tokens_seen": 1275667456 }, { "epoch": 3.08, "learning_rate": 0.0003098495486459378, "loss": 2.8288, "theoretical_loss": 3.567440894997536, "tokens_seen": 1275732992 }, { "epoch": 3.08, "learning_rate": 0.000309839518555667, "loss": 2.8295, "theoretical_loss": 3.56742423112791, "tokens_seen": 1275798528 }, { "epoch": 3.08, "learning_rate": 0.0003098294884653962, "loss": 2.7641, "theoretical_loss": 3.5674075683539272, "tokens_seen": 1275864064 }, { "epoch": 3.08, "learning_rate": 0.00030981945837512537, "loss": 2.5984, "theoretical_loss": 3.56739090667546, "tokens_seen": 1275929600 }, { "epoch": 3.08, "learning_rate": 0.0003098094282848546, "loss": 2.8139, "theoretical_loss": 3.5673742460923803, "tokens_seen": 1275995136 }, { "epoch": 3.08, "learning_rate": 0.00030979939819458373, "loss": 2.9395, "theoretical_loss": 3.567357586604559, "tokens_seen": 1276060672 }, { "epoch": 3.08, "learning_rate": 0.00030978936810431297, "loss": 2.742, "theoretical_loss": 3.5673409282118684, "tokens_seen": 1276126208 }, { "epoch": 3.08, "learning_rate": 0.00030977933801404215, "loss": 2.9412, "theoretical_loss": 3.56732427091418, "tokens_seen": 1276191744 }, { "epoch": 3.08, "learning_rate": 0.00030976930792377133, "loss": 2.8299, "theoretical_loss": 3.5673076147113667, "tokens_seen": 1276257280 }, { "epoch": 3.08, "objective/train/docs_used": 2042188, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6980106830596924, "objective/train/theoretical_loss": 3.5673034508317225, "objective/train/tokens_used": 1296733664, "theoretical_loss": 3.5673034508317225, "tokens_seen": 1276273664 }, { "epoch": 3.08, "learning_rate": 0.0003097592778335005, "loss": 2.7173, "theoretical_loss": 3.567290959603299, "tokens_seen": 1276322816 }, { "epoch": 3.08, "learning_rate": 0.0003097492477432297, "loss": 2.8035, "theoretical_loss": 3.5672743055898497, "tokens_seen": 1276388352 }, { "epoch": 3.08, "learning_rate": 0.00030973921765295887, "loss": 2.5309, "theoretical_loss": 3.5672576526708895, "tokens_seen": 1276453888 }, { "epoch": 3.08, "learning_rate": 0.0003097291875626881, "loss": 2.8228, "theoretical_loss": 3.567241000846291, "tokens_seen": 1276519424 }, { "epoch": 3.08, "learning_rate": 0.00030971915747241723, "loss": 2.6796, "theoretical_loss": 3.567224350115927, "tokens_seen": 1276584960 }, { "epoch": 3.08, "learning_rate": 0.00030970912738214647, "loss": 2.8799, "theoretical_loss": 3.5672077004796674, "tokens_seen": 1276650496 }, { "epoch": 3.08, "learning_rate": 0.0003096990972918756, "loss": 2.8746, "theoretical_loss": 3.5671910519373853, "tokens_seen": 1276716032 }, { "epoch": 3.08, "learning_rate": 0.00030968906720160483, "loss": 2.7933, "theoretical_loss": 3.567174404488953, "tokens_seen": 1276781568 }, { "epoch": 3.08, "learning_rate": 0.000309679037111334, "loss": 3.0213, "theoretical_loss": 3.5671577581342424, "tokens_seen": 1276847104 }, { "epoch": 3.08, "learning_rate": 0.0003096690070210632, "loss": 2.796, "theoretical_loss": 3.5671411128731245, "tokens_seen": 1276912640 }, { "epoch": 3.08, "learning_rate": 0.0003096589769307924, "loss": 2.7598, "theoretical_loss": 3.5671244687054724, "tokens_seen": 1276978176 }, { "epoch": 3.08, "learning_rate": 0.0003096489468405216, "loss": 2.7918, "theoretical_loss": 3.5671078256311577, "tokens_seen": 1277043712 }, { "epoch": 3.08, "learning_rate": 0.00030963891675025074, "loss": 2.6643, "theoretical_loss": 3.567091183650053, "tokens_seen": 1277109248 }, { "epoch": 3.08, "learning_rate": 0.00030962888665997997, "loss": 2.8198, "theoretical_loss": 3.5670745427620294, "tokens_seen": 1277174784 }, { "epoch": 3.08, "learning_rate": 0.0003096188565697091, "loss": 2.7618, "theoretical_loss": 3.56705790296696, "tokens_seen": 1277240320 }, { "epoch": 3.08, "learning_rate": 0.00030960882647943833, "loss": 2.7712, "theoretical_loss": 3.567041264264716, "tokens_seen": 1277305856 }, { "epoch": 3.08, "learning_rate": 0.0003095987963891675, "loss": 2.7885, "theoretical_loss": 3.5670246266551704, "tokens_seen": 1277371392 }, { "epoch": 3.08, "learning_rate": 0.0003095887662988967, "loss": 2.8414, "theoretical_loss": 3.567007990138195, "tokens_seen": 1277436928 }, { "epoch": 3.08, "learning_rate": 0.0003095787362086259, "loss": 2.9225, "theoretical_loss": 3.5669913547136627, "tokens_seen": 1277502464 }, { "epoch": 3.08, "learning_rate": 0.00030956870611835506, "loss": 2.9169, "theoretical_loss": 3.5669747203814444, "tokens_seen": 1277568000 }, { "epoch": 3.08, "learning_rate": 0.00030955867602808424, "loss": 2.922, "theoretical_loss": 3.5669580871414133, "tokens_seen": 1277633536 }, { "epoch": 3.08, "learning_rate": 0.0003095486459378135, "loss": 2.8263, "theoretical_loss": 3.566941454993441, "tokens_seen": 1277699072 }, { "epoch": 3.08, "learning_rate": 0.0003095386158475426, "loss": 2.8211, "theoretical_loss": 3.5669248239374007, "tokens_seen": 1277764608 }, { "epoch": 3.08, "learning_rate": 0.00030952858575727184, "loss": 2.78, "theoretical_loss": 3.5669081939731644, "tokens_seen": 1277830144 }, { "epoch": 3.08, "learning_rate": 0.00030951855566700096, "loss": 2.8287, "theoretical_loss": 3.5668915651006037, "tokens_seen": 1277895680 }, { "epoch": 3.08, "objective/train/docs_used": 2045095, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.755298614501953, "objective/train/theoretical_loss": 3.5668874080530233, "objective/train/tokens_used": 1298372064, "theoretical_loss": 3.5668874080530233, "tokens_seen": 1277912064 }, { "epoch": 3.08, "learning_rate": 0.0003095085255767302, "loss": 2.818, "theoretical_loss": 3.566874937319592, "tokens_seen": 1277961216 }, { "epoch": 3.08, "learning_rate": 0.0003094984954864594, "loss": 2.857, "theoretical_loss": 3.566858310630001, "tokens_seen": 1278026752 }, { "epoch": 3.08, "learning_rate": 0.00030948846539618856, "loss": 2.8905, "theoretical_loss": 3.5668416850317035, "tokens_seen": 1278092288 }, { "epoch": 3.08, "learning_rate": 0.00030947843530591774, "loss": 2.9456, "theoretical_loss": 3.5668250605245717, "tokens_seen": 1278157824 }, { "epoch": 3.08, "learning_rate": 0.000309468405215647, "loss": 2.6788, "theoretical_loss": 3.566808437108478, "tokens_seen": 1278223360 }, { "epoch": 3.08, "learning_rate": 0.0003094583751253761, "loss": 2.8242, "theoretical_loss": 3.566791814783295, "tokens_seen": 1278288896 }, { "epoch": 3.08, "learning_rate": 0.00030944834503510534, "loss": 2.9028, "theoretical_loss": 3.566775193548895, "tokens_seen": 1278354432 }, { "epoch": 3.08, "learning_rate": 0.00030943831494483447, "loss": 2.9333, "theoretical_loss": 3.5667585734051506, "tokens_seen": 1278419968 }, { "epoch": 3.08, "learning_rate": 0.0003094282848545637, "loss": 2.7586, "theoretical_loss": 3.5667419543519343, "tokens_seen": 1278485504 }, { "epoch": 3.08, "learning_rate": 0.0003094182547642929, "loss": 2.7254, "theoretical_loss": 3.566725336389119, "tokens_seen": 1278551040 }, { "epoch": 3.08, "learning_rate": 0.00030940822467402207, "loss": 2.852, "theoretical_loss": 3.5667087195165763, "tokens_seen": 1278616576 }, { "epoch": 3.08, "learning_rate": 0.00030939819458375125, "loss": 2.8413, "theoretical_loss": 3.56669210373418, "tokens_seen": 1278682112 }, { "epoch": 3.08, "learning_rate": 0.00030938816449348043, "loss": 2.8396, "theoretical_loss": 3.5666754890418026, "tokens_seen": 1278747648 }, { "epoch": 3.08, "learning_rate": 0.0003093781344032096, "loss": 2.8166, "theoretical_loss": 3.566658875439316, "tokens_seen": 1278813184 }, { "epoch": 3.08, "learning_rate": 0.00030936810431293884, "loss": 2.8419, "theoretical_loss": 3.566642262926593, "tokens_seen": 1278878720 }, { "epoch": 3.08, "learning_rate": 0.00030935807422266797, "loss": 2.7846, "theoretical_loss": 3.5666256515035064, "tokens_seen": 1278944256 }, { "epoch": 3.08, "learning_rate": 0.0003093480441323972, "loss": 2.8627, "theoretical_loss": 3.566609041169929, "tokens_seen": 1279009792 }, { "epoch": 3.08, "learning_rate": 0.00030933801404212633, "loss": 2.7645, "theoretical_loss": 3.5665924319257334, "tokens_seen": 1279075328 }, { "epoch": 3.08, "learning_rate": 0.00030932798395185557, "loss": 2.8159, "theoretical_loss": 3.5665758237707923, "tokens_seen": 1279140864 }, { "epoch": 3.08, "learning_rate": 0.00030931795386158475, "loss": 2.8342, "theoretical_loss": 3.566559216704979, "tokens_seen": 1279206400 }, { "epoch": 3.08, "learning_rate": 0.00030930792377131393, "loss": 2.9264, "theoretical_loss": 3.566542610728166, "tokens_seen": 1279271936 }, { "epoch": 3.08, "learning_rate": 0.00030929789368104317, "loss": 2.7658, "theoretical_loss": 3.5665260058402253, "tokens_seen": 1279337472 }, { "epoch": 3.08, "learning_rate": 0.00030928786359077235, "loss": 2.9363, "theoretical_loss": 3.5665094020410306, "tokens_seen": 1279403008 }, { "epoch": 3.08, "learning_rate": 0.00030927783350050153, "loss": 2.7611, "theoretical_loss": 3.5664927993304545, "tokens_seen": 1279468544 }, { "epoch": 3.08, "learning_rate": 0.0003092678034102307, "loss": 2.827, "theoretical_loss": 3.5664761977083703, "tokens_seen": 1279534080 }, { "epoch": 3.08, "objective/train/docs_used": 2047951, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7561490535736084, "objective/train/theoretical_loss": 3.566472047472911, "objective/train/tokens_used": 1300010464, "theoretical_loss": 3.566472047472911, "tokens_seen": 1279550464 }, { "epoch": 3.08, "learning_rate": 0.0003092577733199599, "loss": 2.8293, "theoretical_loss": 3.56645959717465, "tokens_seen": 1279599616 }, { "epoch": 3.08, "learning_rate": 0.00030924774322968907, "loss": 2.6304, "theoretical_loss": 3.5664429977291676, "tokens_seen": 1279665152 }, { "epoch": 3.08, "learning_rate": 0.0003092377131394183, "loss": 2.9215, "theoretical_loss": 3.566426399371795, "tokens_seen": 1279730688 }, { "epoch": 3.08, "learning_rate": 0.00030922768304914743, "loss": 2.6503, "theoretical_loss": 3.566409802102406, "tokens_seen": 1279796224 }, { "epoch": 3.08, "learning_rate": 0.00030921765295887667, "loss": 2.9219, "theoretical_loss": 3.566393205920873, "tokens_seen": 1279861760 }, { "epoch": 3.08, "learning_rate": 0.0003092076228686058, "loss": 2.7555, "theoretical_loss": 3.566376610827069, "tokens_seen": 1279927296 }, { "epoch": 3.08, "learning_rate": 0.00030919759277833503, "loss": 2.9189, "theoretical_loss": 3.5663600168208673, "tokens_seen": 1279992832 }, { "epoch": 3.08, "learning_rate": 0.0003091875626880642, "loss": 2.8933, "theoretical_loss": 3.5663434239021408, "tokens_seen": 1280058368 }, { "epoch": 3.08, "learning_rate": 0.0003091775325977934, "loss": 2.8728, "theoretical_loss": 3.5663268320707626, "tokens_seen": 1280123904 }, { "epoch": 3.08, "learning_rate": 0.0003091675025075226, "loss": 2.9835, "theoretical_loss": 3.5663102413266063, "tokens_seen": 1280189440 }, { "epoch": 3.08, "learning_rate": 0.0003091574724172518, "loss": 2.8153, "theoretical_loss": 3.566293651669544, "tokens_seen": 1280254976 }, { "epoch": 3.08, "learning_rate": 0.00030914744232698094, "loss": 2.7222, "theoretical_loss": 3.5662770630994496, "tokens_seen": 1280320512 }, { "epoch": 3.08, "learning_rate": 0.00030913741223671017, "loss": 2.6039, "theoretical_loss": 3.5662604756161955, "tokens_seen": 1280386048 }, { "epoch": 3.08, "learning_rate": 0.0003091273821464393, "loss": 2.8839, "theoretical_loss": 3.5662438892196553, "tokens_seen": 1280451584 }, { "epoch": 3.08, "learning_rate": 0.00030911735205616853, "loss": 2.7741, "theoretical_loss": 3.5662273039097028, "tokens_seen": 1280517120 }, { "epoch": 3.08, "learning_rate": 0.0003091073219658977, "loss": 2.6301, "theoretical_loss": 3.56621071968621, "tokens_seen": 1280582656 }, { "epoch": 3.08, "learning_rate": 0.0003090972918756269, "loss": 2.9443, "theoretical_loss": 3.5661941365490515, "tokens_seen": 1280648192 }, { "epoch": 3.08, "learning_rate": 0.0003090872617853561, "loss": 2.9055, "theoretical_loss": 3.5661775544980996, "tokens_seen": 1280713728 }, { "epoch": 3.08, "learning_rate": 0.00030907723169508526, "loss": 2.7646, "theoretical_loss": 3.5661609735332274, "tokens_seen": 1280779264 }, { "epoch": 3.08, "learning_rate": 0.00030906720160481444, "loss": 2.8202, "theoretical_loss": 3.566144393654309, "tokens_seen": 1280844800 }, { "epoch": 3.08, "learning_rate": 0.0003090571715145437, "loss": 2.8147, "theoretical_loss": 3.5661278148612174, "tokens_seen": 1280910336 }, { "epoch": 3.08, "learning_rate": 0.0003090471414242728, "loss": 2.8575, "theoretical_loss": 3.5661112371538257, "tokens_seen": 1280975872 }, { "epoch": 3.08, "learning_rate": 0.00030903711133400204, "loss": 3.0096, "theoretical_loss": 3.5660946605320074, "tokens_seen": 1281041408 }, { "epoch": 3.08, "learning_rate": 0.00030902708124373116, "loss": 2.8467, "theoretical_loss": 3.566078084995636, "tokens_seen": 1281106944 }, { "epoch": 3.08, "learning_rate": 0.0003090170511534604, "loss": 2.7888, "theoretical_loss": 3.5660615105445848, "tokens_seen": 1281172480 }, { "epoch": 3.08, "objective/train/docs_used": 2050720, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0580687522888184, "objective/train/theoretical_loss": 3.566057367101388, "objective/train/tokens_used": 1301648864, "theoretical_loss": 3.566057367101388, "tokens_seen": 1281188864 }, { "epoch": 3.08, "learning_rate": 0.0003090070210631896, "loss": 2.8433, "theoretical_loss": 3.566044937178727, "tokens_seen": 1281238016 }, { "epoch": 3.08, "learning_rate": 0.00030899699097291876, "loss": 2.8327, "theoretical_loss": 3.566028364897936, "tokens_seen": 1281303552 }, { "epoch": 3.08, "learning_rate": 0.00030898696088264794, "loss": 2.9192, "theoretical_loss": 3.5660117937020863, "tokens_seen": 1281369088 }, { "epoch": 3.08, "learning_rate": 0.0003089769307923772, "loss": 2.8039, "theoretical_loss": 3.5659952235910506, "tokens_seen": 1281434624 }, { "epoch": 3.08, "learning_rate": 0.0003089669007021063, "loss": 2.8716, "theoretical_loss": 3.565978654564702, "tokens_seen": 1281500160 }, { "epoch": 3.08, "learning_rate": 0.00030895687061183554, "loss": 2.8519, "theoretical_loss": 3.565962086622914, "tokens_seen": 1281565696 }, { "epoch": 3.08, "learning_rate": 0.00030894684052156467, "loss": 2.7467, "theoretical_loss": 3.5659455197655614, "tokens_seen": 1281631232 }, { "epoch": 3.08, "learning_rate": 0.0003089368104312939, "loss": 2.8168, "theoretical_loss": 3.565928953992517, "tokens_seen": 1281696768 }, { "epoch": 3.08, "learning_rate": 0.0003089267803410231, "loss": 2.8619, "theoretical_loss": 3.565912389303654, "tokens_seen": 1281762304 }, { "epoch": 3.08, "learning_rate": 0.00030891675025075227, "loss": 2.7081, "theoretical_loss": 3.5658958256988464, "tokens_seen": 1281827840 }, { "epoch": 3.08, "learning_rate": 0.00030890672016048145, "loss": 2.8142, "theoretical_loss": 3.565879263177968, "tokens_seen": 1281893376 }, { "epoch": 3.08, "learning_rate": 0.00030889669007021063, "loss": 2.7799, "theoretical_loss": 3.5658627017408926, "tokens_seen": 1281958912 }, { "epoch": 3.08, "learning_rate": 0.0003088866599799398, "loss": 2.7755, "theoretical_loss": 3.5658461413874933, "tokens_seen": 1282024448 }, { "epoch": 3.08, "learning_rate": 0.00030887662988966904, "loss": 2.9732, "theoretical_loss": 3.565829582117644, "tokens_seen": 1282089984 }, { "epoch": 3.08, "learning_rate": 0.00030886659979939817, "loss": 2.8037, "theoretical_loss": 3.5658130239312182, "tokens_seen": 1282155520 }, { "epoch": 3.08, "learning_rate": 0.0003088565697091274, "loss": 2.9296, "theoretical_loss": 3.5657964668280906, "tokens_seen": 1282221056 }, { "epoch": 3.08, "learning_rate": 0.00030884653961885653, "loss": 2.8349, "theoretical_loss": 3.565779910808134, "tokens_seen": 1282286592 }, { "epoch": 3.08, "learning_rate": 0.00030883650952858577, "loss": 2.7321, "theoretical_loss": 3.5657633558712227, "tokens_seen": 1282352128 }, { "epoch": 3.08, "learning_rate": 0.00030882647943831495, "loss": 2.7959, "theoretical_loss": 3.5657468020172294, "tokens_seen": 1282417664 }, { "epoch": 3.08, "learning_rate": 0.00030881644934804413, "loss": 2.6268, "theoretical_loss": 3.56573024924603, "tokens_seen": 1282483200 }, { "epoch": 3.08, "learning_rate": 0.0003088064192577733, "loss": 2.8317, "theoretical_loss": 3.5657136975574963, "tokens_seen": 1282548736 }, { "epoch": 3.08, "learning_rate": 0.00030879638916750255, "loss": 2.9188, "theoretical_loss": 3.5656971469515035, "tokens_seen": 1282614272 }, { "epoch": 3.08, "learning_rate": 0.0003087863590772317, "loss": 2.8718, "theoretical_loss": 3.565680597427925, "tokens_seen": 1282679808 }, { "epoch": 3.08, "learning_rate": 0.0003087763289869609, "loss": 2.72, "theoretical_loss": 3.5656640489866342, "tokens_seen": 1282745344 }, { "epoch": 3.08, "learning_rate": 0.00030876629889669004, "loss": 2.7444, "theoretical_loss": 3.565647501627506, "tokens_seen": 1282810880 }, { "epoch": 3.08, "objective/train/docs_used": 2053595, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.671159029006958, "objective/train/theoretical_loss": 3.5656433649567973, "objective/train/tokens_used": 1303287264, "theoretical_loss": 3.5656433649567973, "tokens_seen": 1282827264 }, { "epoch": 3.08, "learning_rate": 0.00030875626880641927, "loss": 2.7079, "theoretical_loss": 3.5656309553504144, "tokens_seen": 1282876416 }, { "epoch": 3.08, "learning_rate": 0.00030874623871614845, "loss": 2.7463, "theoretical_loss": 3.5656144101552325, "tokens_seen": 1282941952 }, { "epoch": 3.08, "learning_rate": 0.00030873620862587763, "loss": 2.7667, "theoretical_loss": 3.565597866041834, "tokens_seen": 1283007488 }, { "epoch": 3.08, "learning_rate": 0.0003087261785356068, "loss": 2.8892, "theoretical_loss": 3.5655813230100946, "tokens_seen": 1283073024 }, { "epoch": 3.08, "learning_rate": 0.000308716148445336, "loss": 3.0491, "theoretical_loss": 3.5655647810598867, "tokens_seen": 1283138560 }, { "epoch": 3.08, "learning_rate": 0.0003087061183550652, "loss": 2.6954, "theoretical_loss": 3.5655482401910854, "tokens_seen": 1283204096 }, { "epoch": 3.08, "learning_rate": 0.0003086960882647944, "loss": 2.8254, "theoretical_loss": 3.565531700403564, "tokens_seen": 1283269632 }, { "epoch": 3.08, "learning_rate": 0.00030868605817452354, "loss": 2.816, "theoretical_loss": 3.5655151616971974, "tokens_seen": 1283335168 }, { "epoch": 3.08, "learning_rate": 0.0003086760280842528, "loss": 2.7198, "theoretical_loss": 3.565498624071859, "tokens_seen": 1283400704 }, { "epoch": 3.08, "learning_rate": 0.0003086659979939819, "loss": 2.8836, "theoretical_loss": 3.5654820875274233, "tokens_seen": 1283466240 }, { "epoch": 3.08, "learning_rate": 0.00030865596790371114, "loss": 2.9465, "theoretical_loss": 3.565465552063765, "tokens_seen": 1283531776 }, { "epoch": 3.08, "learning_rate": 0.0003086459378134403, "loss": 3.03, "theoretical_loss": 3.565449017680757, "tokens_seen": 1283597312 }, { "epoch": 3.08, "learning_rate": 0.0003086359077231695, "loss": 2.8489, "theoretical_loss": 3.5654324843782743, "tokens_seen": 1283662848 }, { "epoch": 3.08, "learning_rate": 0.0003086258776328987, "loss": 2.7073, "theoretical_loss": 3.565415952156191, "tokens_seen": 1283728384 }, { "epoch": 3.08, "learning_rate": 0.0003086158475426279, "loss": 2.7077, "theoretical_loss": 3.565399421014382, "tokens_seen": 1283793920 }, { "epoch": 3.08, "learning_rate": 0.00030860581745235704, "loss": 2.841, "theoretical_loss": 3.56538289095272, "tokens_seen": 1283859456 }, { "epoch": 3.08, "learning_rate": 0.0003085957873620863, "loss": 2.9124, "theoretical_loss": 3.5653663619710807, "tokens_seen": 1283924992 }, { "epoch": 3.08, "learning_rate": 0.0003085857572718154, "loss": 2.8044, "theoretical_loss": 3.5653498340693384, "tokens_seen": 1283990528 }, { "epoch": 3.08, "learning_rate": 0.00030857572718154464, "loss": 2.7368, "theoretical_loss": 3.5653333072473665, "tokens_seen": 1284056064 }, { "epoch": 3.08, "learning_rate": 0.0003085656970912738, "loss": 2.7881, "theoretical_loss": 3.56531678150504, "tokens_seen": 1284121600 }, { "epoch": 3.08, "learning_rate": 0.000308555667001003, "loss": 2.9394, "theoretical_loss": 3.565300256842233, "tokens_seen": 1284187136 }, { "epoch": 3.08, "learning_rate": 0.00030854563691073224, "loss": 2.8206, "theoretical_loss": 3.56528373325882, "tokens_seen": 1284252672 }, { "epoch": 3.08, "learning_rate": 0.00030853560682046137, "loss": 2.9449, "theoretical_loss": 3.5652672107546755, "tokens_seen": 1284318208 }, { "epoch": 3.08, "learning_rate": 0.0003085255767301906, "loss": 2.7905, "theoretical_loss": 3.565250689329674, "tokens_seen": 1284383744 }, { "epoch": 3.08, "learning_rate": 0.0003085155466399198, "loss": 2.8243, "theoretical_loss": 3.5652341689836895, "tokens_seen": 1284449280 }, { "epoch": 3.08, "objective/train/docs_used": 2056231, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.272430181503296, "objective/train/theoretical_loss": 3.565230039065775, "objective/train/tokens_used": 1304925664, "theoretical_loss": 3.565230039065775, "tokens_seen": 1284465664 }, { "epoch": 3.08, "learning_rate": 0.00030850551654964896, "loss": 2.9441, "theoretical_loss": 3.5652176497165975, "tokens_seen": 1284514816 }, { "epoch": 3.08, "learning_rate": 0.00030849548645937814, "loss": 2.7882, "theoretical_loss": 3.565201131528271, "tokens_seen": 1284580352 }, { "epoch": 3.08, "learning_rate": 0.0003084854563691074, "loss": 2.7605, "theoretical_loss": 3.5651846144185857, "tokens_seen": 1284645888 }, { "epoch": 3.08, "learning_rate": 0.0003084754262788365, "loss": 2.8858, "theoretical_loss": 3.565168098387416, "tokens_seen": 1284711424 }, { "epoch": 3.08, "learning_rate": 0.00030846539618856574, "loss": 2.7759, "theoretical_loss": 3.565151583434636, "tokens_seen": 1284776960 }, { "epoch": 3.08, "learning_rate": 0.00030845536609829487, "loss": 2.9479, "theoretical_loss": 3.565135069560121, "tokens_seen": 1284842496 }, { "epoch": 3.08, "learning_rate": 0.0003084453360080241, "loss": 2.9092, "theoretical_loss": 3.5651185567637445, "tokens_seen": 1284908032 }, { "epoch": 3.08, "learning_rate": 0.0003084353059177533, "loss": 2.829, "theoretical_loss": 3.565102045045382, "tokens_seen": 1284973568 }, { "epoch": 3.08, "learning_rate": 0.00030842527582748247, "loss": 2.8063, "theoretical_loss": 3.5650855344049077, "tokens_seen": 1285039104 }, { "epoch": 3.08, "learning_rate": 0.00030841524573721165, "loss": 2.928, "theoretical_loss": 3.565069024842196, "tokens_seen": 1285104640 }, { "epoch": 3.08, "learning_rate": 0.00030840521564694083, "loss": 2.9136, "theoretical_loss": 3.565052516357123, "tokens_seen": 1285170176 }, { "epoch": 3.08, "learning_rate": 0.00030839518555667, "loss": 2.9731, "theoretical_loss": 3.565036008949562, "tokens_seen": 1285235712 }, { "epoch": 3.08, "learning_rate": 0.00030838515546639924, "loss": 2.9301, "theoretical_loss": 3.565019502619389, "tokens_seen": 1285301248 }, { "epoch": 3.08, "learning_rate": 0.00030837512537612837, "loss": 2.8052, "theoretical_loss": 3.565002997366477, "tokens_seen": 1285366784 }, { "epoch": 3.08, "learning_rate": 0.0003083650952858576, "loss": 2.8715, "theoretical_loss": 3.5649864931907023, "tokens_seen": 1285432320 }, { "epoch": 3.08, "learning_rate": 0.00030835506519558673, "loss": 2.8253, "theoretical_loss": 3.5649699900919387, "tokens_seen": 1285497856 }, { "epoch": 3.08, "learning_rate": 0.00030834503510531597, "loss": 2.6142, "theoretical_loss": 3.564953488070062, "tokens_seen": 1285563392 }, { "epoch": 3.08, "learning_rate": 0.00030833500501504515, "loss": 2.8056, "theoretical_loss": 3.564936987124946, "tokens_seen": 1285628928 }, { "epoch": 3.08, "learning_rate": 0.00030832497492477433, "loss": 2.6789, "theoretical_loss": 3.564920487256466, "tokens_seen": 1285694464 }, { "epoch": 3.08, "learning_rate": 0.0003083149448345035, "loss": 2.6607, "theoretical_loss": 3.564903988464498, "tokens_seen": 1285760000 }, { "epoch": 3.08, "learning_rate": 0.00030830491474423275, "loss": 2.8736, "theoretical_loss": 3.5648874907489145, "tokens_seen": 1285825536 }, { "epoch": 3.08, "learning_rate": 0.0003082948846539619, "loss": 2.8778, "theoretical_loss": 3.564870994109592, "tokens_seen": 1285891072 }, { "epoch": 3.08, "learning_rate": 0.0003082848545636911, "loss": 2.8991, "theoretical_loss": 3.5648544985464055, "tokens_seen": 1285956608 }, { "epoch": 3.08, "learning_rate": 0.00030827482447342024, "loss": 2.7046, "theoretical_loss": 3.56483800405923, "tokens_seen": 1286022144 }, { "epoch": 3.08, "learning_rate": 0.00030826479438314947, "loss": 2.8303, "theoretical_loss": 3.5648215106479393, "tokens_seen": 1286087680 }, { "epoch": 3.08, "objective/train/docs_used": 2057685, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9047701358795166, "objective/train/theoretical_loss": 3.5648173874632096, "objective/train/tokens_used": 1306564064, "theoretical_loss": 3.5648173874632096, "tokens_seen": 1286104064 }, { "epoch": 3.08, "learning_rate": 0.00030825476429287865, "loss": 2.8594, "theoretical_loss": 3.56480501831241, "tokens_seen": 1286153216 }, { "epoch": 3.08, "learning_rate": 0.00030824473420260783, "loss": 2.7933, "theoretical_loss": 3.564788527052516, "tokens_seen": 1286218752 }, { "epoch": 3.08, "learning_rate": 0.000308234704112337, "loss": 2.8361, "theoretical_loss": 3.5647720368681326, "tokens_seen": 1286284288 }, { "epoch": 3.08, "learning_rate": 0.0003082246740220662, "loss": 2.9337, "theoretical_loss": 3.564755547759135, "tokens_seen": 1286349824 }, { "epoch": 3.08, "learning_rate": 0.0003082146439317954, "loss": 2.8558, "theoretical_loss": 3.5647390597253983, "tokens_seen": 1286415360 }, { "epoch": 3.08, "learning_rate": 0.0003082046138415246, "loss": 2.6352, "theoretical_loss": 3.5647225727667973, "tokens_seen": 1286480896 }, { "epoch": 3.08, "learning_rate": 0.00030819458375125374, "loss": 2.8214, "theoretical_loss": 3.564706086883208, "tokens_seen": 1286546432 }, { "epoch": 3.08, "learning_rate": 0.000308184553660983, "loss": 2.7587, "theoretical_loss": 3.5646896020745045, "tokens_seen": 1286611968 }, { "epoch": 3.08, "learning_rate": 0.0003081745235707121, "loss": 2.8347, "theoretical_loss": 3.5646731183405627, "tokens_seen": 1286677504 }, { "epoch": 3.08, "learning_rate": 0.00030816449348044134, "loss": 2.9672, "theoretical_loss": 3.5646566356812572, "tokens_seen": 1286743040 }, { "epoch": 3.08, "learning_rate": 0.0003081544633901705, "loss": 2.8373, "theoretical_loss": 3.564640154096464, "tokens_seen": 1286808576 }, { "epoch": 3.08, "learning_rate": 0.0003081444332998997, "loss": 2.782, "theoretical_loss": 3.5646236735860573, "tokens_seen": 1286874112 }, { "epoch": 3.08, "learning_rate": 0.0003081344032096289, "loss": 2.8336, "theoretical_loss": 3.564607194149913, "tokens_seen": 1286939648 }, { "epoch": 3.08, "learning_rate": 0.0003081243731193581, "loss": 2.7307, "theoretical_loss": 3.564590715787907, "tokens_seen": 1287005184 }, { "epoch": 3.08, "learning_rate": 0.00030811434302908724, "loss": 2.7777, "theoretical_loss": 3.564574238499913, "tokens_seen": 1287070720 }, { "epoch": 3.08, "learning_rate": 0.0003081043129388165, "loss": 2.7292, "theoretical_loss": 3.5645577622858076, "tokens_seen": 1287136256 }, { "epoch": 3.08, "learning_rate": 0.0003080942828485456, "loss": 2.8924, "theoretical_loss": 3.564541287145466, "tokens_seen": 1287201792 }, { "epoch": 3.08, "learning_rate": 0.00030808425275827484, "loss": 2.8401, "theoretical_loss": 3.5645248130787626, "tokens_seen": 1287267328 }, { "epoch": 3.08, "learning_rate": 0.000308074222668004, "loss": 2.8797, "theoretical_loss": 3.564508340085574, "tokens_seen": 1287332864 }, { "epoch": 3.08, "learning_rate": 0.0003080641925777332, "loss": 2.814, "theoretical_loss": 3.5644918681657747, "tokens_seen": 1287398400 }, { "epoch": 3.08, "learning_rate": 0.0003080541624874624, "loss": 2.9278, "theoretical_loss": 3.5644753973192405, "tokens_seen": 1287463936 }, { "epoch": 3.08, "learning_rate": 0.00030804413239719157, "loss": 2.8492, "theoretical_loss": 3.5644589275458474, "tokens_seen": 1287529472 }, { "epoch": 3.08, "learning_rate": 0.00030803410230692075, "loss": 2.8715, "theoretical_loss": 3.5644424588454697, "tokens_seen": 1287595008 }, { "epoch": 3.08, "learning_rate": 0.00030802407221665, "loss": 2.8693, "theoretical_loss": 3.564425991217984, "tokens_seen": 1287660544 }, { "epoch": 3.08, "learning_rate": 0.0003080140421263791, "loss": 2.8238, "theoretical_loss": 3.5644095246632648, "tokens_seen": 1287726080 }, { "epoch": 3.08, "objective/train/docs_used": 2060564, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6895647048950195, "objective/train/theoretical_loss": 3.56440540819219, "objective/train/tokens_used": 1308202464, "theoretical_loss": 3.56440540819219, "tokens_seen": 1287742464 }, { "epoch": 3.08, "learning_rate": 0.00030800401203610834, "loss": 2.7429, "theoretical_loss": 3.5643930591811883, "tokens_seen": 1287791616 }, { "epoch": 3.08, "learning_rate": 0.0003079939819458375, "loss": 2.8074, "theoretical_loss": 3.56437659477163, "tokens_seen": 1287857152 }, { "epoch": 3.08, "learning_rate": 0.0003079839518555667, "loss": 2.8484, "theoretical_loss": 3.564360131434465, "tokens_seen": 1287922688 }, { "epoch": 3.08, "learning_rate": 0.0003079739217652959, "loss": 2.8502, "theoretical_loss": 3.5643436691695696, "tokens_seen": 1287988224 }, { "epoch": 3.08, "learning_rate": 0.00030796389167502507, "loss": 2.7296, "theoretical_loss": 3.5643272079768185, "tokens_seen": 1288053760 }, { "epoch": 3.08, "learning_rate": 0.00030795386158475425, "loss": 2.7973, "theoretical_loss": 3.5643107478560885, "tokens_seen": 1288119296 }, { "epoch": 3.08, "learning_rate": 0.0003079438314944835, "loss": 2.8589, "theoretical_loss": 3.564294288807254, "tokens_seen": 1288184832 }, { "epoch": 3.08, "learning_rate": 0.0003079338014042126, "loss": 2.8833, "theoretical_loss": 3.564277830830192, "tokens_seen": 1288250368 }, { "epoch": 3.08, "learning_rate": 0.00030792377131394185, "loss": 2.8744, "theoretical_loss": 3.5642613739247766, "tokens_seen": 1288315904 }, { "epoch": 3.08, "learning_rate": 0.000307913741223671, "loss": 2.873, "theoretical_loss": 3.5642449180908846, "tokens_seen": 1288381440 }, { "epoch": 3.08, "learning_rate": 0.0003079037111334002, "loss": 2.8117, "theoretical_loss": 3.564228463328392, "tokens_seen": 1288446976 }, { "epoch": 3.08, "learning_rate": 0.0003078936810431294, "loss": 2.7698, "theoretical_loss": 3.5642120096371737, "tokens_seen": 1288512512 }, { "epoch": 3.08, "learning_rate": 0.00030788365095285857, "loss": 2.7774, "theoretical_loss": 3.564195557017106, "tokens_seen": 1288578048 }, { "epoch": 3.08, "learning_rate": 0.00030787362086258775, "loss": 2.7229, "theoretical_loss": 3.5641791054680643, "tokens_seen": 1288643584 }, { "epoch": 3.08, "learning_rate": 0.00030786359077231693, "loss": 2.8985, "theoretical_loss": 3.5641626549899246, "tokens_seen": 1288709120 }, { "epoch": 3.08, "learning_rate": 0.0003078535606820461, "loss": 2.6499, "theoretical_loss": 3.564146205582563, "tokens_seen": 1288774656 }, { "epoch": 3.08, "learning_rate": 0.00030784353059177535, "loss": 2.8404, "theoretical_loss": 3.564129757245855, "tokens_seen": 1288840192 }, { "epoch": 3.08, "learning_rate": 0.0003078335005015045, "loss": 2.7768, "theoretical_loss": 3.5641133099796765, "tokens_seen": 1288905728 }, { "epoch": 3.08, "learning_rate": 0.0003078234704112337, "loss": 3.0153, "theoretical_loss": 3.5640968637839037, "tokens_seen": 1288971264 }, { "epoch": 3.08, "learning_rate": 0.0003078134403209629, "loss": 2.8298, "theoretical_loss": 3.5640804186584125, "tokens_seen": 1289036800 }, { "epoch": 3.08, "learning_rate": 0.0003078034102306921, "loss": 2.8477, "theoretical_loss": 3.564063974603078, "tokens_seen": 1289102336 }, { "epoch": 3.08, "learning_rate": 0.0003077933801404213, "loss": 2.7826, "theoretical_loss": 3.5640475316177778, "tokens_seen": 1289167872 }, { "epoch": 3.08, "learning_rate": 0.00030778335005015044, "loss": 2.8496, "theoretical_loss": 3.564031089702386, "tokens_seen": 1289233408 }, { "epoch": 3.08, "learning_rate": 0.00030777331995987967, "loss": 2.8117, "theoretical_loss": 3.56401464885678, "tokens_seen": 1289298944 }, { "epoch": 3.08, "learning_rate": 0.00030776328986960885, "loss": 2.9735, "theoretical_loss": 3.563998209080835, "tokens_seen": 1289364480 }, { "epoch": 3.08, "objective/train/docs_used": 2063230, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.94197154045105, "objective/train/theoretical_loss": 3.563994099303969, "objective/train/tokens_used": 1309840864, "theoretical_loss": 3.563994099303969, "tokens_seen": 1289380864 }, { "epoch": 3.08, "learning_rate": 0.00030775325977933803, "loss": 2.9038, "theoretical_loss": 3.563981770374428, "tokens_seen": 1289430016 }, { "epoch": 3.08, "learning_rate": 0.0003077432296890672, "loss": 2.7779, "theoretical_loss": 3.5639653327374337, "tokens_seen": 1289495552 }, { "epoch": 3.08, "learning_rate": 0.0003077331995987964, "loss": 2.8222, "theoretical_loss": 3.5639488961697294, "tokens_seen": 1289561088 }, { "epoch": 3.08, "learning_rate": 0.0003077231695085256, "loss": 2.8521, "theoretical_loss": 3.5639324606711904, "tokens_seen": 1289626624 }, { "epoch": 3.08, "learning_rate": 0.0003077131394182548, "loss": 2.7309, "theoretical_loss": 3.5639160262416936, "tokens_seen": 1289692160 }, { "epoch": 3.08, "learning_rate": 0.00030770310932798394, "loss": 2.8944, "theoretical_loss": 3.563899592881114, "tokens_seen": 1289757696 }, { "epoch": 3.08, "learning_rate": 0.0003076930792377132, "loss": 2.8254, "theoretical_loss": 3.563883160589329, "tokens_seen": 1289823232 }, { "epoch": 3.08, "learning_rate": 0.0003076830491474423, "loss": 2.8558, "theoretical_loss": 3.5638667293662145, "tokens_seen": 1289888768 }, { "epoch": 3.08, "learning_rate": 0.00030767301905717154, "loss": 2.8639, "theoretical_loss": 3.563850299211646, "tokens_seen": 1289954304 }, { "epoch": 3.08, "learning_rate": 0.0003076629889669007, "loss": 2.7628, "theoretical_loss": 3.5638338701255003, "tokens_seen": 1290019840 }, { "epoch": 3.08, "learning_rate": 0.0003076529588766299, "loss": 2.8572, "theoretical_loss": 3.563817442107654, "tokens_seen": 1290085376 }, { "epoch": 3.08, "learning_rate": 0.0003076429287863591, "loss": 2.9019, "theoretical_loss": 3.563801015157982, "tokens_seen": 1290150912 }, { "epoch": 3.08, "learning_rate": 0.0003076328986960883, "loss": 2.7502, "theoretical_loss": 3.563784589276362, "tokens_seen": 1290216448 }, { "epoch": 3.08, "learning_rate": 0.00030762286860581744, "loss": 2.7712, "theoretical_loss": 3.5637681644626698, "tokens_seen": 1290281984 }, { "epoch": 3.08, "learning_rate": 0.0003076128385155467, "loss": 2.8537, "theoretical_loss": 3.563751740716782, "tokens_seen": 1290347520 }, { "epoch": 3.08, "learning_rate": 0.0003076028084252758, "loss": 2.7836, "theoretical_loss": 3.5637353180385745, "tokens_seen": 1290413056 }, { "epoch": 3.08, "learning_rate": 0.00030759277833500504, "loss": 2.7941, "theoretical_loss": 3.5637188964279236, "tokens_seen": 1290478592 }, { "epoch": 3.08, "learning_rate": 0.0003075827482447342, "loss": 2.5539, "theoretical_loss": 3.563702475884706, "tokens_seen": 1290544128 }, { "epoch": 3.08, "learning_rate": 0.0003075727181544634, "loss": 2.7128, "theoretical_loss": 3.563686056408798, "tokens_seen": 1290609664 }, { "epoch": 3.08, "learning_rate": 0.0003075626880641926, "loss": 2.9419, "theoretical_loss": 3.5636696380000767, "tokens_seen": 1290675200 }, { "epoch": 3.08, "learning_rate": 0.00030755265797392177, "loss": 2.8378, "theoretical_loss": 3.5636532206584173, "tokens_seen": 1290740736 }, { "epoch": 3.08, "learning_rate": 0.00030754262788365095, "loss": 2.8606, "theoretical_loss": 3.5636368043836972, "tokens_seen": 1290806272 }, { "epoch": 3.08, "learning_rate": 0.0003075325977933802, "loss": 2.8728, "theoretical_loss": 3.5636203891757927, "tokens_seen": 1290871808 }, { "epoch": 3.08, "learning_rate": 0.0003075225677031093, "loss": 2.8559, "theoretical_loss": 3.5636039750345807, "tokens_seen": 1290937344 }, { "epoch": 3.08, "learning_rate": 0.00030751253761283854, "loss": 2.7561, "theoretical_loss": 3.5635875619599364, "tokens_seen": 1291002880 }, { "epoch": 3.08, "objective/train/docs_used": 2066144, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.896148920059204, "objective/train/theoretical_loss": 3.5635834588579125, "objective/train/tokens_used": 1311479264, "theoretical_loss": 3.5635834588579125, "tokens_seen": 1291019264 }, { "epoch": 3.08, "learning_rate": 0.0003075025075225677, "loss": 3.0019, "theoretical_loss": 3.5635711499517377, "tokens_seen": 1291068416 }, { "epoch": 3.08, "learning_rate": 0.0003074924774322969, "loss": 2.7729, "theoretical_loss": 3.5635547390098608, "tokens_seen": 1291133952 }, { "epoch": 3.08, "learning_rate": 0.0003074824473420261, "loss": 2.6531, "theoretical_loss": 3.5635383291341824, "tokens_seen": 1291199488 }, { "epoch": 3.08, "learning_rate": 0.00030747241725175527, "loss": 2.8833, "theoretical_loss": 3.5635219203245785, "tokens_seen": 1291265024 }, { "epoch": 3.08, "learning_rate": 0.00030746238716148445, "loss": 2.735, "theoretical_loss": 3.5635055125809263, "tokens_seen": 1291330560 }, { "epoch": 3.08, "learning_rate": 0.0003074523570712137, "loss": 2.9572, "theoretical_loss": 3.5634891059031024, "tokens_seen": 1291396096 }, { "epoch": 3.08, "learning_rate": 0.0003074423269809428, "loss": 2.853, "theoretical_loss": 3.5634727002909834, "tokens_seen": 1291461632 }, { "epoch": 3.08, "learning_rate": 0.00030743229689067205, "loss": 2.8459, "theoretical_loss": 3.5634562957444462, "tokens_seen": 1291527168 }, { "epoch": 3.08, "learning_rate": 0.0003074222668004012, "loss": 2.7542, "theoretical_loss": 3.563439892263367, "tokens_seen": 1291592704 }, { "epoch": 3.08, "learning_rate": 0.0003074122367101304, "loss": 2.8185, "theoretical_loss": 3.5634234898476236, "tokens_seen": 1291658240 }, { "epoch": 3.08, "learning_rate": 0.0003074022066198596, "loss": 2.8744, "theoretical_loss": 3.5634070884970916, "tokens_seen": 1291723776 }, { "epoch": 3.08, "learning_rate": 0.00030739217652958877, "loss": 2.6677, "theoretical_loss": 3.563390688211648, "tokens_seen": 1291789312 }, { "epoch": 3.08, "learning_rate": 0.00030738214643931795, "loss": 2.8331, "theoretical_loss": 3.56337428899117, "tokens_seen": 1291854848 }, { "epoch": 3.08, "learning_rate": 0.00030737211634904713, "loss": 2.7535, "theoretical_loss": 3.563357890835535, "tokens_seen": 1291920384 }, { "epoch": 3.08, "learning_rate": 0.0003073620862587763, "loss": 2.7739, "theoretical_loss": 3.563341493744618, "tokens_seen": 1291985920 }, { "epoch": 3.08, "learning_rate": 0.00030735205616850555, "loss": 2.9396, "theoretical_loss": 3.5633250977182978, "tokens_seen": 1292051456 }, { "epoch": 3.08, "learning_rate": 0.0003073420260782347, "loss": 2.9535, "theoretical_loss": 3.5633087027564505, "tokens_seen": 1292116992 }, { "epoch": 3.08, "learning_rate": 0.0003073319959879639, "loss": 2.7878, "theoretical_loss": 3.5632923088589523, "tokens_seen": 1292182528 }, { "epoch": 3.08, "learning_rate": 0.0003073219658976931, "loss": 2.8176, "theoretical_loss": 3.563275916025681, "tokens_seen": 1292248064 }, { "epoch": 3.08, "learning_rate": 0.0003073119358074223, "loss": 2.866, "theoretical_loss": 3.5632595242565137, "tokens_seen": 1292313600 }, { "epoch": 3.08, "learning_rate": 0.00030730190571715146, "loss": 2.8714, "theoretical_loss": 3.563243133551327, "tokens_seen": 1292379136 }, { "epoch": 3.08, "learning_rate": 0.00030729187562688064, "loss": 2.7196, "theoretical_loss": 3.563226743909998, "tokens_seen": 1292444672 }, { "epoch": 3.08, "learning_rate": 0.0003072818455366098, "loss": 2.7525, "theoretical_loss": 3.5632103553324033, "tokens_seen": 1292510208 }, { "epoch": 3.08, "learning_rate": 0.00030727181544633905, "loss": 2.771, "theoretical_loss": 3.56319396781842, "tokens_seen": 1292575744 }, { "epoch": 3.08, "learning_rate": 0.0003072617853560682, "loss": 2.8291, "theoretical_loss": 3.563177581367926, "tokens_seen": 1292641280 }, { "epoch": 3.08, "objective/train/docs_used": 2068589, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.926602363586426, "objective/train/theoretical_loss": 3.5631734849214585, "objective/train/tokens_used": 1313117664, "theoretical_loss": 3.5631734849214585, "tokens_seen": 1292657664 }, { "epoch": 3.08, "learning_rate": 0.0003072517552657974, "loss": 2.7905, "theoretical_loss": 3.563161195980798, "tokens_seen": 1292706816 }, { "epoch": 3.08, "learning_rate": 0.00030724172517552654, "loss": 2.8511, "theoretical_loss": 3.5631448116569118, "tokens_seen": 1292772352 }, { "epoch": 3.08, "learning_rate": 0.0003072316950852558, "loss": 2.9001, "theoretical_loss": 3.563128428396146, "tokens_seen": 1292837888 }, { "epoch": 3.08, "learning_rate": 0.00030722166499498496, "loss": 2.9564, "theoretical_loss": 3.563112046198378, "tokens_seen": 1292903424 }, { "epoch": 3.08, "learning_rate": 0.00030721163490471414, "loss": 2.7404, "theoretical_loss": 3.5630956650634835, "tokens_seen": 1292968960 }, { "epoch": 3.08, "learning_rate": 0.0003072016048144433, "loss": 2.7867, "theoretical_loss": 3.563079284991341, "tokens_seen": 1293034496 }, { "epoch": 3.08, "learning_rate": 0.0003071915747241725, "loss": 2.9799, "theoretical_loss": 3.563062905981827, "tokens_seen": 1293100032 }, { "epoch": 3.08, "learning_rate": 0.0003071815446339017, "loss": 2.8573, "theoretical_loss": 3.5630465280348185, "tokens_seen": 1293165568 }, { "epoch": 3.08, "learning_rate": 0.0003071715145436309, "loss": 2.8362, "theoretical_loss": 3.563030151150193, "tokens_seen": 1293231104 }, { "epoch": 3.08, "learning_rate": 0.00030716148445336005, "loss": 2.912, "theoretical_loss": 3.5630137753278284, "tokens_seen": 1293296640 }, { "epoch": 3.08, "learning_rate": 0.0003071514543630893, "loss": 2.7502, "theoretical_loss": 3.5629974005676006, "tokens_seen": 1293362176 }, { "epoch": 3.08, "learning_rate": 0.00030714142427281846, "loss": 2.7029, "theoretical_loss": 3.5629810268693882, "tokens_seen": 1293427712 }, { "epoch": 3.08, "learning_rate": 0.00030713139418254764, "loss": 2.7338, "theoretical_loss": 3.562964654233068, "tokens_seen": 1293493248 }, { "epoch": 3.08, "learning_rate": 0.0003071213640922768, "loss": 2.8494, "theoretical_loss": 3.5629482826585175, "tokens_seen": 1293558784 }, { "epoch": 3.08, "learning_rate": 0.000307111334002006, "loss": 2.8369, "theoretical_loss": 3.562931912145614, "tokens_seen": 1293624320 }, { "epoch": 3.08, "learning_rate": 0.0003071013039117352, "loss": 2.8778, "theoretical_loss": 3.562915542694234, "tokens_seen": 1293689856 }, { "epoch": 3.08, "learning_rate": 0.0003070912738214644, "loss": 2.8795, "theoretical_loss": 3.562899174304256, "tokens_seen": 1293755392 }, { "epoch": 3.08, "learning_rate": 0.00030708124373119355, "loss": 2.7773, "theoretical_loss": 3.5628828069755576, "tokens_seen": 1293820928 }, { "epoch": 3.08, "learning_rate": 0.0003070712136409228, "loss": 2.9088, "theoretical_loss": 3.5628664407080155, "tokens_seen": 1293886464 }, { "epoch": 3.08, "learning_rate": 0.0003070611835506519, "loss": 2.766, "theoretical_loss": 3.5628500755015073, "tokens_seen": 1293952000 }, { "epoch": 3.08, "learning_rate": 0.00030705115346038115, "loss": 2.7332, "theoretical_loss": 3.5628337113559105, "tokens_seen": 1294017536 }, { "epoch": 3.08, "learning_rate": 0.0003070411233701104, "loss": 2.8046, "theoretical_loss": 3.562817348271103, "tokens_seen": 1294083072 }, { "epoch": 3.08, "learning_rate": 0.0003070310932798395, "loss": 2.9611, "theoretical_loss": 3.562800986246962, "tokens_seen": 1294148608 }, { "epoch": 3.08, "learning_rate": 0.00030702106318956874, "loss": 2.7184, "theoretical_loss": 3.562784625283365, "tokens_seen": 1294214144 }, { "epoch": 3.08, "learning_rate": 0.0003070110330992979, "loss": 2.7914, "theoretical_loss": 3.562768265380189, "tokens_seen": 1294279680 }, { "epoch": 3.08, "objective/train/docs_used": 2071519, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9766716957092285, "objective/train/theoretical_loss": 3.562764175570072, "objective/train/tokens_used": 1314756064, "theoretical_loss": 3.562764175570072, "tokens_seen": 1294296064 }, { "epoch": 3.08, "learning_rate": 0.0003070010030090271, "loss": 2.8257, "theoretical_loss": 3.5627519065373128, "tokens_seen": 1294345216 }, { "epoch": 3.08, "learning_rate": 0.0003069909729187563, "loss": 2.7773, "theoretical_loss": 3.5627355487546133, "tokens_seen": 1294410752 }, { "epoch": 3.08, "learning_rate": 0.00030698094282848547, "loss": 2.9655, "theoretical_loss": 3.562719192031968, "tokens_seen": 1294476288 }, { "epoch": 3.08, "learning_rate": 0.00030697091273821465, "loss": 2.8812, "theoretical_loss": 3.5627028363692546, "tokens_seen": 1294541824 }, { "epoch": 3.08, "learning_rate": 0.0003069608826479439, "loss": 2.7393, "theoretical_loss": 3.5626864817663515, "tokens_seen": 1294607360 }, { "epoch": 3.08, "learning_rate": 0.000306950852557673, "loss": 2.7738, "theoretical_loss": 3.562670128223135, "tokens_seen": 1294672896 }, { "epoch": 3.08, "learning_rate": 0.00030694082246740225, "loss": 2.8539, "theoretical_loss": 3.5626537757394843, "tokens_seen": 1294738432 }, { "epoch": 3.08, "learning_rate": 0.0003069307923771314, "loss": 2.8844, "theoretical_loss": 3.562637424315276, "tokens_seen": 1294803968 }, { "epoch": 3.08, "learning_rate": 0.0003069207622868606, "loss": 2.8087, "theoretical_loss": 3.562621073950388, "tokens_seen": 1294869504 }, { "epoch": 3.08, "learning_rate": 0.0003069107321965898, "loss": 2.7098, "theoretical_loss": 3.562604724644699, "tokens_seen": 1294935040 }, { "epoch": 3.08, "learning_rate": 0.00030690070210631897, "loss": 2.7998, "theoretical_loss": 3.5625883763980855, "tokens_seen": 1295000576 }, { "epoch": 3.08, "learning_rate": 0.00030689067201604815, "loss": 2.7148, "theoretical_loss": 3.5625720292104264, "tokens_seen": 1295066112 }, { "epoch": 3.08, "learning_rate": 0.00030688064192577733, "loss": 2.8172, "theoretical_loss": 3.5625556830815985, "tokens_seen": 1295131648 }, { "epoch": 3.08, "learning_rate": 0.0003068706118355065, "loss": 2.7691, "theoretical_loss": 3.5625393380114803, "tokens_seen": 1295197184 }, { "epoch": 3.08, "learning_rate": 0.00030686058174523575, "loss": 2.7821, "theoretical_loss": 3.56252299399995, "tokens_seen": 1295262720 }, { "epoch": 3.08, "learning_rate": 0.0003068505516549649, "loss": 2.8479, "theoretical_loss": 3.5625066510468844, "tokens_seen": 1295328256 }, { "epoch": 3.08, "learning_rate": 0.0003068405215646941, "loss": 2.7416, "theoretical_loss": 3.5624903091521625, "tokens_seen": 1295393792 }, { "epoch": 3.08, "learning_rate": 0.0003068304914744233, "loss": 2.6438, "theoretical_loss": 3.562473968315661, "tokens_seen": 1295459328 }, { "epoch": 3.08, "learning_rate": 0.0003068204613841525, "loss": 2.7384, "theoretical_loss": 3.56245762853726, "tokens_seen": 1295524864 }, { "epoch": 3.08, "learning_rate": 0.00030681043129388166, "loss": 2.8675, "theoretical_loss": 3.5624412898168347, "tokens_seen": 1295590400 }, { "epoch": 3.08, "learning_rate": 0.00030680040120361084, "loss": 2.8992, "theoretical_loss": 3.562424952154265, "tokens_seen": 1295655936 }, { "epoch": 3.08, "learning_rate": 0.00030679037111334, "loss": 2.8271, "theoretical_loss": 3.562408615549428, "tokens_seen": 1295721472 }, { "epoch": 3.08, "learning_rate": 0.00030678034102306925, "loss": 2.8593, "theoretical_loss": 3.5623922800022028, "tokens_seen": 1295787008 }, { "epoch": 3.08, "learning_rate": 0.0003067703109327984, "loss": 2.5556, "theoretical_loss": 3.562375945512466, "tokens_seen": 1295852544 }, { "epoch": 3.08, "learning_rate": 0.0003067602808425276, "loss": 2.8395, "theoretical_loss": 3.5623596120800967, "tokens_seen": 1295918080 }, { "epoch": 3.08, "objective/train/docs_used": 2072917, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7524263858795166, "objective/train/theoretical_loss": 3.5623555288872035, "objective/train/tokens_used": 1316394464, "theoretical_loss": 3.5623555288872035, "tokens_seen": 1295934464 }, { "epoch": 3.08, "learning_rate": 0.00030675025075225674, "loss": 2.7662, "theoretical_loss": 3.5623432797049723, "tokens_seen": 1295983616 }, { "epoch": 3.08, "learning_rate": 0.000306740220661986, "loss": 2.805, "theoretical_loss": 3.562326948386972, "tokens_seen": 1296049152 }, { "epoch": 3.08, "learning_rate": 0.00030673019057171516, "loss": 2.851, "theoretical_loss": 3.5623106181259727, "tokens_seen": 1296114688 }, { "epoch": 3.08, "learning_rate": 0.00030672016048144434, "loss": 2.9163, "theoretical_loss": 3.562294288921853, "tokens_seen": 1296180224 }, { "epoch": 3.08, "learning_rate": 0.0003067101303911735, "loss": 2.8924, "theoretical_loss": 3.5622779607744914, "tokens_seen": 1296245760 }, { "epoch": 3.08, "learning_rate": 0.0003067001003009027, "loss": 2.7037, "theoretical_loss": 3.562261633683766, "tokens_seen": 1296311296 }, { "epoch": 3.08, "learning_rate": 0.0003066900702106319, "loss": 2.9213, "theoretical_loss": 3.5622453076495546, "tokens_seen": 1296376832 }, { "epoch": 3.08, "learning_rate": 0.0003066800401203611, "loss": 2.9023, "theoretical_loss": 3.5622289826717353, "tokens_seen": 1296442368 }, { "epoch": 3.08, "learning_rate": 0.00030667001003009025, "loss": 2.7144, "theoretical_loss": 3.5622126587501874, "tokens_seen": 1296507904 }, { "epoch": 3.08, "learning_rate": 0.0003066599799398195, "loss": 2.8865, "theoretical_loss": 3.5621963358847877, "tokens_seen": 1296573440 }, { "epoch": 3.08, "learning_rate": 0.00030664994984954866, "loss": 2.7256, "theoretical_loss": 3.562180014075416, "tokens_seen": 1296638976 }, { "epoch": 3.08, "learning_rate": 0.00030663991975927784, "loss": 2.7899, "theoretical_loss": 3.562163693321949, "tokens_seen": 1296704512 }, { "epoch": 3.08, "learning_rate": 0.000306629889669007, "loss": 2.8193, "theoretical_loss": 3.562147373624267, "tokens_seen": 1296770048 }, { "epoch": 3.08, "learning_rate": 0.0003066198595787362, "loss": 2.814, "theoretical_loss": 3.5621310549822462, "tokens_seen": 1296835584 }, { "epoch": 3.08, "learning_rate": 0.0003066098294884654, "loss": 2.7878, "theoretical_loss": 3.562114737395767, "tokens_seen": 1296901120 }, { "epoch": 3.08, "learning_rate": 0.0003065997993981946, "loss": 2.6979, "theoretical_loss": 3.5620984208647064, "tokens_seen": 1296966656 }, { "epoch": 3.08, "learning_rate": 0.00030658976930792375, "loss": 2.7431, "theoretical_loss": 3.562082105388943, "tokens_seen": 1297032192 }, { "epoch": 3.08, "learning_rate": 0.000306579739217653, "loss": 2.7368, "theoretical_loss": 3.5620657909683553, "tokens_seen": 1297097728 }, { "epoch": 3.08, "learning_rate": 0.0003065697091273821, "loss": 2.6833, "theoretical_loss": 3.562049477602822, "tokens_seen": 1297163264 }, { "epoch": 3.08, "learning_rate": 0.00030655967903711135, "loss": 2.8553, "theoretical_loss": 3.562033165292222, "tokens_seen": 1297228800 }, { "epoch": 3.08, "learning_rate": 0.00030654964894684053, "loss": 2.7307, "theoretical_loss": 3.562016854036433, "tokens_seen": 1297294336 }, { "epoch": 3.08, "learning_rate": 0.0003065396188565697, "loss": 2.8942, "theoretical_loss": 3.5620005438353335, "tokens_seen": 1297359872 }, { "epoch": 3.08, "learning_rate": 0.0003065295887662989, "loss": 2.7982, "theoretical_loss": 3.561984234688803, "tokens_seen": 1297425408 }, { "epoch": 3.08, "learning_rate": 0.0003065195586760281, "loss": 2.8977, "theoretical_loss": 3.5619679265967186, "tokens_seen": 1297490944 }, { "epoch": 3.08, "learning_rate": 0.00030650952858575725, "loss": 2.7068, "theoretical_loss": 3.5619516195589602, "tokens_seen": 1297556480 }, { "epoch": 3.08, "objective/train/docs_used": 2075639, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.735356569290161, "objective/train/theoretical_loss": 3.561947542964244, "objective/train/tokens_used": 1318032864, "theoretical_loss": 3.561947542964244, "tokens_seen": 1297572864 }, { "epoch": 3.08, "learning_rate": 0.0003064994984954865, "loss": 2.9051, "theoretical_loss": 3.5619353135754053, "tokens_seen": 1297622016 }, { "epoch": 3.08, "learning_rate": 0.0003064894684052156, "loss": 2.7495, "theoretical_loss": 3.5619190086459334, "tokens_seen": 1297687552 }, { "epoch": 3.08, "learning_rate": 0.00030647943831494485, "loss": 2.8358, "theoretical_loss": 3.5619027047704224, "tokens_seen": 1297753088 }, { "epoch": 3.08, "learning_rate": 0.00030646940822467403, "loss": 2.8858, "theoretical_loss": 3.561886401948752, "tokens_seen": 1297818624 }, { "epoch": 3.08, "learning_rate": 0.0003064593781344032, "loss": 3.0167, "theoretical_loss": 3.5618701001807995, "tokens_seen": 1297884160 }, { "epoch": 3.08, "learning_rate": 0.0003064493480441324, "loss": 2.9028, "theoretical_loss": 3.5618537994664443, "tokens_seen": 1297949696 }, { "epoch": 3.08, "learning_rate": 0.0003064393179538616, "loss": 2.8397, "theoretical_loss": 3.5618374998055655, "tokens_seen": 1298015232 }, { "epoch": 3.08, "learning_rate": 0.00030642928786359076, "loss": 2.7016, "theoretical_loss": 3.561821201198041, "tokens_seen": 1298080768 }, { "epoch": 3.08, "learning_rate": 0.00030641925777332, "loss": 2.9269, "theoretical_loss": 3.5618049036437505, "tokens_seen": 1298146304 }, { "epoch": 3.08, "learning_rate": 0.0003064092276830491, "loss": 2.7679, "theoretical_loss": 3.5617886071425717, "tokens_seen": 1298211840 }, { "epoch": 3.08, "learning_rate": 0.00030639919759277835, "loss": 2.9695, "theoretical_loss": 3.5617723116943845, "tokens_seen": 1298277376 }, { "epoch": 3.08, "learning_rate": 0.0003063891675025075, "loss": 2.8432, "theoretical_loss": 3.561756017299066, "tokens_seen": 1298342912 }, { "epoch": 3.08, "learning_rate": 0.0003063791374122367, "loss": 2.7302, "theoretical_loss": 3.5617397239564976, "tokens_seen": 1298408448 }, { "epoch": 3.08, "learning_rate": 0.0003063691073219659, "loss": 2.8772, "theoretical_loss": 3.561723431666556, "tokens_seen": 1298473984 }, { "epoch": 3.08, "learning_rate": 0.0003063590772316951, "loss": 2.8507, "theoretical_loss": 3.561707140429121, "tokens_seen": 1298539520 }, { "epoch": 3.08, "learning_rate": 0.00030634904714142426, "loss": 2.7423, "theoretical_loss": 3.561690850244071, "tokens_seen": 1298605056 }, { "epoch": 3.08, "learning_rate": 0.0003063390170511535, "loss": 2.8166, "theoretical_loss": 3.561674561111286, "tokens_seen": 1298670592 }, { "epoch": 3.08, "learning_rate": 0.0003063289869608826, "loss": 2.8376, "theoretical_loss": 3.5616582730306434, "tokens_seen": 1298736128 }, { "epoch": 3.08, "learning_rate": 0.00030631895687061186, "loss": 2.8221, "theoretical_loss": 3.561641986002023, "tokens_seen": 1298801664 }, { "epoch": 3.08, "learning_rate": 0.000306308926780341, "loss": 2.898, "theoretical_loss": 3.5616257000253038, "tokens_seen": 1298867200 }, { "epoch": 3.08, "learning_rate": 0.0003062988966900702, "loss": 2.8395, "theoretical_loss": 3.5616094151003646, "tokens_seen": 1298932736 }, { "epoch": 3.08, "learning_rate": 0.00030628886659979945, "loss": 2.7567, "theoretical_loss": 3.5615931312270845, "tokens_seen": 1298998272 }, { "epoch": 3.08, "learning_rate": 0.0003062788365095286, "loss": 2.8432, "theoretical_loss": 3.561576848405343, "tokens_seen": 1299063808 }, { "epoch": 3.08, "learning_rate": 0.0003062688064192578, "loss": 2.7232, "theoretical_loss": 3.561560566635018, "tokens_seen": 1299129344 }, { "epoch": 3.08, "learning_rate": 0.00030625877632898694, "loss": 2.8342, "theoretical_loss": 3.5615442859159896, "tokens_seen": 1299194880 }, { "epoch": 3.08, "objective/train/docs_used": 2078608, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8635072708129883, "objective/train/theoretical_loss": 3.561540215900483, "objective/train/tokens_used": 1319671264, "theoretical_loss": 3.561540215900483, "tokens_seen": 1299211264 }, { "epoch": 3.08, "learning_rate": 0.0003062487462387162, "loss": 2.7995, "theoretical_loss": 3.5615280062481363, "tokens_seen": 1299260416 }, { "epoch": 3.08, "learning_rate": 0.00030623871614844536, "loss": 2.7681, "theoretical_loss": 3.5615117276313377, "tokens_seen": 1299325952 }, { "epoch": 3.08, "learning_rate": 0.00030622868605817454, "loss": 2.7028, "theoretical_loss": 3.5614954500654727, "tokens_seen": 1299391488 }, { "epoch": 3.08, "learning_rate": 0.0003062186559679037, "loss": 2.5921, "theoretical_loss": 3.56147917355042, "tokens_seen": 1299457024 }, { "epoch": 3.08, "learning_rate": 0.0003062086258776329, "loss": 2.8382, "theoretical_loss": 3.5614628980860594, "tokens_seen": 1299522560 }, { "epoch": 3.08, "learning_rate": 0.0003061985957873621, "loss": 2.7246, "theoretical_loss": 3.5614466236722704, "tokens_seen": 1299588096 }, { "epoch": 3.08, "learning_rate": 0.0003061885656970913, "loss": 2.8859, "theoretical_loss": 3.561430350308931, "tokens_seen": 1299653632 }, { "epoch": 3.08, "learning_rate": 0.00030617853560682045, "loss": 2.8366, "theoretical_loss": 3.561414077995922, "tokens_seen": 1299719168 }, { "epoch": 3.08, "learning_rate": 0.0003061685055165497, "loss": 2.8476, "theoretical_loss": 3.5613978067331207, "tokens_seen": 1299784704 }, { "epoch": 3.08, "learning_rate": 0.00030615847542627886, "loss": 2.7546, "theoretical_loss": 3.561381536520408, "tokens_seen": 1299850240 }, { "epoch": 3.08, "learning_rate": 0.00030614844533600804, "loss": 2.9657, "theoretical_loss": 3.5613652673576626, "tokens_seen": 1299915776 }, { "epoch": 3.08, "learning_rate": 0.0003061384152457372, "loss": 2.7871, "theoretical_loss": 3.561348999244764, "tokens_seen": 1299981312 }, { "epoch": 3.08, "learning_rate": 0.0003061283851554664, "loss": 2.907, "theoretical_loss": 3.561332732181591, "tokens_seen": 1300046848 }, { "epoch": 3.08, "learning_rate": 0.0003061183550651956, "loss": 2.7236, "theoretical_loss": 3.561316466168024, "tokens_seen": 1300112384 }, { "epoch": 3.08, "learning_rate": 0.0003061083249749248, "loss": 2.7547, "theoretical_loss": 3.5613002012039416, "tokens_seen": 1300177920 }, { "epoch": 3.08, "learning_rate": 0.00030609829488465395, "loss": 2.8775, "theoretical_loss": 3.561283937289223, "tokens_seen": 1300243456 }, { "epoch": 3.08, "learning_rate": 0.0003060882647943832, "loss": 2.742, "theoretical_loss": 3.561267674423748, "tokens_seen": 1300308992 }, { "epoch": 3.08, "learning_rate": 0.0003060782347041123, "loss": 2.7848, "theoretical_loss": 3.561251412607396, "tokens_seen": 1300374528 }, { "epoch": 3.08, "learning_rate": 0.00030606820461384155, "loss": 2.8747, "theoretical_loss": 3.5612351518400467, "tokens_seen": 1300440064 }, { "epoch": 3.08, "learning_rate": 0.00030605817452357073, "loss": 2.7519, "theoretical_loss": 3.5612188921215786, "tokens_seen": 1300505600 }, { "epoch": 3.08, "learning_rate": 0.0003060481444332999, "loss": 2.8274, "theoretical_loss": 3.561202633451873, "tokens_seen": 1300571136 }, { "epoch": 3.08, "learning_rate": 0.0003060381143430291, "loss": 2.8002, "theoretical_loss": 3.5611863758308075, "tokens_seen": 1300636672 }, { "epoch": 3.08, "learning_rate": 0.0003060280842527583, "loss": 2.8903, "theoretical_loss": 3.5611701192582625, "tokens_seen": 1300702208 }, { "epoch": 3.08, "learning_rate": 0.00030601805416248745, "loss": 3.0564, "theoretical_loss": 3.5611538637341176, "tokens_seen": 1300767744 }, { "epoch": 3.08, "learning_rate": 0.0003060080240722167, "loss": 2.6549, "theoretical_loss": 3.5611376092582523, "tokens_seen": 1300833280 }, { "epoch": 3.08, "objective/train/docs_used": 2081298, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.674614429473877, "objective/train/theoretical_loss": 3.5611335458030653, "objective/train/tokens_used": 1321309664, "theoretical_loss": 3.5611335458030653, "tokens_seen": 1300849664 }, { "epoch": 3.08, "learning_rate": 0.0003059979939819458, "loss": 2.7628, "theoretical_loss": 3.5611213558305463, "tokens_seen": 1300898816 }, { "epoch": 3.08, "learning_rate": 0.00030598796389167505, "loss": 2.9318, "theoretical_loss": 3.5611051034508785, "tokens_seen": 1300964352 }, { "epoch": 3.08, "learning_rate": 0.00030597793380140423, "loss": 2.8147, "theoretical_loss": 3.5610888521191297, "tokens_seen": 1301029888 }, { "epoch": 3.08, "learning_rate": 0.0003059679037111334, "loss": 2.7811, "theoretical_loss": 3.5610726018351784, "tokens_seen": 1301095424 }, { "epoch": 3.08, "learning_rate": 0.0003059578736208626, "loss": 2.4773, "theoretical_loss": 3.5610563525989054, "tokens_seen": 1301160960 }, { "epoch": 3.08, "learning_rate": 0.0003059478435305918, "loss": 2.7579, "theoretical_loss": 3.561040104410189, "tokens_seen": 1301226496 }, { "epoch": 3.08, "learning_rate": 0.00030593781344032096, "loss": 2.9412, "theoretical_loss": 3.5610238572689106, "tokens_seen": 1301292032 }, { "epoch": 3.08, "learning_rate": 0.0003059277833500502, "loss": 2.7683, "theoretical_loss": 3.561007611174948, "tokens_seen": 1301357568 }, { "epoch": 3.08, "learning_rate": 0.0003059177532597793, "loss": 2.8385, "theoretical_loss": 3.560991366128183, "tokens_seen": 1301423104 }, { "epoch": 3.08, "learning_rate": 0.00030590772316950855, "loss": 2.7746, "theoretical_loss": 3.560975122128494, "tokens_seen": 1301488640 }, { "epoch": 3.08, "learning_rate": 0.0003058976930792377, "loss": 2.7085, "theoretical_loss": 3.560958879175761, "tokens_seen": 1301554176 }, { "epoch": 3.08, "learning_rate": 0.0003058876629889669, "loss": 2.8334, "theoretical_loss": 3.5609426372698643, "tokens_seen": 1301619712 }, { "epoch": 3.08, "learning_rate": 0.0003058776328986961, "loss": 2.7888, "theoretical_loss": 3.560926396410683, "tokens_seen": 1301685248 }, { "epoch": 3.08, "learning_rate": 0.0003058676028084253, "loss": 3.0387, "theoretical_loss": 3.560910156598098, "tokens_seen": 1301750784 }, { "epoch": 3.08, "learning_rate": 0.00030585757271815446, "loss": 2.9202, "theoretical_loss": 3.5608939178319883, "tokens_seen": 1301816320 }, { "epoch": 3.08, "learning_rate": 0.0003058475426278837, "loss": 2.8029, "theoretical_loss": 3.5608776801122333, "tokens_seen": 1301881856 }, { "epoch": 3.08, "learning_rate": 0.0003058375125376128, "loss": 2.8992, "theoretical_loss": 3.5608614434387142, "tokens_seen": 1301947392 }, { "epoch": 3.08, "learning_rate": 0.00030582748244734206, "loss": 2.9309, "theoretical_loss": 3.5608452078113104, "tokens_seen": 1302012928 }, { "epoch": 3.08, "learning_rate": 0.0003058174523570712, "loss": 2.7889, "theoretical_loss": 3.5608289732299014, "tokens_seen": 1302078464 }, { "epoch": 3.08, "learning_rate": 0.0003058074222668004, "loss": 2.689, "theoretical_loss": 3.560812739694368, "tokens_seen": 1302144000 }, { "epoch": 3.08, "learning_rate": 0.0003057973921765296, "loss": 2.8939, "theoretical_loss": 3.5607965072045893, "tokens_seen": 1302209536 }, { "epoch": 3.08, "learning_rate": 0.0003057873620862588, "loss": 2.6919, "theoretical_loss": 3.5607802757604463, "tokens_seen": 1302275072 }, { "epoch": 3.08, "learning_rate": 0.00030577733199598796, "loss": 2.8195, "theoretical_loss": 3.560764045361818, "tokens_seen": 1302340608 }, { "epoch": 3.08, "learning_rate": 0.00030576730190571714, "loss": 2.7624, "theoretical_loss": 3.5607478160085853, "tokens_seen": 1302406144 }, { "epoch": 3.08, "learning_rate": 0.0003057572718154463, "loss": 2.8217, "theoretical_loss": 3.560731587700628, "tokens_seen": 1302471680 }, { "epoch": 3.08, "objective/train/docs_used": 2083714, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7642292976379395, "objective/train/theoretical_loss": 3.5607275307869486, "objective/train/tokens_used": 1322948064, "theoretical_loss": 3.5607275307869486, "tokens_seen": 1302488064 }, { "epoch": 3.08, "learning_rate": 0.00030574724172517556, "loss": 2.8265, "theoretical_loss": 3.5607153604378254, "tokens_seen": 1302537216 }, { "epoch": 3.08, "learning_rate": 0.0003057372116349047, "loss": 2.8194, "theoretical_loss": 3.560699134220059, "tokens_seen": 1302602752 }, { "epoch": 3.08, "learning_rate": 0.0003057271815446339, "loss": 2.8703, "theoretical_loss": 3.560682909047208, "tokens_seen": 1302668288 }, { "epoch": 3.08, "learning_rate": 0.00030571715145436305, "loss": 2.9596, "theoretical_loss": 3.560666684919153, "tokens_seen": 1302733824 }, { "epoch": 3.08, "learning_rate": 0.0003057071213640923, "loss": 2.7315, "theoretical_loss": 3.560650461835774, "tokens_seen": 1302799360 }, { "epoch": 3.08, "learning_rate": 0.00030569709127382147, "loss": 3.1129, "theoretical_loss": 3.5606342397969506, "tokens_seen": 1302864896 }, { "epoch": 3.08, "learning_rate": 0.00030568706118355065, "loss": 2.8945, "theoretical_loss": 3.560618018802564, "tokens_seen": 1302930432 }, { "epoch": 3.08, "learning_rate": 0.00030567703109327983, "loss": 2.9162, "theoretical_loss": 3.5606017988524936, "tokens_seen": 1302995968 }, { "epoch": 3.08, "learning_rate": 0.00030566700100300906, "loss": 2.7446, "theoretical_loss": 3.56058557994662, "tokens_seen": 1303061504 }, { "epoch": 3.08, "learning_rate": 0.0003056569709127382, "loss": 2.8664, "theoretical_loss": 3.5605693620848236, "tokens_seen": 1303127040 }, { "epoch": 3.08, "learning_rate": 0.0003056469408224674, "loss": 2.9412, "theoretical_loss": 3.5605531452669847, "tokens_seen": 1303192576 }, { "epoch": 3.08, "learning_rate": 0.00030563691073219655, "loss": 2.8503, "theoretical_loss": 3.560536929492983, "tokens_seen": 1303258112 }, { "epoch": 3.08, "learning_rate": 0.0003056268806419258, "loss": 2.7895, "theoretical_loss": 3.5605207147627, "tokens_seen": 1303323648 }, { "epoch": 3.08, "learning_rate": 0.00030561685055165497, "loss": 2.777, "theoretical_loss": 3.5605045010760152, "tokens_seen": 1303389184 }, { "epoch": 3.08, "learning_rate": 0.00030560682046138415, "loss": 2.9033, "theoretical_loss": 3.5604882884328086, "tokens_seen": 1303454720 }, { "epoch": 3.08, "learning_rate": 0.00030559679037111333, "loss": 2.8584, "theoretical_loss": 3.560472076832961, "tokens_seen": 1303520256 }, { "epoch": 3.08, "learning_rate": 0.0003055867602808425, "loss": 2.7187, "theoretical_loss": 3.560455866276354, "tokens_seen": 1303585792 }, { "epoch": 3.08, "learning_rate": 0.0003055767301905717, "loss": 2.8747, "theoretical_loss": 3.560439656762866, "tokens_seen": 1303651328 }, { "epoch": 3.08, "learning_rate": 0.00030556670010030093, "loss": 2.912, "theoretical_loss": 3.5604234482923784, "tokens_seen": 1303716864 }, { "epoch": 3.08, "learning_rate": 0.00030555667001003006, "loss": 2.9839, "theoretical_loss": 3.560407240864772, "tokens_seen": 1303782400 }, { "epoch": 3.08, "learning_rate": 0.0003055466399197593, "loss": 2.7364, "theoretical_loss": 3.5603910344799266, "tokens_seen": 1303847936 }, { "epoch": 3.08, "learning_rate": 0.0003055366098294885, "loss": 2.8229, "theoretical_loss": 3.560374829137723, "tokens_seen": 1303913472 }, { "epoch": 3.08, "learning_rate": 0.00030552657973921765, "loss": 2.8001, "theoretical_loss": 3.5603586248380417, "tokens_seen": 1303979008 }, { "epoch": 3.08, "learning_rate": 0.0003055165496489469, "loss": 2.6069, "theoretical_loss": 3.5603424215807635, "tokens_seen": 1304044544 }, { "epoch": 3.08, "learning_rate": 0.000305506519558676, "loss": 2.8047, "theoretical_loss": 3.5603262193657685, "tokens_seen": 1304110080 }, { "epoch": 3.08, "objective/train/docs_used": 2086380, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9482533931732178, "objective/train/theoretical_loss": 3.5603221689748628, "objective/train/tokens_used": 1324586464, "theoretical_loss": 3.5603221689748628, "tokens_seen": 1304126464 }, { "epoch": 3.08, "learning_rate": 0.00030549648946840525, "loss": 2.7718, "theoretical_loss": 3.5603100181929372, "tokens_seen": 1304175616 }, { "epoch": 3.08, "learning_rate": 0.00030548645937813443, "loss": 2.8159, "theoretical_loss": 3.560293818062151, "tokens_seen": 1304241152 }, { "epoch": 3.08, "learning_rate": 0.0003054764292878636, "loss": 2.8362, "theoretical_loss": 3.56027761897329, "tokens_seen": 1304306688 }, { "epoch": 3.08, "learning_rate": 0.0003054663991975928, "loss": 2.8967, "theoretical_loss": 3.560261420926235, "tokens_seen": 1304372224 }, { "epoch": 3.08, "learning_rate": 0.000305456369107322, "loss": 2.633, "theoretical_loss": 3.5602452239208664, "tokens_seen": 1304437760 }, { "epoch": 3.08, "learning_rate": 0.00030544633901705116, "loss": 3.0119, "theoretical_loss": 3.5602290279570648, "tokens_seen": 1304503296 }, { "epoch": 3.08, "learning_rate": 0.0003054363089267804, "loss": 2.8338, "theoretical_loss": 3.5602128330347114, "tokens_seen": 1304568832 }, { "epoch": 3.08, "learning_rate": 0.0003054262788365095, "loss": 2.7553, "theoretical_loss": 3.560196639153687, "tokens_seen": 1304634368 }, { "epoch": 3.08, "learning_rate": 0.00030541624874623875, "loss": 2.7132, "theoretical_loss": 3.560180446313871, "tokens_seen": 1304699904 }, { "epoch": 3.08, "learning_rate": 0.0003054062186559679, "loss": 2.7107, "theoretical_loss": 3.5601642545151453, "tokens_seen": 1304765440 }, { "epoch": 3.08, "learning_rate": 0.0003053961885656971, "loss": 2.9019, "theoretical_loss": 3.5601480637573912, "tokens_seen": 1304830976 }, { "epoch": 3.08, "learning_rate": 0.0003053861584754263, "loss": 2.876, "theoretical_loss": 3.560131874040488, "tokens_seen": 1304896512 }, { "epoch": 3.08, "learning_rate": 0.0003053761283851555, "loss": 2.6381, "theoretical_loss": 3.560115685364318, "tokens_seen": 1304962048 }, { "epoch": 3.08, "learning_rate": 0.00030536609829488466, "loss": 2.9158, "theoretical_loss": 3.560099497728761, "tokens_seen": 1305027584 }, { "epoch": 3.08, "learning_rate": 0.0003053560682046139, "loss": 2.8376, "theoretical_loss": 3.5600833111336985, "tokens_seen": 1305093120 }, { "epoch": 3.08, "learning_rate": 0.000305346038114343, "loss": 2.8408, "theoretical_loss": 3.560067125579011, "tokens_seen": 1305158656 }, { "epoch": 3.08, "learning_rate": 0.00030533600802407226, "loss": 2.8186, "theoretical_loss": 3.560050941064579, "tokens_seen": 1305224192 }, { "epoch": 3.08, "learning_rate": 0.0003053259779338014, "loss": 2.7829, "theoretical_loss": 3.5600347575902846, "tokens_seen": 1305289728 }, { "epoch": 3.08, "learning_rate": 0.0003053159478435306, "loss": 2.8828, "theoretical_loss": 3.5600185751560076, "tokens_seen": 1305355264 }, { "epoch": 3.08, "learning_rate": 0.0003053059177532598, "loss": 2.8743, "theoretical_loss": 3.5600023937616294, "tokens_seen": 1305420800 }, { "epoch": 3.08, "learning_rate": 0.000305295887662989, "loss": 2.7934, "theoretical_loss": 3.559986213407031, "tokens_seen": 1305486336 }, { "epoch": 3.08, "learning_rate": 0.00030528585757271816, "loss": 2.8089, "theoretical_loss": 3.5599700340920934, "tokens_seen": 1305551872 }, { "epoch": 3.08, "learning_rate": 0.00030527582748244734, "loss": 2.7533, "theoretical_loss": 3.5599538558166977, "tokens_seen": 1305617408 }, { "epoch": 3.09, "learning_rate": 0.0003052657973921765, "loss": 2.8994, "theoretical_loss": 3.5599376785807246, "tokens_seen": 1305682944 }, { "epoch": 3.09, "learning_rate": 0.00030525576730190576, "loss": 2.7856, "theoretical_loss": 3.559921502384056, "tokens_seen": 1305748480 }, { "epoch": 3.09, "objective/train/docs_used": 2089277, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5143356323242188, "objective/train/theoretical_loss": 3.5599174584972655, "objective/train/tokens_used": 1326224864, "theoretical_loss": 3.5599174584972655, "tokens_seen": 1305764864 }, { "epoch": 3.09, "learning_rate": 0.0003052457372116349, "loss": 2.7391, "theoretical_loss": 3.5599053272265717, "tokens_seen": 1305814016 }, { "epoch": 3.09, "learning_rate": 0.0003052357071213641, "loss": 2.7603, "theoretical_loss": 3.5598891531081533, "tokens_seen": 1305879552 }, { "epoch": 3.09, "learning_rate": 0.00030522567703109325, "loss": 2.9153, "theoretical_loss": 3.5598729800286826, "tokens_seen": 1305945088 }, { "epoch": 3.09, "learning_rate": 0.0003052156469408225, "loss": 2.7986, "theoretical_loss": 3.5598568079880395, "tokens_seen": 1306010624 }, { "epoch": 3.09, "learning_rate": 0.00030520561685055167, "loss": 2.9569, "theoretical_loss": 3.559840636986106, "tokens_seen": 1306076160 }, { "epoch": 3.09, "learning_rate": 0.00030519558676028085, "loss": 2.7954, "theoretical_loss": 3.5598244670227635, "tokens_seen": 1306141696 }, { "epoch": 3.09, "learning_rate": 0.00030518555667001003, "loss": 2.8342, "theoretical_loss": 3.5598082980978925, "tokens_seen": 1306207232 }, { "epoch": 3.09, "learning_rate": 0.00030517552657973926, "loss": 2.8244, "theoretical_loss": 3.559792130211374, "tokens_seen": 1306272768 }, { "epoch": 3.09, "learning_rate": 0.0003051654964894684, "loss": 2.6401, "theoretical_loss": 3.5597759633630903, "tokens_seen": 1306338304 }, { "epoch": 3.09, "learning_rate": 0.0003051554663991976, "loss": 3.0235, "theoretical_loss": 3.559759797552922, "tokens_seen": 1306403840 }, { "epoch": 3.09, "learning_rate": 0.00030514543630892675, "loss": 2.8356, "theoretical_loss": 3.5597436327807506, "tokens_seen": 1306469376 }, { "epoch": 3.09, "learning_rate": 0.000305135406218656, "loss": 2.8696, "theoretical_loss": 3.559727469046457, "tokens_seen": 1306534912 }, { "epoch": 3.09, "learning_rate": 0.00030512537612838517, "loss": 2.8493, "theoretical_loss": 3.5597113063499224, "tokens_seen": 1306600448 }, { "epoch": 3.09, "learning_rate": 0.00030511534603811435, "loss": 2.7493, "theoretical_loss": 3.559695144691029, "tokens_seen": 1306665984 }, { "epoch": 3.09, "learning_rate": 0.00030510531594784353, "loss": 2.5119, "theoretical_loss": 3.559678984069657, "tokens_seen": 1306731520 }, { "epoch": 3.09, "learning_rate": 0.0003050952858575727, "loss": 2.9074, "theoretical_loss": 3.559662824485689, "tokens_seen": 1306797056 }, { "epoch": 3.09, "learning_rate": 0.0003050852557673019, "loss": 2.725, "theoretical_loss": 3.559646665939005, "tokens_seen": 1306862592 }, { "epoch": 3.09, "learning_rate": 0.00030507522567703113, "loss": 2.8901, "theoretical_loss": 3.559630508429488, "tokens_seen": 1306928128 }, { "epoch": 3.09, "learning_rate": 0.00030506519558676026, "loss": 2.8245, "theoretical_loss": 3.559614351957018, "tokens_seen": 1306993664 }, { "epoch": 3.09, "learning_rate": 0.0003050551654964895, "loss": 2.9182, "theoretical_loss": 3.559598196521477, "tokens_seen": 1307059200 }, { "epoch": 3.09, "learning_rate": 0.0003050451354062186, "loss": 2.7472, "theoretical_loss": 3.559582042122747, "tokens_seen": 1307124736 }, { "epoch": 3.09, "learning_rate": 0.00030503510531594785, "loss": 2.8887, "theoretical_loss": 3.5595658887607087, "tokens_seen": 1307190272 }, { "epoch": 3.09, "learning_rate": 0.00030502507522567703, "loss": 2.8699, "theoretical_loss": 3.559549736435243, "tokens_seen": 1307255808 }, { "epoch": 3.09, "learning_rate": 0.0003050150451354062, "loss": 2.898, "theoretical_loss": 3.5595335851462337, "tokens_seen": 1307321344 }, { "epoch": 3.09, "learning_rate": 0.0003050050150451354, "loss": 2.806, "theoretical_loss": 3.55951743489356, "tokens_seen": 1307386880 }, { "epoch": 3.09, "objective/train/docs_used": 2090640, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.999148368835449, "objective/train/theoretical_loss": 3.5595133974923057, "objective/train/tokens_used": 1327863264, "theoretical_loss": 3.5595133974923057, "tokens_seen": 1307403264 }, { "epoch": 3.09, "learning_rate": 0.00030499498495486463, "loss": 2.8133, "theoretical_loss": 3.5595012856771047, "tokens_seen": 1307452416 }, { "epoch": 3.09, "learning_rate": 0.00030498495486459376, "loss": 2.8793, "theoretical_loss": 3.5594851374967487, "tokens_seen": 1307517952 }, { "epoch": 3.09, "learning_rate": 0.000304974924774323, "loss": 2.7884, "theoretical_loss": 3.5594689903523746, "tokens_seen": 1307583488 }, { "epoch": 3.09, "learning_rate": 0.0003049648946840521, "loss": 2.6217, "theoretical_loss": 3.5594528442438627, "tokens_seen": 1307649024 }, { "epoch": 3.09, "learning_rate": 0.00030495486459378136, "loss": 2.8363, "theoretical_loss": 3.559436699171096, "tokens_seen": 1307714560 }, { "epoch": 3.09, "learning_rate": 0.00030494483450351054, "loss": 2.9627, "theoretical_loss": 3.559420555133955, "tokens_seen": 1307780096 }, { "epoch": 3.09, "learning_rate": 0.0003049348044132397, "loss": 2.8401, "theoretical_loss": 3.559404412132322, "tokens_seen": 1307845632 }, { "epoch": 3.09, "learning_rate": 0.0003049247743229689, "loss": 2.9334, "theoretical_loss": 3.5593882701660786, "tokens_seen": 1307911168 }, { "epoch": 3.09, "learning_rate": 0.0003049147442326981, "loss": 2.7971, "theoretical_loss": 3.5593721292351064, "tokens_seen": 1307976704 }, { "epoch": 3.09, "learning_rate": 0.00030490471414242726, "loss": 2.7759, "theoretical_loss": 3.559355989339287, "tokens_seen": 1308042240 }, { "epoch": 3.09, "learning_rate": 0.0003048946840521565, "loss": 2.7285, "theoretical_loss": 3.559339850478503, "tokens_seen": 1308107776 }, { "epoch": 3.09, "learning_rate": 0.0003048846539618856, "loss": 2.8228, "theoretical_loss": 3.5593237126526347, "tokens_seen": 1308173312 }, { "epoch": 3.09, "learning_rate": 0.00030487462387161486, "loss": 2.8475, "theoretical_loss": 3.559307575861565, "tokens_seen": 1308238848 }, { "epoch": 3.09, "learning_rate": 0.000304864593781344, "loss": 2.6248, "theoretical_loss": 3.559291440105176, "tokens_seen": 1308304384 }, { "epoch": 3.09, "learning_rate": 0.0003048545636910732, "loss": 2.8156, "theoretical_loss": 3.5592753053833484, "tokens_seen": 1308369920 }, { "epoch": 3.09, "learning_rate": 0.0003048445336008024, "loss": 2.6592, "theoretical_loss": 3.559259171695965, "tokens_seen": 1308435456 }, { "epoch": 3.09, "learning_rate": 0.0003048345035105316, "loss": 2.8988, "theoretical_loss": 3.559243039042907, "tokens_seen": 1308500992 }, { "epoch": 3.09, "learning_rate": 0.00030482447342026076, "loss": 2.7358, "theoretical_loss": 3.5592269074240566, "tokens_seen": 1308566528 }, { "epoch": 3.09, "learning_rate": 0.00030481444332999, "loss": 2.7907, "theoretical_loss": 3.5592107768392953, "tokens_seen": 1308632064 }, { "epoch": 3.09, "learning_rate": 0.0003048044132397192, "loss": 2.79, "theoretical_loss": 3.559194647288506, "tokens_seen": 1308697600 }, { "epoch": 3.09, "learning_rate": 0.00030479438314944836, "loss": 2.8253, "theoretical_loss": 3.55917851877157, "tokens_seen": 1308763136 }, { "epoch": 3.09, "learning_rate": 0.00030478435305917754, "loss": 2.6265, "theoretical_loss": 3.5591623912883694, "tokens_seen": 1308828672 }, { "epoch": 3.09, "learning_rate": 0.0003047743229689067, "loss": 2.7536, "theoretical_loss": 3.5591462648387857, "tokens_seen": 1308894208 }, { "epoch": 3.09, "learning_rate": 0.00030476429287863596, "loss": 2.7287, "theoretical_loss": 3.5591301394227015, "tokens_seen": 1308959744 }, { "epoch": 3.09, "learning_rate": 0.0003047542627883651, "loss": 2.7678, "theoretical_loss": 3.5591140150399987, "tokens_seen": 1309025280 }, { "epoch": 3.09, "objective/train/docs_used": 2093346, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7634706497192383, "objective/train/theoretical_loss": 3.559109984105775, "objective/train/tokens_used": 1329501664, "theoretical_loss": 3.559109984105775, "tokens_seen": 1309041664 }, { "epoch": 3.09, "learning_rate": 0.0003047442326980943, "loss": 2.8367, "theoretical_loss": 3.5590978916905596, "tokens_seen": 1309090816 }, { "epoch": 3.09, "learning_rate": 0.00030473420260782345, "loss": 2.7915, "theoretical_loss": 3.5590817693742656, "tokens_seen": 1309156352 }, { "epoch": 3.09, "learning_rate": 0.0003047241725175527, "loss": 2.7647, "theoretical_loss": 3.5590656480909995, "tokens_seen": 1309221888 }, { "epoch": 3.09, "learning_rate": 0.00030471414242728187, "loss": 2.7149, "theoretical_loss": 3.559049527840643, "tokens_seen": 1309287424 }, { "epoch": 3.09, "learning_rate": 0.00030470411233701105, "loss": 2.8589, "theoretical_loss": 3.559033408623078, "tokens_seen": 1309352960 }, { "epoch": 3.09, "learning_rate": 0.00030469408224674023, "loss": 2.7888, "theoretical_loss": 3.559017290438187, "tokens_seen": 1309418496 }, { "epoch": 3.09, "learning_rate": 0.00030468405215646946, "loss": 2.7494, "theoretical_loss": 3.559001173285852, "tokens_seen": 1309484032 }, { "epoch": 3.09, "learning_rate": 0.0003046740220661986, "loss": 2.9589, "theoretical_loss": 3.5589850571659554, "tokens_seen": 1309549568 }, { "epoch": 3.09, "learning_rate": 0.0003046639919759278, "loss": 2.7354, "theoretical_loss": 3.5589689420783794, "tokens_seen": 1309615104 }, { "epoch": 3.09, "learning_rate": 0.00030465396188565695, "loss": 2.7653, "theoretical_loss": 3.5589528280230063, "tokens_seen": 1309680640 }, { "epoch": 3.09, "learning_rate": 0.0003046439317953862, "loss": 2.9207, "theoretical_loss": 3.5589367149997173, "tokens_seen": 1309746176 }, { "epoch": 3.09, "learning_rate": 0.00030463390170511537, "loss": 2.675, "theoretical_loss": 3.558920603008396, "tokens_seen": 1309811712 }, { "epoch": 3.09, "learning_rate": 0.00030462387161484455, "loss": 3.0485, "theoretical_loss": 3.5589044920489243, "tokens_seen": 1309877248 }, { "epoch": 3.09, "learning_rate": 0.00030461384152457373, "loss": 2.9751, "theoretical_loss": 3.558888382121184, "tokens_seen": 1309942784 }, { "epoch": 3.09, "learning_rate": 0.0003046038114343029, "loss": 2.8446, "theoretical_loss": 3.558872273225058, "tokens_seen": 1310008320 }, { "epoch": 3.09, "learning_rate": 0.0003045937813440321, "loss": 2.6921, "theoretical_loss": 3.558856165360429, "tokens_seen": 1310073856 }, { "epoch": 3.09, "learning_rate": 0.00030458375125376133, "loss": 2.7802, "theoretical_loss": 3.5588400585271778, "tokens_seen": 1310139392 }, { "epoch": 3.09, "learning_rate": 0.00030457372116349046, "loss": 2.8692, "theoretical_loss": 3.558823952725188, "tokens_seen": 1310204928 }, { "epoch": 3.09, "learning_rate": 0.0003045636910732197, "loss": 2.7381, "theoretical_loss": 3.558807847954342, "tokens_seen": 1310270464 }, { "epoch": 3.09, "learning_rate": 0.0003045536609829488, "loss": 2.8416, "theoretical_loss": 3.558791744214522, "tokens_seen": 1310336000 }, { "epoch": 3.09, "learning_rate": 0.00030454363089267805, "loss": 2.9198, "theoretical_loss": 3.55877564150561, "tokens_seen": 1310401536 }, { "epoch": 3.09, "learning_rate": 0.00030453360080240723, "loss": 2.8356, "theoretical_loss": 3.558759539827489, "tokens_seen": 1310467072 }, { "epoch": 3.09, "learning_rate": 0.0003045235707121364, "loss": 2.8638, "theoretical_loss": 3.5587434391800414, "tokens_seen": 1310532608 }, { "epoch": 3.09, "learning_rate": 0.0003045135406218656, "loss": 2.9631, "theoretical_loss": 3.5587273395631493, "tokens_seen": 1310598144 }, { "epoch": 3.09, "learning_rate": 0.00030450351053159483, "loss": 2.8373, "theoretical_loss": 3.558711240976696, "tokens_seen": 1310663680 }, { "debugging/Self-BLEU-5": 0.6489003202557142, "debugging/distinct-1-grams": 0.7661570929999267, "debugging/distinct-2-grams": 0.959194855638566, "debugging/entropy-1-grams": 6.494201505311883, "debugging/entropy-2-grams": 7.767432162897712, "debugging/length": 536.8823529411765, "debugging/num_segments": 34, "epoch": 3.09, "objective/train/docs_used": 2096158, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8112263679504395, "objective/train/theoretical_loss": 3.558707216491075, "objective/train/tokens_used": 1331140064, "theoretical_loss": 3.558707216491075, "tokens_seen": 1310680064 }, { "epoch": 3.09, "learning_rate": 0.00030449348044132396, "loss": 2.7969, "theoretical_loss": 3.5586951434205636, "tokens_seen": 1310729216 }, { "epoch": 3.09, "learning_rate": 0.0003044834503510532, "loss": 2.6684, "theoretical_loss": 3.558679046894634, "tokens_seen": 1310794752 }, { "epoch": 3.09, "learning_rate": 0.0003044734202607823, "loss": 2.836, "theoretical_loss": 3.558662951398791, "tokens_seen": 1310860288 }, { "epoch": 3.09, "learning_rate": 0.00030446339017051156, "loss": 2.7494, "theoretical_loss": 3.558646856932916, "tokens_seen": 1310925824 }, { "epoch": 3.09, "learning_rate": 0.00030445336008024074, "loss": 2.9542, "theoretical_loss": 3.5586307634968923, "tokens_seen": 1310991360 }, { "epoch": 3.09, "learning_rate": 0.0003044433299899699, "loss": 2.8307, "theoretical_loss": 3.558614671090602, "tokens_seen": 1311056896 }, { "epoch": 3.09, "learning_rate": 0.0003044332998996991, "loss": 2.8339, "theoretical_loss": 3.558598579713929, "tokens_seen": 1311122432 }, { "epoch": 3.09, "learning_rate": 0.0003044232698094283, "loss": 2.9674, "theoretical_loss": 3.5585824893667546, "tokens_seen": 1311187968 }, { "epoch": 3.09, "learning_rate": 0.00030441323971915746, "loss": 2.8048, "theoretical_loss": 3.5585664000489614, "tokens_seen": 1311253504 }, { "epoch": 3.09, "learning_rate": 0.0003044032096288867, "loss": 2.852, "theoretical_loss": 3.5585503117604333, "tokens_seen": 1311319040 }, { "epoch": 3.09, "learning_rate": 0.0003043931795386158, "loss": 2.7097, "theoretical_loss": 3.5585342245010523, "tokens_seen": 1311384576 }, { "epoch": 3.09, "learning_rate": 0.00030438314944834506, "loss": 2.7453, "theoretical_loss": 3.558518138270701, "tokens_seen": 1311450112 }, { "epoch": 3.09, "learning_rate": 0.0003043731193580742, "loss": 2.8929, "theoretical_loss": 3.5585020530692626, "tokens_seen": 1311515648 }, { "epoch": 3.09, "learning_rate": 0.0003043630892678034, "loss": 2.8378, "theoretical_loss": 3.55848596889662, "tokens_seen": 1311581184 }, { "epoch": 3.09, "learning_rate": 0.0003043530591775326, "loss": 2.7303, "theoretical_loss": 3.558469885752655, "tokens_seen": 1311646720 }, { "epoch": 3.09, "learning_rate": 0.0003043430290872618, "loss": 2.9012, "theoretical_loss": 3.558453803637251, "tokens_seen": 1311712256 }, { "epoch": 3.09, "learning_rate": 0.00030433299899699097, "loss": 2.8084, "theoretical_loss": 3.5584377225502912, "tokens_seen": 1311777792 }, { "epoch": 3.09, "learning_rate": 0.0003043229689067202, "loss": 2.9317, "theoretical_loss": 3.5584216424916586, "tokens_seen": 1311843328 }, { "epoch": 3.09, "learning_rate": 0.00030431293881644933, "loss": 2.7864, "theoretical_loss": 3.5584055634612355, "tokens_seen": 1311908864 }, { "epoch": 3.09, "learning_rate": 0.00030430290872617856, "loss": 2.7695, "theoretical_loss": 3.558389485458904, "tokens_seen": 1311974400 }, { "epoch": 3.09, "learning_rate": 0.0003042928786359077, "loss": 2.8728, "theoretical_loss": 3.558373408484549, "tokens_seen": 1312039936 }, { "epoch": 3.09, "learning_rate": 0.0003042828485456369, "loss": 2.8468, "theoretical_loss": 3.558357332538052, "tokens_seen": 1312105472 }, { "epoch": 3.09, "learning_rate": 0.0003042728184553661, "loss": 2.9408, "theoretical_loss": 3.5583412576192965, "tokens_seen": 1312171008 }, { "epoch": 3.09, "learning_rate": 0.0003042627883650953, "loss": 2.7611, "theoretical_loss": 3.558325183728165, "tokens_seen": 1312236544 }, { "epoch": 3.09, "learning_rate": 0.00030425275827482447, "loss": 2.7173, "theoretical_loss": 3.558309110864541, "tokens_seen": 1312302080 }, { "epoch": 3.09, "objective/train/docs_used": 2098971, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.384787082672119, "objective/train/theoretical_loss": 3.558305092809169, "objective/train/tokens_used": 1332778464, "theoretical_loss": 3.558305092809169, "tokens_seen": 1312318464 }, { "epoch": 3.09, "learning_rate": 0.00030424272818455365, "loss": 2.6613, "theoretical_loss": 3.558293039028307, "tokens_seen": 1312367616 }, { "epoch": 3.09, "learning_rate": 0.00030423269809428283, "loss": 2.8626, "theoretical_loss": 3.5582769682193467, "tokens_seen": 1312433152 }, { "epoch": 3.09, "learning_rate": 0.00030422266800401207, "loss": 2.9497, "theoretical_loss": 3.558260898437543, "tokens_seen": 1312498688 }, { "epoch": 3.09, "learning_rate": 0.0003042126379137412, "loss": 2.8294, "theoretical_loss": 3.5582448296827787, "tokens_seen": 1312564224 }, { "epoch": 3.09, "learning_rate": 0.00030420260782347043, "loss": 2.8969, "theoretical_loss": 3.558228761954936, "tokens_seen": 1312629760 }, { "epoch": 3.09, "learning_rate": 0.0003041925777331996, "loss": 2.8604, "theoretical_loss": 3.5582126952539, "tokens_seen": 1312695296 }, { "epoch": 3.09, "learning_rate": 0.0003041825476429288, "loss": 2.7601, "theoretical_loss": 3.558196629579552, "tokens_seen": 1312760832 }, { "epoch": 3.09, "learning_rate": 0.00030417251755265797, "loss": 2.7575, "theoretical_loss": 3.558180564931776, "tokens_seen": 1312826368 }, { "epoch": 3.09, "learning_rate": 0.00030416248746238715, "loss": 2.8332, "theoretical_loss": 3.558164501310456, "tokens_seen": 1312891904 }, { "epoch": 3.09, "learning_rate": 0.00030415245737211633, "loss": 2.7017, "theoretical_loss": 3.558148438715474, "tokens_seen": 1312957440 }, { "epoch": 3.09, "learning_rate": 0.00030414242728184557, "loss": 2.8553, "theoretical_loss": 3.5581323771467126, "tokens_seen": 1313022976 }, { "epoch": 3.09, "learning_rate": 0.0003041323971915747, "loss": 2.7947, "theoretical_loss": 3.558116316604056, "tokens_seen": 1313088512 }, { "epoch": 3.09, "learning_rate": 0.00030412236710130393, "loss": 2.8779, "theoretical_loss": 3.558100257087388, "tokens_seen": 1313154048 }, { "epoch": 3.09, "learning_rate": 0.00030411233701103306, "loss": 2.9033, "theoretical_loss": 3.5580841985965908, "tokens_seen": 1313219584 }, { "epoch": 3.09, "learning_rate": 0.0003041023069207623, "loss": 2.7711, "theoretical_loss": 3.558068141131548, "tokens_seen": 1313285120 }, { "epoch": 3.09, "learning_rate": 0.0003040922768304915, "loss": 2.9112, "theoretical_loss": 3.5580520846921426, "tokens_seen": 1313350656 }, { "epoch": 3.09, "learning_rate": 0.00030408224674022066, "loss": 2.8918, "theoretical_loss": 3.5580360292782585, "tokens_seen": 1313416192 }, { "epoch": 3.09, "learning_rate": 0.00030407221664994984, "loss": 2.5728, "theoretical_loss": 3.558019974889779, "tokens_seen": 1313481728 }, { "epoch": 3.09, "learning_rate": 0.000304062186559679, "loss": 2.8287, "theoretical_loss": 3.558003921526587, "tokens_seen": 1313547264 }, { "epoch": 3.09, "learning_rate": 0.00030405215646940825, "loss": 2.7883, "theoretical_loss": 3.5579878691885662, "tokens_seen": 1313612800 }, { "epoch": 3.09, "learning_rate": 0.00030404212637913743, "loss": 2.885, "theoretical_loss": 3.5579718178756004, "tokens_seen": 1313678336 }, { "epoch": 3.09, "learning_rate": 0.0003040320962888666, "loss": 2.8998, "theoretical_loss": 3.557955767587572, "tokens_seen": 1313743872 }, { "epoch": 3.09, "learning_rate": 0.0003040220661985958, "loss": 2.8544, "theoretical_loss": 3.5579397183243646, "tokens_seen": 1313809408 }, { "epoch": 3.09, "learning_rate": 0.00030401203610832503, "loss": 2.699, "theoretical_loss": 3.5579236700858625, "tokens_seen": 1313874944 }, { "epoch": 3.09, "learning_rate": 0.00030400200601805416, "loss": 2.7657, "theoretical_loss": 3.557907622871949, "tokens_seen": 1313940480 }, { "epoch": 3.09, "objective/train/docs_used": 2101853, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.626044273376465, "objective/train/theoretical_loss": 3.5579036112285483, "objective/train/tokens_used": 1334416864, "theoretical_loss": 3.5579036112285483, "tokens_seen": 1313956864 }, { "epoch": 3.09, "learning_rate": 0.0003039919759277834, "loss": 2.8234, "theoretical_loss": 3.5578915766825063, "tokens_seen": 1314006016 }, { "epoch": 3.09, "learning_rate": 0.0003039819458375125, "loss": 2.7423, "theoretical_loss": 3.5578755315174195, "tokens_seen": 1314071552 }, { "epoch": 3.09, "learning_rate": 0.00030397191574724176, "loss": 2.8227, "theoretical_loss": 3.557859487376571, "tokens_seen": 1314137088 }, { "epoch": 3.09, "learning_rate": 0.00030396188565697094, "loss": 3.0099, "theoretical_loss": 3.5578434442598454, "tokens_seen": 1314202624 }, { "epoch": 3.09, "learning_rate": 0.0003039518555667001, "loss": 2.8151, "theoretical_loss": 3.5578274021671255, "tokens_seen": 1314268160 }, { "epoch": 3.09, "learning_rate": 0.0003039418254764293, "loss": 2.9784, "theoretical_loss": 3.557811361098295, "tokens_seen": 1314333696 }, { "epoch": 3.09, "learning_rate": 0.0003039317953861585, "loss": 2.7788, "theoretical_loss": 3.557795321053238, "tokens_seen": 1314399232 }, { "epoch": 3.09, "learning_rate": 0.00030392176529588766, "loss": 2.9562, "theoretical_loss": 3.557779282031837, "tokens_seen": 1314464768 }, { "epoch": 3.09, "learning_rate": 0.0003039117352056169, "loss": 2.867, "theoretical_loss": 3.5577632440339766, "tokens_seen": 1314530304 }, { "epoch": 3.09, "learning_rate": 0.000303901705115346, "loss": 2.7499, "theoretical_loss": 3.55774720705954, "tokens_seen": 1314595840 }, { "epoch": 3.09, "learning_rate": 0.00030389167502507526, "loss": 2.743, "theoretical_loss": 3.5577311711084114, "tokens_seen": 1314661376 }, { "epoch": 3.09, "learning_rate": 0.0003038816449348044, "loss": 2.9899, "theoretical_loss": 3.557715136180474, "tokens_seen": 1314726912 }, { "epoch": 3.09, "learning_rate": 0.0003038716148445336, "loss": 2.6862, "theoretical_loss": 3.5576991022756115, "tokens_seen": 1314792448 }, { "epoch": 3.09, "learning_rate": 0.0003038615847542628, "loss": 2.84, "theoretical_loss": 3.557683069393708, "tokens_seen": 1314857984 }, { "epoch": 3.09, "learning_rate": 0.000303851554663992, "loss": 2.7648, "theoretical_loss": 3.557667037534647, "tokens_seen": 1314923520 }, { "epoch": 3.09, "learning_rate": 0.00030384152457372117, "loss": 2.7499, "theoretical_loss": 3.5576510066983125, "tokens_seen": 1314989056 }, { "epoch": 3.09, "learning_rate": 0.0003038314944834504, "loss": 2.964, "theoretical_loss": 3.5576349768845885, "tokens_seen": 1315054592 }, { "epoch": 3.09, "learning_rate": 0.00030382146439317953, "loss": 2.7535, "theoretical_loss": 3.5576189480933573, "tokens_seen": 1315120128 }, { "epoch": 3.09, "learning_rate": 0.00030381143430290876, "loss": 2.7481, "theoretical_loss": 3.5576029203245048, "tokens_seen": 1315185664 }, { "epoch": 3.09, "learning_rate": 0.0003038014042126379, "loss": 2.9079, "theoretical_loss": 3.5575868935779136, "tokens_seen": 1315251200 }, { "epoch": 3.09, "learning_rate": 0.0003037913741223671, "loss": 2.7954, "theoretical_loss": 3.5575708678534683, "tokens_seen": 1315316736 }, { "epoch": 3.09, "learning_rate": 0.0003037813440320963, "loss": 2.9198, "theoretical_loss": 3.5575548431510517, "tokens_seen": 1315382272 }, { "epoch": 3.09, "learning_rate": 0.0003037713139418255, "loss": 2.8589, "theoretical_loss": 3.557538819470549, "tokens_seen": 1315447808 }, { "epoch": 3.09, "learning_rate": 0.00030376128385155467, "loss": 2.6925, "theoretical_loss": 3.557522796811843, "tokens_seen": 1315513344 }, { "epoch": 3.09, "learning_rate": 0.00030375125376128385, "loss": 2.8056, "theoretical_loss": 3.5575067751748186, "tokens_seen": 1315578880 }, { "epoch": 3.09, "objective/train/docs_used": 2104899, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9174537658691406, "objective/train/theoretical_loss": 3.557502769925186, "objective/train/tokens_used": 1336055264, "theoretical_loss": 3.557502769925186, "tokens_seen": 1315595264 }, { "epoch": 3.09, "learning_rate": 0.00030374122367101303, "loss": 2.8385, "theoretical_loss": 3.5574907545593586, "tokens_seen": 1315644416 }, { "epoch": 3.09, "learning_rate": 0.00030373119358074227, "loss": 2.7795, "theoretical_loss": 3.557474734965348, "tokens_seen": 1315709952 }, { "epoch": 3.09, "learning_rate": 0.0003037211634904714, "loss": 2.6623, "theoretical_loss": 3.557458716392671, "tokens_seen": 1315775488 }, { "epoch": 3.09, "learning_rate": 0.00030371113340020063, "loss": 2.6563, "theoretical_loss": 3.5574426988412107, "tokens_seen": 1315841024 }, { "epoch": 3.09, "learning_rate": 0.0003037011033099298, "loss": 2.6901, "theoretical_loss": 3.557426682310852, "tokens_seen": 1315906560 }, { "epoch": 3.09, "learning_rate": 0.000303691073219659, "loss": 2.8531, "theoretical_loss": 3.557410666801478, "tokens_seen": 1315972096 }, { "epoch": 3.09, "learning_rate": 0.00030368104312938817, "loss": 2.7961, "theoretical_loss": 3.557394652312973, "tokens_seen": 1316037632 }, { "epoch": 3.09, "learning_rate": 0.00030367101303911735, "loss": 2.843, "theoretical_loss": 3.557378638845222, "tokens_seen": 1316103168 }, { "epoch": 3.09, "learning_rate": 0.00030366098294884653, "loss": 2.7981, "theoretical_loss": 3.557362626398108, "tokens_seen": 1316168704 }, { "epoch": 3.09, "learning_rate": 0.00030365095285857577, "loss": 2.7578, "theoretical_loss": 3.557346614971516, "tokens_seen": 1316234240 }, { "epoch": 3.09, "learning_rate": 0.0003036409227683049, "loss": 2.8984, "theoretical_loss": 3.5573306045653297, "tokens_seen": 1316299776 }, { "epoch": 3.09, "learning_rate": 0.00030363089267803413, "loss": 2.7406, "theoretical_loss": 3.5573145951794327, "tokens_seen": 1316365312 }, { "epoch": 3.09, "learning_rate": 0.00030362086258776326, "loss": 2.7761, "theoretical_loss": 3.557298586813711, "tokens_seen": 1316430848 }, { "epoch": 3.09, "learning_rate": 0.0003036108324974925, "loss": 2.9402, "theoretical_loss": 3.5572825794680467, "tokens_seen": 1316496384 }, { "epoch": 3.09, "learning_rate": 0.0003036008024072217, "loss": 2.7459, "theoretical_loss": 3.557266573142325, "tokens_seen": 1316561920 }, { "epoch": 3.09, "learning_rate": 0.00030359077231695086, "loss": 2.7893, "theoretical_loss": 3.5572505678364306, "tokens_seen": 1316627456 }, { "epoch": 3.09, "learning_rate": 0.00030358074222668004, "loss": 2.7393, "theoretical_loss": 3.557234563550247, "tokens_seen": 1316692992 }, { "epoch": 3.09, "learning_rate": 0.0003035707121364092, "loss": 2.8942, "theoretical_loss": 3.557218560283659, "tokens_seen": 1316758528 }, { "epoch": 3.09, "learning_rate": 0.0003035606820461384, "loss": 2.7494, "theoretical_loss": 3.5572025580365505, "tokens_seen": 1316824064 }, { "epoch": 3.09, "learning_rate": 0.00030355065195586763, "loss": 2.8136, "theoretical_loss": 3.557186556808806, "tokens_seen": 1316889600 }, { "epoch": 3.09, "learning_rate": 0.00030354062186559676, "loss": 2.8795, "theoretical_loss": 3.55717055660031, "tokens_seen": 1316955136 }, { "epoch": 3.09, "learning_rate": 0.000303530591775326, "loss": 2.8558, "theoretical_loss": 3.5571545574109464, "tokens_seen": 1317020672 }, { "epoch": 3.09, "learning_rate": 0.0003035205616850552, "loss": 2.9565, "theoretical_loss": 3.5571385592405997, "tokens_seen": 1317086208 }, { "epoch": 3.09, "learning_rate": 0.00030351053159478436, "loss": 2.6521, "theoretical_loss": 3.5571225620891544, "tokens_seen": 1317151744 }, { "epoch": 3.09, "learning_rate": 0.00030350050150451354, "loss": 2.7759, "theoretical_loss": 3.557106565956496, "tokens_seen": 1317217280 }, { "epoch": 3.09, "objective/train/docs_used": 2106316, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.49198842048645, "objective/train/theoretical_loss": 3.5571025670825027, "objective/train/tokens_used": 1337693664, "theoretical_loss": 3.5571025670825027, "tokens_seen": 1317233664 }, { "epoch": 3.09, "learning_rate": 0.0003034904714142427, "loss": 2.8517, "theoretical_loss": 3.557090570842507, "tokens_seen": 1317282816 }, { "epoch": 3.09, "learning_rate": 0.0003034804413239719, "loss": 2.8768, "theoretical_loss": 3.557074576747073, "tokens_seen": 1317348352 }, { "epoch": 3.09, "learning_rate": 0.00030347041123370114, "loss": 2.8143, "theoretical_loss": 3.557058583670078, "tokens_seen": 1317413888 }, { "epoch": 3.09, "learning_rate": 0.00030346038114343026, "loss": 2.6861, "theoretical_loss": 3.557042591611407, "tokens_seen": 1317479424 }, { "epoch": 3.09, "learning_rate": 0.0003034503510531595, "loss": 2.9135, "theoretical_loss": 3.557026600570944, "tokens_seen": 1317544960 }, { "epoch": 3.09, "learning_rate": 0.0003034403209628886, "loss": 2.7433, "theoretical_loss": 3.5570106105485744, "tokens_seen": 1317610496 }, { "epoch": 3.09, "learning_rate": 0.00030343029087261786, "loss": 2.9305, "theoretical_loss": 3.5569946215441814, "tokens_seen": 1317676032 }, { "epoch": 3.09, "learning_rate": 0.00030342026078234704, "loss": 2.8251, "theoretical_loss": 3.5569786335576508, "tokens_seen": 1317741568 }, { "epoch": 3.09, "learning_rate": 0.0003034102306920762, "loss": 2.8213, "theoretical_loss": 3.5569626465888664, "tokens_seen": 1317807104 }, { "epoch": 3.09, "learning_rate": 0.0003034002006018054, "loss": 2.8218, "theoretical_loss": 3.556946660637713, "tokens_seen": 1317872640 }, { "epoch": 3.09, "learning_rate": 0.0003033901705115346, "loss": 2.9999, "theoretical_loss": 3.5569306757040753, "tokens_seen": 1317938176 }, { "epoch": 3.09, "learning_rate": 0.00030338014042126377, "loss": 2.9509, "theoretical_loss": 3.556914691787838, "tokens_seen": 1318003712 }, { "epoch": 3.09, "learning_rate": 0.000303370110330993, "loss": 2.8849, "theoretical_loss": 3.5568987088888857, "tokens_seen": 1318069248 }, { "epoch": 3.09, "learning_rate": 0.00030336008024072213, "loss": 2.7664, "theoretical_loss": 3.5568827270071033, "tokens_seen": 1318134784 }, { "epoch": 3.09, "learning_rate": 0.00030335005015045137, "loss": 2.8779, "theoretical_loss": 3.556866746142375, "tokens_seen": 1318200320 }, { "epoch": 3.09, "learning_rate": 0.00030334002006018055, "loss": 2.7681, "theoretical_loss": 3.556850766294586, "tokens_seen": 1318265856 }, { "epoch": 3.09, "learning_rate": 0.00030332998996990973, "loss": 2.784, "theoretical_loss": 3.556834787463621, "tokens_seen": 1318331392 }, { "epoch": 3.09, "learning_rate": 0.0003033199598796389, "loss": 2.8635, "theoretical_loss": 3.5568188096493643, "tokens_seen": 1318396928 }, { "epoch": 3.09, "learning_rate": 0.0003033099297893681, "loss": 2.8546, "theoretical_loss": 3.5568028328517007, "tokens_seen": 1318462464 }, { "epoch": 3.09, "learning_rate": 0.0003032998996990973, "loss": 2.7756, "theoretical_loss": 3.5567868570705157, "tokens_seen": 1318528000 }, { "epoch": 3.09, "learning_rate": 0.0003032898696088265, "loss": 2.724, "theoretical_loss": 3.5567708823056936, "tokens_seen": 1318593536 }, { "epoch": 3.09, "learning_rate": 0.0003032798395185557, "loss": 3.0038, "theoretical_loss": 3.556754908557119, "tokens_seen": 1318659072 }, { "epoch": 3.09, "learning_rate": 0.00030326980942828487, "loss": 2.7832, "theoretical_loss": 3.5567389358246775, "tokens_seen": 1318724608 }, { "epoch": 3.09, "learning_rate": 0.00030325977933801405, "loss": 2.7322, "theoretical_loss": 3.556722964108253, "tokens_seen": 1318790144 }, { "epoch": 3.09, "learning_rate": 0.00030324974924774323, "loss": 2.7273, "theoretical_loss": 3.5567069934077313, "tokens_seen": 1318855680 }, { "epoch": 3.09, "objective/train/docs_used": 2109209, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0971221923828125, "objective/train/theoretical_loss": 3.5567030008913223, "objective/train/tokens_used": 1339332064, "theoretical_loss": 3.5567030008913223, "tokens_seen": 1318872064 }, { "epoch": 3.09, "learning_rate": 0.00030323971915747247, "loss": 2.8617, "theoretical_loss": 3.5566910237229967, "tokens_seen": 1318921216 }, { "epoch": 3.09, "learning_rate": 0.0003032296890672016, "loss": 2.7373, "theoretical_loss": 3.556675055053934, "tokens_seen": 1318986752 }, { "epoch": 3.09, "learning_rate": 0.00030321965897693083, "loss": 2.8916, "theoretical_loss": 3.556659087400429, "tokens_seen": 1319052288 }, { "epoch": 3.09, "learning_rate": 0.00030320962888666, "loss": 2.9218, "theoretical_loss": 3.556643120762366, "tokens_seen": 1319117824 }, { "epoch": 3.09, "learning_rate": 0.0003031995987963892, "loss": 2.8247, "theoretical_loss": 3.5566271551396307, "tokens_seen": 1319183360 }, { "epoch": 3.09, "learning_rate": 0.00030318956870611837, "loss": 2.8354, "theoretical_loss": 3.5566111905321067, "tokens_seen": 1319248896 }, { "epoch": 3.09, "learning_rate": 0.00030317953861584755, "loss": 2.7973, "theoretical_loss": 3.55659522693968, "tokens_seen": 1319314432 }, { "epoch": 3.09, "learning_rate": 0.00030316950852557673, "loss": 2.8086, "theoretical_loss": 3.5565792643622354, "tokens_seen": 1319379968 }, { "epoch": 3.09, "learning_rate": 0.00030315947843530597, "loss": 2.6873, "theoretical_loss": 3.556563302799658, "tokens_seen": 1319445504 }, { "epoch": 3.09, "learning_rate": 0.0003031494483450351, "loss": 2.8877, "theoretical_loss": 3.556547342251833, "tokens_seen": 1319511040 }, { "epoch": 3.09, "learning_rate": 0.00030313941825476433, "loss": 2.858, "theoretical_loss": 3.5565313827186458, "tokens_seen": 1319576576 }, { "epoch": 3.09, "learning_rate": 0.00030312938816449346, "loss": 2.8382, "theoretical_loss": 3.5565154241999806, "tokens_seen": 1319642112 }, { "epoch": 3.09, "learning_rate": 0.0003031193580742227, "loss": 2.5413, "theoretical_loss": 3.5564994666957235, "tokens_seen": 1319707648 }, { "epoch": 3.09, "learning_rate": 0.0003031093279839519, "loss": 2.9398, "theoretical_loss": 3.5564835102057586, "tokens_seen": 1319773184 }, { "epoch": 3.09, "learning_rate": 0.00030309929789368106, "loss": 2.9423, "theoretical_loss": 3.556467554729972, "tokens_seen": 1319838720 }, { "epoch": 3.09, "learning_rate": 0.00030308926780341024, "loss": 2.7607, "theoretical_loss": 3.5564516002682485, "tokens_seen": 1319904256 }, { "epoch": 3.09, "learning_rate": 0.0003030792377131394, "loss": 2.813, "theoretical_loss": 3.556435646820473, "tokens_seen": 1319969792 }, { "epoch": 3.09, "learning_rate": 0.0003030692076228686, "loss": 2.864, "theoretical_loss": 3.5564196943865314, "tokens_seen": 1320035328 }, { "epoch": 3.09, "learning_rate": 0.00030305917753259783, "loss": 2.7441, "theoretical_loss": 3.5564037429663085, "tokens_seen": 1320100864 }, { "epoch": 3.09, "learning_rate": 0.00030304914744232696, "loss": 2.6914, "theoretical_loss": 3.5563877925596894, "tokens_seen": 1320166400 }, { "epoch": 3.09, "learning_rate": 0.0003030391173520562, "loss": 2.5937, "theoretical_loss": 3.55637184316656, "tokens_seen": 1320231936 }, { "epoch": 3.09, "learning_rate": 0.0003030290872617854, "loss": 2.8679, "theoretical_loss": 3.5563558947868055, "tokens_seen": 1320297472 }, { "epoch": 3.09, "learning_rate": 0.00030301905717151456, "loss": 2.7772, "theoretical_loss": 3.5563399474203106, "tokens_seen": 1320363008 }, { "epoch": 3.09, "learning_rate": 0.00030300902708124374, "loss": 2.8796, "theoretical_loss": 3.556324001066961, "tokens_seen": 1320428544 }, { "epoch": 3.09, "learning_rate": 0.0003029989969909729, "loss": 2.8421, "theoretical_loss": 3.5563080557266415, "tokens_seen": 1320494080 }, { "epoch": 3.09, "objective/train/docs_used": 2111515, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.476933717727661, "objective/train/theoretical_loss": 3.5563040695498347, "objective/train/tokens_used": 1340970464, "theoretical_loss": 3.5563040695498347, "tokens_seen": 1320510464 }, { "epoch": 3.09, "learning_rate": 0.0003029889669007021, "loss": 2.7385, "theoretical_loss": 3.556292111399239, "tokens_seen": 1320559616 }, { "epoch": 3.09, "learning_rate": 0.00030297893681043134, "loss": 2.7741, "theoretical_loss": 3.5562761680846373, "tokens_seen": 1320625152 }, { "epoch": 3.09, "learning_rate": 0.00030296890672016046, "loss": 2.7865, "theoretical_loss": 3.5562602257827227, "tokens_seen": 1320690688 }, { "epoch": 3.09, "learning_rate": 0.0003029588766298897, "loss": 2.8199, "theoretical_loss": 3.5562442844933804, "tokens_seen": 1320756224 }, { "epoch": 3.09, "learning_rate": 0.0003029488465396188, "loss": 2.8898, "theoretical_loss": 3.5562283442164957, "tokens_seen": 1320821760 }, { "epoch": 3.09, "learning_rate": 0.00030293881644934806, "loss": 2.8333, "theoretical_loss": 3.556212404951954, "tokens_seen": 1320887296 }, { "epoch": 3.09, "learning_rate": 0.00030292878635907724, "loss": 2.986, "theoretical_loss": 3.5561964666996415, "tokens_seen": 1320952832 }, { "epoch": 3.09, "learning_rate": 0.0003029187562688064, "loss": 2.7655, "theoretical_loss": 3.5561805294594424, "tokens_seen": 1321018368 }, { "epoch": 3.09, "learning_rate": 0.0003029087261785356, "loss": 2.8385, "theoretical_loss": 3.5561645932312436, "tokens_seen": 1321083904 }, { "epoch": 3.09, "learning_rate": 0.0003028986960882648, "loss": 2.8088, "theoretical_loss": 3.5561486580149295, "tokens_seen": 1321149440 }, { "epoch": 3.09, "learning_rate": 0.00030288866599799397, "loss": 2.6152, "theoretical_loss": 3.5561327238103866, "tokens_seen": 1321214976 }, { "epoch": 3.09, "learning_rate": 0.0003028786359077232, "loss": 2.7758, "theoretical_loss": 3.5561167906174997, "tokens_seen": 1321280512 }, { "epoch": 3.09, "learning_rate": 0.00030286860581745233, "loss": 2.9126, "theoretical_loss": 3.556100858436155, "tokens_seen": 1321346048 }, { "epoch": 3.09, "learning_rate": 0.00030285857572718157, "loss": 2.8759, "theoretical_loss": 3.5560849272662374, "tokens_seen": 1321411584 }, { "epoch": 3.09, "learning_rate": 0.00030284854563691075, "loss": 2.7867, "theoretical_loss": 3.5560689971076336, "tokens_seen": 1321477120 }, { "epoch": 3.09, "learning_rate": 0.00030283851554663993, "loss": 2.8258, "theoretical_loss": 3.556053067960228, "tokens_seen": 1321542656 }, { "epoch": 3.09, "learning_rate": 0.0003028284854563691, "loss": 2.9265, "theoretical_loss": 3.556037139823908, "tokens_seen": 1321608192 }, { "epoch": 3.09, "learning_rate": 0.0003028184553660983, "loss": 2.8235, "theoretical_loss": 3.5560212126985573, "tokens_seen": 1321673728 }, { "epoch": 3.09, "learning_rate": 0.00030280842527582747, "loss": 3.0348, "theoretical_loss": 3.556005286584063, "tokens_seen": 1321739264 }, { "epoch": 3.09, "learning_rate": 0.0003027983951855567, "loss": 2.8253, "theoretical_loss": 3.55598936148031, "tokens_seen": 1321804800 }, { "epoch": 3.09, "learning_rate": 0.00030278836509528583, "loss": 2.8164, "theoretical_loss": 3.5559734373871845, "tokens_seen": 1321870336 }, { "epoch": 3.09, "learning_rate": 0.00030277833500501507, "loss": 2.7397, "theoretical_loss": 3.555957514304572, "tokens_seen": 1321935872 }, { "epoch": 3.09, "learning_rate": 0.0003027683049147442, "loss": 2.9242, "theoretical_loss": 3.555941592232359, "tokens_seen": 1322001408 }, { "epoch": 3.09, "learning_rate": 0.00030275827482447343, "loss": 2.7824, "theoretical_loss": 3.5559256711704306, "tokens_seen": 1322066944 }, { "epoch": 3.09, "learning_rate": 0.0003027482447342026, "loss": 2.6462, "theoretical_loss": 3.555909751118672, "tokens_seen": 1322132480 }, { "epoch": 3.09, "objective/train/docs_used": 2114440, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5862877368927, "objective/train/theoretical_loss": 3.5559057712635584, "objective/train/tokens_used": 1342608864, "theoretical_loss": 3.5559057712635584, "tokens_seen": 1322148864 }, { "epoch": 3.09, "learning_rate": 0.0003027382146439318, "loss": 2.7732, "theoretical_loss": 3.55589383207697, "tokens_seen": 1322198016 }, { "epoch": 3.09, "learning_rate": 0.000302728184553661, "loss": 2.829, "theoretical_loss": 3.555877914045211, "tokens_seen": 1322263552 }, { "epoch": 3.09, "learning_rate": 0.0003027181544633902, "loss": 2.8233, "theoretical_loss": 3.5558619970232797, "tokens_seen": 1322329088 }, { "epoch": 3.09, "learning_rate": 0.00030270812437311934, "loss": 2.6943, "theoretical_loss": 3.5558460810110626, "tokens_seen": 1322394624 }, { "epoch": 3.09, "learning_rate": 0.00030269809428284857, "loss": 2.786, "theoretical_loss": 3.555830166008445, "tokens_seen": 1322460160 }, { "epoch": 3.09, "learning_rate": 0.0003026880641925777, "loss": 2.8047, "theoretical_loss": 3.5558142520153133, "tokens_seen": 1322525696 }, { "epoch": 3.09, "learning_rate": 0.00030267803410230693, "loss": 2.8554, "theoretical_loss": 3.555798339031554, "tokens_seen": 1322591232 }, { "epoch": 3.09, "learning_rate": 0.0003026680040120361, "loss": 2.5832, "theoretical_loss": 3.555782427057052, "tokens_seen": 1322656768 }, { "epoch": 3.09, "learning_rate": 0.0003026579739217653, "loss": 2.7228, "theoretical_loss": 3.555766516091694, "tokens_seen": 1322722304 }, { "epoch": 3.09, "learning_rate": 0.0003026479438314945, "loss": 2.7538, "theoretical_loss": 3.5557506061353656, "tokens_seen": 1322787840 }, { "epoch": 3.09, "learning_rate": 0.00030263791374122366, "loss": 2.945, "theoretical_loss": 3.5557346971879533, "tokens_seen": 1322853376 }, { "epoch": 3.09, "learning_rate": 0.00030262788365095284, "loss": 2.9189, "theoretical_loss": 3.555718789249343, "tokens_seen": 1322918912 }, { "epoch": 3.09, "learning_rate": 0.0003026178535606821, "loss": 2.8601, "theoretical_loss": 3.55570288231942, "tokens_seen": 1322984448 }, { "epoch": 3.09, "learning_rate": 0.0003026078234704112, "loss": 2.7621, "theoretical_loss": 3.5556869763980714, "tokens_seen": 1323049984 }, { "epoch": 3.09, "learning_rate": 0.00030259779338014044, "loss": 2.7076, "theoretical_loss": 3.5556710714851825, "tokens_seen": 1323115520 }, { "epoch": 3.09, "learning_rate": 0.00030258776328986956, "loss": 2.9698, "theoretical_loss": 3.55565516758064, "tokens_seen": 1323181056 }, { "epoch": 3.09, "learning_rate": 0.0003025777331995988, "loss": 2.759, "theoretical_loss": 3.55563926468433, "tokens_seen": 1323246592 }, { "epoch": 3.09, "learning_rate": 0.000302567703109328, "loss": 2.8224, "theoretical_loss": 3.5556233627961378, "tokens_seen": 1323312128 }, { "epoch": 3.09, "learning_rate": 0.00030255767301905716, "loss": 2.8851, "theoretical_loss": 3.555607461915951, "tokens_seen": 1323377664 }, { "epoch": 3.09, "learning_rate": 0.0003025476429287864, "loss": 2.9592, "theoretical_loss": 3.555591562043655, "tokens_seen": 1323443200 }, { "epoch": 3.09, "learning_rate": 0.0003025376128385156, "loss": 2.7859, "theoretical_loss": 3.5555756631791358, "tokens_seen": 1323508736 }, { "epoch": 3.09, "learning_rate": 0.00030252758274824476, "loss": 2.5824, "theoretical_loss": 3.55555976532228, "tokens_seen": 1323574272 }, { "epoch": 3.09, "learning_rate": 0.00030251755265797394, "loss": 2.7408, "theoretical_loss": 3.5555438684729737, "tokens_seen": 1323639808 }, { "epoch": 3.09, "learning_rate": 0.0003025075225677031, "loss": 2.8829, "theoretical_loss": 3.555527972631103, "tokens_seen": 1323705344 }, { "epoch": 3.09, "learning_rate": 0.0003024974924774323, "loss": 2.7944, "theoretical_loss": 3.5555120777965543, "tokens_seen": 1323770880 }, { "epoch": 3.09, "objective/train/docs_used": 2117216, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8153276443481445, "objective/train/theoretical_loss": 3.5555081042452983, "objective/train/tokens_used": 1344247264, "theoretical_loss": 3.5555081042452983, "tokens_seen": 1323787264 }, { "epoch": 3.09, "learning_rate": 0.00030248746238716154, "loss": 2.7586, "theoretical_loss": 3.5554961839692147, "tokens_seen": 1323836416 }, { "epoch": 3.09, "learning_rate": 0.00030247743229689066, "loss": 2.962, "theoretical_loss": 3.555480291148969, "tokens_seen": 1323901952 }, { "epoch": 3.09, "learning_rate": 0.0003024674022066199, "loss": 2.835, "theoretical_loss": 3.5554643993357047, "tokens_seen": 1323967488 }, { "epoch": 3.09, "learning_rate": 0.00030245737211634903, "loss": 2.9141, "theoretical_loss": 3.5554485085293077, "tokens_seen": 1324033024 }, { "epoch": 3.09, "learning_rate": 0.00030244734202607826, "loss": 2.826, "theoretical_loss": 3.5554326187296645, "tokens_seen": 1324098560 }, { "epoch": 3.09, "learning_rate": 0.00030243731193580744, "loss": 2.8175, "theoretical_loss": 3.5554167299366615, "tokens_seen": 1324164096 }, { "epoch": 3.09, "learning_rate": 0.0003024272818455366, "loss": 2.8486, "theoretical_loss": 3.5554008421501853, "tokens_seen": 1324229632 }, { "epoch": 3.09, "learning_rate": 0.0003024172517552658, "loss": 2.9523, "theoretical_loss": 3.555384955370122, "tokens_seen": 1324295168 }, { "epoch": 3.09, "learning_rate": 0.000302407221664995, "loss": 2.8665, "theoretical_loss": 3.555369069596358, "tokens_seen": 1324360704 }, { "epoch": 3.09, "learning_rate": 0.00030239719157472417, "loss": 2.7006, "theoretical_loss": 3.5553531848287796, "tokens_seen": 1324426240 }, { "epoch": 3.09, "learning_rate": 0.0003023871614844534, "loss": 2.7569, "theoretical_loss": 3.555337301067274, "tokens_seen": 1324491776 }, { "epoch": 3.09, "learning_rate": 0.00030237713139418253, "loss": 2.7688, "theoretical_loss": 3.5553214183117268, "tokens_seen": 1324557312 }, { "epoch": 3.09, "learning_rate": 0.00030236710130391177, "loss": 2.962, "theoretical_loss": 3.5553055365620256, "tokens_seen": 1324622848 }, { "epoch": 3.09, "learning_rate": 0.00030235707121364095, "loss": 2.7806, "theoretical_loss": 3.555289655818056, "tokens_seen": 1324688384 }, { "epoch": 3.09, "learning_rate": 0.00030234704112337013, "loss": 2.7003, "theoretical_loss": 3.5552737760797055, "tokens_seen": 1324753920 }, { "epoch": 3.09, "learning_rate": 0.0003023370110330993, "loss": 2.9527, "theoretical_loss": 3.5552578973468596, "tokens_seen": 1324819456 }, { "epoch": 3.09, "learning_rate": 0.0003023269809428285, "loss": 2.7308, "theoretical_loss": 3.5552420196194054, "tokens_seen": 1324884992 }, { "epoch": 3.09, "learning_rate": 0.00030231695085255767, "loss": 2.8919, "theoretical_loss": 3.5552261428972294, "tokens_seen": 1324950528 }, { "epoch": 3.09, "learning_rate": 0.0003023069207622869, "loss": 2.866, "theoretical_loss": 3.555210267180219, "tokens_seen": 1325016064 }, { "epoch": 3.09, "learning_rate": 0.00030229689067201603, "loss": 2.7277, "theoretical_loss": 3.55519439246826, "tokens_seen": 1325081600 }, { "epoch": 3.09, "learning_rate": 0.00030228686058174527, "loss": 2.6961, "theoretical_loss": 3.555178518761239, "tokens_seen": 1325147136 }, { "epoch": 3.09, "learning_rate": 0.0003022768304914744, "loss": 2.7793, "theoretical_loss": 3.555162646059043, "tokens_seen": 1325212672 }, { "epoch": 3.09, "learning_rate": 0.00030226680040120363, "loss": 2.8489, "theoretical_loss": 3.5551467743615586, "tokens_seen": 1325278208 }, { "epoch": 3.09, "learning_rate": 0.0003022567703109328, "loss": 2.7104, "theoretical_loss": 3.5551309036686725, "tokens_seen": 1325343744 }, { "epoch": 3.09, "learning_rate": 0.000302246740220662, "loss": 2.9999, "theoretical_loss": 3.5551150339802717, "tokens_seen": 1325409280 }, { "epoch": 3.09, "objective/train/docs_used": 2119959, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.088566541671753, "objective/train/theoretical_loss": 3.555111066715109, "objective/train/tokens_used": 1345885664, "theoretical_loss": 3.555111066715109, "tokens_seen": 1325425664 }, { "epoch": 3.09, "learning_rate": 0.0003022367101303912, "loss": 2.9146, "theoretical_loss": 3.5550991652962427, "tokens_seen": 1325474816 }, { "epoch": 3.09, "learning_rate": 0.0003022266800401204, "loss": 2.7792, "theoretical_loss": 3.5550832976164726, "tokens_seen": 1325540352 }, { "epoch": 3.09, "learning_rate": 0.00030221664994984954, "loss": 2.7859, "theoretical_loss": 3.5550674309408477, "tokens_seen": 1325605888 }, { "epoch": 3.09, "learning_rate": 0.00030220661985957877, "loss": 2.9589, "theoretical_loss": 3.555051565269255, "tokens_seen": 1325671424 }, { "epoch": 3.09, "learning_rate": 0.0003021965897693079, "loss": 2.8379, "theoretical_loss": 3.5550357006015822, "tokens_seen": 1325736960 }, { "epoch": 3.09, "learning_rate": 0.00030218655967903713, "loss": 2.7561, "theoretical_loss": 3.5550198369377144, "tokens_seen": 1325802496 }, { "epoch": 3.09, "learning_rate": 0.0003021765295887663, "loss": 2.7292, "theoretical_loss": 3.5550039742775397, "tokens_seen": 1325868032 }, { "epoch": 3.09, "learning_rate": 0.0003021664994984955, "loss": 2.8262, "theoretical_loss": 3.5549881126209453, "tokens_seen": 1325933568 }, { "epoch": 3.09, "learning_rate": 0.0003021564694082247, "loss": 2.8024, "theoretical_loss": 3.5549722519678166, "tokens_seen": 1325999104 }, { "epoch": 3.09, "learning_rate": 0.00030214643931795386, "loss": 2.9236, "theoretical_loss": 3.5549563923180423, "tokens_seen": 1326064640 }, { "epoch": 3.09, "learning_rate": 0.00030213640922768304, "loss": 2.7349, "theoretical_loss": 3.554940533671508, "tokens_seen": 1326130176 }, { "epoch": 3.09, "learning_rate": 0.0003021263791374123, "loss": 2.7674, "theoretical_loss": 3.5549246760281017, "tokens_seen": 1326195712 }, { "epoch": 3.09, "learning_rate": 0.0003021163490471414, "loss": 2.8102, "theoretical_loss": 3.5549088193877094, "tokens_seen": 1326261248 }, { "epoch": 3.09, "learning_rate": 0.00030210631895687064, "loss": 2.9048, "theoretical_loss": 3.554892963750219, "tokens_seen": 1326326784 }, { "epoch": 3.09, "learning_rate": 0.00030209628886659976, "loss": 2.8455, "theoretical_loss": 3.5548771091155165, "tokens_seen": 1326392320 }, { "epoch": 3.09, "learning_rate": 0.000302086258776329, "loss": 2.7576, "theoretical_loss": 3.5548612554834897, "tokens_seen": 1326457856 }, { "epoch": 3.09, "learning_rate": 0.0003020762286860582, "loss": 2.7038, "theoretical_loss": 3.5548454028540255, "tokens_seen": 1326523392 }, { "epoch": 3.09, "learning_rate": 0.00030206619859578736, "loss": 2.9629, "theoretical_loss": 3.554829551227011, "tokens_seen": 1326588928 }, { "epoch": 3.09, "learning_rate": 0.00030205616850551654, "loss": 2.6515, "theoretical_loss": 3.5548137006023333, "tokens_seen": 1326654464 }, { "epoch": 3.09, "learning_rate": 0.0003020461384152458, "loss": 3.0502, "theoretical_loss": 3.554797850979879, "tokens_seen": 1326720000 }, { "epoch": 3.09, "learning_rate": 0.0003020361083249749, "loss": 2.8046, "theoretical_loss": 3.5547820023595365, "tokens_seen": 1326785536 }, { "epoch": 3.09, "learning_rate": 0.00030202607823470414, "loss": 2.8787, "theoretical_loss": 3.5547661547411917, "tokens_seen": 1326851072 }, { "epoch": 3.09, "learning_rate": 0.00030201604814443327, "loss": 2.9834, "theoretical_loss": 3.5547503081247322, "tokens_seen": 1326916608 }, { "epoch": 3.09, "learning_rate": 0.0003020060180541625, "loss": 2.8021, "theoretical_loss": 3.5547344625100448, "tokens_seen": 1326982144 }, { "epoch": 3.09, "learning_rate": 0.0003019959879638917, "loss": 2.8156, "theoretical_loss": 3.5547186178970174, "tokens_seen": 1327047680 }, { "epoch": 3.09, "objective/train/docs_used": 2122481, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5520663261413574, "objective/train/theoretical_loss": 3.5547146569002566, "objective/train/tokens_used": 1347524064, "theoretical_loss": 3.5547146569002566, "tokens_seen": 1327064064 }, { "epoch": 3.09, "learning_rate": 0.00030198595787362087, "loss": 2.6954, "theoretical_loss": 3.554702774285537, "tokens_seen": 1327113216 }, { "epoch": 3.09, "learning_rate": 0.00030197592778335005, "loss": 2.8824, "theoretical_loss": 3.5546869316754903, "tokens_seen": 1327178752 }, { "epoch": 3.09, "learning_rate": 0.00030196589769307923, "loss": 2.7242, "theoretical_loss": 3.5546710900667655, "tokens_seen": 1327244288 }, { "epoch": 3.09, "learning_rate": 0.0003019558676028084, "loss": 2.5772, "theoretical_loss": 3.554655249459249, "tokens_seen": 1327309824 }, { "epoch": 3.09, "learning_rate": 0.00030194583751253764, "loss": 2.9047, "theoretical_loss": 3.5546394098528284, "tokens_seen": 1327375360 }, { "epoch": 3.09, "learning_rate": 0.00030193580742226677, "loss": 2.8214, "theoretical_loss": 3.5546235712473915, "tokens_seen": 1327440896 }, { "epoch": 3.09, "learning_rate": 0.000301925777331996, "loss": 2.8189, "theoretical_loss": 3.5546077336428246, "tokens_seen": 1327506432 }, { "epoch": 3.09, "learning_rate": 0.00030191574724172513, "loss": 2.6784, "theoretical_loss": 3.554591897039016, "tokens_seen": 1327571968 }, { "epoch": 3.09, "learning_rate": 0.00030190571715145437, "loss": 2.7436, "theoretical_loss": 3.554576061435853, "tokens_seen": 1327637504 }, { "epoch": 3.09, "learning_rate": 0.00030189568706118355, "loss": 2.885, "theoretical_loss": 3.554560226833222, "tokens_seen": 1327703040 }, { "epoch": 3.09, "learning_rate": 0.00030188565697091273, "loss": 2.9097, "theoretical_loss": 3.5545443932310112, "tokens_seen": 1327768576 }, { "epoch": 3.09, "learning_rate": 0.0003018756268806419, "loss": 2.875, "theoretical_loss": 3.554528560629108, "tokens_seen": 1327834112 }, { "epoch": 3.09, "learning_rate": 0.00030186559679037115, "loss": 2.8452, "theoretical_loss": 3.5545127290274, "tokens_seen": 1327899648 }, { "epoch": 3.09, "learning_rate": 0.0003018555667001003, "loss": 2.859, "theoretical_loss": 3.554496898425774, "tokens_seen": 1327965184 }, { "epoch": 3.09, "learning_rate": 0.0003018455366098295, "loss": 2.8346, "theoretical_loss": 3.554481068824118, "tokens_seen": 1328030720 }, { "epoch": 3.09, "learning_rate": 0.00030183550651955864, "loss": 2.7906, "theoretical_loss": 3.55446524022232, "tokens_seen": 1328096256 }, { "epoch": 3.09, "learning_rate": 0.00030182547642928787, "loss": 2.9036, "theoretical_loss": 3.554449412620266, "tokens_seen": 1328161792 }, { "epoch": 3.09, "learning_rate": 0.00030181544633901705, "loss": 2.8005, "theoretical_loss": 3.5544335860178444, "tokens_seen": 1328227328 }, { "epoch": 3.09, "learning_rate": 0.00030180541624874623, "loss": 2.8995, "theoretical_loss": 3.5544177604149434, "tokens_seen": 1328292864 }, { "epoch": 3.09, "learning_rate": 0.00030179538615847547, "loss": 2.8987, "theoretical_loss": 3.5544019358114496, "tokens_seen": 1328358400 }, { "epoch": 3.09, "learning_rate": 0.0003017853560682046, "loss": 2.88, "theoretical_loss": 3.5543861122072506, "tokens_seen": 1328423936 }, { "epoch": 3.09, "learning_rate": 0.00030177532597793383, "loss": 2.9079, "theoretical_loss": 3.5543702896022347, "tokens_seen": 1328489472 }, { "epoch": 3.09, "learning_rate": 0.000301765295887663, "loss": 2.8889, "theoretical_loss": 3.554354467996289, "tokens_seen": 1328555008 }, { "epoch": 3.09, "learning_rate": 0.0003017552657973922, "loss": 2.8562, "theoretical_loss": 3.5543386473893013, "tokens_seen": 1328620544 }, { "epoch": 3.09, "learning_rate": 0.0003017452357071214, "loss": 2.8271, "theoretical_loss": 3.554322827781159, "tokens_seen": 1328686080 }, { "epoch": 3.09, "objective/train/docs_used": 2123978, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7763121128082275, "objective/train/theoretical_loss": 3.5543188730351805, "objective/train/tokens_used": 1349162464, "theoretical_loss": 3.5543188730351805, "tokens_seen": 1328702464 }, { "epoch": 3.09, "learning_rate": 0.0003017352056168506, "loss": 2.8503, "theoretical_loss": 3.5543070091717506, "tokens_seen": 1328751616 }, { "epoch": 3.09, "learning_rate": 0.00030172517552657974, "loss": 2.8157, "theoretical_loss": 3.5542911915609627, "tokens_seen": 1328817152 }, { "epoch": 3.09, "learning_rate": 0.00030171514543630897, "loss": 2.8045, "theoretical_loss": 3.5542753749486833, "tokens_seen": 1328882688 }, { "epoch": 3.09, "learning_rate": 0.0003017051153460381, "loss": 2.9044, "theoretical_loss": 3.5542595593348008, "tokens_seen": 1328948224 }, { "epoch": 3.09, "learning_rate": 0.00030169508525576733, "loss": 2.7537, "theoretical_loss": 3.5542437447192023, "tokens_seen": 1329013760 }, { "epoch": 3.09, "learning_rate": 0.0003016850551654965, "loss": 2.8534, "theoretical_loss": 3.5542279311017757, "tokens_seen": 1329079296 }, { "epoch": 3.09, "learning_rate": 0.0003016750250752257, "loss": 2.6784, "theoretical_loss": 3.5542121184824085, "tokens_seen": 1329144832 }, { "epoch": 3.09, "learning_rate": 0.0003016649949849549, "loss": 2.8437, "theoretical_loss": 3.554196306860989, "tokens_seen": 1329210368 }, { "epoch": 3.09, "learning_rate": 0.00030165496489468406, "loss": 2.8423, "theoretical_loss": 3.5541804962374055, "tokens_seen": 1329275904 }, { "epoch": 3.09, "learning_rate": 0.00030164493480441324, "loss": 2.6973, "theoretical_loss": 3.5541646866115446, "tokens_seen": 1329341440 }, { "epoch": 3.09, "learning_rate": 0.0003016349047141425, "loss": 2.7925, "theoretical_loss": 3.5541488779832946, "tokens_seen": 1329406976 }, { "epoch": 3.09, "learning_rate": 0.0003016248746238716, "loss": 2.8339, "theoretical_loss": 3.554133070352544, "tokens_seen": 1329472512 }, { "epoch": 3.09, "learning_rate": 0.00030161484453360084, "loss": 2.7936, "theoretical_loss": 3.55411726371918, "tokens_seen": 1329538048 }, { "epoch": 3.09, "learning_rate": 0.00030160481444332996, "loss": 2.6524, "theoretical_loss": 3.5541014580830907, "tokens_seen": 1329603584 }, { "epoch": 3.09, "learning_rate": 0.0003015947843530592, "loss": 2.7867, "theoretical_loss": 3.554085653444164, "tokens_seen": 1329669120 }, { "epoch": 3.09, "learning_rate": 0.0003015847542627884, "loss": 2.8588, "theoretical_loss": 3.554069849802288, "tokens_seen": 1329734656 }, { "epoch": 3.09, "learning_rate": 0.00030157472417251756, "loss": 2.903, "theoretical_loss": 3.5540540471573507, "tokens_seen": 1329800192 }, { "epoch": 3.09, "learning_rate": 0.00030156469408224674, "loss": 2.7664, "theoretical_loss": 3.554038245509239, "tokens_seen": 1329865728 }, { "epoch": 3.09, "learning_rate": 0.000301554663991976, "loss": 2.6735, "theoretical_loss": 3.554022444857843, "tokens_seen": 1329931264 }, { "epoch": 3.09, "learning_rate": 0.0003015446339017051, "loss": 2.9409, "theoretical_loss": 3.5540066452030494, "tokens_seen": 1329996800 }, { "epoch": 3.09, "learning_rate": 0.00030153460381143434, "loss": 2.6595, "theoretical_loss": 3.553990846544746, "tokens_seen": 1330062336 }, { "epoch": 3.09, "learning_rate": 0.00030152457372116347, "loss": 2.88, "theoretical_loss": 3.5539750488828212, "tokens_seen": 1330127872 }, { "epoch": 3.09, "learning_rate": 0.0003015145436308927, "loss": 2.6798, "theoretical_loss": 3.5539592522171635, "tokens_seen": 1330193408 }, { "epoch": 3.09, "learning_rate": 0.0003015045135406219, "loss": 2.7921, "theoretical_loss": 3.5539434565476604, "tokens_seen": 1330258944 }, { "epoch": 3.09, "learning_rate": 0.00030149448345035107, "loss": 2.844, "theoretical_loss": 3.5539276618742, "tokens_seen": 1330324480 }, { "epoch": 3.09, "objective/train/docs_used": 2126663, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7675726413726807, "objective/train/theoretical_loss": 3.5539237133614536, "objective/train/tokens_used": 1350800864, "theoretical_loss": 3.5539237133614536, "tokens_seen": 1330340864 }, { "epoch": 3.09, "learning_rate": 0.00030148445336008025, "loss": 2.7634, "theoretical_loss": 3.5539118681966713, "tokens_seen": 1330390016 }, { "epoch": 3.09, "learning_rate": 0.00030147442326980943, "loss": 2.7899, "theoretical_loss": 3.5538960755149613, "tokens_seen": 1330455552 }, { "epoch": 3.09, "learning_rate": 0.0003014643931795386, "loss": 2.8391, "theoretical_loss": 3.5538802838289585, "tokens_seen": 1330521088 }, { "epoch": 3.09, "learning_rate": 0.00030145436308926784, "loss": 2.8962, "theoretical_loss": 3.5538644931385517, "tokens_seen": 1330586624 }, { "epoch": 3.09, "learning_rate": 0.00030144433299899697, "loss": 2.6915, "theoretical_loss": 3.5538487034436286, "tokens_seen": 1330652160 }, { "epoch": 3.09, "learning_rate": 0.0003014343029087262, "loss": 2.7046, "theoretical_loss": 3.5538329147440773, "tokens_seen": 1330717696 }, { "epoch": 3.09, "learning_rate": 0.00030142427281845533, "loss": 2.8123, "theoretical_loss": 3.5538171270397863, "tokens_seen": 1330783232 }, { "epoch": 3.09, "learning_rate": 0.00030141424272818457, "loss": 2.7488, "theoretical_loss": 3.5538013403306437, "tokens_seen": 1330848768 }, { "epoch": 3.09, "learning_rate": 0.00030140421263791375, "loss": 2.5635, "theoretical_loss": 3.553785554616538, "tokens_seen": 1330914304 }, { "epoch": 3.09, "learning_rate": 0.00030139418254764293, "loss": 2.6908, "theoretical_loss": 3.5537697698973574, "tokens_seen": 1330979840 }, { "epoch": 3.09, "learning_rate": 0.0003013841524573721, "loss": 2.8011, "theoretical_loss": 3.55375398617299, "tokens_seen": 1331045376 }, { "epoch": 3.09, "learning_rate": 0.00030137412236710135, "loss": 2.6846, "theoretical_loss": 3.5537382034433236, "tokens_seen": 1331110912 }, { "epoch": 3.09, "learning_rate": 0.0003013640922768305, "loss": 2.7895, "theoretical_loss": 3.553722421708248, "tokens_seen": 1331176448 }, { "epoch": 3.09, "learning_rate": 0.0003013540621865597, "loss": 2.7863, "theoretical_loss": 3.5537066409676505, "tokens_seen": 1331241984 }, { "epoch": 3.09, "learning_rate": 0.00030134403209628884, "loss": 2.7763, "theoretical_loss": 3.55369086122142, "tokens_seen": 1331307520 }, { "epoch": 3.09, "learning_rate": 0.00030133400200601807, "loss": 2.8215, "theoretical_loss": 3.5536750824694447, "tokens_seen": 1331373056 }, { "epoch": 3.09, "learning_rate": 0.00030132397191574725, "loss": 2.6146, "theoretical_loss": 3.553659304711612, "tokens_seen": 1331438592 }, { "epoch": 3.09, "learning_rate": 0.00030131394182547643, "loss": 2.7519, "theoretical_loss": 3.5536435279478127, "tokens_seen": 1331504128 }, { "epoch": 3.09, "learning_rate": 0.0003013039117352056, "loss": 2.7509, "theoretical_loss": 3.553627752177933, "tokens_seen": 1331569664 }, { "epoch": 3.09, "learning_rate": 0.0003012938816449348, "loss": 2.7113, "theoretical_loss": 3.5536119774018626, "tokens_seen": 1331635200 }, { "epoch": 3.09, "learning_rate": 0.000301283851554664, "loss": 2.8307, "theoretical_loss": 3.553596203619489, "tokens_seen": 1331700736 }, { "epoch": 3.09, "learning_rate": 0.0003012738214643932, "loss": 2.92, "theoretical_loss": 3.553580430830702, "tokens_seen": 1331766272 }, { "epoch": 3.09, "learning_rate": 0.00030126379137412234, "loss": 2.8241, "theoretical_loss": 3.553564659035389, "tokens_seen": 1331831808 }, { "epoch": 3.09, "learning_rate": 0.0003012537612838516, "loss": 2.793, "theoretical_loss": 3.553548888233439, "tokens_seen": 1331897344 }, { "epoch": 3.09, "learning_rate": 0.0003012437311935807, "loss": 2.9044, "theoretical_loss": 3.553533118424741, "tokens_seen": 1331962880 }, { "epoch": 3.09, "objective/train/docs_used": 2129395, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.844210147857666, "objective/train/theoretical_loss": 3.553529176127749, "objective/train/tokens_used": 1352439264, "theoretical_loss": 3.553529176127749, "tokens_seen": 1331979264 }, { "epoch": 3.09, "learning_rate": 0.00030123370110330994, "loss": 2.8386, "theoretical_loss": 3.5535173496091828, "tokens_seen": 1332028416 }, { "epoch": 3.09, "learning_rate": 0.0003012236710130391, "loss": 2.9755, "theoretical_loss": 3.5535015817866533, "tokens_seen": 1332093952 }, { "epoch": 3.09, "learning_rate": 0.0003012136409227683, "loss": 2.9535, "theoretical_loss": 3.5534858149570407, "tokens_seen": 1332159488 }, { "epoch": 3.09, "learning_rate": 0.0003012036108324975, "loss": 2.7141, "theoretical_loss": 3.553470049120235, "tokens_seen": 1332225024 }, { "epoch": 3.09, "learning_rate": 0.0003011935807422267, "loss": 2.9113, "theoretical_loss": 3.553454284276123, "tokens_seen": 1332290560 }, { "epoch": 3.09, "learning_rate": 0.00030118355065195584, "loss": 2.8341, "theoretical_loss": 3.553438520424595, "tokens_seen": 1332356096 }, { "epoch": 3.09, "learning_rate": 0.0003011735205616851, "loss": 2.7836, "theoretical_loss": 3.5534227575655386, "tokens_seen": 1332421632 }, { "epoch": 3.09, "learning_rate": 0.0003011634904714142, "loss": 2.7288, "theoretical_loss": 3.553406995698843, "tokens_seen": 1332487168 }, { "epoch": 3.09, "learning_rate": 0.00030115346038114344, "loss": 2.8618, "theoretical_loss": 3.553391234824397, "tokens_seen": 1332552704 }, { "epoch": 3.09, "learning_rate": 0.0003011434302908726, "loss": 2.8018, "theoretical_loss": 3.553375474942089, "tokens_seen": 1332618240 }, { "epoch": 3.09, "learning_rate": 0.0003011334002006018, "loss": 2.8868, "theoretical_loss": 3.5533597160518076, "tokens_seen": 1332683776 }, { "epoch": 3.09, "learning_rate": 0.000301123370110331, "loss": 2.8148, "theoretical_loss": 3.5533439581534423, "tokens_seen": 1332749312 }, { "epoch": 3.09, "learning_rate": 0.00030111334002006016, "loss": 2.7406, "theoretical_loss": 3.553328201246881, "tokens_seen": 1332814848 }, { "epoch": 3.09, "learning_rate": 0.00030110330992978935, "loss": 2.8482, "theoretical_loss": 3.5533124453320135, "tokens_seen": 1332880384 }, { "epoch": 3.09, "learning_rate": 0.0003010932798395186, "loss": 2.8337, "theoretical_loss": 3.553296690408728, "tokens_seen": 1332945920 }, { "epoch": 3.09, "learning_rate": 0.0003010832497492477, "loss": 2.8186, "theoretical_loss": 3.553280936476913, "tokens_seen": 1333011456 }, { "epoch": 3.09, "learning_rate": 0.00030107321965897694, "loss": 2.6856, "theoretical_loss": 3.5532651835364586, "tokens_seen": 1333076992 }, { "epoch": 3.09, "learning_rate": 0.00030106318956870607, "loss": 2.8617, "theoretical_loss": 3.5532494315872523, "tokens_seen": 1333142528 }, { "epoch": 3.09, "learning_rate": 0.0003010531594784353, "loss": 2.7739, "theoretical_loss": 3.5532336806291838, "tokens_seen": 1333208064 }, { "epoch": 3.09, "learning_rate": 0.00030104312938816454, "loss": 2.8734, "theoretical_loss": 3.5532179306621416, "tokens_seen": 1333273600 }, { "epoch": 3.09, "learning_rate": 0.00030103309929789367, "loss": 2.8929, "theoretical_loss": 3.5532021816860153, "tokens_seen": 1333339136 }, { "epoch": 3.09, "learning_rate": 0.0003010230692076229, "loss": 2.9658, "theoretical_loss": 3.5531864337006933, "tokens_seen": 1333404672 }, { "epoch": 3.09, "learning_rate": 0.0003010130391173521, "loss": 2.8579, "theoretical_loss": 3.553170686706064, "tokens_seen": 1333470208 }, { "epoch": 3.09, "learning_rate": 0.00030100300902708127, "loss": 2.8116, "theoretical_loss": 3.553154940702018, "tokens_seen": 1333535744 }, { "epoch": 3.09, "learning_rate": 0.00030099297893681045, "loss": 2.7941, "theoretical_loss": 3.553139195688443, "tokens_seen": 1333601280 }, { "epoch": 3.09, "objective/train/docs_used": 2132198, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8671371936798096, "objective/train/theoretical_loss": 3.553135259589798, "objective/train/tokens_used": 1354077664, "theoretical_loss": 3.553135259589798, "tokens_seen": 1333617664 }, { "epoch": 3.09, "learning_rate": 0.00030098294884653963, "loss": 2.8619, "theoretical_loss": 3.5531234516652286, "tokens_seen": 1333666816 }, { "epoch": 3.09, "learning_rate": 0.0003009729187562688, "loss": 2.8459, "theoretical_loss": 3.5531077086322638, "tokens_seen": 1333732352 }, { "epoch": 3.09, "learning_rate": 0.00030096288866599804, "loss": 2.9422, "theoretical_loss": 3.553091966589437, "tokens_seen": 1333797888 }, { "epoch": 3.09, "learning_rate": 0.00030095285857572717, "loss": 2.7433, "theoretical_loss": 3.553076225536638, "tokens_seen": 1333863424 }, { "epoch": 3.09, "learning_rate": 0.0003009428284854564, "loss": 2.8771, "theoretical_loss": 3.5530604854737557, "tokens_seen": 1333928960 }, { "epoch": 3.09, "learning_rate": 0.00030093279839518553, "loss": 2.8703, "theoretical_loss": 3.55304474640068, "tokens_seen": 1333994496 }, { "epoch": 3.09, "learning_rate": 0.00030092276830491477, "loss": 2.8596, "theoretical_loss": 3.5530290083172984, "tokens_seen": 1334060032 }, { "epoch": 3.09, "learning_rate": 0.00030091273821464395, "loss": 2.9615, "theoretical_loss": 3.5530132712235014, "tokens_seen": 1334125568 }, { "epoch": 3.09, "learning_rate": 0.00030090270812437313, "loss": 2.7833, "theoretical_loss": 3.552997535119177, "tokens_seen": 1334191104 }, { "epoch": 3.09, "learning_rate": 0.0003008926780341023, "loss": 2.6517, "theoretical_loss": 3.5529818000042157, "tokens_seen": 1334256640 }, { "epoch": 3.09, "learning_rate": 0.00030088264794383155, "loss": 2.9361, "theoretical_loss": 3.552966065878506, "tokens_seen": 1334322176 }, { "epoch": 3.09, "learning_rate": 0.0003008726178535607, "loss": 2.8218, "theoretical_loss": 3.552950332741937, "tokens_seen": 1334387712 }, { "epoch": 3.09, "learning_rate": 0.0003008625877632899, "loss": 2.7542, "theoretical_loss": 3.552934600594398, "tokens_seen": 1334453248 }, { "epoch": 3.09, "learning_rate": 0.00030085255767301904, "loss": 2.8248, "theoretical_loss": 3.5529188694357785, "tokens_seen": 1334518784 }, { "epoch": 3.09, "learning_rate": 0.00030084252758274827, "loss": 2.8888, "theoretical_loss": 3.5529031392659682, "tokens_seen": 1334584320 }, { "epoch": 3.09, "learning_rate": 0.00030083249749247745, "loss": 2.9331, "theoretical_loss": 3.552887410084855, "tokens_seen": 1334649856 }, { "epoch": 3.09, "learning_rate": 0.00030082246740220663, "loss": 3.0677, "theoretical_loss": 3.55287168189233, "tokens_seen": 1334715392 }, { "epoch": 3.09, "learning_rate": 0.0003008124373119358, "loss": 2.7805, "theoretical_loss": 3.552855954688281, "tokens_seen": 1334780928 }, { "epoch": 3.09, "learning_rate": 0.000300802407221665, "loss": 2.9817, "theoretical_loss": 3.552840228472598, "tokens_seen": 1334846464 }, { "epoch": 3.09, "learning_rate": 0.0003007923771313942, "loss": 2.8183, "theoretical_loss": 3.5528245032451706, "tokens_seen": 1334912000 }, { "epoch": 3.09, "learning_rate": 0.0003007823470411234, "loss": 2.7813, "theoretical_loss": 3.552808779005888, "tokens_seen": 1334977536 }, { "epoch": 3.09, "learning_rate": 0.00030077231695085254, "loss": 2.8165, "theoretical_loss": 3.5527930557546394, "tokens_seen": 1335043072 }, { "epoch": 3.09, "learning_rate": 0.0003007622868605818, "loss": 2.8804, "theoretical_loss": 3.552777333491314, "tokens_seen": 1335108608 }, { "epoch": 3.09, "learning_rate": 0.0003007522567703109, "loss": 2.8883, "theoretical_loss": 3.552761612215802, "tokens_seen": 1335174144 }, { "epoch": 3.09, "learning_rate": 0.00030074222668004014, "loss": 2.6271, "theoretical_loss": 3.552745891927992, "tokens_seen": 1335239680 }, { "epoch": 3.09, "objective/train/docs_used": 2134950, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.79323410987854, "objective/train/theoretical_loss": 3.552741962010355, "objective/train/tokens_used": 1355716064, "theoretical_loss": 3.552741962010355, "tokens_seen": 1335256064 }, { "epoch": 3.09, "learning_rate": 0.0003007321965897693, "loss": 2.7689, "theoretical_loss": 3.5527301726277747, "tokens_seen": 1335305216 }, { "epoch": 3.09, "learning_rate": 0.0003007221664994985, "loss": 2.5616, "theoretical_loss": 3.552714454315038, "tokens_seen": 1335370752 }, { "epoch": 3.09, "learning_rate": 0.0003007121364092277, "loss": 2.7614, "theoretical_loss": 3.5526987369896723, "tokens_seen": 1335436288 }, { "epoch": 3.09, "learning_rate": 0.0003007021063189569, "loss": 2.7525, "theoretical_loss": 3.5526830206515676, "tokens_seen": 1335501824 }, { "epoch": 3.09, "learning_rate": 0.00030069207622868604, "loss": 2.6861, "theoretical_loss": 3.552667305300612, "tokens_seen": 1335567360 }, { "epoch": 3.09, "learning_rate": 0.0003006820461384153, "loss": 2.9151, "theoretical_loss": 3.5526515909366965, "tokens_seen": 1335632896 }, { "epoch": 3.09, "learning_rate": 0.0003006720160481444, "loss": 2.866, "theoretical_loss": 3.5526358775597098, "tokens_seen": 1335698432 }, { "epoch": 3.09, "learning_rate": 0.00030066198595787364, "loss": 2.903, "theoretical_loss": 3.552620165169542, "tokens_seen": 1335763968 }, { "epoch": 3.09, "learning_rate": 0.0003006519558676028, "loss": 2.6279, "theoretical_loss": 3.552604453766083, "tokens_seen": 1335829504 }, { "epoch": 3.09, "learning_rate": 0.000300641925777332, "loss": 2.8138, "theoretical_loss": 3.5525887433492214, "tokens_seen": 1335895040 }, { "epoch": 3.09, "learning_rate": 0.0003006318956870612, "loss": 2.8528, "theoretical_loss": 3.552573033918848, "tokens_seen": 1335960576 }, { "epoch": 3.09, "learning_rate": 0.00030062186559679036, "loss": 2.8955, "theoretical_loss": 3.552557325474851, "tokens_seen": 1336026112 }, { "epoch": 3.09, "learning_rate": 0.00030061183550651955, "loss": 2.708, "theoretical_loss": 3.5525416180171216, "tokens_seen": 1336091648 }, { "epoch": 3.09, "learning_rate": 0.0003006018054162488, "loss": 2.7153, "theoretical_loss": 3.5525259115455485, "tokens_seen": 1336157184 }, { "epoch": 3.09, "learning_rate": 0.0003005917753259779, "loss": 2.8222, "theoretical_loss": 3.552510206060022, "tokens_seen": 1336222720 }, { "epoch": 3.09, "learning_rate": 0.00030058174523570714, "loss": 2.8671, "theoretical_loss": 3.5524945015604317, "tokens_seen": 1336288256 }, { "epoch": 3.09, "learning_rate": 0.00030057171514543627, "loss": 2.8856, "theoretical_loss": 3.5524787980466668, "tokens_seen": 1336353792 }, { "epoch": 3.09, "learning_rate": 0.0003005616850551655, "loss": 2.6661, "theoretical_loss": 3.5524630955186183, "tokens_seen": 1336419328 }, { "epoch": 3.09, "learning_rate": 0.0003005516549648947, "loss": 2.846, "theoretical_loss": 3.552447393976175, "tokens_seen": 1336484864 }, { "epoch": 3.09, "learning_rate": 0.00030054162487462387, "loss": 3.0256, "theoretical_loss": 3.5524316934192264, "tokens_seen": 1336550400 }, { "epoch": 3.09, "learning_rate": 0.00030053159478435305, "loss": 2.8816, "theoretical_loss": 3.5524159938476636, "tokens_seen": 1336615936 }, { "epoch": 3.09, "learning_rate": 0.0003005215646940823, "loss": 2.8233, "theoretical_loss": 3.552400295261376, "tokens_seen": 1336681472 }, { "epoch": 3.09, "learning_rate": 0.0003005115346038114, "loss": 2.6107, "theoretical_loss": 3.5523845976602524, "tokens_seen": 1336747008 }, { "epoch": 3.09, "learning_rate": 0.00030050150451354065, "loss": 2.974, "theoretical_loss": 3.5523689010441837, "tokens_seen": 1336812544 }, { "epoch": 3.09, "learning_rate": 0.0003004914744232698, "loss": 2.8759, "theoretical_loss": 3.55235320541306, "tokens_seen": 1336878080 }, { "epoch": 3.09, "objective/train/docs_used": 2137671, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.155066967010498, "objective/train/theoretical_loss": 3.5523492816591635, "objective/train/tokens_used": 1357354464, "theoretical_loss": 3.5523492816591635, "tokens_seen": 1336894464 }, { "epoch": 3.09, "learning_rate": 0.000300481444332999, "loss": 2.8671, "theoretical_loss": 3.5523375107667703, "tokens_seen": 1336943616 }, { "epoch": 3.09, "learning_rate": 0.0003004714142427282, "loss": 2.7675, "theoretical_loss": 3.5523218171052053, "tokens_seen": 1337009152 }, { "epoch": 3.09, "learning_rate": 0.00030046138415245737, "loss": 2.7375, "theoretical_loss": 3.552306124428255, "tokens_seen": 1337074688 }, { "epoch": 3.09, "learning_rate": 0.00030045135406218655, "loss": 2.9342, "theoretical_loss": 3.5522904327358082, "tokens_seen": 1337140224 }, { "epoch": 3.09, "learning_rate": 0.00030044132397191573, "loss": 2.8057, "theoretical_loss": 3.5522747420277563, "tokens_seen": 1337205760 }, { "epoch": 3.09, "learning_rate": 0.0003004312938816449, "loss": 2.9044, "theoretical_loss": 3.552259052303989, "tokens_seen": 1337271296 }, { "epoch": 3.09, "learning_rate": 0.00030042126379137415, "loss": 2.8658, "theoretical_loss": 3.5522433635643953, "tokens_seen": 1337336832 }, { "epoch": 3.09, "learning_rate": 0.0003004112337011033, "loss": 2.9531, "theoretical_loss": 3.552227675808867, "tokens_seen": 1337402368 }, { "epoch": 3.09, "learning_rate": 0.0003004012036108325, "loss": 2.8693, "theoretical_loss": 3.5522119890372927, "tokens_seen": 1337467904 }, { "epoch": 3.09, "learning_rate": 0.0003003911735205617, "loss": 2.8501, "theoretical_loss": 3.5521963032495627, "tokens_seen": 1337533440 }, { "epoch": 3.09, "learning_rate": 0.0003003811434302909, "loss": 2.8426, "theoretical_loss": 3.5521806184455675, "tokens_seen": 1337598976 }, { "epoch": 3.09, "learning_rate": 0.00030037111334002006, "loss": 2.8716, "theoretical_loss": 3.552164934625197, "tokens_seen": 1337664512 }, { "epoch": 3.09, "learning_rate": 0.00030036108324974924, "loss": 2.8157, "theoretical_loss": 3.5521492517883417, "tokens_seen": 1337730048 }, { "epoch": 3.09, "learning_rate": 0.0003003510531594784, "loss": 2.7831, "theoretical_loss": 3.5521335699348913, "tokens_seen": 1337795584 }, { "epoch": 3.09, "learning_rate": 0.00030034102306920765, "loss": 2.6367, "theoretical_loss": 3.552117889064736, "tokens_seen": 1337861120 }, { "epoch": 3.09, "learning_rate": 0.0003003309929789368, "loss": 2.7646, "theoretical_loss": 3.5521022091777663, "tokens_seen": 1337926656 }, { "epoch": 3.09, "learning_rate": 0.000300320962888666, "loss": 2.8923, "theoretical_loss": 3.552086530273872, "tokens_seen": 1337992192 }, { "epoch": 3.09, "learning_rate": 0.00030031093279839514, "loss": 2.7756, "theoretical_loss": 3.5520708523529434, "tokens_seen": 1338057728 }, { "epoch": 3.09, "learning_rate": 0.0003003009027081244, "loss": 2.9127, "theoretical_loss": 3.552055175414871, "tokens_seen": 1338123264 }, { "epoch": 3.09, "learning_rate": 0.0003002908726178536, "loss": 2.806, "theoretical_loss": 3.5520394994595446, "tokens_seen": 1338188800 }, { "epoch": 3.09, "learning_rate": 0.00030028084252758274, "loss": 2.726, "theoretical_loss": 3.552023824486855, "tokens_seen": 1338254336 }, { "epoch": 3.09, "learning_rate": 0.000300270812437312, "loss": 2.7466, "theoretical_loss": 3.5520081504966923, "tokens_seen": 1338319872 }, { "epoch": 3.09, "learning_rate": 0.0003002607823470411, "loss": 2.8623, "theoretical_loss": 3.5519924774889464, "tokens_seen": 1338385408 }, { "epoch": 3.09, "learning_rate": 0.00030025075225677034, "loss": 2.8265, "theoretical_loss": 3.551976805463508, "tokens_seen": 1338450944 }, { "epoch": 3.09, "learning_rate": 0.0003002407221664995, "loss": 2.6787, "theoretical_loss": 3.551961134420268, "tokens_seen": 1338516480 }, { "epoch": 3.09, "objective/train/docs_used": 2139095, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.734720230102539, "objective/train/theoretical_loss": 3.5519572168129137, "objective/train/tokens_used": 1358992864, "theoretical_loss": 3.5519572168129137, "tokens_seen": 1338532864 }, { "epoch": 3.09, "learning_rate": 0.0003002306920762287, "loss": 2.8998, "theoretical_loss": 3.551945464359116, "tokens_seen": 1338582016 }, { "epoch": 3.1, "learning_rate": 0.0003002206619859579, "loss": 2.9106, "theoretical_loss": 3.551929795279942, "tokens_seen": 1338647552 }, { "epoch": 3.1, "learning_rate": 0.0003002106318956871, "loss": 2.7888, "theoretical_loss": 3.5519141271826378, "tokens_seen": 1338713088 }, { "epoch": 3.1, "learning_rate": 0.00030020060180541624, "loss": 2.9392, "theoretical_loss": 3.551898460067092, "tokens_seen": 1338778624 }, { "epoch": 3.1, "learning_rate": 0.0003001905717151455, "loss": 2.8999, "theoretical_loss": 3.5518827939331965, "tokens_seen": 1338844160 }, { "epoch": 3.1, "learning_rate": 0.0003001805416248746, "loss": 2.8143, "theoretical_loss": 3.5518671287808408, "tokens_seen": 1338909696 }, { "epoch": 3.1, "learning_rate": 0.00030017051153460384, "loss": 2.781, "theoretical_loss": 3.551851464609916, "tokens_seen": 1338975232 }, { "epoch": 3.1, "learning_rate": 0.000300160481444333, "loss": 2.8631, "theoretical_loss": 3.551835801420313, "tokens_seen": 1339040768 }, { "epoch": 3.1, "learning_rate": 0.0003001504513540622, "loss": 2.8628, "theoretical_loss": 3.5518201392119213, "tokens_seen": 1339106304 }, { "epoch": 3.1, "learning_rate": 0.0003001404212637914, "loss": 2.8371, "theoretical_loss": 3.551804477984631, "tokens_seen": 1339171840 }, { "epoch": 3.1, "learning_rate": 0.00030013039117352056, "loss": 2.9554, "theoretical_loss": 3.551788817738334, "tokens_seen": 1339237376 }, { "epoch": 3.1, "learning_rate": 0.00030012036108324975, "loss": 2.7242, "theoretical_loss": 3.5517731584729204, "tokens_seen": 1339302912 }, { "epoch": 3.1, "learning_rate": 0.000300110330992979, "loss": 2.7708, "theoretical_loss": 3.5517575001882804, "tokens_seen": 1339368448 }, { "epoch": 3.1, "learning_rate": 0.0003001003009027081, "loss": 2.9175, "theoretical_loss": 3.5517418428843053, "tokens_seen": 1339433984 }, { "epoch": 3.1, "learning_rate": 0.00030009027081243734, "loss": 2.918, "theoretical_loss": 3.551726186560885, "tokens_seen": 1339499520 }, { "epoch": 3.1, "learning_rate": 0.00030008024072216647, "loss": 2.8502, "theoretical_loss": 3.55171053121791, "tokens_seen": 1339565056 }, { "epoch": 3.1, "learning_rate": 0.0003000702106318957, "loss": 2.9521, "theoretical_loss": 3.5516948768552714, "tokens_seen": 1339630592 }, { "epoch": 3.1, "learning_rate": 0.0003000601805416249, "loss": 3.026, "theoretical_loss": 3.55167922347286, "tokens_seen": 1339696128 }, { "epoch": 3.1, "learning_rate": 0.00030005015045135407, "loss": 2.7233, "theoretical_loss": 3.5516635710705664, "tokens_seen": 1339761664 }, { "epoch": 3.1, "learning_rate": 0.00030004012036108325, "loss": 2.6532, "theoretical_loss": 3.55164791964828, "tokens_seen": 1339827200 }, { "epoch": 3.1, "learning_rate": 0.0003000300902708125, "loss": 2.9783, "theoretical_loss": 3.5516322692058937, "tokens_seen": 1339892736 }, { "epoch": 3.1, "learning_rate": 0.0003000200601805416, "loss": 2.8577, "theoretical_loss": 3.551616619743297, "tokens_seen": 1339958272 }, { "epoch": 3.1, "learning_rate": 0.00030001003009027085, "loss": 2.8204, "theoretical_loss": 3.5516009712603807, "tokens_seen": 1340023808 }, { "epoch": 3.1, "learning_rate": 0.0003, "loss": 2.5941, "theoretical_loss": 3.5515853237570356, "tokens_seen": 1340089344 }, { "epoch": 3.1, "learning_rate": 0.0002999899699097292, "loss": 2.6979, "theoretical_loss": 3.551569677233153, "tokens_seen": 1340154880 }, { "epoch": 3.1, "objective/train/docs_used": 2142005, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.78869891166687, "objective/train/theoretical_loss": 3.55156576575521, "objective/train/tokens_used": 1360631264, "theoretical_loss": 3.55156576575521, "tokens_seen": 1340171264 }, { "epoch": 3.1, "learning_rate": 0.0002999799398194584, "loss": 2.8519, "theoretical_loss": 3.5515540316886223, "tokens_seen": 1340220416 }, { "epoch": 3.1, "learning_rate": 0.00029996990972918757, "loss": 2.866, "theoretical_loss": 3.551538387123336, "tokens_seen": 1340285952 }, { "epoch": 3.1, "learning_rate": 0.00029995987963891675, "loss": 2.8241, "theoretical_loss": 3.5515227435371837, "tokens_seen": 1340351488 }, { "epoch": 3.1, "learning_rate": 0.00029994984954864593, "loss": 2.7209, "theoretical_loss": 3.551507100930057, "tokens_seen": 1340417024 }, { "epoch": 3.1, "learning_rate": 0.0002999398194583751, "loss": 3.0166, "theoretical_loss": 3.5514914593018463, "tokens_seen": 1340482560 }, { "epoch": 3.1, "learning_rate": 0.00029992978936810435, "loss": 2.8899, "theoretical_loss": 3.5514758186524427, "tokens_seen": 1340548096 }, { "epoch": 3.1, "learning_rate": 0.0002999197592778335, "loss": 2.8499, "theoretical_loss": 3.5514601789817375, "tokens_seen": 1340613632 }, { "epoch": 3.1, "learning_rate": 0.0002999097291875627, "loss": 3.0046, "theoretical_loss": 3.5514445402896206, "tokens_seen": 1340679168 }, { "epoch": 3.1, "learning_rate": 0.0002998996990972919, "loss": 2.6589, "theoretical_loss": 3.551428902575984, "tokens_seen": 1340744704 }, { "epoch": 3.1, "learning_rate": 0.0002998896690070211, "loss": 2.7258, "theoretical_loss": 3.551413265840718, "tokens_seen": 1340810240 }, { "epoch": 3.1, "learning_rate": 0.00029987963891675026, "loss": 2.8264, "theoretical_loss": 3.551397630083714, "tokens_seen": 1340875776 }, { "epoch": 3.1, "learning_rate": 0.00029986960882647944, "loss": 2.8062, "theoretical_loss": 3.5513819953048626, "tokens_seen": 1340941312 }, { "epoch": 3.1, "learning_rate": 0.0002998595787362086, "loss": 2.8237, "theoretical_loss": 3.551366361504055, "tokens_seen": 1341006848 }, { "epoch": 3.1, "learning_rate": 0.00029984954864593785, "loss": 2.9777, "theoretical_loss": 3.551350728681182, "tokens_seen": 1341072384 }, { "epoch": 3.1, "learning_rate": 0.000299839518555667, "loss": 2.7279, "theoretical_loss": 3.5513350968361346, "tokens_seen": 1341137920 }, { "epoch": 3.1, "learning_rate": 0.0002998294884653962, "loss": 2.8291, "theoretical_loss": 3.5513194659688043, "tokens_seen": 1341203456 }, { "epoch": 3.1, "learning_rate": 0.00029981945837512534, "loss": 2.769, "theoretical_loss": 3.5513038360790823, "tokens_seen": 1341268992 }, { "epoch": 3.1, "learning_rate": 0.0002998094282848546, "loss": 2.7457, "theoretical_loss": 3.5512882071668592, "tokens_seen": 1341334528 }, { "epoch": 3.1, "learning_rate": 0.00029979939819458376, "loss": 2.8406, "theoretical_loss": 3.551272579232026, "tokens_seen": 1341400064 }, { "epoch": 3.1, "learning_rate": 0.00029978936810431294, "loss": 2.8456, "theoretical_loss": 3.551256952274475, "tokens_seen": 1341465600 }, { "epoch": 3.1, "learning_rate": 0.0002997793380140421, "loss": 2.7981, "theoretical_loss": 3.5512413262940954, "tokens_seen": 1341531136 }, { "epoch": 3.1, "learning_rate": 0.0002997693079237713, "loss": 2.8989, "theoretical_loss": 3.5512257012907797, "tokens_seen": 1341596672 }, { "epoch": 3.1, "learning_rate": 0.0002997592778335005, "loss": 2.8715, "theoretical_loss": 3.551210077264419, "tokens_seen": 1341662208 }, { "epoch": 3.1, "learning_rate": 0.0002997492477432297, "loss": 2.9951, "theoretical_loss": 3.551194454214904, "tokens_seen": 1341727744 }, { "epoch": 3.1, "learning_rate": 0.00029973921765295885, "loss": 2.808, "theoretical_loss": 3.5511788321421265, "tokens_seen": 1341793280 }, { "epoch": 3.1, "objective/train/docs_used": 2144837, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.060481071472168, "objective/train/theoretical_loss": 3.5511749267765342, "objective/train/tokens_used": 1362269664, "theoretical_loss": 3.5511749267765342, "tokens_seen": 1341809664 }, { "epoch": 3.1, "learning_rate": 0.0002997291875626881, "loss": 2.9247, "theoretical_loss": 3.5511632110459774, "tokens_seen": 1341858816 }, { "epoch": 3.1, "learning_rate": 0.00029971915747241726, "loss": 2.8817, "theoretical_loss": 3.551147590926348, "tokens_seen": 1341924352 }, { "epoch": 3.1, "learning_rate": 0.00029970912738214644, "loss": 2.8206, "theoretical_loss": 3.5511319717831293, "tokens_seen": 1341989888 }, { "epoch": 3.1, "learning_rate": 0.0002996990972918756, "loss": 2.8605, "theoretical_loss": 3.551116353616213, "tokens_seen": 1342055424 }, { "epoch": 3.1, "learning_rate": 0.0002996890672016048, "loss": 2.8562, "theoretical_loss": 3.5511007364254903, "tokens_seen": 1342120960 }, { "epoch": 3.1, "learning_rate": 0.000299679037111334, "loss": 2.8277, "theoretical_loss": 3.5510851202108524, "tokens_seen": 1342186496 }, { "epoch": 3.1, "learning_rate": 0.0002996690070210632, "loss": 2.784, "theoretical_loss": 3.551069504972191, "tokens_seen": 1342252032 }, { "epoch": 3.1, "learning_rate": 0.00029965897693079235, "loss": 2.7567, "theoretical_loss": 3.5510538907093974, "tokens_seen": 1342317568 }, { "epoch": 3.1, "learning_rate": 0.0002996489468405216, "loss": 2.9042, "theoretical_loss": 3.5510382774223617, "tokens_seen": 1342383104 }, { "epoch": 3.1, "learning_rate": 0.0002996389167502507, "loss": 2.8616, "theoretical_loss": 3.5510226651109775, "tokens_seen": 1342448640 }, { "epoch": 3.1, "learning_rate": 0.00029962888665997995, "loss": 2.7439, "theoretical_loss": 3.551007053775134, "tokens_seen": 1342514176 }, { "epoch": 3.1, "learning_rate": 0.00029961885656970913, "loss": 2.8864, "theoretical_loss": 3.5509914434147243, "tokens_seen": 1342579712 }, { "epoch": 3.1, "learning_rate": 0.0002996088264794383, "loss": 2.8935, "theoretical_loss": 3.5509758340296393, "tokens_seen": 1342645248 }, { "epoch": 3.1, "learning_rate": 0.0002995987963891675, "loss": 2.9261, "theoretical_loss": 3.55096022561977, "tokens_seen": 1342710784 }, { "epoch": 3.1, "learning_rate": 0.00029958876629889667, "loss": 2.9403, "theoretical_loss": 3.5509446181850084, "tokens_seen": 1342776320 }, { "epoch": 3.1, "learning_rate": 0.00029957873620862585, "loss": 2.7914, "theoretical_loss": 3.5509290117252457, "tokens_seen": 1342841856 }, { "epoch": 3.1, "learning_rate": 0.0002995687061183551, "loss": 2.9902, "theoretical_loss": 3.5509134062403738, "tokens_seen": 1342907392 }, { "epoch": 3.1, "learning_rate": 0.0002995586760280842, "loss": 2.7698, "theoretical_loss": 3.550897801730284, "tokens_seen": 1342972928 }, { "epoch": 3.1, "learning_rate": 0.00029954864593781345, "loss": 2.8066, "theoretical_loss": 3.5508821981948673, "tokens_seen": 1343038464 }, { "epoch": 3.1, "learning_rate": 0.0002995386158475427, "loss": 2.7211, "theoretical_loss": 3.5508665956340164, "tokens_seen": 1343104000 }, { "epoch": 3.1, "learning_rate": 0.0002995285857572718, "loss": 2.7963, "theoretical_loss": 3.550850994047622, "tokens_seen": 1343169536 }, { "epoch": 3.1, "learning_rate": 0.00029951855566700105, "loss": 2.8172, "theoretical_loss": 3.550835393435576, "tokens_seen": 1343235072 }, { "epoch": 3.1, "learning_rate": 0.0002995085255767302, "loss": 2.943, "theoretical_loss": 3.5508197937977704, "tokens_seen": 1343300608 }, { "epoch": 3.1, "learning_rate": 0.0002994984954864594, "loss": 2.8592, "theoretical_loss": 3.550804195134096, "tokens_seen": 1343366144 }, { "epoch": 3.1, "learning_rate": 0.0002994884653961886, "loss": 2.7556, "theoretical_loss": 3.550788597444445, "tokens_seen": 1343431680 }, { "epoch": 3.1, "objective/train/docs_used": 2147756, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9597442150115967, "objective/train/theoretical_loss": 3.55078469817421, "objective/train/tokens_used": 1363908064, "theoretical_loss": 3.55078469817421, "tokens_seen": 1343448064 }, { "epoch": 3.1, "learning_rate": 0.00029947843530591777, "loss": 2.8712, "theoretical_loss": 3.5507730007287086, "tokens_seen": 1343497216 }, { "epoch": 3.1, "learning_rate": 0.00029946840521564695, "loss": 2.9022, "theoretical_loss": 3.550757404986779, "tokens_seen": 1343562752 }, { "epoch": 3.1, "learning_rate": 0.00029945837512537613, "loss": 2.6905, "theoretical_loss": 3.5507418102185477, "tokens_seen": 1343628288 }, { "epoch": 3.1, "learning_rate": 0.0002994483450351053, "loss": 2.8072, "theoretical_loss": 3.5507262164239073, "tokens_seen": 1343693824 }, { "epoch": 3.1, "learning_rate": 0.00029943831494483455, "loss": 2.728, "theoretical_loss": 3.5507106236027477, "tokens_seen": 1343759360 }, { "epoch": 3.1, "learning_rate": 0.0002994282848545637, "loss": 2.7838, "theoretical_loss": 3.550695031754962, "tokens_seen": 1343824896 }, { "epoch": 3.1, "learning_rate": 0.0002994182547642929, "loss": 2.7808, "theoretical_loss": 3.550679440880441, "tokens_seen": 1343890432 }, { "epoch": 3.1, "learning_rate": 0.0002994082246740221, "loss": 2.7747, "theoretical_loss": 3.5506638509790784, "tokens_seen": 1343955968 }, { "epoch": 3.1, "learning_rate": 0.0002993981945837513, "loss": 2.6379, "theoretical_loss": 3.550648262050764, "tokens_seen": 1344021504 }, { "epoch": 3.1, "learning_rate": 0.00029938816449348046, "loss": 2.7894, "theoretical_loss": 3.5506326740953904, "tokens_seen": 1344087040 }, { "epoch": 3.1, "learning_rate": 0.00029937813440320964, "loss": 2.9122, "theoretical_loss": 3.5506170871128493, "tokens_seen": 1344152576 }, { "epoch": 3.1, "learning_rate": 0.0002993681043129388, "loss": 2.6766, "theoretical_loss": 3.5506015011030327, "tokens_seen": 1344218112 }, { "epoch": 3.1, "learning_rate": 0.00029935807422266805, "loss": 2.8017, "theoretical_loss": 3.550585916065832, "tokens_seen": 1344283648 }, { "epoch": 3.1, "learning_rate": 0.0002993480441323972, "loss": 2.7793, "theoretical_loss": 3.55057033200114, "tokens_seen": 1344349184 }, { "epoch": 3.1, "learning_rate": 0.0002993380140421264, "loss": 2.8635, "theoretical_loss": 3.5505547489088483, "tokens_seen": 1344414720 }, { "epoch": 3.1, "learning_rate": 0.00029932798395185554, "loss": 2.8669, "theoretical_loss": 3.550539166788848, "tokens_seen": 1344480256 }, { "epoch": 3.1, "learning_rate": 0.0002993179538615848, "loss": 2.7747, "theoretical_loss": 3.5505235856410327, "tokens_seen": 1344545792 }, { "epoch": 3.1, "learning_rate": 0.00029930792377131396, "loss": 2.7465, "theoretical_loss": 3.550508005465293, "tokens_seen": 1344611328 }, { "epoch": 3.1, "learning_rate": 0.00029929789368104314, "loss": 2.9185, "theoretical_loss": 3.5504924262615205, "tokens_seen": 1344676864 }, { "epoch": 3.1, "learning_rate": 0.0002992878635907723, "loss": 2.933, "theoretical_loss": 3.5504768480296085, "tokens_seen": 1344742400 }, { "epoch": 3.1, "learning_rate": 0.0002992778335005015, "loss": 2.7145, "theoretical_loss": 3.550461270769448, "tokens_seen": 1344807936 }, { "epoch": 3.1, "learning_rate": 0.0002992678034102307, "loss": 2.6854, "theoretical_loss": 3.550445694480932, "tokens_seen": 1344873472 }, { "epoch": 3.1, "learning_rate": 0.0002992577733199599, "loss": 2.8711, "theoretical_loss": 3.550430119163952, "tokens_seen": 1344939008 }, { "epoch": 3.1, "learning_rate": 0.00029924774322968905, "loss": 2.7336, "theoretical_loss": 3.5504145448184, "tokens_seen": 1345004544 }, { "epoch": 3.1, "learning_rate": 0.0002992377131394183, "loss": 2.7726, "theoretical_loss": 3.550398971444168, "tokens_seen": 1345070080 }, { "epoch": 3.1, "objective/train/docs_used": 2150607, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.865643262863159, "objective/train/theoretical_loss": 3.5503950782523663, "objective/train/tokens_used": 1365546464, "theoretical_loss": 3.5503950782523663, "tokens_seen": 1345086464 }, { "epoch": 3.1, "learning_rate": 0.00029922768304914746, "loss": 2.7612, "theoretical_loss": 3.5503833990411486, "tokens_seen": 1345135616 }, { "epoch": 3.1, "learning_rate": 0.00029921765295887664, "loss": 2.874, "theoretical_loss": 3.5503678276092336, "tokens_seen": 1345201152 }, { "epoch": 3.1, "learning_rate": 0.0002992076228686058, "loss": 2.7243, "theoretical_loss": 3.5503522571483153, "tokens_seen": 1345266688 }, { "epoch": 3.1, "learning_rate": 0.000299197592778335, "loss": 2.8299, "theoretical_loss": 3.5503366876582847, "tokens_seen": 1345332224 }, { "epoch": 3.1, "learning_rate": 0.0002991875626880642, "loss": 2.8199, "theoretical_loss": 3.550321119139036, "tokens_seen": 1345397760 }, { "epoch": 3.1, "learning_rate": 0.0002991775325977934, "loss": 2.8445, "theoretical_loss": 3.55030555159046, "tokens_seen": 1345463296 }, { "epoch": 3.1, "learning_rate": 0.00029916750250752255, "loss": 2.8021, "theoretical_loss": 3.550289985012449, "tokens_seen": 1345528832 }, { "epoch": 3.1, "learning_rate": 0.0002991574724172518, "loss": 2.8564, "theoretical_loss": 3.5502744194048956, "tokens_seen": 1345594368 }, { "epoch": 3.1, "learning_rate": 0.0002991474423269809, "loss": 2.7614, "theoretical_loss": 3.550258854767692, "tokens_seen": 1345659904 }, { "epoch": 3.1, "learning_rate": 0.00029913741223671015, "loss": 2.8417, "theoretical_loss": 3.5502432911007302, "tokens_seen": 1345725440 }, { "epoch": 3.1, "learning_rate": 0.00029912738214643933, "loss": 2.8924, "theoretical_loss": 3.550227728403903, "tokens_seen": 1345790976 }, { "epoch": 3.1, "learning_rate": 0.0002991173520561685, "loss": 2.8024, "theoretical_loss": 3.5502121666771025, "tokens_seen": 1345856512 }, { "epoch": 3.1, "learning_rate": 0.0002991073219658977, "loss": 2.8421, "theoretical_loss": 3.5501966059202203, "tokens_seen": 1345922048 }, { "epoch": 3.1, "learning_rate": 0.00029909729187562687, "loss": 2.7616, "theoretical_loss": 3.550181046133149, "tokens_seen": 1345987584 }, { "epoch": 3.1, "learning_rate": 0.00029908726178535605, "loss": 2.822, "theoretical_loss": 3.550165487315782, "tokens_seen": 1346053120 }, { "epoch": 3.1, "learning_rate": 0.0002990772316950853, "loss": 3.0186, "theoretical_loss": 3.5501499294680103, "tokens_seen": 1346118656 }, { "epoch": 3.1, "learning_rate": 0.0002990672016048144, "loss": 2.8524, "theoretical_loss": 3.550134372589727, "tokens_seen": 1346184192 }, { "epoch": 3.1, "learning_rate": 0.00029905717151454365, "loss": 2.8694, "theoretical_loss": 3.5501188166808246, "tokens_seen": 1346249728 }, { "epoch": 3.1, "learning_rate": 0.00029904714142427283, "loss": 2.7438, "theoretical_loss": 3.550103261741195, "tokens_seen": 1346315264 }, { "epoch": 3.1, "learning_rate": 0.000299037111334002, "loss": 3.014, "theoretical_loss": 3.5500877077707305, "tokens_seen": 1346380800 }, { "epoch": 3.1, "learning_rate": 0.0002990270812437312, "loss": 2.6882, "theoretical_loss": 3.5500721547693246, "tokens_seen": 1346446336 }, { "epoch": 3.1, "learning_rate": 0.0002990170511534604, "loss": 2.8061, "theoretical_loss": 3.550056602736869, "tokens_seen": 1346511872 }, { "epoch": 3.1, "learning_rate": 0.00029900702106318956, "loss": 2.927, "theoretical_loss": 3.5500410516732552, "tokens_seen": 1346577408 }, { "epoch": 3.1, "learning_rate": 0.0002989969909729188, "loss": 2.8458, "theoretical_loss": 3.5500255015783777, "tokens_seen": 1346642944 }, { "epoch": 3.1, "learning_rate": 0.0002989869608826479, "loss": 2.831, "theoretical_loss": 3.550009952452128, "tokens_seen": 1346708480 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0435328483581543, "objective/train/theoretical_loss": 3.550006065321901, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.550006065321901, "tokens_seen": 1346724864 }, { "epoch": 3.1, "learning_rate": 0.00029897693079237715, "loss": 2.7252, "theoretical_loss": 3.5499944042943987, "tokens_seen": 1346774016 }, { "epoch": 3.1, "learning_rate": 0.0002989669007021063, "loss": 2.7017, "theoretical_loss": 3.5499788571050823, "tokens_seen": 1346839552 }, { "epoch": 3.1, "learning_rate": 0.0002989568706118355, "loss": 2.8245, "theoretical_loss": 3.5499633108840714, "tokens_seen": 1346905088 }, { "epoch": 3.1, "learning_rate": 0.0002989468405215647, "loss": 2.7476, "theoretical_loss": 3.5499477656312584, "tokens_seen": 1346970624 }, { "epoch": 3.1, "learning_rate": 0.0002989368104312939, "loss": 2.814, "theoretical_loss": 3.549932221346536, "tokens_seen": 1347036160 }, { "epoch": 3.1, "learning_rate": 0.00029892678034102306, "loss": 2.8135, "theoretical_loss": 3.5499166780297973, "tokens_seen": 1347101696 }, { "epoch": 3.1, "learning_rate": 0.0002989167502507523, "loss": 2.8398, "theoretical_loss": 3.5499011356809342, "tokens_seen": 1347167232 }, { "epoch": 3.1, "learning_rate": 0.0002989067201604814, "loss": 2.8073, "theoretical_loss": 3.54988559429984, "tokens_seen": 1347232768 }, { "epoch": 3.1, "learning_rate": 0.00029889669007021066, "loss": 2.8524, "theoretical_loss": 3.5498700538864068, "tokens_seen": 1347298304 }, { "epoch": 3.1, "learning_rate": 0.0002988866599799398, "loss": 2.7804, "theoretical_loss": 3.5498545144405274, "tokens_seen": 1347363840 }, { "epoch": 3.1, "learning_rate": 0.000298876629889669, "loss": 2.7443, "theoretical_loss": 3.5498389759620954, "tokens_seen": 1347429376 }, { "epoch": 3.1, "learning_rate": 0.0002988665997993982, "loss": 2.9238, "theoretical_loss": 3.5498234384510017, "tokens_seen": 1347494912 }, { "epoch": 3.1, "learning_rate": 0.0002988565697091274, "loss": 2.8437, "theoretical_loss": 3.549807901907141, "tokens_seen": 1347560448 }, { "epoch": 3.1, "learning_rate": 0.00029884653961885656, "loss": 2.9102, "theoretical_loss": 3.5497923663304043, "tokens_seen": 1347625984 }, { "epoch": 3.1, "learning_rate": 0.00029883650952858574, "loss": 2.8204, "theoretical_loss": 3.5497768317206857, "tokens_seen": 1347691520 }, { "epoch": 3.1, "learning_rate": 0.0002988264794383149, "loss": 2.7333, "theoretical_loss": 3.549761298077878, "tokens_seen": 1347757056 }, { "epoch": 3.1, "learning_rate": 0.00029881644934804416, "loss": 2.8687, "theoretical_loss": 3.549745765401873, "tokens_seen": 1347822592 }, { "epoch": 3.1, "learning_rate": 0.0002988064192577733, "loss": 2.7814, "theoretical_loss": 3.5497302336925642, "tokens_seen": 1347888128 }, { "epoch": 3.1, "learning_rate": 0.0002987963891675025, "loss": 2.8577, "theoretical_loss": 3.549714702949844, "tokens_seen": 1347953664 }, { "epoch": 3.1, "learning_rate": 0.0002987863590772317, "loss": 2.7821, "theoretical_loss": 3.5496991731736056, "tokens_seen": 1348019200 }, { "epoch": 3.1, "learning_rate": 0.0002987763289869609, "loss": 2.6675, "theoretical_loss": 3.549683644363742, "tokens_seen": 1348084736 }, { "epoch": 3.1, "learning_rate": 0.0002987662988966901, "loss": 2.7325, "theoretical_loss": 3.5496681165201456, "tokens_seen": 1348150272 }, { "epoch": 3.1, "learning_rate": 0.00029875626880641925, "loss": 2.9473, "theoretical_loss": 3.54965258964271, "tokens_seen": 1348215808 }, { "epoch": 3.1, "learning_rate": 0.0002987462387161485, "loss": 2.9169, "theoretical_loss": 3.5496370637313275, "tokens_seen": 1348281344 }, { "epoch": 3.1, "learning_rate": 0.00029873620862587766, "loss": 2.9859, "theoretical_loss": 3.549621538785891, "tokens_seen": 1348346880 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.748163938522339, "objective/train/theoretical_loss": 3.5496176577004483, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.5496176577004483, "tokens_seen": 1348363264 }, { "epoch": 3.1, "learning_rate": 0.00029872617853560684, "loss": 2.899, "theoretical_loss": 3.549606014806294, "tokens_seen": 1348412416 }, { "epoch": 3.1, "learning_rate": 0.000298716148445336, "loss": 2.8535, "theoretical_loss": 3.5495904917924292, "tokens_seen": 1348477952 }, { "epoch": 3.1, "learning_rate": 0.0002987061183550652, "loss": 2.762, "theoretical_loss": 3.5495749697441896, "tokens_seen": 1348543488 }, { "epoch": 3.1, "learning_rate": 0.0002986960882647944, "loss": 2.7787, "theoretical_loss": 3.5495594486614683, "tokens_seen": 1348609024 }, { "epoch": 3.1, "learning_rate": 0.0002986860581745236, "loss": 2.8848, "theoretical_loss": 3.549543928544158, "tokens_seen": 1348674560 }, { "epoch": 3.1, "learning_rate": 0.00029867602808425275, "loss": 2.8567, "theoretical_loss": 3.5495284093921518, "tokens_seen": 1348740096 }, { "epoch": 3.1, "learning_rate": 0.000298665997993982, "loss": 2.7005, "theoretical_loss": 3.5495128912053433, "tokens_seen": 1348805632 }, { "epoch": 3.1, "learning_rate": 0.0002986559679037111, "loss": 2.9023, "theoretical_loss": 3.549497373983625, "tokens_seen": 1348871168 }, { "epoch": 3.1, "learning_rate": 0.00029864593781344035, "loss": 2.7807, "theoretical_loss": 3.5494818577268905, "tokens_seen": 1348936704 }, { "epoch": 3.1, "learning_rate": 0.00029863590772316953, "loss": 2.7119, "theoretical_loss": 3.5494663424350326, "tokens_seen": 1349002240 }, { "epoch": 3.1, "learning_rate": 0.0002986258776328987, "loss": 2.6856, "theoretical_loss": 3.549450828107944, "tokens_seen": 1349067776 }, { "epoch": 3.1, "learning_rate": 0.0002986158475426279, "loss": 2.809, "theoretical_loss": 3.549435314745519, "tokens_seen": 1349133312 }, { "epoch": 3.1, "learning_rate": 0.00029860581745235707, "loss": 2.8312, "theoretical_loss": 3.5494198023476495, "tokens_seen": 1349198848 }, { "epoch": 3.1, "learning_rate": 0.00029859578736208625, "loss": 2.8254, "theoretical_loss": 3.549404290914229, "tokens_seen": 1349264384 }, { "epoch": 3.1, "learning_rate": 0.0002985857572718155, "loss": 2.7673, "theoretical_loss": 3.5493887804451516, "tokens_seen": 1349329920 }, { "epoch": 3.1, "learning_rate": 0.0002985757271815446, "loss": 2.6683, "theoretical_loss": 3.5493732709403094, "tokens_seen": 1349395456 }, { "epoch": 3.1, "learning_rate": 0.00029856569709127385, "loss": 2.803, "theoretical_loss": 3.5493577623995964, "tokens_seen": 1349460992 }, { "epoch": 3.1, "learning_rate": 0.00029855566700100303, "loss": 2.7603, "theoretical_loss": 3.549342254822905, "tokens_seen": 1349526528 }, { "epoch": 3.1, "learning_rate": 0.0002985456369107322, "loss": 2.8633, "theoretical_loss": 3.5493267482101296, "tokens_seen": 1349592064 }, { "epoch": 3.1, "learning_rate": 0.0002985356068204614, "loss": 2.8352, "theoretical_loss": 3.5493112425611626, "tokens_seen": 1349657600 }, { "epoch": 3.1, "learning_rate": 0.0002985255767301906, "loss": 2.8781, "theoretical_loss": 3.5492957378758976, "tokens_seen": 1349723136 }, { "epoch": 3.1, "learning_rate": 0.00029851554663991976, "loss": 2.7041, "theoretical_loss": 3.549280234154228, "tokens_seen": 1349788672 }, { "epoch": 3.1, "learning_rate": 0.000298505516549649, "loss": 2.8332, "theoretical_loss": 3.549264731396047, "tokens_seen": 1349854208 }, { "epoch": 3.1, "learning_rate": 0.0002984954864593781, "loss": 2.8912, "theoretical_loss": 3.549249229601248, "tokens_seen": 1349919744 }, { "epoch": 3.1, "learning_rate": 0.00029848545636910735, "loss": 2.9041, "theoretical_loss": 3.5492337287697238, "tokens_seen": 1349985280 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6640119552612305, "objective/train/theoretical_loss": 3.549229853712342, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.549229853712342, "tokens_seen": 1350001664 }, { "epoch": 3.1, "learning_rate": 0.0002984754262788365, "loss": 2.7542, "theoretical_loss": 3.5492182289013687, "tokens_seen": 1350050816 }, { "epoch": 3.1, "learning_rate": 0.0002984653961885657, "loss": 2.9117, "theoretical_loss": 3.5492027299960762, "tokens_seen": 1350116352 }, { "epoch": 3.1, "learning_rate": 0.0002984553660982949, "loss": 2.6688, "theoretical_loss": 3.5491872320537388, "tokens_seen": 1350181888 }, { "epoch": 3.1, "learning_rate": 0.0002984453360080241, "loss": 2.7283, "theoretical_loss": 3.5491717350742498, "tokens_seen": 1350247424 }, { "epoch": 3.1, "learning_rate": 0.00029843530591775326, "loss": 2.9467, "theoretical_loss": 3.549156239057504, "tokens_seen": 1350312960 }, { "epoch": 3.1, "learning_rate": 0.0002984252758274825, "loss": 2.9394, "theoretical_loss": 3.5491407440033935, "tokens_seen": 1350378496 }, { "epoch": 3.1, "learning_rate": 0.0002984152457372116, "loss": 2.6979, "theoretical_loss": 3.549125249911813, "tokens_seen": 1350444032 }, { "epoch": 3.1, "learning_rate": 0.00029840521564694086, "loss": 2.9595, "theoretical_loss": 3.5491097567826553, "tokens_seen": 1350509568 }, { "epoch": 3.1, "learning_rate": 0.00029839518555667, "loss": 2.7874, "theoretical_loss": 3.549094264615814, "tokens_seen": 1350575104 }, { "epoch": 3.1, "learning_rate": 0.0002983851554663992, "loss": 2.7998, "theoretical_loss": 3.549078773411182, "tokens_seen": 1350640640 }, { "epoch": 3.1, "learning_rate": 0.0002983751253761284, "loss": 2.6598, "theoretical_loss": 3.549063283168654, "tokens_seen": 1350706176 }, { "epoch": 3.1, "learning_rate": 0.0002983650952858576, "loss": 2.8429, "theoretical_loss": 3.549047793888123, "tokens_seen": 1350771712 }, { "epoch": 3.1, "learning_rate": 0.00029835506519558676, "loss": 2.8279, "theoretical_loss": 3.549032305569483, "tokens_seen": 1350837248 }, { "epoch": 3.1, "learning_rate": 0.00029834503510531594, "loss": 2.7834, "theoretical_loss": 3.549016818212627, "tokens_seen": 1350902784 }, { "epoch": 3.1, "learning_rate": 0.0002983350050150451, "loss": 2.7759, "theoretical_loss": 3.5490013318174487, "tokens_seen": 1350968320 }, { "epoch": 3.1, "learning_rate": 0.00029832497492477436, "loss": 2.7973, "theoretical_loss": 3.548985846383842, "tokens_seen": 1351033856 }, { "epoch": 3.1, "learning_rate": 0.0002983149448345035, "loss": 2.8139, "theoretical_loss": 3.5489703619117003, "tokens_seen": 1351099392 }, { "epoch": 3.1, "learning_rate": 0.0002983049147442327, "loss": 2.666, "theoretical_loss": 3.548954878400918, "tokens_seen": 1351164928 }, { "epoch": 3.1, "learning_rate": 0.00029829488465396185, "loss": 2.8017, "theoretical_loss": 3.548939395851388, "tokens_seen": 1351230464 }, { "epoch": 3.1, "learning_rate": 0.0002982848545636911, "loss": 3.0463, "theoretical_loss": 3.5489239142630042, "tokens_seen": 1351296000 }, { "epoch": 3.1, "learning_rate": 0.00029827482447342026, "loss": 2.9969, "theoretical_loss": 3.54890843363566, "tokens_seen": 1351361536 }, { "epoch": 3.1, "learning_rate": 0.00029826479438314945, "loss": 2.7144, "theoretical_loss": 3.5488929539692498, "tokens_seen": 1351427072 }, { "epoch": 3.1, "learning_rate": 0.0002982547642928786, "loss": 2.7592, "theoretical_loss": 3.548877475263667, "tokens_seen": 1351492608 }, { "epoch": 3.1, "learning_rate": 0.00029824473420260786, "loss": 2.974, "theoretical_loss": 3.548861997518806, "tokens_seen": 1351558144 }, { "epoch": 3.1, "learning_rate": 0.000298234704112337, "loss": 2.7784, "theoretical_loss": 3.5488465207345588, "tokens_seen": 1351623680 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6397829055786133, "objective/train/theoretical_loss": 3.548842651688581, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.548842651688581, "tokens_seen": 1351640064 }, { "epoch": 3.1, "learning_rate": 0.0002982246740220662, "loss": 2.7147, "theoretical_loss": 3.5488310449108216, "tokens_seen": 1351689216 }, { "epoch": 3.1, "learning_rate": 0.00029821464393179535, "loss": 2.942, "theoretical_loss": 3.5488155700474864, "tokens_seen": 1351754752 }, { "epoch": 3.1, "learning_rate": 0.0002982046138415246, "loss": 2.8884, "theoretical_loss": 3.548800096144448, "tokens_seen": 1351820288 }, { "epoch": 3.1, "learning_rate": 0.00029819458375125377, "loss": 2.8302, "theoretical_loss": 3.5487846232015996, "tokens_seen": 1351885824 }, { "epoch": 3.1, "learning_rate": 0.00029818455366098295, "loss": 2.783, "theoretical_loss": 3.5487691512188357, "tokens_seen": 1351951360 }, { "epoch": 3.1, "learning_rate": 0.00029817452357071213, "loss": 2.8087, "theoretical_loss": 3.5487536801960498, "tokens_seen": 1352016896 }, { "epoch": 3.1, "learning_rate": 0.0002981644934804413, "loss": 2.9094, "theoretical_loss": 3.548738210133136, "tokens_seen": 1352082432 }, { "epoch": 3.1, "learning_rate": 0.0002981544633901705, "loss": 2.9394, "theoretical_loss": 3.548722741029988, "tokens_seen": 1352147968 }, { "epoch": 3.1, "learning_rate": 0.00029814443329989973, "loss": 2.6448, "theoretical_loss": 3.5487072728865003, "tokens_seen": 1352213504 }, { "epoch": 3.1, "learning_rate": 0.00029813440320962885, "loss": 2.7436, "theoretical_loss": 3.548691805702566, "tokens_seen": 1352279040 }, { "epoch": 3.1, "learning_rate": 0.0002981243731193581, "loss": 2.8237, "theoretical_loss": 3.54867633947808, "tokens_seen": 1352344576 }, { "epoch": 3.1, "learning_rate": 0.0002981143430290872, "loss": 2.9387, "theoretical_loss": 3.548660874212935, "tokens_seen": 1352410112 }, { "epoch": 3.1, "learning_rate": 0.00029810431293881645, "loss": 2.868, "theoretical_loss": 3.5486454099070266, "tokens_seen": 1352475648 }, { "epoch": 3.1, "learning_rate": 0.00029809428284854563, "loss": 2.8328, "theoretical_loss": 3.548629946560248, "tokens_seen": 1352541184 }, { "epoch": 3.1, "learning_rate": 0.0002980842527582748, "loss": 2.9111, "theoretical_loss": 3.548614484172493, "tokens_seen": 1352606720 }, { "epoch": 3.1, "learning_rate": 0.000298074222668004, "loss": 2.6892, "theoretical_loss": 3.548599022743656, "tokens_seen": 1352672256 }, { "epoch": 3.1, "learning_rate": 0.00029806419257773323, "loss": 2.8243, "theoretical_loss": 3.548583562273631, "tokens_seen": 1352737792 }, { "epoch": 3.1, "learning_rate": 0.00029805416248746236, "loss": 2.7874, "theoretical_loss": 3.548568102762312, "tokens_seen": 1352803328 }, { "epoch": 3.1, "learning_rate": 0.0002980441323971916, "loss": 2.797, "theoretical_loss": 3.5485526442095936, "tokens_seen": 1352868864 }, { "epoch": 3.1, "learning_rate": 0.0002980341023069208, "loss": 2.9374, "theoretical_loss": 3.5485371866153694, "tokens_seen": 1352934400 }, { "epoch": 3.1, "learning_rate": 0.00029802407221664996, "loss": 2.9502, "theoretical_loss": 3.5485217299795337, "tokens_seen": 1352999936 }, { "epoch": 3.1, "learning_rate": 0.0002980140421263792, "loss": 2.6168, "theoretical_loss": 3.548506274301981, "tokens_seen": 1353065472 }, { "epoch": 3.1, "learning_rate": 0.0002980040120361083, "loss": 2.7458, "theoretical_loss": 3.5484908195826046, "tokens_seen": 1353131008 }, { "epoch": 3.1, "learning_rate": 0.00029799398194583755, "loss": 2.7814, "theoretical_loss": 3.5484753658212997, "tokens_seen": 1353196544 }, { "epoch": 3.1, "learning_rate": 0.0002979839518555667, "loss": 2.7385, "theoretical_loss": 3.54845991301796, "tokens_seen": 1353262080 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7294182777404785, "objective/train/theoretical_loss": 3.548456049966795, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.548456049966795, "tokens_seen": 1353278464 }, { "epoch": 3.1, "learning_rate": 0.0002979739217652959, "loss": 2.7737, "theoretical_loss": 3.5484444611724797, "tokens_seen": 1353327616 }, { "epoch": 3.1, "learning_rate": 0.0002979638916750251, "loss": 2.775, "theoretical_loss": 3.548429010284753, "tokens_seen": 1353393152 }, { "epoch": 3.1, "learning_rate": 0.0002979538615847543, "loss": 2.8603, "theoretical_loss": 3.5484135603546743, "tokens_seen": 1353458688 }, { "epoch": 3.1, "learning_rate": 0.00029794383149448346, "loss": 2.911, "theoretical_loss": 3.548398111382138, "tokens_seen": 1353524224 }, { "epoch": 3.1, "learning_rate": 0.0002979338014042127, "loss": 2.6968, "theoretical_loss": 3.5483826633670383, "tokens_seen": 1353589760 }, { "epoch": 3.1, "learning_rate": 0.0002979237713139418, "loss": 2.9316, "theoretical_loss": 3.5483672163092694, "tokens_seen": 1353655296 }, { "epoch": 3.1, "learning_rate": 0.00029791374122367106, "loss": 2.7763, "theoretical_loss": 3.5483517702087255, "tokens_seen": 1353720832 }, { "epoch": 3.1, "learning_rate": 0.0002979037111334002, "loss": 2.9308, "theoretical_loss": 3.5483363250653017, "tokens_seen": 1353786368 }, { "epoch": 3.1, "learning_rate": 0.0002978936810431294, "loss": 2.8418, "theoretical_loss": 3.5483208808788915, "tokens_seen": 1353851904 }, { "epoch": 3.1, "learning_rate": 0.0002978836509528586, "loss": 2.8325, "theoretical_loss": 3.5483054376493897, "tokens_seen": 1353917440 }, { "epoch": 3.1, "learning_rate": 0.0002978736208625878, "loss": 2.7095, "theoretical_loss": 3.5482899953766904, "tokens_seen": 1353982976 }, { "epoch": 3.1, "learning_rate": 0.00029786359077231696, "loss": 2.9345, "theoretical_loss": 3.548274554060688, "tokens_seen": 1354048512 }, { "epoch": 3.1, "learning_rate": 0.00029785356068204614, "loss": 2.8487, "theoretical_loss": 3.5482591137012776, "tokens_seen": 1354114048 }, { "epoch": 3.1, "learning_rate": 0.0002978435305917753, "loss": 3.0087, "theoretical_loss": 3.5482436742983534, "tokens_seen": 1354179584 }, { "epoch": 3.1, "learning_rate": 0.00029783350050150456, "loss": 2.7364, "theoretical_loss": 3.548228235851809, "tokens_seen": 1354245120 }, { "epoch": 3.1, "learning_rate": 0.0002978234704112337, "loss": 2.8221, "theoretical_loss": 3.54821279836154, "tokens_seen": 1354310656 }, { "epoch": 3.1, "learning_rate": 0.0002978134403209629, "loss": 2.8939, "theoretical_loss": 3.54819736182744, "tokens_seen": 1354376192 }, { "epoch": 3.1, "learning_rate": 0.00029780341023069205, "loss": 3.0521, "theoretical_loss": 3.548181926249404, "tokens_seen": 1354441728 }, { "epoch": 3.1, "learning_rate": 0.0002977933801404213, "loss": 2.7701, "theoretical_loss": 3.5481664916273266, "tokens_seen": 1354507264 }, { "epoch": 3.1, "learning_rate": 0.00029778335005015046, "loss": 2.8296, "theoretical_loss": 3.5481510579611024, "tokens_seen": 1354572800 }, { "epoch": 3.1, "learning_rate": 0.00029777331995987965, "loss": 2.8239, "theoretical_loss": 3.5481356252506258, "tokens_seen": 1354638336 }, { "epoch": 3.1, "learning_rate": 0.00029776328986960883, "loss": 2.8171, "theoretical_loss": 3.548120193495791, "tokens_seen": 1354703872 }, { "epoch": 3.1, "learning_rate": 0.00029775325977933806, "loss": 2.8405, "theoretical_loss": 3.548104762696493, "tokens_seen": 1354769408 }, { "epoch": 3.1, "learning_rate": 0.0002977432296890672, "loss": 2.8577, "theoretical_loss": 3.5480893328526264, "tokens_seen": 1354834944 }, { "epoch": 3.1, "learning_rate": 0.0002977331995987964, "loss": 2.8484, "theoretical_loss": 3.5480739039640863, "tokens_seen": 1354900480 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6355433464050293, "objective/train/theoretical_loss": 3.5480700468912083, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.5480700468912083, "tokens_seen": 1354916864 }, { "epoch": 3.1, "learning_rate": 0.00029772316950852555, "loss": 2.7732, "theoretical_loss": 3.548058476030766, "tokens_seen": 1354966016 }, { "epoch": 3.1, "learning_rate": 0.0002977131394182548, "loss": 2.8011, "theoretical_loss": 3.548043049052562, "tokens_seen": 1355031552 }, { "epoch": 3.1, "learning_rate": 0.00029770310932798397, "loss": 2.8166, "theoretical_loss": 3.548027623029367, "tokens_seen": 1355097088 }, { "epoch": 3.1, "learning_rate": 0.00029769307923771315, "loss": 2.9169, "theoretical_loss": 3.5480121979610777, "tokens_seen": 1355162624 }, { "epoch": 3.1, "learning_rate": 0.00029768304914744233, "loss": 2.9736, "theoretical_loss": 3.5479967738475873, "tokens_seen": 1355228160 }, { "epoch": 3.1, "learning_rate": 0.0002976730190571715, "loss": 2.8487, "theoretical_loss": 3.547981350688791, "tokens_seen": 1355293696 }, { "epoch": 3.1, "learning_rate": 0.0002976629889669007, "loss": 2.9127, "theoretical_loss": 3.547965928484583, "tokens_seen": 1355359232 }, { "epoch": 3.1, "learning_rate": 0.00029765295887662993, "loss": 2.8818, "theoretical_loss": 3.5479505072348596, "tokens_seen": 1355424768 }, { "epoch": 3.1, "learning_rate": 0.00029764292878635905, "loss": 2.5931, "theoretical_loss": 3.547935086939514, "tokens_seen": 1355490304 }, { "epoch": 3.1, "learning_rate": 0.0002976328986960883, "loss": 2.7165, "theoretical_loss": 3.5479196675984417, "tokens_seen": 1355555840 }, { "epoch": 3.1, "learning_rate": 0.0002976228686058174, "loss": 2.7553, "theoretical_loss": 3.5479042492115376, "tokens_seen": 1355621376 }, { "epoch": 3.1, "learning_rate": 0.00029761283851554665, "loss": 2.7269, "theoretical_loss": 3.5478888317786965, "tokens_seen": 1355686912 }, { "epoch": 3.1, "learning_rate": 0.00029760280842527583, "loss": 2.7565, "theoretical_loss": 3.5478734152998133, "tokens_seen": 1355752448 }, { "epoch": 3.1, "learning_rate": 0.000297592778335005, "loss": 2.7962, "theoretical_loss": 3.5478579997747826, "tokens_seen": 1355817984 }, { "epoch": 3.1, "learning_rate": 0.0002975827482447342, "loss": 2.7079, "theoretical_loss": 3.5478425852034987, "tokens_seen": 1355883520 }, { "epoch": 3.1, "learning_rate": 0.00029757271815446343, "loss": 2.9102, "theoretical_loss": 3.5478271715858574, "tokens_seen": 1355949056 }, { "epoch": 3.1, "learning_rate": 0.00029756268806419256, "loss": 2.7323, "theoretical_loss": 3.547811758921754, "tokens_seen": 1356014592 }, { "epoch": 3.1, "learning_rate": 0.0002975526579739218, "loss": 2.659, "theoretical_loss": 3.547796347211082, "tokens_seen": 1356080128 }, { "epoch": 3.1, "learning_rate": 0.0002975426278836509, "loss": 2.7985, "theoretical_loss": 3.547780936453738, "tokens_seen": 1356145664 }, { "epoch": 3.1, "learning_rate": 0.00029753259779338016, "loss": 2.8339, "theoretical_loss": 3.5477655266496155, "tokens_seen": 1356211200 }, { "epoch": 3.1, "learning_rate": 0.00029752256770310934, "loss": 2.8563, "theoretical_loss": 3.54775011779861, "tokens_seen": 1356276736 }, { "epoch": 3.1, "learning_rate": 0.0002975125376128385, "loss": 2.9102, "theoretical_loss": 3.547734709900617, "tokens_seen": 1356342272 }, { "epoch": 3.1, "learning_rate": 0.0002975025075225677, "loss": 2.8732, "theoretical_loss": 3.5477193029555307, "tokens_seen": 1356407808 }, { "epoch": 3.1, "learning_rate": 0.0002974924774322969, "loss": 2.8634, "theoretical_loss": 3.547703896963247, "tokens_seen": 1356473344 }, { "epoch": 3.1, "learning_rate": 0.00029748244734202606, "loss": 2.9113, "theoretical_loss": 3.5476884919236604, "tokens_seen": 1356538880 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8693079948425293, "objective/train/theoretical_loss": 3.5476846408126104, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.5476846408126104, "tokens_seen": 1356555264 }, { "epoch": 3.1, "learning_rate": 0.0002974724172517553, "loss": 2.8089, "theoretical_loss": 3.547673087836666, "tokens_seen": 1356604416 }, { "epoch": 3.1, "learning_rate": 0.0002974623871614844, "loss": 2.5266, "theoretical_loss": 3.547657684702159, "tokens_seen": 1356669952 }, { "epoch": 3.1, "learning_rate": 0.00029745235707121366, "loss": 2.7954, "theoretical_loss": 3.547642282520034, "tokens_seen": 1356735488 }, { "epoch": 3.1, "learning_rate": 0.0002974423269809428, "loss": 2.7176, "theoretical_loss": 3.547626881290187, "tokens_seen": 1356801024 }, { "epoch": 3.1, "learning_rate": 0.000297432296890672, "loss": 2.8036, "theoretical_loss": 3.5476114810125123, "tokens_seen": 1356866560 }, { "epoch": 3.1, "learning_rate": 0.0002974222668004012, "loss": 2.6837, "theoretical_loss": 3.547596081686906, "tokens_seen": 1356932096 }, { "epoch": 3.1, "learning_rate": 0.0002974122367101304, "loss": 2.687, "theoretical_loss": 3.5475806833132624, "tokens_seen": 1356997632 }, { "epoch": 3.1, "learning_rate": 0.00029740220661985956, "loss": 2.7826, "theoretical_loss": 3.547565285891477, "tokens_seen": 1357063168 }, { "epoch": 3.1, "learning_rate": 0.0002973921765295888, "loss": 2.8809, "theoretical_loss": 3.547549889421445, "tokens_seen": 1357128704 }, { "epoch": 3.1, "learning_rate": 0.0002973821464393179, "loss": 2.7153, "theoretical_loss": 3.5475344939030613, "tokens_seen": 1357194240 }, { "epoch": 3.1, "learning_rate": 0.00029737211634904716, "loss": 2.7634, "theoretical_loss": 3.5475190993362213, "tokens_seen": 1357259776 }, { "epoch": 3.1, "learning_rate": 0.0002973620862587763, "loss": 2.9342, "theoretical_loss": 3.5475037057208207, "tokens_seen": 1357325312 }, { "epoch": 3.1, "learning_rate": 0.0002973520561685055, "loss": 2.7401, "theoretical_loss": 3.5474883130567543, "tokens_seen": 1357390848 }, { "epoch": 3.1, "learning_rate": 0.0002973420260782347, "loss": 2.8594, "theoretical_loss": 3.5474729213439176, "tokens_seen": 1357456384 }, { "epoch": 3.1, "learning_rate": 0.0002973319959879639, "loss": 2.9328, "theoretical_loss": 3.547457530582206, "tokens_seen": 1357521920 }, { "epoch": 3.1, "learning_rate": 0.00029732196589769307, "loss": 2.8834, "theoretical_loss": 3.547442140771514, "tokens_seen": 1357587456 }, { "epoch": 3.1, "learning_rate": 0.00029731193580742225, "loss": 2.7115, "theoretical_loss": 3.547426751911738, "tokens_seen": 1357652992 }, { "epoch": 3.1, "learning_rate": 0.00029730190571715143, "loss": 2.7948, "theoretical_loss": 3.547411364002773, "tokens_seen": 1357718528 }, { "epoch": 3.1, "learning_rate": 0.00029729187562688067, "loss": 2.8134, "theoretical_loss": 3.547395977044514, "tokens_seen": 1357784064 }, { "epoch": 3.1, "learning_rate": 0.00029728184553660985, "loss": 2.8188, "theoretical_loss": 3.5473805910368563, "tokens_seen": 1357849600 }, { "epoch": 3.1, "learning_rate": 0.00029727181544633903, "loss": 2.7345, "theoretical_loss": 3.5473652059796965, "tokens_seen": 1357915136 }, { "epoch": 3.1, "learning_rate": 0.00029726178535606826, "loss": 2.7828, "theoretical_loss": 3.5473498218729285, "tokens_seen": 1357980672 }, { "epoch": 3.1, "learning_rate": 0.0002972517552657974, "loss": 2.7541, "theoretical_loss": 3.5473344387164483, "tokens_seen": 1358046208 }, { "epoch": 3.1, "learning_rate": 0.0002972417251755266, "loss": 2.898, "theoretical_loss": 3.5473190565101516, "tokens_seen": 1358111744 }, { "epoch": 3.1, "learning_rate": 0.00029723169508525575, "loss": 2.7855, "theoretical_loss": 3.5473036752539335, "tokens_seen": 1358177280 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.995497703552246, "objective/train/theoretical_loss": 3.547299830088317, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.547299830088317, "tokens_seen": 1358193664 }, { "epoch": 3.1, "learning_rate": 0.000297221664994985, "loss": 2.8009, "theoretical_loss": 3.54728829494769, "tokens_seen": 1358242816 }, { "epoch": 3.1, "learning_rate": 0.00029721163490471417, "loss": 2.8588, "theoretical_loss": 3.547272915591316, "tokens_seen": 1358308352 }, { "epoch": 3.1, "learning_rate": 0.00029720160481444335, "loss": 2.9148, "theoretical_loss": 3.5472575371847075, "tokens_seen": 1358373888 }, { "epoch": 3.1, "learning_rate": 0.00029719157472417253, "loss": 2.7222, "theoretical_loss": 3.5472421597277597, "tokens_seen": 1358439424 }, { "epoch": 3.1, "learning_rate": 0.0002971815446339017, "loss": 2.9243, "theoretical_loss": 3.547226783220368, "tokens_seen": 1358504960 }, { "epoch": 3.1, "learning_rate": 0.0002971715145436309, "loss": 2.8792, "theoretical_loss": 3.547211407662428, "tokens_seen": 1358570496 }, { "epoch": 3.1, "learning_rate": 0.00029716148445336013, "loss": 2.8497, "theoretical_loss": 3.547196033053836, "tokens_seen": 1358636032 }, { "epoch": 3.1, "learning_rate": 0.00029715145436308926, "loss": 2.6531, "theoretical_loss": 3.5471806593944866, "tokens_seen": 1358701568 }, { "epoch": 3.1, "learning_rate": 0.0002971414242728185, "loss": 2.8322, "theoretical_loss": 3.5471652866842764, "tokens_seen": 1358767104 }, { "epoch": 3.1, "learning_rate": 0.0002971313941825476, "loss": 2.7348, "theoretical_loss": 3.5471499149231, "tokens_seen": 1358832640 }, { "epoch": 3.1, "learning_rate": 0.00029712136409227685, "loss": 2.8788, "theoretical_loss": 3.547134544110854, "tokens_seen": 1358898176 }, { "epoch": 3.1, "learning_rate": 0.00029711133400200603, "loss": 2.8181, "theoretical_loss": 3.5471191742474337, "tokens_seen": 1358963712 }, { "epoch": 3.1, "learning_rate": 0.0002971013039117352, "loss": 2.7542, "theoretical_loss": 3.547103805332734, "tokens_seen": 1359029248 }, { "epoch": 3.1, "learning_rate": 0.0002970912738214644, "loss": 2.9456, "theoretical_loss": 3.547088437366652, "tokens_seen": 1359094784 }, { "epoch": 3.1, "learning_rate": 0.00029708124373119363, "loss": 2.9855, "theoretical_loss": 3.5470730703490823, "tokens_seen": 1359160320 }, { "epoch": 3.1, "learning_rate": 0.00029707121364092276, "loss": 2.8148, "theoretical_loss": 3.547057704279921, "tokens_seen": 1359225856 }, { "epoch": 3.1, "learning_rate": 0.000297061183550652, "loss": 2.8493, "theoretical_loss": 3.5470423391590638, "tokens_seen": 1359291392 }, { "epoch": 3.1, "learning_rate": 0.0002970511534603811, "loss": 2.8153, "theoretical_loss": 3.547026974986407, "tokens_seen": 1359356928 }, { "epoch": 3.1, "learning_rate": 0.00029704112337011036, "loss": 2.5942, "theoretical_loss": 3.547011611761845, "tokens_seen": 1359422464 }, { "epoch": 3.1, "learning_rate": 0.00029703109327983954, "loss": 2.7476, "theoretical_loss": 3.546996249485275, "tokens_seen": 1359488000 }, { "epoch": 3.1, "learning_rate": 0.0002970210631895687, "loss": 2.894, "theoretical_loss": 3.5469808881565927, "tokens_seen": 1359553536 }, { "epoch": 3.1, "learning_rate": 0.0002970110330992979, "loss": 2.8628, "theoretical_loss": 3.546965527775693, "tokens_seen": 1359619072 }, { "epoch": 3.1, "learning_rate": 0.0002970010030090271, "loss": 2.7901, "theoretical_loss": 3.5469501683424727, "tokens_seen": 1359684608 }, { "epoch": 3.1, "learning_rate": 0.00029699097291875626, "loss": 2.7274, "theoretical_loss": 3.5469348098568267, "tokens_seen": 1359750144 }, { "epoch": 3.1, "learning_rate": 0.0002969809428284855, "loss": 2.9287, "theoretical_loss": 3.546919452318652, "tokens_seen": 1359815680 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.595344066619873, "objective/train/theoretical_loss": 3.546915613082138, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.546915613082138, "tokens_seen": 1359832064 }, { "epoch": 3.1, "learning_rate": 0.0002969709127382146, "loss": 2.5859, "theoretical_loss": 3.546904095727843, "tokens_seen": 1359881216 }, { "epoch": 3.1, "learning_rate": 0.00029696088264794386, "loss": 2.8002, "theoretical_loss": 3.546888740084297, "tokens_seen": 1359946752 }, { "epoch": 3.1, "learning_rate": 0.000296950852557673, "loss": 2.7294, "theoretical_loss": 3.5468733853879097, "tokens_seen": 1360012288 }, { "epoch": 3.1, "learning_rate": 0.0002969408224674022, "loss": 2.8196, "theoretical_loss": 3.5468580316385765, "tokens_seen": 1360077824 }, { "epoch": 3.1, "learning_rate": 0.0002969307923771314, "loss": 2.7816, "theoretical_loss": 3.5468426788361938, "tokens_seen": 1360143360 }, { "epoch": 3.1, "learning_rate": 0.0002969207622868606, "loss": 2.9772, "theoretical_loss": 3.546827326980657, "tokens_seen": 1360208896 }, { "epoch": 3.1, "learning_rate": 0.00029691073219658976, "loss": 2.654, "theoretical_loss": 3.546811976071863, "tokens_seen": 1360274432 }, { "epoch": 3.1, "learning_rate": 0.000296900702106319, "loss": 2.6236, "theoretical_loss": 3.546796626109707, "tokens_seen": 1360339968 }, { "epoch": 3.1, "learning_rate": 0.0002968906720160481, "loss": 2.8103, "theoretical_loss": 3.5467812770940847, "tokens_seen": 1360405504 }, { "epoch": 3.1, "learning_rate": 0.00029688064192577736, "loss": 2.7787, "theoretical_loss": 3.5467659290248936, "tokens_seen": 1360471040 }, { "epoch": 3.1, "learning_rate": 0.0002968706118355065, "loss": 2.919, "theoretical_loss": 3.5467505819020286, "tokens_seen": 1360536576 }, { "epoch": 3.1, "learning_rate": 0.0002968605817452357, "loss": 2.9061, "theoretical_loss": 3.5467352357253863, "tokens_seen": 1360602112 }, { "epoch": 3.1, "learning_rate": 0.0002968505516549649, "loss": 2.6908, "theoretical_loss": 3.5467198904948622, "tokens_seen": 1360667648 }, { "epoch": 3.1, "learning_rate": 0.0002968405215646941, "loss": 2.9049, "theoretical_loss": 3.546704546210353, "tokens_seen": 1360733184 }, { "epoch": 3.1, "learning_rate": 0.00029683049147442327, "loss": 2.7888, "theoretical_loss": 3.5466892028717547, "tokens_seen": 1360798720 }, { "epoch": 3.1, "learning_rate": 0.00029682046138415245, "loss": 2.8426, "theoretical_loss": 3.546673860478963, "tokens_seen": 1360864256 }, { "epoch": 3.1, "learning_rate": 0.00029681043129388163, "loss": 2.8318, "theoretical_loss": 3.5466585190318747, "tokens_seen": 1360929792 }, { "epoch": 3.1, "learning_rate": 0.00029680040120361087, "loss": 2.8557, "theoretical_loss": 3.5466431785303856, "tokens_seen": 1360995328 }, { "epoch": 3.1, "learning_rate": 0.00029679037111334, "loss": 2.7754, "theoretical_loss": 3.5466278389743917, "tokens_seen": 1361060864 }, { "epoch": 3.1, "learning_rate": 0.00029678034102306923, "loss": 2.7975, "theoretical_loss": 3.5466125003637896, "tokens_seen": 1361126400 }, { "epoch": 3.1, "learning_rate": 0.0002967703109327984, "loss": 2.8289, "theoretical_loss": 3.5465971626984754, "tokens_seen": 1361191936 }, { "epoch": 3.1, "learning_rate": 0.0002967602808425276, "loss": 2.9529, "theoretical_loss": 3.546581825978345, "tokens_seen": 1361257472 }, { "epoch": 3.1, "learning_rate": 0.00029675025075225677, "loss": 2.7043, "theoretical_loss": 3.546566490203295, "tokens_seen": 1361323008 }, { "epoch": 3.1, "learning_rate": 0.00029674022066198595, "loss": 2.9243, "theoretical_loss": 3.5465511553732223, "tokens_seen": 1361388544 }, { "epoch": 3.1, "learning_rate": 0.00029673019057171513, "loss": 2.6754, "theoretical_loss": 3.546535821488022, "tokens_seen": 1361454080 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8237411975860596, "objective/train/theoretical_loss": 3.5465319881643462, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.5465319881643462, "tokens_seen": 1361470464 }, { "epoch": 3.1, "learning_rate": 0.00029672016048144437, "loss": 2.7943, "theoretical_loss": 3.546520488547591, "tokens_seen": 1361519616 }, { "epoch": 3.1, "learning_rate": 0.0002967101303911735, "loss": 2.838, "theoretical_loss": 3.5465051565518255, "tokens_seen": 1361585152 }, { "epoch": 3.1, "learning_rate": 0.00029670010030090273, "loss": 2.8887, "theoretical_loss": 3.546489825500622, "tokens_seen": 1361650688 }, { "epoch": 3.1, "learning_rate": 0.00029669007021063186, "loss": 2.8882, "theoretical_loss": 3.5464744953938765, "tokens_seen": 1361716224 }, { "epoch": 3.1, "learning_rate": 0.0002966800401203611, "loss": 2.8805, "theoretical_loss": 3.546459166231486, "tokens_seen": 1361781760 }, { "epoch": 3.1, "learning_rate": 0.0002966700100300903, "loss": 2.9022, "theoretical_loss": 3.546443838013346, "tokens_seen": 1361847296 }, { "epoch": 3.1, "learning_rate": 0.00029665997993981946, "loss": 2.9126, "theoretical_loss": 3.546428510739353, "tokens_seen": 1361912832 }, { "epoch": 3.1, "learning_rate": 0.00029664994984954864, "loss": 2.781, "theoretical_loss": 3.5464131844094045, "tokens_seen": 1361978368 }, { "epoch": 3.1, "learning_rate": 0.0002966399197592778, "loss": 2.7799, "theoretical_loss": 3.546397859023396, "tokens_seen": 1362043904 }, { "epoch": 3.1, "learning_rate": 0.000296629889669007, "loss": 2.8164, "theoretical_loss": 3.546382534581224, "tokens_seen": 1362109440 }, { "epoch": 3.1, "learning_rate": 0.00029661985957873623, "loss": 2.6779, "theoretical_loss": 3.546367211082785, "tokens_seen": 1362174976 }, { "epoch": 3.1, "learning_rate": 0.00029660982948846536, "loss": 2.8901, "theoretical_loss": 3.546351888527976, "tokens_seen": 1362240512 }, { "epoch": 3.1, "learning_rate": 0.0002965997993981946, "loss": 2.9037, "theoretical_loss": 3.5463365669166933, "tokens_seen": 1362306048 }, { "epoch": 3.1, "learning_rate": 0.0002965897693079238, "loss": 2.9239, "theoretical_loss": 3.5463212462488327, "tokens_seen": 1362371584 }, { "epoch": 3.1, "learning_rate": 0.00029657973921765296, "loss": 2.6862, "theoretical_loss": 3.5463059265242913, "tokens_seen": 1362437120 }, { "epoch": 3.1, "learning_rate": 0.00029656970912738214, "loss": 2.8129, "theoretical_loss": 3.5462906077429657, "tokens_seen": 1362502656 }, { "epoch": 3.1, "learning_rate": 0.0002965596790371113, "loss": 2.6787, "theoretical_loss": 3.5462752899047523, "tokens_seen": 1362568192 }, { "epoch": 3.1, "learning_rate": 0.0002965496489468405, "loss": 2.6939, "theoretical_loss": 3.546259973009548, "tokens_seen": 1362633728 }, { "epoch": 3.1, "learning_rate": 0.00029653961885656974, "loss": 2.6647, "theoretical_loss": 3.5462446570572492, "tokens_seen": 1362699264 }, { "epoch": 3.1, "learning_rate": 0.0002965295887662989, "loss": 2.8995, "theoretical_loss": 3.546229342047752, "tokens_seen": 1362764800 }, { "epoch": 3.1, "learning_rate": 0.0002965195586760281, "loss": 2.8056, "theoretical_loss": 3.5462140279809535, "tokens_seen": 1362830336 }, { "epoch": 3.1, "learning_rate": 0.0002965095285857573, "loss": 2.7629, "theoretical_loss": 3.5461987148567506, "tokens_seen": 1362895872 }, { "epoch": 3.1, "learning_rate": 0.00029649949849548646, "loss": 2.8087, "theoretical_loss": 3.5461834026750396, "tokens_seen": 1362961408 }, { "epoch": 3.1, "learning_rate": 0.0002964894684052157, "loss": 2.7522, "theoretical_loss": 3.5461680914357174, "tokens_seen": 1363026944 }, { "epoch": 3.1, "learning_rate": 0.0002964794383149448, "loss": 2.7687, "theoretical_loss": 3.54615278113868, "tokens_seen": 1363092480 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.704561233520508, "objective/train/theoretical_loss": 3.546148953711641, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.546148953711641, "tokens_seen": 1363108864 }, { "epoch": 3.1, "learning_rate": 0.00029646940822467406, "loss": 2.7511, "theoretical_loss": 3.546137471783825, "tokens_seen": 1363158016 }, { "epoch": 3.1, "learning_rate": 0.0002964593781344032, "loss": 2.8456, "theoretical_loss": 3.546122163371049, "tokens_seen": 1363223552 }, { "epoch": 3.1, "learning_rate": 0.0002964493480441324, "loss": 2.7361, "theoretical_loss": 3.5461068559002484, "tokens_seen": 1363289088 }, { "epoch": 3.1, "learning_rate": 0.0002964393179538616, "loss": 2.7742, "theoretical_loss": 3.5460915493713205, "tokens_seen": 1363354624 }, { "epoch": 3.1, "learning_rate": 0.0002964292878635908, "loss": 2.9066, "theoretical_loss": 3.546076243784161, "tokens_seen": 1363420160 }, { "epoch": 3.1, "learning_rate": 0.00029641925777331996, "loss": 2.8325, "theoretical_loss": 3.5460609391386675, "tokens_seen": 1363485696 }, { "epoch": 3.1, "learning_rate": 0.0002964092276830492, "loss": 2.8843, "theoretical_loss": 3.546045635434737, "tokens_seen": 1363551232 }, { "epoch": 3.1, "learning_rate": 0.0002963991975927783, "loss": 2.6098, "theoretical_loss": 3.5460303326722658, "tokens_seen": 1363616768 }, { "epoch": 3.1, "learning_rate": 0.00029638916750250756, "loss": 2.7716, "theoretical_loss": 3.5460150308511507, "tokens_seen": 1363682304 }, { "epoch": 3.1, "learning_rate": 0.0002963791374122367, "loss": 2.847, "theoretical_loss": 3.545999729971289, "tokens_seen": 1363747840 }, { "epoch": 3.1, "learning_rate": 0.0002963691073219659, "loss": 2.9284, "theoretical_loss": 3.5459844300325774, "tokens_seen": 1363813376 }, { "epoch": 3.1, "learning_rate": 0.0002963590772316951, "loss": 2.8765, "theoretical_loss": 3.5459691310349126, "tokens_seen": 1363878912 }, { "epoch": 3.1, "learning_rate": 0.0002963490471414243, "loss": 2.987, "theoretical_loss": 3.5459538329781912, "tokens_seen": 1363944448 }, { "epoch": 3.1, "learning_rate": 0.00029633901705115347, "loss": 2.8149, "theoretical_loss": 3.5459385358623114, "tokens_seen": 1364009984 }, { "epoch": 3.1, "learning_rate": 0.00029632898696088265, "loss": 2.8383, "theoretical_loss": 3.545923239687169, "tokens_seen": 1364075520 }, { "epoch": 3.1, "learning_rate": 0.00029631895687061183, "loss": 2.7481, "theoretical_loss": 3.5459079444526616, "tokens_seen": 1364141056 }, { "epoch": 3.1, "learning_rate": 0.00029630892678034107, "loss": 2.7663, "theoretical_loss": 3.5458926501586854, "tokens_seen": 1364206592 }, { "epoch": 3.1, "learning_rate": 0.0002962988966900702, "loss": 2.8793, "theoretical_loss": 3.5458773568051374, "tokens_seen": 1364272128 }, { "epoch": 3.1, "learning_rate": 0.00029628886659979943, "loss": 2.9553, "theoretical_loss": 3.545862064391916, "tokens_seen": 1364337664 }, { "epoch": 3.1, "learning_rate": 0.0002962788365095286, "loss": 2.7472, "theoretical_loss": 3.5458467729189165, "tokens_seen": 1364403200 }, { "epoch": 3.1, "learning_rate": 0.0002962688064192578, "loss": 2.9015, "theoretical_loss": 3.545831482386037, "tokens_seen": 1364468736 }, { "epoch": 3.1, "learning_rate": 0.00029625877632898697, "loss": 2.8341, "theoretical_loss": 3.5458161927931746, "tokens_seen": 1364534272 }, { "epoch": 3.1, "learning_rate": 0.00029624874623871615, "loss": 2.7635, "theoretical_loss": 3.545800904140225, "tokens_seen": 1364599808 }, { "epoch": 3.1, "learning_rate": 0.00029623871614844533, "loss": 2.7728, "theoretical_loss": 3.545785616427087, "tokens_seen": 1364665344 }, { "epoch": 3.1, "learning_rate": 0.00029622868605817457, "loss": 2.7059, "theoretical_loss": 3.545770329653657, "tokens_seen": 1364730880 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.071437358856201, "objective/train/theoretical_loss": 3.545766508107117, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.545766508107117, "tokens_seen": 1364747264 }, { "epoch": 3.1, "learning_rate": 0.0002962186559679037, "loss": 2.8139, "theoretical_loss": 3.545755043819832, "tokens_seen": 1364796416 }, { "epoch": 3.1, "learning_rate": 0.00029620862587763293, "loss": 2.8462, "theoretical_loss": 3.5457397589255093, "tokens_seen": 1364861952 }, { "epoch": 3.1, "learning_rate": 0.00029619859578736206, "loss": 2.6715, "theoretical_loss": 3.5457244749705854, "tokens_seen": 1364927488 }, { "epoch": 3.1, "learning_rate": 0.0002961885656970913, "loss": 2.8886, "theoretical_loss": 3.545709191954959, "tokens_seen": 1364993024 }, { "epoch": 3.1, "learning_rate": 0.0002961785356068205, "loss": 2.8232, "theoretical_loss": 3.5456939098785254, "tokens_seen": 1365058560 }, { "epoch": 3.1, "learning_rate": 0.00029616850551654966, "loss": 2.6558, "theoretical_loss": 3.5456786287411832, "tokens_seen": 1365124096 }, { "epoch": 3.1, "learning_rate": 0.00029615847542627884, "loss": 2.7009, "theoretical_loss": 3.5456633485428295, "tokens_seen": 1365189632 }, { "epoch": 3.1, "learning_rate": 0.000296148445336008, "loss": 2.7263, "theoretical_loss": 3.5456480692833603, "tokens_seen": 1365255168 }, { "epoch": 3.1, "learning_rate": 0.0002961384152457372, "loss": 2.821, "theoretical_loss": 3.5456327909626744, "tokens_seen": 1365320704 }, { "epoch": 3.1, "learning_rate": 0.00029612838515546643, "loss": 2.8583, "theoretical_loss": 3.545617513580668, "tokens_seen": 1365386240 }, { "epoch": 3.1, "learning_rate": 0.00029611835506519556, "loss": 2.794, "theoretical_loss": 3.5456022371372384, "tokens_seen": 1365451776 }, { "epoch": 3.1, "learning_rate": 0.0002961083249749248, "loss": 2.7462, "theoretical_loss": 3.545586961632284, "tokens_seen": 1365517312 }, { "epoch": 3.1, "learning_rate": 0.000296098294884654, "loss": 2.8457, "theoretical_loss": 3.545571687065701, "tokens_seen": 1365582848 }, { "epoch": 3.1, "learning_rate": 0.00029608826479438316, "loss": 2.6061, "theoretical_loss": 3.545556413437387, "tokens_seen": 1365648384 }, { "epoch": 3.1, "learning_rate": 0.00029607823470411234, "loss": 2.7025, "theoretical_loss": 3.5455411407472397, "tokens_seen": 1365713920 }, { "epoch": 3.1, "learning_rate": 0.0002960682046138415, "loss": 2.9482, "theoretical_loss": 3.5455258689951563, "tokens_seen": 1365779456 }, { "epoch": 3.1, "learning_rate": 0.0002960581745235707, "loss": 2.7894, "theoretical_loss": 3.5455105981810338, "tokens_seen": 1365844992 }, { "epoch": 3.1, "learning_rate": 0.00029604814443329994, "loss": 2.7449, "theoretical_loss": 3.54549532830477, "tokens_seen": 1365910528 }, { "epoch": 3.1, "learning_rate": 0.00029603811434302906, "loss": 2.8159, "theoretical_loss": 3.5454800593662616, "tokens_seen": 1365976064 }, { "epoch": 3.1, "learning_rate": 0.0002960280842527583, "loss": 2.711, "theoretical_loss": 3.545464791365407, "tokens_seen": 1366041600 }, { "epoch": 3.1, "learning_rate": 0.0002960180541624874, "loss": 2.7588, "theoretical_loss": 3.545449524302103, "tokens_seen": 1366107136 }, { "epoch": 3.1, "learning_rate": 0.00029600802407221666, "loss": 2.6625, "theoretical_loss": 3.5454342581762477, "tokens_seen": 1366172672 }, { "epoch": 3.1, "learning_rate": 0.00029599799398194584, "loss": 2.8143, "theoretical_loss": 3.545418992987738, "tokens_seen": 1366238208 }, { "epoch": 3.1, "learning_rate": 0.000295987963891675, "loss": 2.9363, "theoretical_loss": 3.5454037287364715, "tokens_seen": 1366303744 }, { "epoch": 3.1, "learning_rate": 0.0002959779338014042, "loss": 2.8026, "theoretical_loss": 3.545388465422345, "tokens_seen": 1366369280 }, { "epoch": 3.1, "objective/train/docs_used": 2151444, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8739490509033203, "objective/train/theoretical_loss": 3.54538464974023, "objective/train/tokens_used": 1366811104, "theoretical_loss": 3.54538464974023, "tokens_seen": 1366385664 }, { "epoch": 3.1, "learning_rate": 0.0002959679037111334, "loss": 2.7696, "theoretical_loss": 3.5453732030452576, "tokens_seen": 1366434816 }, { "epoch": 3.1, "learning_rate": 0.00029595787362086257, "loss": 2.7583, "theoretical_loss": 3.545357941605106, "tokens_seen": 1366500352 }, { "epoch": 3.1, "learning_rate": 0.0002959478435305918, "loss": 2.8351, "theoretical_loss": 3.545342681101787, "tokens_seen": 1366565888 }, { "epoch": 3.1, "learning_rate": 0.00029593781344032093, "loss": 2.7347, "theoretical_loss": 3.5453274215352, "tokens_seen": 1366631424 }, { "epoch": 3.1, "learning_rate": 0.00029592778335005016, "loss": 2.7786, "theoretical_loss": 3.5453121629052413, "tokens_seen": 1366696960 }, { "epoch": 3.1, "learning_rate": 0.00029591775325977935, "loss": 2.8485, "theoretical_loss": 3.545296905211808, "tokens_seen": 1366762496 }, { "epoch": 3.1, "learning_rate": 0.00029590772316950853, "loss": 2.788, "theoretical_loss": 3.545281648454799, "tokens_seen": 1366828032 }, { "epoch": 4.0, "learning_rate": 0.0002958976930792377, "loss": 3.6679, "theoretical_loss": 3.5452656775404883, "tokens_seen": 1366896640 }, { "epoch": 4.0, "learning_rate": 0.0002958876629889669, "loss": 2.8656, "theoretical_loss": 3.545250422699902, "tokens_seen": 1366962176 }, { "epoch": 4.0, "learning_rate": 0.00029587763289869607, "loss": 2.9513, "theoretical_loss": 3.5452351687954278, "tokens_seen": 1367027712 }, { "epoch": 4.0, "learning_rate": 0.0002958676028084253, "loss": 2.9817, "theoretical_loss": 3.545219915826963, "tokens_seen": 1367093248 }, { "epoch": 4.0, "learning_rate": 0.00029585757271815443, "loss": 2.8466, "theoretical_loss": 3.5452046637944057, "tokens_seen": 1367158784 }, { "epoch": 4.0, "learning_rate": 0.00029584754262788367, "loss": 2.8708, "theoretical_loss": 3.545189412697653, "tokens_seen": 1367224320 }, { "epoch": 4.0, "learning_rate": 0.0002958375125376128, "loss": 2.8222, "theoretical_loss": 3.5451741625366036, "tokens_seen": 1367289856 }, { "epoch": 4.0, "learning_rate": 0.00029582748244734203, "loss": 2.8281, "theoretical_loss": 3.545158913311154, "tokens_seen": 1367355392 }, { "epoch": 4.0, "learning_rate": 0.0002958174523570712, "loss": 2.8536, "theoretical_loss": 3.545143665021203, "tokens_seen": 1367420928 }, { "epoch": 4.0, "learning_rate": 0.0002958074222668004, "loss": 2.8566, "theoretical_loss": 3.5451284176666475, "tokens_seen": 1367486464 }, { "epoch": 4.0, "learning_rate": 0.0002957973921765296, "loss": 2.8604, "theoretical_loss": 3.5451131712473862, "tokens_seen": 1367552000 }, { "epoch": 4.0, "learning_rate": 0.0002957873620862588, "loss": 2.899, "theoretical_loss": 3.5450979257633164, "tokens_seen": 1367617536 }, { "epoch": 4.0, "learning_rate": 0.000295777331995988, "loss": 2.8339, "theoretical_loss": 3.5450826812143363, "tokens_seen": 1367683072 }, { "epoch": 4.0, "learning_rate": 0.00029576730190571717, "loss": 2.8399, "theoretical_loss": 3.545067437600343, "tokens_seen": 1367748608 }, { "epoch": 4.0, "learning_rate": 0.00029575727181544635, "loss": 2.8946, "theoretical_loss": 3.545052194921235, "tokens_seen": 1367814144 }, { "epoch": 4.0, "learning_rate": 0.00029574724172517553, "loss": 2.6767, "theoretical_loss": 3.54503695317691, "tokens_seen": 1367879680 }, { "epoch": 4.0, "learning_rate": 0.00029573721163490477, "loss": 2.8126, "theoretical_loss": 3.545021712367266, "tokens_seen": 1367945216 }, { "epoch": 4.0, "objective/train/docs_used": 2186216, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7966411113739014, "objective/train/theoretical_loss": 3.5450064724922004, "objective/train/tokens_used": 1388470752, "theoretical_loss": 3.5450064724922004, "tokens_seen": 1368010752 }, { "epoch": 4.0, "learning_rate": 0.0002957271815446339, "loss": 2.8872, "theoretical_loss": 3.5450064724922004, "tokens_seen": 1368010752 }, { "epoch": 4.0, "learning_rate": 0.00029571715145436313, "loss": 2.7719, "theoretical_loss": 3.5449912335516114, "tokens_seen": 1368076288 }, { "epoch": 4.0, "learning_rate": 0.00029570712136409226, "loss": 2.8786, "theoretical_loss": 3.544975995545397, "tokens_seen": 1368141824 }, { "epoch": 4.0, "learning_rate": 0.0002956970912738215, "loss": 2.7486, "theoretical_loss": 3.544960758473456, "tokens_seen": 1368207360 }, { "epoch": 4.0, "learning_rate": 0.0002956870611835507, "loss": 2.7137, "theoretical_loss": 3.544945522335685, "tokens_seen": 1368272896 }, { "epoch": 4.0, "learning_rate": 0.00029567703109327986, "loss": 2.8411, "theoretical_loss": 3.544930287131983, "tokens_seen": 1368338432 }, { "epoch": 4.0, "learning_rate": 0.00029566700100300904, "loss": 3.0249, "theoretical_loss": 3.5449150528622475, "tokens_seen": 1368403968 }, { "epoch": 4.0, "learning_rate": 0.0002956569709127382, "loss": 2.6501, "theoretical_loss": 3.544899819526376, "tokens_seen": 1368469504 }, { "epoch": 4.0, "learning_rate": 0.0002956469408224674, "loss": 2.7094, "theoretical_loss": 3.5448845871242676, "tokens_seen": 1368535040 }, { "epoch": 4.0, "learning_rate": 0.00029563691073219663, "loss": 2.7465, "theoretical_loss": 3.5448693556558197, "tokens_seen": 1368600576 }, { "epoch": 4.0, "learning_rate": 0.00029562688064192576, "loss": 2.8897, "theoretical_loss": 3.5448541251209305, "tokens_seen": 1368666112 }, { "epoch": 4.0, "learning_rate": 0.000295616850551655, "loss": 2.6196, "theoretical_loss": 3.544838895519498, "tokens_seen": 1368731648 }, { "epoch": 4.0, "learning_rate": 0.0002956068204613842, "loss": 2.6384, "theoretical_loss": 3.544823666851421, "tokens_seen": 1368797184 }, { "epoch": 4.0, "learning_rate": 0.00029559679037111336, "loss": 2.9304, "theoretical_loss": 3.544808439116596, "tokens_seen": 1368862720 }, { "epoch": 4.0, "learning_rate": 0.00029558676028084254, "loss": 2.8146, "theoretical_loss": 3.5447932123149233, "tokens_seen": 1368928256 }, { "epoch": 4.0, "learning_rate": 0.0002955767301905717, "loss": 2.7593, "theoretical_loss": 3.5447779864462996, "tokens_seen": 1368993792 }, { "epoch": 4.0, "learning_rate": 0.0002955667001003009, "loss": 2.8995, "theoretical_loss": 3.544762761510623, "tokens_seen": 1369059328 }, { "epoch": 4.0, "learning_rate": 0.00029555667001003014, "loss": 2.7063, "theoretical_loss": 3.544747537507792, "tokens_seen": 1369124864 }, { "epoch": 4.0, "learning_rate": 0.00029554663991975926, "loss": 2.8052, "theoretical_loss": 3.544732314437705, "tokens_seen": 1369190400 }, { "epoch": 4.0, "learning_rate": 0.0002955366098294885, "loss": 2.6933, "theoretical_loss": 3.54471709230026, "tokens_seen": 1369255936 }, { "epoch": 4.0, "learning_rate": 0.0002955265797392176, "loss": 2.8125, "theoretical_loss": 3.5447018710953557, "tokens_seen": 1369321472 }, { "epoch": 4.0, "learning_rate": 0.00029551654964894686, "loss": 3.0092, "theoretical_loss": 3.5446866508228903, "tokens_seen": 1369387008 }, { "epoch": 4.0, "learning_rate": 0.00029550651955867604, "loss": 2.8747, "theoretical_loss": 3.5446714314827608, "tokens_seen": 1369452544 }, { "epoch": 4.0, "learning_rate": 0.0002954964894684052, "loss": 2.837, "theoretical_loss": 3.5446562130748664, "tokens_seen": 1369518080 }, { "epoch": 4.0, "learning_rate": 0.0002954864593781344, "loss": 2.9028, "theoretical_loss": 3.5446409955991056, "tokens_seen": 1369583616 }, { "epoch": 4.0, "objective/train/docs_used": 2189251, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0046889781951904, "objective/train/theoretical_loss": 3.5446257790553766, "objective/train/tokens_used": 1390109152, "theoretical_loss": 3.5446257790553766, "tokens_seen": 1369649152 }, { "epoch": 4.0, "learning_rate": 0.0002954764292878636, "loss": 2.8095, "theoretical_loss": 3.5446257790553766, "tokens_seen": 1369649152 }, { "epoch": 4.0, "learning_rate": 0.00029546639919759277, "loss": 2.9119, "theoretical_loss": 3.5446105634435776, "tokens_seen": 1369714688 }, { "epoch": 4.0, "learning_rate": 0.000295456369107322, "loss": 2.9271, "theoretical_loss": 3.5445953487636066, "tokens_seen": 1369780224 }, { "epoch": 4.0, "learning_rate": 0.00029544633901705113, "loss": 2.9497, "theoretical_loss": 3.5445801350153623, "tokens_seen": 1369845760 }, { "epoch": 4.0, "learning_rate": 0.00029543630892678037, "loss": 2.8257, "theoretical_loss": 3.5445649221987434, "tokens_seen": 1369911296 }, { "epoch": 4.0, "learning_rate": 0.00029542627883650955, "loss": 2.7887, "theoretical_loss": 3.544549710313648, "tokens_seen": 1369976832 }, { "epoch": 4.0, "learning_rate": 0.00029541624874623873, "loss": 2.7144, "theoretical_loss": 3.544534499359974, "tokens_seen": 1370042368 }, { "epoch": 4.0, "learning_rate": 0.0002954062186559679, "loss": 2.9205, "theoretical_loss": 3.5445192893376203, "tokens_seen": 1370107904 }, { "epoch": 4.0, "learning_rate": 0.0002953961885656971, "loss": 2.9587, "theoretical_loss": 3.5445040802464858, "tokens_seen": 1370173440 }, { "epoch": 4.0, "learning_rate": 0.00029538615847542627, "loss": 2.8563, "theoretical_loss": 3.544488872086468, "tokens_seen": 1370238976 }, { "epoch": 4.0, "learning_rate": 0.0002953761283851555, "loss": 3.0079, "theoretical_loss": 3.544473664857466, "tokens_seen": 1370304512 }, { "epoch": 4.0, "learning_rate": 0.00029536609829488463, "loss": 2.9525, "theoretical_loss": 3.5444584585593777, "tokens_seen": 1370370048 }, { "epoch": 4.0, "learning_rate": 0.00029535606820461387, "loss": 2.8555, "theoretical_loss": 3.544443253192102, "tokens_seen": 1370435584 }, { "epoch": 4.0, "learning_rate": 0.000295346038114343, "loss": 2.6072, "theoretical_loss": 3.544428048755538, "tokens_seen": 1370501120 }, { "epoch": 4.0, "learning_rate": 0.00029533600802407223, "loss": 2.8672, "theoretical_loss": 3.5444128452495836, "tokens_seen": 1370566656 }, { "epoch": 4.0, "learning_rate": 0.0002953259779338014, "loss": 2.7622, "theoretical_loss": 3.544397642674137, "tokens_seen": 1370632192 }, { "epoch": 4.0, "learning_rate": 0.0002953159478435306, "loss": 2.8314, "theoretical_loss": 3.5443824410290974, "tokens_seen": 1370697728 }, { "epoch": 4.0, "learning_rate": 0.0002953059177532598, "loss": 2.7615, "theoretical_loss": 3.544367240314363, "tokens_seen": 1370763264 }, { "epoch": 4.0, "learning_rate": 0.000295295887662989, "loss": 2.7992, "theoretical_loss": 3.5443520405298323, "tokens_seen": 1370828800 }, { "epoch": 4.0, "learning_rate": 0.00029528585757271814, "loss": 2.8715, "theoretical_loss": 3.5443368416754044, "tokens_seen": 1370894336 }, { "epoch": 4.0, "learning_rate": 0.00029527582748244737, "loss": 2.8487, "theoretical_loss": 3.5443216437509775, "tokens_seen": 1370959872 }, { "epoch": 4.0, "learning_rate": 0.0002952657973921765, "loss": 2.9333, "theoretical_loss": 3.5443064467564502, "tokens_seen": 1371025408 }, { "epoch": 4.0, "learning_rate": 0.00029525576730190573, "loss": 2.8526, "theoretical_loss": 3.544291250691722, "tokens_seen": 1371090944 }, { "epoch": 4.0, "learning_rate": 0.0002952457372116349, "loss": 2.9029, "theoretical_loss": 3.544276055556691, "tokens_seen": 1371156480 }, { "epoch": 4.0, "learning_rate": 0.0002952357071213641, "loss": 2.8708, "theoretical_loss": 3.5442608613512547, "tokens_seen": 1371222016 }, { "epoch": 4.0, "objective/train/docs_used": 2192065, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.598870277404785, "objective/train/theoretical_loss": 3.5442456680753134, "objective/train/tokens_used": 1391747552, "theoretical_loss": 3.5442456680753134, "tokens_seen": 1371287552 }, { "epoch": 4.0, "learning_rate": 0.0002952256770310933, "loss": 2.7539, "theoretical_loss": 3.5442456680753134, "tokens_seen": 1371287552 }, { "epoch": 4.0, "learning_rate": 0.00029521564694082246, "loss": 2.835, "theoretical_loss": 3.544230475728766, "tokens_seen": 1371353088 }, { "epoch": 4.0, "learning_rate": 0.00029520561685055164, "loss": 2.8126, "theoretical_loss": 3.54421528431151, "tokens_seen": 1371418624 }, { "epoch": 4.0, "learning_rate": 0.0002951955867602809, "loss": 2.7729, "theoretical_loss": 3.544200093823445, "tokens_seen": 1371484160 }, { "epoch": 4.0, "learning_rate": 0.00029518555667001, "loss": 2.8836, "theoretical_loss": 3.544184904264469, "tokens_seen": 1371549696 }, { "epoch": 4.0, "learning_rate": 0.00029517552657973924, "loss": 2.7575, "theoretical_loss": 3.5441697156344816, "tokens_seen": 1371615232 }, { "epoch": 4.0, "learning_rate": 0.00029516549648946836, "loss": 2.8716, "theoretical_loss": 3.5441545279333813, "tokens_seen": 1371680768 }, { "epoch": 4.0, "learning_rate": 0.0002951554663991976, "loss": 2.9261, "theoretical_loss": 3.544139341161067, "tokens_seen": 1371746304 }, { "epoch": 4.0, "learning_rate": 0.0002951454363089268, "loss": 2.8004, "theoretical_loss": 3.544124155317437, "tokens_seen": 1371811840 }, { "epoch": 4.0, "learning_rate": 0.00029513540621865596, "loss": 2.8019, "theoretical_loss": 3.544108970402391, "tokens_seen": 1371877376 }, { "epoch": 4.0, "learning_rate": 0.00029512537612838514, "loss": 2.836, "theoretical_loss": 3.5440937864158277, "tokens_seen": 1371942912 }, { "epoch": 4.0, "learning_rate": 0.0002951153460381144, "loss": 2.7628, "theoretical_loss": 3.5440786033576455, "tokens_seen": 1372008448 }, { "epoch": 4.0, "learning_rate": 0.0002951053159478435, "loss": 2.7452, "theoretical_loss": 3.544063421227743, "tokens_seen": 1372073984 }, { "epoch": 4.0, "learning_rate": 0.00029509528585757274, "loss": 2.9494, "theoretical_loss": 3.54404824002602, "tokens_seen": 1372139520 }, { "epoch": 4.0, "learning_rate": 0.00029508525576730187, "loss": 2.8709, "theoretical_loss": 3.5440330597523753, "tokens_seen": 1372205056 }, { "epoch": 4.0, "learning_rate": 0.0002950752256770311, "loss": 2.8239, "theoretical_loss": 3.5440178804067073, "tokens_seen": 1372270592 }, { "epoch": 4.0, "learning_rate": 0.0002950651955867603, "loss": 2.7772, "theoretical_loss": 3.5440027019889153, "tokens_seen": 1372336128 }, { "epoch": 4.0, "learning_rate": 0.00029505516549648946, "loss": 2.8405, "theoretical_loss": 3.543987524498898, "tokens_seen": 1372401664 }, { "epoch": 4.0, "learning_rate": 0.00029504513540621865, "loss": 2.7064, "theoretical_loss": 3.5439723479365552, "tokens_seen": 1372467200 }, { "epoch": 4.0, "learning_rate": 0.0002950351053159478, "loss": 2.7792, "theoretical_loss": 3.543957172301785, "tokens_seen": 1372532736 }, { "epoch": 4.0, "learning_rate": 0.00029502507522567706, "loss": 2.8092, "theoretical_loss": 3.5439419975944864, "tokens_seen": 1372598272 }, { "epoch": 4.0, "learning_rate": 0.00029501504513540624, "loss": 2.8057, "theoretical_loss": 3.5439268238145596, "tokens_seen": 1372663808 }, { "epoch": 4.0, "learning_rate": 0.0002950050150451354, "loss": 2.886, "theoretical_loss": 3.5439116509619017, "tokens_seen": 1372729344 }, { "epoch": 4.0, "learning_rate": 0.0002949949849548646, "loss": 2.8468, "theoretical_loss": 3.543896479036414, "tokens_seen": 1372794880 }, { "epoch": 4.0, "learning_rate": 0.0002949849548645938, "loss": 2.812, "theoretical_loss": 3.543881308037994, "tokens_seen": 1372860416 }, { "epoch": 4.0, "objective/train/docs_used": 2194290, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0445005893707275, "objective/train/theoretical_loss": 3.5438661379665413, "objective/train/tokens_used": 1393385952, "theoretical_loss": 3.5438661379665413, "tokens_seen": 1372925952 }, { "epoch": 4.0, "learning_rate": 0.00029497492477432297, "loss": 2.846, "theoretical_loss": 3.5438661379665413, "tokens_seen": 1372925952 }, { "epoch": 4.0, "learning_rate": 0.0002949648946840522, "loss": 2.9781, "theoretical_loss": 3.543850968821955, "tokens_seen": 1372991488 }, { "epoch": 4.0, "learning_rate": 0.00029495486459378133, "loss": 2.762, "theoretical_loss": 3.5438358006041346, "tokens_seen": 1373057024 }, { "epoch": 4.0, "learning_rate": 0.00029494483450351057, "loss": 2.7783, "theoretical_loss": 3.5438206333129783, "tokens_seen": 1373122560 }, { "epoch": 4.0, "learning_rate": 0.00029493480441323975, "loss": 2.8554, "theoretical_loss": 3.5438054669483865, "tokens_seen": 1373188096 }, { "epoch": 4.0, "learning_rate": 0.00029492477432296893, "loss": 2.7314, "theoretical_loss": 3.543790301510257, "tokens_seen": 1373253632 }, { "epoch": 4.0, "learning_rate": 0.0002949147442326981, "loss": 2.9165, "theoretical_loss": 3.5437751369984904, "tokens_seen": 1373319168 }, { "epoch": 4.0, "learning_rate": 0.0002949047141424273, "loss": 2.7627, "theoretical_loss": 3.543759973412985, "tokens_seen": 1373384704 }, { "epoch": 4.0, "learning_rate": 0.00029489468405215647, "loss": 2.8851, "theoretical_loss": 3.5437448107536405, "tokens_seen": 1373450240 }, { "epoch": 4.0, "learning_rate": 0.0002948846539618857, "loss": 2.7225, "theoretical_loss": 3.5437296490203556, "tokens_seen": 1373515776 }, { "epoch": 4.0, "learning_rate": 0.00029487462387161483, "loss": 2.9884, "theoretical_loss": 3.5437144882130296, "tokens_seen": 1373581312 }, { "epoch": 4.0, "learning_rate": 0.00029486459378134407, "loss": 2.7431, "theoretical_loss": 3.5436993283315625, "tokens_seen": 1373646848 }, { "epoch": 4.0, "learning_rate": 0.0002948545636910732, "loss": 2.934, "theoretical_loss": 3.543684169375853, "tokens_seen": 1373712384 }, { "epoch": 4.0, "learning_rate": 0.00029484453360080243, "loss": 2.7874, "theoretical_loss": 3.5436690113458007, "tokens_seen": 1373777920 }, { "epoch": 4.0, "learning_rate": 0.0002948345035105316, "loss": 2.7791, "theoretical_loss": 3.5436538542413047, "tokens_seen": 1373843456 }, { "epoch": 4.0, "learning_rate": 0.0002948244734202608, "loss": 2.6596, "theoretical_loss": 3.5436386980622645, "tokens_seen": 1373908992 }, { "epoch": 4.0, "learning_rate": 0.00029481444332999, "loss": 2.7802, "theoretical_loss": 3.543623542808579, "tokens_seen": 1373974528 }, { "epoch": 4.0, "learning_rate": 0.0002948044132397192, "loss": 2.658, "theoretical_loss": 3.5436083884801484, "tokens_seen": 1374040064 }, { "epoch": 4.0, "learning_rate": 0.00029479438314944834, "loss": 2.9017, "theoretical_loss": 3.5435932350768713, "tokens_seen": 1374105600 }, { "epoch": 4.0, "learning_rate": 0.00029478435305917757, "loss": 2.8505, "theoretical_loss": 3.543578082598647, "tokens_seen": 1374171136 }, { "epoch": 4.0, "learning_rate": 0.0002947743229689067, "loss": 2.7699, "theoretical_loss": 3.543562931045376, "tokens_seen": 1374236672 }, { "epoch": 4.0, "learning_rate": 0.00029476429287863593, "loss": 2.891, "theoretical_loss": 3.543547780416957, "tokens_seen": 1374302208 }, { "epoch": 4.0, "learning_rate": 0.0002947542627883651, "loss": 2.8186, "theoretical_loss": 3.5435326307132895, "tokens_seen": 1374367744 }, { "epoch": 4.0, "learning_rate": 0.0002947442326980943, "loss": 2.864, "theoretical_loss": 3.5435174819342725, "tokens_seen": 1374433280 }, { "epoch": 4.0, "learning_rate": 0.0002947342026078235, "loss": 2.8008, "theoretical_loss": 3.5435023340798066, "tokens_seen": 1374498816 }, { "epoch": 4.0, "objective/train/docs_used": 2197147, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.651865005493164, "objective/train/theoretical_loss": 3.54348718714979, "objective/train/tokens_used": 1395024352, "theoretical_loss": 3.54348718714979, "tokens_seen": 1374564352 }, { "epoch": 4.0, "learning_rate": 0.00029472417251755266, "loss": 2.8907, "theoretical_loss": 3.54348718714979, "tokens_seen": 1374564352 }, { "epoch": 4.0, "learning_rate": 0.00029471414242728184, "loss": 2.6371, "theoretical_loss": 3.543472041144123, "tokens_seen": 1374629888 }, { "epoch": 4.0, "learning_rate": 0.0002947041123370111, "loss": 2.8398, "theoretical_loss": 3.5434568960627058, "tokens_seen": 1374695424 }, { "epoch": 4.0, "learning_rate": 0.0002946940822467402, "loss": 2.7189, "theoretical_loss": 3.543441751905436, "tokens_seen": 1374760960 }, { "epoch": 4.0, "learning_rate": 0.00029468405215646944, "loss": 2.7607, "theoretical_loss": 3.543426608672215, "tokens_seen": 1374826496 }, { "epoch": 4.0, "learning_rate": 0.00029467402206619856, "loss": 2.939, "theoretical_loss": 3.543411466362942, "tokens_seen": 1374892032 }, { "epoch": 4.0, "learning_rate": 0.0002946639919759278, "loss": 2.8438, "theoretical_loss": 3.5433963249775156, "tokens_seen": 1374957568 }, { "epoch": 4.0, "learning_rate": 0.000294653961885657, "loss": 2.7658, "theoretical_loss": 3.5433811845158356, "tokens_seen": 1375023104 }, { "epoch": 4.0, "learning_rate": 0.00029464393179538616, "loss": 2.8636, "theoretical_loss": 3.5433660449778026, "tokens_seen": 1375088640 }, { "epoch": 4.0, "learning_rate": 0.00029463390170511534, "loss": 2.7696, "theoretical_loss": 3.543350906363316, "tokens_seen": 1375154176 }, { "epoch": 4.0, "learning_rate": 0.0002946238716148446, "loss": 2.8509, "theoretical_loss": 3.543335768672275, "tokens_seen": 1375219712 }, { "epoch": 4.0, "learning_rate": 0.0002946138415245737, "loss": 2.8828, "theoretical_loss": 3.5433206319045794, "tokens_seen": 1375285248 }, { "epoch": 4.0, "learning_rate": 0.00029460381143430294, "loss": 2.8752, "theoretical_loss": 3.5433054960601287, "tokens_seen": 1375350784 }, { "epoch": 4.0, "learning_rate": 0.00029459378134403207, "loss": 2.5587, "theoretical_loss": 3.5432903611388227, "tokens_seen": 1375416320 }, { "epoch": 4.0, "learning_rate": 0.0002945837512537613, "loss": 3.0124, "theoretical_loss": 3.543275227140562, "tokens_seen": 1375481856 }, { "epoch": 4.0, "learning_rate": 0.0002945737211634905, "loss": 2.7206, "theoretical_loss": 3.5432600940652446, "tokens_seen": 1375547392 }, { "epoch": 4.0, "learning_rate": 0.00029456369107321966, "loss": 2.6344, "theoretical_loss": 3.5432449619127717, "tokens_seen": 1375612928 }, { "epoch": 4.0, "learning_rate": 0.00029455366098294885, "loss": 2.7922, "theoretical_loss": 3.5432298306830425, "tokens_seen": 1375678464 }, { "epoch": 4.0, "learning_rate": 0.000294543630892678, "loss": 2.8798, "theoretical_loss": 3.543214700375957, "tokens_seen": 1375744000 }, { "epoch": 4.0, "learning_rate": 0.0002945336008024072, "loss": 2.8332, "theoretical_loss": 3.5431995709914146, "tokens_seen": 1375809536 }, { "epoch": 4.0, "learning_rate": 0.00029452357071213644, "loss": 2.7588, "theoretical_loss": 3.5431844425293155, "tokens_seen": 1375875072 }, { "epoch": 4.0, "learning_rate": 0.00029451354062186557, "loss": 2.8776, "theoretical_loss": 3.543169314989559, "tokens_seen": 1375940608 }, { "epoch": 4.0, "learning_rate": 0.0002945035105315948, "loss": 2.8144, "theoretical_loss": 3.543154188372046, "tokens_seen": 1376006144 }, { "epoch": 4.0, "learning_rate": 0.00029449348044132393, "loss": 2.827, "theoretical_loss": 3.543139062676675, "tokens_seen": 1376071680 }, { "epoch": 4.0, "learning_rate": 0.00029448345035105317, "loss": 2.7169, "theoretical_loss": 3.5431239379033466, "tokens_seen": 1376137216 }, { "epoch": 4.0, "objective/train/docs_used": 2200122, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.83168363571167, "objective/train/theoretical_loss": 3.543108814051961, "objective/train/tokens_used": 1396662752, "theoretical_loss": 3.543108814051961, "tokens_seen": 1376202752 }, { "epoch": 4.0, "learning_rate": 0.00029447342026078235, "loss": 2.8436, "theoretical_loss": 3.543108814051961, "tokens_seen": 1376202752 }, { "epoch": 4.0, "learning_rate": 0.00029446339017051153, "loss": 2.8371, "theoretical_loss": 3.543093691122418, "tokens_seen": 1376268288 }, { "epoch": 4.0, "learning_rate": 0.0002944533600802407, "loss": 2.66, "theoretical_loss": 3.5430785691146163, "tokens_seen": 1376333824 }, { "epoch": 4.0, "learning_rate": 0.00029444332998996995, "loss": 2.8212, "theoretical_loss": 3.543063448028457, "tokens_seen": 1376399360 }, { "epoch": 4.0, "learning_rate": 0.0002944332998996991, "loss": 2.8184, "theoretical_loss": 3.5430483278638403, "tokens_seen": 1376464896 }, { "epoch": 4.0, "learning_rate": 0.0002944232698094283, "loss": 2.7217, "theoretical_loss": 3.5430332086206655, "tokens_seen": 1376530432 }, { "epoch": 4.0, "learning_rate": 0.00029441323971915744, "loss": 2.7513, "theoretical_loss": 3.543018090298833, "tokens_seen": 1376595968 }, { "epoch": 4.0, "learning_rate": 0.00029440320962888667, "loss": 2.8687, "theoretical_loss": 3.543002972898242, "tokens_seen": 1376661504 }, { "epoch": 4.0, "learning_rate": 0.00029439317953861585, "loss": 2.7792, "theoretical_loss": 3.5429878564187933, "tokens_seen": 1376727040 }, { "epoch": 4.0, "learning_rate": 0.00029438314944834503, "loss": 2.8904, "theoretical_loss": 3.542972740860387, "tokens_seen": 1376792576 }, { "epoch": 4.0, "learning_rate": 0.0002943731193580742, "loss": 2.8445, "theoretical_loss": 3.5429576262229228, "tokens_seen": 1376858112 }, { "epoch": 4.0, "learning_rate": 0.0002943630892678034, "loss": 2.7805, "theoretical_loss": 3.5429425125063005, "tokens_seen": 1376923648 }, { "epoch": 4.0, "learning_rate": 0.0002943530591775326, "loss": 2.7639, "theoretical_loss": 3.5429273997104205, "tokens_seen": 1376989184 }, { "epoch": 4.0, "learning_rate": 0.0002943430290872618, "loss": 2.7523, "theoretical_loss": 3.542912287835183, "tokens_seen": 1377054720 }, { "epoch": 4.0, "learning_rate": 0.00029433299899699094, "loss": 2.8276, "theoretical_loss": 3.542897176880488, "tokens_seen": 1377120256 }, { "epoch": 4.0, "learning_rate": 0.0002943229689067202, "loss": 2.8003, "theoretical_loss": 3.5428820668462357, "tokens_seen": 1377185792 }, { "epoch": 4.0, "learning_rate": 0.0002943129388164493, "loss": 2.6753, "theoretical_loss": 3.542866957732326, "tokens_seen": 1377251328 }, { "epoch": 4.0, "learning_rate": 0.00029430290872617854, "loss": 2.7577, "theoretical_loss": 3.542851849538659, "tokens_seen": 1377316864 }, { "epoch": 4.0, "learning_rate": 0.00029429287863590777, "loss": 2.6351, "theoretical_loss": 3.5428367422651355, "tokens_seen": 1377382400 }, { "epoch": 4.0, "learning_rate": 0.0002942828485456369, "loss": 2.6099, "theoretical_loss": 3.542821635911655, "tokens_seen": 1377447936 }, { "epoch": 4.0, "learning_rate": 0.00029427281845536613, "loss": 2.8837, "theoretical_loss": 3.542806530478118, "tokens_seen": 1377513472 }, { "epoch": 4.0, "learning_rate": 0.0002942627883650953, "loss": 2.992, "theoretical_loss": 3.542791425964424, "tokens_seen": 1377579008 }, { "epoch": 4.0, "learning_rate": 0.0002942527582748245, "loss": 2.9325, "theoretical_loss": 3.542776322370475, "tokens_seen": 1377644544 }, { "epoch": 4.0, "learning_rate": 0.0002942427281845537, "loss": 2.8058, "theoretical_loss": 3.5427612196961693, "tokens_seen": 1377710080 }, { "epoch": 4.0, "learning_rate": 0.00029423269809428286, "loss": 2.8841, "theoretical_loss": 3.5427461179414084, "tokens_seen": 1377775616 }, { "epoch": 4.0, "objective/train/docs_used": 2203056, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7526707649230957, "objective/train/theoretical_loss": 3.5427310171060924, "objective/train/tokens_used": 1398301152, "theoretical_loss": 3.5427310171060924, "tokens_seen": 1377841152 }, { "epoch": 4.0, "learning_rate": 0.00029422266800401204, "loss": 2.898, "theoretical_loss": 3.5427310171060924, "tokens_seen": 1377841152 }, { "epoch": 4.0, "learning_rate": 0.0002942126379137413, "loss": 2.8522, "theoretical_loss": 3.5427159171901206, "tokens_seen": 1377906688 }, { "epoch": 4.0, "learning_rate": 0.0002942026078234704, "loss": 2.7464, "theoretical_loss": 3.5427008181933948, "tokens_seen": 1377972224 }, { "epoch": 4.0, "learning_rate": 0.00029419257773319964, "loss": 2.7572, "theoretical_loss": 3.542685720115814, "tokens_seen": 1378037760 }, { "epoch": 4.0, "learning_rate": 0.00029418254764292876, "loss": 2.9644, "theoretical_loss": 3.5426706229572797, "tokens_seen": 1378103296 }, { "epoch": 4.0, "learning_rate": 0.000294172517552658, "loss": 2.9055, "theoretical_loss": 3.5426555267176916, "tokens_seen": 1378168832 }, { "epoch": 4.0, "learning_rate": 0.0002941624874623872, "loss": 2.936, "theoretical_loss": 3.5426404313969497, "tokens_seen": 1378234368 }, { "epoch": 4.0, "learning_rate": 0.00029415245737211636, "loss": 2.9607, "theoretical_loss": 3.542625336994955, "tokens_seen": 1378299904 }, { "epoch": 4.0, "learning_rate": 0.00029414242728184554, "loss": 2.9551, "theoretical_loss": 3.542610243511608, "tokens_seen": 1378365440 }, { "epoch": 4.0, "learning_rate": 0.0002941323971915748, "loss": 2.829, "theoretical_loss": 3.5425951509468088, "tokens_seen": 1378430976 }, { "epoch": 4.0, "learning_rate": 0.0002941223671013039, "loss": 2.8326, "theoretical_loss": 3.542580059300458, "tokens_seen": 1378496512 }, { "epoch": 4.0, "learning_rate": 0.00029411233701103314, "loss": 2.842, "theoretical_loss": 3.5425649685724556, "tokens_seen": 1378562048 }, { "epoch": 4.0, "learning_rate": 0.00029410230692076227, "loss": 2.7336, "theoretical_loss": 3.5425498787627023, "tokens_seen": 1378627584 }, { "epoch": 4.0, "learning_rate": 0.0002940922768304915, "loss": 2.8998, "theoretical_loss": 3.542534789871099, "tokens_seen": 1378693120 }, { "epoch": 4.0, "learning_rate": 0.0002940822467402207, "loss": 2.814, "theoretical_loss": 3.542519701897546, "tokens_seen": 1378758656 }, { "epoch": 4.0, "learning_rate": 0.00029407221664994986, "loss": 2.6451, "theoretical_loss": 3.5425046148419432, "tokens_seen": 1378824192 }, { "epoch": 4.0, "learning_rate": 0.00029406218655967905, "loss": 2.8887, "theoretical_loss": 3.542489528704192, "tokens_seen": 1378889728 }, { "epoch": 4.0, "learning_rate": 0.0002940521564694082, "loss": 2.9911, "theoretical_loss": 3.5424744434841924, "tokens_seen": 1378955264 }, { "epoch": 4.0, "learning_rate": 0.0002940421263791374, "loss": 2.8639, "theoretical_loss": 3.542459359181845, "tokens_seen": 1379020800 }, { "epoch": 4.0, "learning_rate": 0.00029403209628886664, "loss": 3.0051, "theoretical_loss": 3.5424442757970507, "tokens_seen": 1379086336 }, { "epoch": 4.0, "learning_rate": 0.00029402206619859577, "loss": 2.8092, "theoretical_loss": 3.54242919332971, "tokens_seen": 1379151872 }, { "epoch": 4.0, "learning_rate": 0.000294012036108325, "loss": 2.8152, "theoretical_loss": 3.542414111779723, "tokens_seen": 1379217408 }, { "epoch": 4.0, "learning_rate": 0.00029400200601805413, "loss": 2.7687, "theoretical_loss": 3.542399031146991, "tokens_seen": 1379282944 }, { "epoch": 4.0, "learning_rate": 0.00029399197592778337, "loss": 2.6523, "theoretical_loss": 3.542383951431414, "tokens_seen": 1379348480 }, { "epoch": 4.0, "learning_rate": 0.00029398194583751255, "loss": 2.8993, "theoretical_loss": 3.5423688726328932, "tokens_seen": 1379414016 }, { "epoch": 4.0, "objective/train/docs_used": 2205936, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8732426166534424, "objective/train/theoretical_loss": 3.542353794751329, "objective/train/tokens_used": 1399939552, "theoretical_loss": 3.542353794751329, "tokens_seen": 1379479552 }, { "epoch": 4.0, "learning_rate": 0.00029397191574724173, "loss": 2.9306, "theoretical_loss": 3.542353794751329, "tokens_seen": 1379479552 }, { "epoch": 4.0, "learning_rate": 0.0002939618856569709, "loss": 2.9492, "theoretical_loss": 3.542338717786622, "tokens_seen": 1379545088 }, { "epoch": 4.0, "learning_rate": 0.00029395185556670015, "loss": 2.7763, "theoretical_loss": 3.5423236417386734, "tokens_seen": 1379610624 }, { "epoch": 4.0, "learning_rate": 0.0002939418254764293, "loss": 2.8085, "theoretical_loss": 3.5423085666073835, "tokens_seen": 1379676160 }, { "epoch": 4.0, "learning_rate": 0.0002939317953861585, "loss": 2.8219, "theoretical_loss": 3.542293492392653, "tokens_seen": 1379741696 }, { "epoch": 4.0, "learning_rate": 0.00029392176529588764, "loss": 2.8291, "theoretical_loss": 3.5422784190943823, "tokens_seen": 1379807232 }, { "epoch": 4.0, "learning_rate": 0.00029391173520561687, "loss": 2.7737, "theoretical_loss": 3.5422633467124727, "tokens_seen": 1379872768 }, { "epoch": 4.0, "learning_rate": 0.00029390170511534605, "loss": 2.81, "theoretical_loss": 3.542248275246825, "tokens_seen": 1379938304 }, { "epoch": 4.0, "learning_rate": 0.00029389167502507523, "loss": 2.9606, "theoretical_loss": 3.5422332046973395, "tokens_seen": 1380003840 }, { "epoch": 4.0, "learning_rate": 0.0002938816449348044, "loss": 2.7962, "theoretical_loss": 3.5422181350639175, "tokens_seen": 1380069376 }, { "epoch": 4.0, "learning_rate": 0.0002938716148445336, "loss": 2.9089, "theoretical_loss": 3.5422030663464597, "tokens_seen": 1380134912 }, { "epoch": 4.0, "learning_rate": 0.0002938615847542628, "loss": 2.9406, "theoretical_loss": 3.542187998544867, "tokens_seen": 1380200448 }, { "epoch": 4.0, "learning_rate": 0.000293851554663992, "loss": 2.9201, "theoretical_loss": 3.54217293165904, "tokens_seen": 1380265984 }, { "epoch": 4.0, "learning_rate": 0.00029384152457372114, "loss": 2.7778, "theoretical_loss": 3.5421578656888797, "tokens_seen": 1380331520 }, { "epoch": 4.0, "learning_rate": 0.0002938314944834504, "loss": 2.878, "theoretical_loss": 3.542142800634287, "tokens_seen": 1380397056 }, { "epoch": 4.0, "learning_rate": 0.0002938214643931795, "loss": 2.86, "theoretical_loss": 3.5421277364951624, "tokens_seen": 1380462592 }, { "epoch": 4.0, "learning_rate": 0.00029381143430290874, "loss": 2.8134, "theoretical_loss": 3.542112673271408, "tokens_seen": 1380528128 }, { "epoch": 4.0, "learning_rate": 0.0002938014042126379, "loss": 2.8065, "theoretical_loss": 3.542097610962923, "tokens_seen": 1380593664 }, { "epoch": 4.0, "learning_rate": 0.0002937913741223671, "loss": 3.0055, "theoretical_loss": 3.54208254956961, "tokens_seen": 1380659200 }, { "epoch": 4.0, "learning_rate": 0.0002937813440320963, "loss": 2.6776, "theoretical_loss": 3.5420674890913686, "tokens_seen": 1380724736 }, { "epoch": 4.0, "learning_rate": 0.0002937713139418255, "loss": 2.7465, "theoretical_loss": 3.5420524295281006, "tokens_seen": 1380790272 }, { "epoch": 4.0, "learning_rate": 0.00029376128385155464, "loss": 2.8715, "theoretical_loss": 3.542037370879707, "tokens_seen": 1380855808 }, { "epoch": 4.0, "learning_rate": 0.0002937512537612839, "loss": 2.9015, "theoretical_loss": 3.542022313146088, "tokens_seen": 1380921344 }, { "epoch": 4.0, "learning_rate": 0.000293741223671013, "loss": 2.8376, "theoretical_loss": 3.542007256327146, "tokens_seen": 1380986880 }, { "epoch": 4.0, "learning_rate": 0.00029373119358074224, "loss": 2.7311, "theoretical_loss": 3.54199220042278, "tokens_seen": 1381052416 }, { "epoch": 4.0, "objective/train/docs_used": 2207757, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.856968879699707, "objective/train/theoretical_loss": 3.5419771454328934, "objective/train/tokens_used": 1401577952, "theoretical_loss": 3.5419771454328934, "tokens_seen": 1381117952 }, { "epoch": 4.0, "learning_rate": 0.0002937211634904714, "loss": 2.9605, "theoretical_loss": 3.5419771454328934, "tokens_seen": 1381117952 }, { "epoch": 4.0, "learning_rate": 0.0002937111334002006, "loss": 2.7668, "theoretical_loss": 3.5419620913573864, "tokens_seen": 1381183488 }, { "epoch": 4.0, "learning_rate": 0.0002937011033099298, "loss": 2.8067, "theoretical_loss": 3.541947038196159, "tokens_seen": 1381249024 }, { "epoch": 4.0, "learning_rate": 0.00029369107321965896, "loss": 2.7071, "theoretical_loss": 3.5419319859491134, "tokens_seen": 1381314560 }, { "epoch": 4.0, "learning_rate": 0.00029368104312938815, "loss": 2.7099, "theoretical_loss": 3.5419169346161503, "tokens_seen": 1381380096 }, { "epoch": 4.0, "learning_rate": 0.0002936710130391174, "loss": 2.7778, "theoretical_loss": 3.5419018841971712, "tokens_seen": 1381445632 }, { "epoch": 4.0, "learning_rate": 0.0002936609829488465, "loss": 2.7411, "theoretical_loss": 3.541886834692077, "tokens_seen": 1381511168 }, { "epoch": 4.0, "learning_rate": 0.00029365095285857574, "loss": 2.7583, "theoretical_loss": 3.541871786100769, "tokens_seen": 1381576704 }, { "epoch": 4.0, "learning_rate": 0.00029364092276830487, "loss": 2.8834, "theoretical_loss": 3.5418567384231476, "tokens_seen": 1381642240 }, { "epoch": 4.0, "learning_rate": 0.0002936308926780341, "loss": 2.7815, "theoretical_loss": 3.541841691659115, "tokens_seen": 1381707776 }, { "epoch": 4.0, "learning_rate": 0.0002936208625877633, "loss": 2.8566, "theoretical_loss": 3.5418266458085723, "tokens_seen": 1381773312 }, { "epoch": 4.0, "learning_rate": 0.00029361083249749247, "loss": 2.8056, "theoretical_loss": 3.54181160087142, "tokens_seen": 1381838848 }, { "epoch": 4.0, "learning_rate": 0.00029360080240722165, "loss": 2.8356, "theoretical_loss": 3.5417965568475607, "tokens_seen": 1381904384 }, { "epoch": 4.0, "learning_rate": 0.0002935907723169509, "loss": 2.8893, "theoretical_loss": 3.5417815137368938, "tokens_seen": 1381969920 }, { "epoch": 4.0, "learning_rate": 0.00029358074222668, "loss": 2.7534, "theoretical_loss": 3.5417664715393222, "tokens_seen": 1382035456 }, { "epoch": 4.0, "learning_rate": 0.00029357071213640925, "loss": 2.8381, "theoretical_loss": 3.5417514302547457, "tokens_seen": 1382100992 }, { "epoch": 4.0, "learning_rate": 0.0002935606820461384, "loss": 2.7108, "theoretical_loss": 3.541736389883067, "tokens_seen": 1382166528 }, { "epoch": 4.0, "learning_rate": 0.0002935506519558676, "loss": 2.7682, "theoretical_loss": 3.5417213504241865, "tokens_seen": 1382232064 }, { "epoch": 4.0, "learning_rate": 0.00029354062186559684, "loss": 2.7435, "theoretical_loss": 3.5417063118780057, "tokens_seen": 1382297600 }, { "epoch": 4.0, "learning_rate": 0.00029353059177532597, "loss": 2.7429, "theoretical_loss": 3.5416912742444264, "tokens_seen": 1382363136 }, { "epoch": 4.0, "learning_rate": 0.0002935205616850552, "loss": 2.8963, "theoretical_loss": 3.5416762375233497, "tokens_seen": 1382428672 }, { "epoch": 4.0, "learning_rate": 0.00029351053159478433, "loss": 2.8467, "theoretical_loss": 3.5416612017146765, "tokens_seen": 1382494208 }, { "epoch": 4.0, "learning_rate": 0.00029350050150451357, "loss": 2.7956, "theoretical_loss": 3.541646166818309, "tokens_seen": 1382559744 }, { "epoch": 4.0, "learning_rate": 0.00029349047141424275, "loss": 2.8737, "theoretical_loss": 3.541631132834148, "tokens_seen": 1382625280 }, { "epoch": 4.0, "learning_rate": 0.00029348044132397193, "loss": 2.8632, "theoretical_loss": 3.5416160997620953, "tokens_seen": 1382690816 }, { "epoch": 4.0, "objective/train/docs_used": 2210638, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.982017993927002, "objective/train/theoretical_loss": 3.541601067602052, "objective/train/tokens_used": 1403216352, "theoretical_loss": 3.541601067602052, "tokens_seen": 1382756352 }, { "epoch": 4.0, "learning_rate": 0.0002934704112337011, "loss": 2.8908, "theoretical_loss": 3.541601067602052, "tokens_seen": 1382756352 }, { "epoch": 4.0, "learning_rate": 0.00029346038114343035, "loss": 2.8853, "theoretical_loss": 3.5415860363539196, "tokens_seen": 1382821888 }, { "epoch": 4.0, "learning_rate": 0.0002934503510531595, "loss": 2.7033, "theoretical_loss": 3.5415710060176, "tokens_seen": 1382887424 }, { "epoch": 4.0, "learning_rate": 0.0002934403209628887, "loss": 2.7992, "theoretical_loss": 3.541555976592994, "tokens_seen": 1382952960 }, { "epoch": 4.0, "learning_rate": 0.00029343029087261784, "loss": 2.7197, "theoretical_loss": 3.5415409480800033, "tokens_seen": 1383018496 }, { "epoch": 4.0, "learning_rate": 0.00029342026078234707, "loss": 2.7679, "theoretical_loss": 3.54152592047853, "tokens_seen": 1383084032 }, { "epoch": 4.0, "learning_rate": 0.00029341023069207625, "loss": 2.8135, "theoretical_loss": 3.541510893788475, "tokens_seen": 1383149568 }, { "epoch": 4.0, "learning_rate": 0.00029340020060180543, "loss": 2.7511, "theoretical_loss": 3.54149586800974, "tokens_seen": 1383215104 }, { "epoch": 4.0, "learning_rate": 0.0002933901705115346, "loss": 2.8027, "theoretical_loss": 3.541480843142227, "tokens_seen": 1383280640 }, { "epoch": 4.01, "learning_rate": 0.0002933801404212638, "loss": 2.7138, "theoretical_loss": 3.5414658191858366, "tokens_seen": 1383346176 }, { "epoch": 4.01, "learning_rate": 0.000293370110330993, "loss": 2.5907, "theoretical_loss": 3.541450796140471, "tokens_seen": 1383411712 }, { "epoch": 4.01, "learning_rate": 0.0002933600802407222, "loss": 2.8243, "theoretical_loss": 3.541435774006032, "tokens_seen": 1383477248 }, { "epoch": 4.01, "learning_rate": 0.00029335005015045134, "loss": 2.9759, "theoretical_loss": 3.541420752782421, "tokens_seen": 1383542784 }, { "epoch": 4.01, "learning_rate": 0.0002933400200601806, "loss": 2.8011, "theoretical_loss": 3.5414057324695394, "tokens_seen": 1383608320 }, { "epoch": 4.01, "learning_rate": 0.0002933299899699097, "loss": 2.8462, "theoretical_loss": 3.5413907130672886, "tokens_seen": 1383673856 }, { "epoch": 4.01, "learning_rate": 0.00029331995987963894, "loss": 2.6767, "theoretical_loss": 3.5413756945755717, "tokens_seen": 1383739392 }, { "epoch": 4.01, "learning_rate": 0.0002933099297893681, "loss": 2.8206, "theoretical_loss": 3.5413606769942887, "tokens_seen": 1383804928 }, { "epoch": 4.01, "learning_rate": 0.0002932998996990973, "loss": 2.8448, "theoretical_loss": 3.5413456603233424, "tokens_seen": 1383870464 }, { "epoch": 4.01, "learning_rate": 0.0002932898696088265, "loss": 2.9201, "theoretical_loss": 3.541330644562634, "tokens_seen": 1383936000 }, { "epoch": 4.01, "learning_rate": 0.0002932798395185557, "loss": 2.8651, "theoretical_loss": 3.5413156297120647, "tokens_seen": 1384001536 }, { "epoch": 4.01, "learning_rate": 0.00029326980942828484, "loss": 2.6644, "theoretical_loss": 3.5413006157715374, "tokens_seen": 1384067072 }, { "epoch": 4.01, "learning_rate": 0.0002932597793380141, "loss": 2.8338, "theoretical_loss": 3.5412856027409534, "tokens_seen": 1384132608 }, { "epoch": 4.01, "learning_rate": 0.0002932497492477432, "loss": 2.6252, "theoretical_loss": 3.5412705906202144, "tokens_seen": 1384198144 }, { "epoch": 4.01, "learning_rate": 0.00029323971915747244, "loss": 2.9308, "theoretical_loss": 3.5412555794092224, "tokens_seen": 1384263680 }, { "epoch": 4.01, "learning_rate": 0.0002932296890672016, "loss": 2.8902, "theoretical_loss": 3.5412405691078783, "tokens_seen": 1384329216 }, { "epoch": 4.01, "objective/train/docs_used": 2213544, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9514365196228027, "objective/train/theoretical_loss": 3.541225559716085, "objective/train/tokens_used": 1404854752, "theoretical_loss": 3.541225559716085, "tokens_seen": 1384394752 }, { "epoch": 4.01, "learning_rate": 0.0002932196589769308, "loss": 2.9652, "theoretical_loss": 3.541225559716085, "tokens_seen": 1384394752 }, { "epoch": 4.01, "learning_rate": 0.00029320962888666, "loss": 2.8947, "theoretical_loss": 3.541210551233744, "tokens_seen": 1384460288 }, { "epoch": 4.01, "learning_rate": 0.00029319959879638916, "loss": 2.8794, "theoretical_loss": 3.541195543660757, "tokens_seen": 1384525824 }, { "epoch": 4.01, "learning_rate": 0.00029318956870611835, "loss": 2.826, "theoretical_loss": 3.541180536997026, "tokens_seen": 1384591360 }, { "epoch": 4.01, "learning_rate": 0.0002931795386158476, "loss": 2.767, "theoretical_loss": 3.5411655312424526, "tokens_seen": 1384656896 }, { "epoch": 4.01, "learning_rate": 0.0002931695085255767, "loss": 2.9667, "theoretical_loss": 3.5411505263969394, "tokens_seen": 1384722432 }, { "epoch": 4.01, "learning_rate": 0.00029315947843530594, "loss": 2.8677, "theoretical_loss": 3.541135522460387, "tokens_seen": 1384787968 }, { "epoch": 4.01, "learning_rate": 0.00029314944834503507, "loss": 2.9568, "theoretical_loss": 3.541120519432699, "tokens_seen": 1384853504 }, { "epoch": 4.01, "learning_rate": 0.0002931394182547643, "loss": 2.9234, "theoretical_loss": 3.5411055173137758, "tokens_seen": 1384919040 }, { "epoch": 4.01, "learning_rate": 0.0002931293881644935, "loss": 2.7795, "theoretical_loss": 3.5410905161035204, "tokens_seen": 1384984576 }, { "epoch": 4.01, "learning_rate": 0.00029311935807422267, "loss": 2.7128, "theoretical_loss": 3.5410755158018343, "tokens_seen": 1385050112 }, { "epoch": 4.01, "learning_rate": 0.00029310932798395185, "loss": 2.9299, "theoretical_loss": 3.5410605164086197, "tokens_seen": 1385115648 }, { "epoch": 4.01, "learning_rate": 0.0002930992978936811, "loss": 2.9394, "theoretical_loss": 3.541045517923778, "tokens_seen": 1385181184 }, { "epoch": 4.01, "learning_rate": 0.0002930892678034102, "loss": 2.8347, "theoretical_loss": 3.5410305203472126, "tokens_seen": 1385246720 }, { "epoch": 4.01, "learning_rate": 0.00029307923771313945, "loss": 2.6765, "theoretical_loss": 3.5410155236788237, "tokens_seen": 1385312256 }, { "epoch": 4.01, "learning_rate": 0.0002930692076228686, "loss": 2.7597, "theoretical_loss": 3.541000527918515, "tokens_seen": 1385377792 }, { "epoch": 4.01, "learning_rate": 0.0002930591775325978, "loss": 2.8407, "theoretical_loss": 3.5409855330661877, "tokens_seen": 1385443328 }, { "epoch": 4.01, "learning_rate": 0.000293049147442327, "loss": 2.805, "theoretical_loss": 3.540970539121744, "tokens_seen": 1385508864 }, { "epoch": 4.01, "learning_rate": 0.00029303911735205617, "loss": 2.8514, "theoretical_loss": 3.540955546085086, "tokens_seen": 1385574400 }, { "epoch": 4.01, "learning_rate": 0.00029302908726178535, "loss": 2.8827, "theoretical_loss": 3.5409405539561156, "tokens_seen": 1385639936 }, { "epoch": 4.01, "learning_rate": 0.00029301905717151453, "loss": 2.8893, "theoretical_loss": 3.540925562734735, "tokens_seen": 1385705472 }, { "epoch": 4.01, "learning_rate": 0.0002930090270812437, "loss": 2.8602, "theoretical_loss": 3.540910572420847, "tokens_seen": 1385771008 }, { "epoch": 4.01, "learning_rate": 0.00029299899699097295, "loss": 2.7256, "theoretical_loss": 3.540895583014353, "tokens_seen": 1385836544 }, { "epoch": 4.01, "learning_rate": 0.0002929889669007021, "loss": 3.0421, "theoretical_loss": 3.540880594515155, "tokens_seen": 1385902080 }, { "epoch": 4.01, "learning_rate": 0.0002929789368104313, "loss": 2.8308, "theoretical_loss": 3.5408656069231563, "tokens_seen": 1385967616 }, { "epoch": 4.01, "objective/train/docs_used": 2214935, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8946726322174072, "objective/train/theoretical_loss": 3.540850620238258, "objective/train/tokens_used": 1406493152, "theoretical_loss": 3.540850620238258, "tokens_seen": 1386033152 }, { "epoch": 4.01, "learning_rate": 0.0002929689067201605, "loss": 2.9381, "theoretical_loss": 3.540850620238258, "tokens_seen": 1386033152 }, { "epoch": 4.01, "learning_rate": 0.0002929588766298897, "loss": 2.848, "theoretical_loss": 3.540835634460363, "tokens_seen": 1386098688 }, { "epoch": 4.01, "learning_rate": 0.00029294884653961885, "loss": 2.8273, "theoretical_loss": 3.5408206495893726, "tokens_seen": 1386164224 }, { "epoch": 4.01, "learning_rate": 0.00029293881644934804, "loss": 2.8898, "theoretical_loss": 3.5408056656251903, "tokens_seen": 1386229760 }, { "epoch": 4.01, "learning_rate": 0.0002929287863590772, "loss": 2.8554, "theoretical_loss": 3.5407906825677173, "tokens_seen": 1386295296 }, { "epoch": 4.01, "learning_rate": 0.00029291875626880645, "loss": 2.9821, "theoretical_loss": 3.540775700416856, "tokens_seen": 1386360832 }, { "epoch": 4.01, "learning_rate": 0.0002929087261785356, "loss": 2.7077, "theoretical_loss": 3.5407607191725097, "tokens_seen": 1386426368 }, { "epoch": 4.01, "learning_rate": 0.0002928986960882648, "loss": 2.8975, "theoretical_loss": 3.5407457388345795, "tokens_seen": 1386491904 }, { "epoch": 4.01, "learning_rate": 0.00029288866599799394, "loss": 2.9732, "theoretical_loss": 3.540730759402968, "tokens_seen": 1386557440 }, { "epoch": 4.01, "learning_rate": 0.0002928786359077232, "loss": 2.8303, "theoretical_loss": 3.540715780877578, "tokens_seen": 1386622976 }, { "epoch": 4.01, "learning_rate": 0.00029286860581745236, "loss": 2.8143, "theoretical_loss": 3.5407008032583116, "tokens_seen": 1386688512 }, { "epoch": 4.01, "learning_rate": 0.00029285857572718154, "loss": 2.7759, "theoretical_loss": 3.5406858265450714, "tokens_seen": 1386754048 }, { "epoch": 4.01, "learning_rate": 0.0002928485456369107, "loss": 2.7524, "theoretical_loss": 3.5406708507377593, "tokens_seen": 1386819584 }, { "epoch": 4.01, "learning_rate": 0.0002928385155466399, "loss": 2.8641, "theoretical_loss": 3.5406558758362783, "tokens_seen": 1386885120 }, { "epoch": 4.01, "learning_rate": 0.0002928284854563691, "loss": 2.6819, "theoretical_loss": 3.54064090184053, "tokens_seen": 1386950656 }, { "epoch": 4.01, "learning_rate": 0.0002928184553660983, "loss": 2.9394, "theoretical_loss": 3.540625928750417, "tokens_seen": 1387016192 }, { "epoch": 4.01, "learning_rate": 0.00029280842527582744, "loss": 2.7724, "theoretical_loss": 3.5406109565658426, "tokens_seen": 1387081728 }, { "epoch": 4.01, "learning_rate": 0.0002927983951855567, "loss": 2.9358, "theoretical_loss": 3.540595985286708, "tokens_seen": 1387147264 }, { "epoch": 4.01, "learning_rate": 0.0002927883650952859, "loss": 2.7792, "theoretical_loss": 3.5405810149129167, "tokens_seen": 1387212800 }, { "epoch": 4.01, "learning_rate": 0.00029277833500501504, "loss": 2.9202, "theoretical_loss": 3.5405660454443706, "tokens_seen": 1387278336 }, { "epoch": 4.01, "learning_rate": 0.0002927683049147443, "loss": 2.8228, "theoretical_loss": 3.5405510768809725, "tokens_seen": 1387343872 }, { "epoch": 4.01, "learning_rate": 0.0002927582748244734, "loss": 2.6575, "theoretical_loss": 3.540536109222625, "tokens_seen": 1387409408 }, { "epoch": 4.01, "learning_rate": 0.00029274824473420264, "loss": 2.9763, "theoretical_loss": 3.54052114246923, "tokens_seen": 1387474944 }, { "epoch": 4.01, "learning_rate": 0.0002927382146439318, "loss": 2.9908, "theoretical_loss": 3.540506176620691, "tokens_seen": 1387540480 }, { "epoch": 4.01, "learning_rate": 0.000292728184553661, "loss": 2.9085, "theoretical_loss": 3.54049121167691, "tokens_seen": 1387606016 }, { "epoch": 4.01, "objective/train/docs_used": 2217773, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.792294979095459, "objective/train/theoretical_loss": 3.5404762476377893, "objective/train/tokens_used": 1408131552, "theoretical_loss": 3.5404762476377893, "tokens_seen": 1387671552 }, { "epoch": 4.01, "learning_rate": 0.0002927181544633902, "loss": 2.9129, "theoretical_loss": 3.5404762476377893, "tokens_seen": 1387671552 }, { "epoch": 4.01, "learning_rate": 0.00029270812437311936, "loss": 2.8636, "theoretical_loss": 3.540461284503232, "tokens_seen": 1387737088 }, { "epoch": 4.01, "learning_rate": 0.00029269809428284855, "loss": 2.9906, "theoretical_loss": 3.540446322273141, "tokens_seen": 1387802624 }, { "epoch": 4.01, "learning_rate": 0.0002926880641925778, "loss": 2.71, "theoretical_loss": 3.5404313609474176, "tokens_seen": 1387868160 }, { "epoch": 4.01, "learning_rate": 0.0002926780341023069, "loss": 2.9371, "theoretical_loss": 3.5404164005259657, "tokens_seen": 1387933696 }, { "epoch": 4.01, "learning_rate": 0.00029266800401203614, "loss": 2.8496, "theoretical_loss": 3.5404014410086875, "tokens_seen": 1387999232 }, { "epoch": 4.01, "learning_rate": 0.00029265797392176527, "loss": 2.7576, "theoretical_loss": 3.540386482395485, "tokens_seen": 1388064768 }, { "epoch": 4.01, "learning_rate": 0.0002926479438314945, "loss": 2.7261, "theoretical_loss": 3.540371524686263, "tokens_seen": 1388130304 }, { "epoch": 4.01, "learning_rate": 0.0002926379137412237, "loss": 2.7526, "theoretical_loss": 3.540356567880922, "tokens_seen": 1388195840 }, { "epoch": 4.01, "learning_rate": 0.00029262788365095287, "loss": 2.5652, "theoretical_loss": 3.540341611979365, "tokens_seen": 1388261376 }, { "epoch": 4.01, "learning_rate": 0.00029261785356068205, "loss": 2.8918, "theoretical_loss": 3.540326656981496, "tokens_seen": 1388326912 }, { "epoch": 4.01, "learning_rate": 0.0002926078234704113, "loss": 2.8822, "theoretical_loss": 3.5403117028872164, "tokens_seen": 1388392448 }, { "epoch": 4.01, "learning_rate": 0.0002925977933801404, "loss": 2.8667, "theoretical_loss": 3.54029674969643, "tokens_seen": 1388457984 }, { "epoch": 4.01, "learning_rate": 0.00029258776328986965, "loss": 2.722, "theoretical_loss": 3.5402817974090386, "tokens_seen": 1388523520 }, { "epoch": 4.01, "learning_rate": 0.0002925777331995988, "loss": 2.8804, "theoretical_loss": 3.540266846024946, "tokens_seen": 1388589056 }, { "epoch": 4.01, "learning_rate": 0.000292567703109328, "loss": 2.7583, "theoretical_loss": 3.540251895544054, "tokens_seen": 1388654592 }, { "epoch": 4.01, "learning_rate": 0.0002925576730190572, "loss": 2.7609, "theoretical_loss": 3.540236945966266, "tokens_seen": 1388720128 }, { "epoch": 4.01, "learning_rate": 0.00029254764292878637, "loss": 2.8542, "theoretical_loss": 3.540221997291485, "tokens_seen": 1388785664 }, { "epoch": 4.01, "learning_rate": 0.00029253761283851555, "loss": 2.8195, "theoretical_loss": 3.540207049519613, "tokens_seen": 1388851200 }, { "epoch": 4.01, "learning_rate": 0.00029252758274824473, "loss": 2.9433, "theoretical_loss": 3.540192102650554, "tokens_seen": 1388916736 }, { "epoch": 4.01, "learning_rate": 0.0002925175526579739, "loss": 2.9159, "theoretical_loss": 3.54017715668421, "tokens_seen": 1388982272 }, { "epoch": 4.01, "learning_rate": 0.00029250752256770315, "loss": 2.8224, "theoretical_loss": 3.5401622116204843, "tokens_seen": 1389047808 }, { "epoch": 4.01, "learning_rate": 0.0002924974924774323, "loss": 2.7175, "theoretical_loss": 3.5401472674592798, "tokens_seen": 1389113344 }, { "epoch": 4.01, "learning_rate": 0.0002924874623871615, "loss": 2.737, "theoretical_loss": 3.540132324200499, "tokens_seen": 1389178880 }, { "epoch": 4.01, "learning_rate": 0.0002924774322968907, "loss": 2.8189, "theoretical_loss": 3.5401173818440457, "tokens_seen": 1389244416 }, { "epoch": 4.01, "objective/train/docs_used": 2220338, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6557087898254395, "objective/train/theoretical_loss": 3.5401024403898216, "objective/train/tokens_used": 1409769952, "theoretical_loss": 3.5401024403898216, "tokens_seen": 1389309952 }, { "epoch": 4.01, "learning_rate": 0.0002924674022066199, "loss": 2.7307, "theoretical_loss": 3.5401024403898216, "tokens_seen": 1389309952 }, { "epoch": 4.01, "learning_rate": 0.00029245737211634906, "loss": 2.8309, "theoretical_loss": 3.540087499837731, "tokens_seen": 1389375488 }, { "epoch": 4.01, "learning_rate": 0.00029244734202607824, "loss": 2.8332, "theoretical_loss": 3.540072560187676, "tokens_seen": 1389441024 }, { "epoch": 4.01, "learning_rate": 0.0002924373119358074, "loss": 2.9498, "theoretical_loss": 3.54005762143956, "tokens_seen": 1389506560 }, { "epoch": 4.01, "learning_rate": 0.00029242728184553665, "loss": 2.8773, "theoretical_loss": 3.5400426835932857, "tokens_seen": 1389572096 }, { "epoch": 4.01, "learning_rate": 0.0002924172517552658, "loss": 2.607, "theoretical_loss": 3.5400277466487564, "tokens_seen": 1389637632 }, { "epoch": 4.01, "learning_rate": 0.000292407221664995, "loss": 2.6105, "theoretical_loss": 3.540012810605875, "tokens_seen": 1389703168 }, { "epoch": 4.01, "learning_rate": 0.00029239719157472414, "loss": 2.6736, "theoretical_loss": 3.539997875464544, "tokens_seen": 1389768704 }, { "epoch": 4.01, "learning_rate": 0.0002923871614844534, "loss": 2.7766, "theoretical_loss": 3.539982941224668, "tokens_seen": 1389834240 }, { "epoch": 4.01, "learning_rate": 0.00029237713139418256, "loss": 2.8401, "theoretical_loss": 3.5399680078861486, "tokens_seen": 1389899776 }, { "epoch": 4.01, "learning_rate": 0.00029236710130391174, "loss": 2.873, "theoretical_loss": 3.5399530754488895, "tokens_seen": 1389965312 }, { "epoch": 4.01, "learning_rate": 0.0002923570712136409, "loss": 2.8199, "theoretical_loss": 3.539938143912794, "tokens_seen": 1390030848 }, { "epoch": 4.01, "learning_rate": 0.0002923470411233701, "loss": 2.6863, "theoretical_loss": 3.539923213277765, "tokens_seen": 1390096384 }, { "epoch": 4.01, "learning_rate": 0.0002923370110330993, "loss": 2.7458, "theoretical_loss": 3.5399082835437055, "tokens_seen": 1390161920 }, { "epoch": 4.01, "learning_rate": 0.0002923269809428285, "loss": 2.8295, "theoretical_loss": 3.539893354710519, "tokens_seen": 1390227456 }, { "epoch": 4.01, "learning_rate": 0.00029231695085255765, "loss": 2.7702, "theoretical_loss": 3.5398784267781083, "tokens_seen": 1390292992 }, { "epoch": 4.01, "learning_rate": 0.0002923069207622869, "loss": 2.8144, "theoretical_loss": 3.5398634997463767, "tokens_seen": 1390358528 }, { "epoch": 4.01, "learning_rate": 0.00029229689067201606, "loss": 2.7565, "theoretical_loss": 3.539848573615228, "tokens_seen": 1390424064 }, { "epoch": 4.01, "learning_rate": 0.00029228686058174524, "loss": 3.0365, "theoretical_loss": 3.5398336483845645, "tokens_seen": 1390489600 }, { "epoch": 4.01, "learning_rate": 0.0002922768304914744, "loss": 2.9548, "theoretical_loss": 3.5398187240542898, "tokens_seen": 1390555136 }, { "epoch": 4.01, "learning_rate": 0.0002922668004012036, "loss": 2.7356, "theoretical_loss": 3.5398038006243073, "tokens_seen": 1390620672 }, { "epoch": 4.01, "learning_rate": 0.0002922567703109328, "loss": 2.9126, "theoretical_loss": 3.5397888780945204, "tokens_seen": 1390686208 }, { "epoch": 4.01, "learning_rate": 0.000292246740220662, "loss": 2.8318, "theoretical_loss": 3.539773956464832, "tokens_seen": 1390751744 }, { "epoch": 4.01, "learning_rate": 0.00029223671013039115, "loss": 2.9317, "theoretical_loss": 3.539759035735145, "tokens_seen": 1390817280 }, { "epoch": 4.01, "learning_rate": 0.0002922266800401204, "loss": 2.9535, "theoretical_loss": 3.539744115905364, "tokens_seen": 1390882816 }, { "epoch": 4.01, "objective/train/docs_used": 2223119, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9960737228393555, "objective/train/theoretical_loss": 3.539729196975391, "objective/train/tokens_used": 1411408352, "theoretical_loss": 3.539729196975391, "tokens_seen": 1390948352 }, { "epoch": 4.01, "learning_rate": 0.0002922166499498495, "loss": 2.8152, "theoretical_loss": 3.539729196975391, "tokens_seen": 1390948352 }, { "epoch": 4.01, "learning_rate": 0.00029220661985957875, "loss": 2.9229, "theoretical_loss": 3.53971427894513, "tokens_seen": 1391013888 }, { "epoch": 4.01, "learning_rate": 0.0002921965897693079, "loss": 2.9097, "theoretical_loss": 3.5396993618144847, "tokens_seen": 1391079424 }, { "epoch": 4.01, "learning_rate": 0.0002921865596790371, "loss": 2.927, "theoretical_loss": 3.539684445583358, "tokens_seen": 1391144960 }, { "epoch": 4.01, "learning_rate": 0.0002921765295887663, "loss": 2.8365, "theoretical_loss": 3.539669530251653, "tokens_seen": 1391210496 }, { "epoch": 4.01, "learning_rate": 0.00029216649949849547, "loss": 2.9421, "theoretical_loss": 3.5396546158192734, "tokens_seen": 1391276032 }, { "epoch": 4.01, "learning_rate": 0.00029215646940822465, "loss": 2.9009, "theoretical_loss": 3.5396397022861237, "tokens_seen": 1391341568 }, { "epoch": 4.01, "learning_rate": 0.0002921464393179539, "loss": 2.6919, "theoretical_loss": 3.5396247896521054, "tokens_seen": 1391407104 }, { "epoch": 4.01, "learning_rate": 0.000292136409227683, "loss": 2.8076, "theoretical_loss": 3.5396098779171226, "tokens_seen": 1391472640 }, { "epoch": 4.01, "learning_rate": 0.00029212637913741225, "loss": 2.9921, "theoretical_loss": 3.5395949670810793, "tokens_seen": 1391538176 }, { "epoch": 4.01, "learning_rate": 0.00029211634904714143, "loss": 2.759, "theoretical_loss": 3.539580057143879, "tokens_seen": 1391603712 }, { "epoch": 4.01, "learning_rate": 0.0002921063189568706, "loss": 2.6931, "theoretical_loss": 3.5395651481054244, "tokens_seen": 1391669248 }, { "epoch": 4.01, "learning_rate": 0.0002920962888665998, "loss": 2.6172, "theoretical_loss": 3.539550239965619, "tokens_seen": 1391734784 }, { "epoch": 4.01, "learning_rate": 0.000292086258776329, "loss": 2.8204, "theoretical_loss": 3.539535332724368, "tokens_seen": 1391800320 }, { "epoch": 4.01, "learning_rate": 0.00029207622868605815, "loss": 2.968, "theoretical_loss": 3.539520426381573, "tokens_seen": 1391865856 }, { "epoch": 4.01, "learning_rate": 0.0002920661985957874, "loss": 2.7746, "theoretical_loss": 3.539505520937138, "tokens_seen": 1391931392 }, { "epoch": 4.01, "learning_rate": 0.0002920561685055165, "loss": 2.8842, "theoretical_loss": 3.5394906163909674, "tokens_seen": 1391996928 }, { "epoch": 4.01, "learning_rate": 0.00029204613841524575, "loss": 2.9114, "theoretical_loss": 3.5394757127429637, "tokens_seen": 1392062464 }, { "epoch": 4.01, "learning_rate": 0.00029203610832497493, "loss": 2.7826, "theoretical_loss": 3.5394608099930314, "tokens_seen": 1392128000 }, { "epoch": 4.01, "learning_rate": 0.0002920260782347041, "loss": 2.7994, "theoretical_loss": 3.539445908141074, "tokens_seen": 1392193536 }, { "epoch": 4.01, "learning_rate": 0.00029201604814443335, "loss": 2.9248, "theoretical_loss": 3.539431007186994, "tokens_seen": 1392259072 }, { "epoch": 4.01, "learning_rate": 0.0002920060180541625, "loss": 2.7009, "theoretical_loss": 3.5394161071306964, "tokens_seen": 1392324608 }, { "epoch": 4.01, "learning_rate": 0.0002919959879638917, "loss": 3.0544, "theoretical_loss": 3.5394012079720847, "tokens_seen": 1392390144 }, { "epoch": 4.01, "learning_rate": 0.0002919859578736209, "loss": 2.8494, "theoretical_loss": 3.5393863097110616, "tokens_seen": 1392455680 }, { "epoch": 4.01, "learning_rate": 0.0002919759277833501, "loss": 2.816, "theoretical_loss": 3.5393714123475313, "tokens_seen": 1392521216 }, { "epoch": 4.01, "objective/train/docs_used": 2225839, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5613856315612793, "objective/train/theoretical_loss": 3.539356515881398, "objective/train/tokens_used": 1413046752, "theoretical_loss": 3.539356515881398, "tokens_seen": 1392586752 }, { "epoch": 4.01, "learning_rate": 0.00029196589769307926, "loss": 2.6381, "theoretical_loss": 3.539356515881398, "tokens_seen": 1392586752 }, { "epoch": 4.01, "learning_rate": 0.00029195586760280844, "loss": 2.7732, "theoretical_loss": 3.539341620312565, "tokens_seen": 1392652288 }, { "epoch": 4.01, "learning_rate": 0.0002919458375125376, "loss": 2.7954, "theoretical_loss": 3.5393267256409358, "tokens_seen": 1392717824 }, { "epoch": 4.01, "learning_rate": 0.00029193580742226685, "loss": 2.8395, "theoretical_loss": 3.5393118318664145, "tokens_seen": 1392783360 }, { "epoch": 4.01, "learning_rate": 0.000291925777331996, "loss": 2.9031, "theoretical_loss": 3.5392969389889046, "tokens_seen": 1392848896 }, { "epoch": 4.01, "learning_rate": 0.0002919157472417252, "loss": 2.9574, "theoretical_loss": 3.53928204700831, "tokens_seen": 1392914432 }, { "epoch": 4.01, "learning_rate": 0.00029190571715145434, "loss": 2.8458, "theoretical_loss": 3.5392671559245352, "tokens_seen": 1392979968 }, { "epoch": 4.01, "learning_rate": 0.0002918956870611836, "loss": 2.7342, "theoretical_loss": 3.539252265737482, "tokens_seen": 1393045504 }, { "epoch": 4.01, "learning_rate": 0.00029188565697091276, "loss": 2.6037, "theoretical_loss": 3.5392373764470566, "tokens_seen": 1393111040 }, { "epoch": 4.01, "learning_rate": 0.00029187562688064194, "loss": 2.7791, "theoretical_loss": 3.5392224880531615, "tokens_seen": 1393176576 }, { "epoch": 4.01, "learning_rate": 0.0002918655967903711, "loss": 2.9233, "theoretical_loss": 3.5392076005557005, "tokens_seen": 1393242112 }, { "epoch": 4.01, "learning_rate": 0.0002918555667001003, "loss": 2.7458, "theoretical_loss": 3.539192713954578, "tokens_seen": 1393307648 }, { "epoch": 4.01, "learning_rate": 0.0002918455366098295, "loss": 2.8284, "theoretical_loss": 3.5391778282496977, "tokens_seen": 1393373184 }, { "epoch": 4.01, "learning_rate": 0.0002918355065195587, "loss": 2.8562, "theoretical_loss": 3.539162943440963, "tokens_seen": 1393438720 }, { "epoch": 4.01, "learning_rate": 0.00029182547642928785, "loss": 2.8526, "theoretical_loss": 3.5391480595282787, "tokens_seen": 1393504256 }, { "epoch": 4.01, "learning_rate": 0.0002918154463390171, "loss": 2.799, "theoretical_loss": 3.539133176511548, "tokens_seen": 1393569792 }, { "epoch": 4.01, "learning_rate": 0.00029180541624874626, "loss": 2.7694, "theoretical_loss": 3.539118294390675, "tokens_seen": 1393635328 }, { "epoch": 4.01, "learning_rate": 0.00029179538615847544, "loss": 2.5988, "theoretical_loss": 3.5391034131655643, "tokens_seen": 1393700864 }, { "epoch": 4.01, "learning_rate": 0.0002917853560682046, "loss": 2.7843, "theoretical_loss": 3.539088532836119, "tokens_seen": 1393766400 }, { "epoch": 4.01, "learning_rate": 0.0002917753259779338, "loss": 2.9551, "theoretical_loss": 3.539073653402243, "tokens_seen": 1393831936 }, { "epoch": 4.01, "learning_rate": 0.000291765295887663, "loss": 2.9527, "theoretical_loss": 3.5390587748638414, "tokens_seen": 1393897472 }, { "epoch": 4.01, "learning_rate": 0.0002917552657973922, "loss": 2.83, "theoretical_loss": 3.5390438972208167, "tokens_seen": 1393963008 }, { "epoch": 4.01, "learning_rate": 0.00029174523570712135, "loss": 2.7873, "theoretical_loss": 3.539029020473074, "tokens_seen": 1394028544 }, { "epoch": 4.01, "learning_rate": 0.0002917352056168506, "loss": 2.7791, "theoretical_loss": 3.5390141446205177, "tokens_seen": 1394094080 }, { "epoch": 4.01, "learning_rate": 0.0002917251755265797, "loss": 2.8396, "theoretical_loss": 3.5389992696630506, "tokens_seen": 1394159616 }, { "epoch": 4.01, "objective/train/docs_used": 2228465, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1048030853271484, "objective/train/theoretical_loss": 3.5389843956005773, "objective/train/tokens_used": 1414685152, "theoretical_loss": 3.5389843956005773, "tokens_seen": 1394225152 }, { "epoch": 4.01, "learning_rate": 0.00029171514543630895, "loss": 2.8191, "theoretical_loss": 3.5389843956005773, "tokens_seen": 1394225152 }, { "epoch": 4.01, "learning_rate": 0.0002917051153460381, "loss": 2.885, "theoretical_loss": 3.5389695224330024, "tokens_seen": 1394290688 }, { "epoch": 4.01, "learning_rate": 0.0002916950852557673, "loss": 2.8193, "theoretical_loss": 3.538954650160229, "tokens_seen": 1394356224 }, { "epoch": 4.01, "learning_rate": 0.0002916850551654965, "loss": 2.7415, "theoretical_loss": 3.5389397787821624, "tokens_seen": 1394421760 }, { "epoch": 4.01, "learning_rate": 0.00029167502507522567, "loss": 2.7861, "theoretical_loss": 3.5389249082987058, "tokens_seen": 1394487296 }, { "epoch": 4.01, "learning_rate": 0.00029166499498495485, "loss": 2.7541, "theoretical_loss": 3.538910038709764, "tokens_seen": 1394552832 }, { "epoch": 4.01, "learning_rate": 0.0002916549648946841, "loss": 2.6574, "theoretical_loss": 3.5388951700152402, "tokens_seen": 1394618368 }, { "epoch": 4.01, "learning_rate": 0.0002916449348044132, "loss": 2.7274, "theoretical_loss": 3.5388803022150395, "tokens_seen": 1394683904 }, { "epoch": 4.01, "learning_rate": 0.00029163490471414245, "loss": 2.7971, "theoretical_loss": 3.5388654353090656, "tokens_seen": 1394749440 }, { "epoch": 4.01, "learning_rate": 0.00029162487462387163, "loss": 2.8638, "theoretical_loss": 3.538850569297223, "tokens_seen": 1394814976 }, { "epoch": 4.01, "learning_rate": 0.0002916148445336008, "loss": 2.9482, "theoretical_loss": 3.538835704179416, "tokens_seen": 1394880512 }, { "epoch": 4.01, "learning_rate": 0.00029160481444333, "loss": 2.8198, "theoretical_loss": 3.5388208399555485, "tokens_seen": 1394946048 }, { "epoch": 4.01, "learning_rate": 0.0002915947843530592, "loss": 2.7364, "theoretical_loss": 3.538805976625525, "tokens_seen": 1395011584 }, { "epoch": 4.01, "learning_rate": 0.00029158475426278835, "loss": 2.8717, "theoretical_loss": 3.5387911141892494, "tokens_seen": 1395077120 }, { "epoch": 4.01, "learning_rate": 0.0002915747241725176, "loss": 2.8386, "theoretical_loss": 3.5387762526466267, "tokens_seen": 1395142656 }, { "epoch": 4.01, "learning_rate": 0.0002915646940822467, "loss": 2.892, "theoretical_loss": 3.53876139199756, "tokens_seen": 1395208192 }, { "epoch": 4.01, "learning_rate": 0.00029155466399197595, "loss": 2.7973, "theoretical_loss": 3.5387465322419547, "tokens_seen": 1395273728 }, { "epoch": 4.01, "learning_rate": 0.0002915446339017051, "loss": 2.8027, "theoretical_loss": 3.5387316733797145, "tokens_seen": 1395339264 }, { "epoch": 4.01, "learning_rate": 0.0002915346038114343, "loss": 2.7417, "theoretical_loss": 3.5387168154107442, "tokens_seen": 1395404800 }, { "epoch": 4.01, "learning_rate": 0.0002915245737211635, "loss": 2.8626, "theoretical_loss": 3.538701958334948, "tokens_seen": 1395470336 }, { "epoch": 4.01, "learning_rate": 0.0002915145436308927, "loss": 2.7228, "theoretical_loss": 3.5386871021522297, "tokens_seen": 1395535872 }, { "epoch": 4.01, "learning_rate": 0.00029150451354062186, "loss": 2.7908, "theoretical_loss": 3.5386722468624945, "tokens_seen": 1395601408 }, { "epoch": 4.01, "learning_rate": 0.0002914944834503511, "loss": 2.8176, "theoretical_loss": 3.5386573924656464, "tokens_seen": 1395666944 }, { "epoch": 4.01, "learning_rate": 0.0002914844533600802, "loss": 2.7408, "theoretical_loss": 3.5386425389615903, "tokens_seen": 1395732480 }, { "epoch": 4.01, "learning_rate": 0.00029147442326980946, "loss": 2.9384, "theoretical_loss": 3.5386276863502295, "tokens_seen": 1395798016 }, { "epoch": 4.01, "objective/train/docs_used": 2229796, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.001620292663574, "objective/train/theoretical_loss": 3.5386128346314694, "objective/train/tokens_used": 1416323552, "theoretical_loss": 3.5386128346314694, "tokens_seen": 1395863552 }, { "epoch": 4.01, "learning_rate": 0.0002914643931795386, "loss": 2.8645, "theoretical_loss": 3.5386128346314694, "tokens_seen": 1395863552 }, { "epoch": 4.01, "learning_rate": 0.0002914543630892678, "loss": 2.9259, "theoretical_loss": 3.5385979838052144, "tokens_seen": 1395929088 }, { "epoch": 4.01, "learning_rate": 0.000291444332998997, "loss": 2.7608, "theoretical_loss": 3.5385831338713682, "tokens_seen": 1395994624 }, { "epoch": 4.01, "learning_rate": 0.0002914343029087262, "loss": 2.8191, "theoretical_loss": 3.5385682848298363, "tokens_seen": 1396060160 }, { "epoch": 4.01, "learning_rate": 0.00029142427281845536, "loss": 2.7759, "theoretical_loss": 3.5385534366805227, "tokens_seen": 1396125696 }, { "epoch": 4.01, "learning_rate": 0.00029141424272818454, "loss": 2.9095, "theoretical_loss": 3.538538589423332, "tokens_seen": 1396191232 }, { "epoch": 4.01, "learning_rate": 0.0002914042126379137, "loss": 2.9278, "theoretical_loss": 3.5385237430581684, "tokens_seen": 1396256768 }, { "epoch": 4.01, "learning_rate": 0.00029139418254764296, "loss": 2.7719, "theoretical_loss": 3.5385088975849373, "tokens_seen": 1396322304 }, { "epoch": 4.01, "learning_rate": 0.0002913841524573721, "loss": 2.8811, "theoretical_loss": 3.538494053003542, "tokens_seen": 1396387840 }, { "epoch": 4.01, "learning_rate": 0.0002913741223671013, "loss": 2.8774, "theoretical_loss": 3.5384792093138877, "tokens_seen": 1396453376 }, { "epoch": 4.01, "learning_rate": 0.00029136409227683045, "loss": 2.8451, "theoretical_loss": 3.5384643665158793, "tokens_seen": 1396518912 }, { "epoch": 4.01, "learning_rate": 0.0002913540621865597, "loss": 2.9325, "theoretical_loss": 3.5384495246094207, "tokens_seen": 1396584448 }, { "epoch": 4.01, "learning_rate": 0.00029134403209628886, "loss": 2.8831, "theoretical_loss": 3.5384346835944176, "tokens_seen": 1396649984 }, { "epoch": 4.01, "learning_rate": 0.00029133400200601805, "loss": 2.8005, "theoretical_loss": 3.538419843470774, "tokens_seen": 1396715520 }, { "epoch": 4.01, "learning_rate": 0.0002913239719157472, "loss": 2.8486, "theoretical_loss": 3.5384050042383937, "tokens_seen": 1396781056 }, { "epoch": 4.01, "learning_rate": 0.00029131394182547646, "loss": 2.7685, "theoretical_loss": 3.538390165897183, "tokens_seen": 1396846592 }, { "epoch": 4.01, "learning_rate": 0.0002913039117352056, "loss": 2.7998, "theoretical_loss": 3.5383753284470454, "tokens_seen": 1396912128 }, { "epoch": 4.01, "learning_rate": 0.0002912938816449348, "loss": 2.7979, "theoretical_loss": 3.538360491887886, "tokens_seen": 1396977664 }, { "epoch": 4.01, "learning_rate": 0.000291283851554664, "loss": 2.9738, "theoretical_loss": 3.5383456562196094, "tokens_seen": 1397043200 }, { "epoch": 4.01, "learning_rate": 0.0002912738214643932, "loss": 2.8792, "theoretical_loss": 3.5383308214421203, "tokens_seen": 1397108736 }, { "epoch": 4.01, "learning_rate": 0.0002912637913741224, "loss": 2.8865, "theoretical_loss": 3.5383159875553236, "tokens_seen": 1397174272 }, { "epoch": 4.01, "learning_rate": 0.00029125376128385155, "loss": 2.9825, "theoretical_loss": 3.538301154559124, "tokens_seen": 1397239808 }, { "epoch": 4.01, "learning_rate": 0.0002912437311935808, "loss": 2.8231, "theoretical_loss": 3.538286322453426, "tokens_seen": 1397305344 }, { "epoch": 4.01, "learning_rate": 0.0002912337011033099, "loss": 2.9478, "theoretical_loss": 3.5382714912381346, "tokens_seen": 1397370880 }, { "epoch": 4.01, "learning_rate": 0.00029122367101303915, "loss": 2.8181, "theoretical_loss": 3.5382566609131545, "tokens_seen": 1397436416 }, { "epoch": 4.01, "objective/train/docs_used": 2232784, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7363498210906982, "objective/train/theoretical_loss": 3.5382418314783903, "objective/train/tokens_used": 1417961952, "theoretical_loss": 3.5382418314783903, "tokens_seen": 1397501952 }, { "epoch": 4.01, "learning_rate": 0.00029121364092276833, "loss": 2.9439, "theoretical_loss": 3.5382418314783903, "tokens_seen": 1397501952 }, { "epoch": 4.01, "learning_rate": 0.0002912036108324975, "loss": 2.7419, "theoretical_loss": 3.5382270029337475, "tokens_seen": 1397567488 }, { "epoch": 4.01, "learning_rate": 0.0002911935807422267, "loss": 2.8537, "theoretical_loss": 3.5382121752791305, "tokens_seen": 1397633024 }, { "epoch": 4.01, "learning_rate": 0.00029118355065195587, "loss": 2.7499, "theoretical_loss": 3.5381973485144442, "tokens_seen": 1397698560 }, { "epoch": 4.01, "learning_rate": 0.00029117352056168505, "loss": 2.8034, "theoretical_loss": 3.5381825226395938, "tokens_seen": 1397764096 }, { "epoch": 4.01, "learning_rate": 0.0002911634904714143, "loss": 2.7316, "theoretical_loss": 3.538167697654483, "tokens_seen": 1397829632 }, { "epoch": 4.01, "learning_rate": 0.0002911534603811434, "loss": 2.7184, "theoretical_loss": 3.538152873559018, "tokens_seen": 1397895168 }, { "epoch": 4.01, "learning_rate": 0.00029114343029087265, "loss": 2.8313, "theoretical_loss": 3.538138050353103, "tokens_seen": 1397960704 }, { "epoch": 4.01, "learning_rate": 0.00029113340020060183, "loss": 2.7043, "theoretical_loss": 3.5381232280366435, "tokens_seen": 1398026240 }, { "epoch": 4.01, "learning_rate": 0.000291123370110331, "loss": 2.7872, "theoretical_loss": 3.5381084066095436, "tokens_seen": 1398091776 }, { "epoch": 4.01, "learning_rate": 0.0002911133400200602, "loss": 2.5918, "theoretical_loss": 3.538093586071709, "tokens_seen": 1398157312 }, { "epoch": 4.01, "learning_rate": 0.0002911033099297894, "loss": 2.7172, "theoretical_loss": 3.5380787664230446, "tokens_seen": 1398222848 }, { "epoch": 4.01, "learning_rate": 0.00029109327983951855, "loss": 2.7942, "theoretical_loss": 3.5380639476634546, "tokens_seen": 1398288384 }, { "epoch": 4.01, "learning_rate": 0.0002910832497492478, "loss": 2.6887, "theoretical_loss": 3.538049129792845, "tokens_seen": 1398353920 }, { "epoch": 4.01, "learning_rate": 0.0002910732196589769, "loss": 2.8669, "theoretical_loss": 3.5380343128111202, "tokens_seen": 1398419456 }, { "epoch": 4.01, "learning_rate": 0.00029106318956870615, "loss": 2.7888, "theoretical_loss": 3.5380194967181855, "tokens_seen": 1398484992 }, { "epoch": 4.01, "learning_rate": 0.0002910531594784353, "loss": 2.7274, "theoretical_loss": 3.5380046815139456, "tokens_seen": 1398550528 }, { "epoch": 4.01, "learning_rate": 0.0002910431293881645, "loss": 2.8105, "theoretical_loss": 3.5379898671983065, "tokens_seen": 1398616064 }, { "epoch": 4.01, "learning_rate": 0.0002910330992978937, "loss": 2.6997, "theoretical_loss": 3.537975053771172, "tokens_seen": 1398681600 }, { "epoch": 4.01, "learning_rate": 0.0002910230692076229, "loss": 2.9154, "theoretical_loss": 3.537960241232448, "tokens_seen": 1398747136 }, { "epoch": 4.01, "learning_rate": 0.00029101303911735206, "loss": 2.9072, "theoretical_loss": 3.537945429582039, "tokens_seen": 1398812672 }, { "epoch": 4.01, "learning_rate": 0.0002910030090270813, "loss": 2.685, "theoretical_loss": 3.5379306188198507, "tokens_seen": 1398878208 }, { "epoch": 4.01, "learning_rate": 0.0002909929789368104, "loss": 3.0675, "theoretical_loss": 3.5379158089457876, "tokens_seen": 1398943744 }, { "epoch": 4.01, "learning_rate": 0.00029098294884653966, "loss": 2.9681, "theoretical_loss": 3.5379009999597555, "tokens_seen": 1399009280 }, { "epoch": 4.01, "learning_rate": 0.0002909729187562688, "loss": 2.8822, "theoretical_loss": 3.5378861918616593, "tokens_seen": 1399074816 }, { "epoch": 4.01, "objective/train/docs_used": 2235644, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.674912691116333, "objective/train/theoretical_loss": 3.5378713846514045, "objective/train/tokens_used": 1419600352, "theoretical_loss": 3.5378713846514045, "tokens_seen": 1399140352 }, { "epoch": 4.01, "learning_rate": 0.000290962888665998, "loss": 2.7757, "theoretical_loss": 3.5378713846514045, "tokens_seen": 1399140352 }, { "epoch": 4.01, "learning_rate": 0.0002909528585757272, "loss": 2.7919, "theoretical_loss": 3.5378565783288956, "tokens_seen": 1399205888 }, { "epoch": 4.01, "learning_rate": 0.0002909428284854564, "loss": 2.6499, "theoretical_loss": 3.537841772894038, "tokens_seen": 1399271424 }, { "epoch": 4.01, "learning_rate": 0.00029093279839518556, "loss": 2.8305, "theoretical_loss": 3.5378269683467374, "tokens_seen": 1399336960 }, { "epoch": 4.01, "learning_rate": 0.00029092276830491474, "loss": 2.8868, "theoretical_loss": 3.5378121646868985, "tokens_seen": 1399402496 }, { "epoch": 4.01, "learning_rate": 0.0002909127382146439, "loss": 2.8022, "theoretical_loss": 3.5377973619144267, "tokens_seen": 1399468032 }, { "epoch": 4.01, "learning_rate": 0.00029090270812437316, "loss": 3.0074, "theoretical_loss": 3.5377825600292274, "tokens_seen": 1399533568 }, { "epoch": 4.01, "learning_rate": 0.0002908926780341023, "loss": 2.8259, "theoretical_loss": 3.5377677590312056, "tokens_seen": 1399599104 }, { "epoch": 4.01, "learning_rate": 0.0002908826479438315, "loss": 2.8885, "theoretical_loss": 3.537752958920267, "tokens_seen": 1399664640 }, { "epoch": 4.01, "learning_rate": 0.00029087261785356065, "loss": 3.0091, "theoretical_loss": 3.5377381596963167, "tokens_seen": 1399730176 }, { "epoch": 4.01, "learning_rate": 0.0002908625877632899, "loss": 2.8883, "theoretical_loss": 3.5377233613592596, "tokens_seen": 1399795712 }, { "epoch": 4.01, "learning_rate": 0.00029085255767301906, "loss": 2.8899, "theoretical_loss": 3.5377085639090016, "tokens_seen": 1399861248 }, { "epoch": 4.01, "learning_rate": 0.00029084252758274825, "loss": 2.868, "theoretical_loss": 3.5376937673454476, "tokens_seen": 1399926784 }, { "epoch": 4.01, "learning_rate": 0.0002908324974924774, "loss": 2.7506, "theoretical_loss": 3.5376789716685035, "tokens_seen": 1399992320 }, { "epoch": 4.01, "learning_rate": 0.00029082246740220666, "loss": 2.6721, "theoretical_loss": 3.5376641768780743, "tokens_seen": 1400057856 }, { "epoch": 4.01, "learning_rate": 0.0002908124373119358, "loss": 2.6752, "theoretical_loss": 3.537649382974066, "tokens_seen": 1400123392 }, { "epoch": 4.01, "learning_rate": 0.000290802407221665, "loss": 2.6965, "theoretical_loss": 3.5376345899563826, "tokens_seen": 1400188928 }, { "epoch": 4.01, "learning_rate": 0.00029079237713139415, "loss": 2.9194, "theoretical_loss": 3.5376197978249304, "tokens_seen": 1400254464 }, { "epoch": 4.01, "learning_rate": 0.0002907823470411234, "loss": 2.6873, "theoretical_loss": 3.537605006579615, "tokens_seen": 1400320000 }, { "epoch": 4.01, "learning_rate": 0.00029077231695085257, "loss": 2.8903, "theoretical_loss": 3.537590216220342, "tokens_seen": 1400385536 }, { "epoch": 4.01, "learning_rate": 0.00029076228686058175, "loss": 2.7346, "theoretical_loss": 3.5375754267470163, "tokens_seen": 1400451072 }, { "epoch": 4.01, "learning_rate": 0.00029075225677031093, "loss": 2.8468, "theoretical_loss": 3.537560638159544, "tokens_seen": 1400516608 }, { "epoch": 4.01, "learning_rate": 0.0002907422266800401, "loss": 2.8229, "theoretical_loss": 3.537545850457829, "tokens_seen": 1400582144 }, { "epoch": 4.01, "learning_rate": 0.0002907321965897693, "loss": 2.9674, "theoretical_loss": 3.5375310636417794, "tokens_seen": 1400647680 }, { "epoch": 4.01, "learning_rate": 0.00029072216649949853, "loss": 2.929, "theoretical_loss": 3.5375162777112985, "tokens_seen": 1400713216 }, { "epoch": 4.01, "objective/train/docs_used": 2238363, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.60848069190979, "objective/train/theoretical_loss": 3.537501492666293, "objective/train/tokens_used": 1421238752, "theoretical_loss": 3.537501492666293, "tokens_seen": 1400778752 }, { "epoch": 4.01, "learning_rate": 0.00029071213640922765, "loss": 2.8403, "theoretical_loss": 3.537501492666293, "tokens_seen": 1400778752 }, { "epoch": 4.01, "learning_rate": 0.0002907021063189569, "loss": 2.9379, "theoretical_loss": 3.537486708506668, "tokens_seen": 1400844288 }, { "epoch": 4.01, "learning_rate": 0.000290692076228686, "loss": 2.7811, "theoretical_loss": 3.537471925232329, "tokens_seen": 1400909824 }, { "epoch": 4.01, "learning_rate": 0.00029068204613841525, "loss": 2.7741, "theoretical_loss": 3.5374571428431816, "tokens_seen": 1400975360 }, { "epoch": 4.01, "learning_rate": 0.00029067201604814443, "loss": 2.8012, "theoretical_loss": 3.5374423613391324, "tokens_seen": 1401040896 }, { "epoch": 4.01, "learning_rate": 0.0002906619859578736, "loss": 2.7683, "theoretical_loss": 3.5374275807200855, "tokens_seen": 1401106432 }, { "epoch": 4.01, "learning_rate": 0.0002906519558676028, "loss": 2.9987, "theoretical_loss": 3.537412800985947, "tokens_seen": 1401171968 }, { "epoch": 4.01, "learning_rate": 0.00029064192577733203, "loss": 2.7542, "theoretical_loss": 3.537398022136623, "tokens_seen": 1401237504 }, { "epoch": 4.01, "learning_rate": 0.00029063189568706116, "loss": 2.8816, "theoretical_loss": 3.5373832441720188, "tokens_seen": 1401303040 }, { "epoch": 4.01, "learning_rate": 0.0002906218655967904, "loss": 2.8976, "theoretical_loss": 3.53736846709204, "tokens_seen": 1401368576 }, { "epoch": 4.01, "learning_rate": 0.0002906118355065195, "loss": 3.0079, "theoretical_loss": 3.5373536908965924, "tokens_seen": 1401434112 }, { "epoch": 4.01, "learning_rate": 0.00029060180541624876, "loss": 2.7066, "theoretical_loss": 3.5373389155855817, "tokens_seen": 1401499648 }, { "epoch": 4.01, "learning_rate": 0.00029059177532597794, "loss": 2.9719, "theoretical_loss": 3.537324141158914, "tokens_seen": 1401565184 }, { "epoch": 4.01, "learning_rate": 0.0002905817452357071, "loss": 2.9047, "theoretical_loss": 3.537309367616494, "tokens_seen": 1401630720 }, { "epoch": 4.01, "learning_rate": 0.0002905717151454363, "loss": 2.8361, "theoretical_loss": 3.537294594958228, "tokens_seen": 1401696256 }, { "epoch": 4.01, "learning_rate": 0.0002905616850551655, "loss": 2.7541, "theoretical_loss": 3.537279823184022, "tokens_seen": 1401761792 }, { "epoch": 4.01, "learning_rate": 0.00029055165496489466, "loss": 2.743, "theoretical_loss": 3.5372650522937823, "tokens_seen": 1401827328 }, { "epoch": 4.01, "learning_rate": 0.0002905416248746239, "loss": 2.8827, "theoretical_loss": 3.5372502822874132, "tokens_seen": 1401892864 }, { "epoch": 4.01, "learning_rate": 0.0002905315947843531, "loss": 2.5024, "theoretical_loss": 3.537235513164821, "tokens_seen": 1401958400 }, { "epoch": 4.01, "learning_rate": 0.00029052156469408226, "loss": 2.7444, "theoretical_loss": 3.5372207449259125, "tokens_seen": 1402023936 }, { "epoch": 4.01, "learning_rate": 0.0002905115346038115, "loss": 2.7326, "theoretical_loss": 3.537205977570592, "tokens_seen": 1402089472 }, { "epoch": 4.01, "learning_rate": 0.0002905015045135406, "loss": 2.899, "theoretical_loss": 3.537191211098767, "tokens_seen": 1402155008 }, { "epoch": 4.01, "learning_rate": 0.00029049147442326986, "loss": 2.7456, "theoretical_loss": 3.5371764455103416, "tokens_seen": 1402220544 }, { "epoch": 4.01, "learning_rate": 0.000290481444332999, "loss": 2.769, "theoretical_loss": 3.537161680805223, "tokens_seen": 1402286080 }, { "epoch": 4.01, "learning_rate": 0.0002904714142427282, "loss": 2.6664, "theoretical_loss": 3.5371469169833163, "tokens_seen": 1402351616 }, { "epoch": 4.01, "objective/train/docs_used": 2241241, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.815387487411499, "objective/train/theoretical_loss": 3.537132154044528, "objective/train/tokens_used": 1422877152, "theoretical_loss": 3.537132154044528, "tokens_seen": 1402417152 }, { "epoch": 4.01, "learning_rate": 0.0002904613841524574, "loss": 2.8868, "theoretical_loss": 3.537132154044528, "tokens_seen": 1402417152 }, { "epoch": 4.01, "learning_rate": 0.0002904513540621866, "loss": 2.9107, "theoretical_loss": 3.5371173919887635, "tokens_seen": 1402482688 }, { "epoch": 4.01, "learning_rate": 0.00029044132397191576, "loss": 2.7735, "theoretical_loss": 3.5371026308159292, "tokens_seen": 1402548224 }, { "epoch": 4.01, "learning_rate": 0.00029043129388164494, "loss": 2.8511, "theoretical_loss": 3.5370878705259305, "tokens_seen": 1402613760 }, { "epoch": 4.01, "learning_rate": 0.0002904212637913741, "loss": 2.8423, "theoretical_loss": 3.5370731111186737, "tokens_seen": 1402679296 }, { "epoch": 4.01, "learning_rate": 0.00029041123370110336, "loss": 2.7995, "theoretical_loss": 3.537058352594065, "tokens_seen": 1402744832 }, { "epoch": 4.01, "learning_rate": 0.0002904012036108325, "loss": 2.8876, "theoretical_loss": 3.53704359495201, "tokens_seen": 1402810368 }, { "epoch": 4.01, "learning_rate": 0.0002903911735205617, "loss": 2.7777, "theoretical_loss": 3.5370288381924144, "tokens_seen": 1402875904 }, { "epoch": 4.01, "learning_rate": 0.00029038114343029085, "loss": 2.7307, "theoretical_loss": 3.537014082315185, "tokens_seen": 1402941440 }, { "epoch": 4.01, "learning_rate": 0.0002903711133400201, "loss": 2.8149, "theoretical_loss": 3.536999327320227, "tokens_seen": 1403006976 }, { "epoch": 4.01, "learning_rate": 0.00029036108324974926, "loss": 2.7054, "theoretical_loss": 3.536984573207447, "tokens_seen": 1403072512 }, { "epoch": 4.01, "learning_rate": 0.00029035105315947845, "loss": 2.8128, "theoretical_loss": 3.5369698199767514, "tokens_seen": 1403138048 }, { "epoch": 4.01, "learning_rate": 0.0002903410230692076, "loss": 2.8197, "theoretical_loss": 3.5369550676280452, "tokens_seen": 1403203584 }, { "epoch": 4.01, "learning_rate": 0.00029033099297893686, "loss": 2.8393, "theoretical_loss": 3.5369403161612354, "tokens_seen": 1403269120 }, { "epoch": 4.01, "learning_rate": 0.000290320962888666, "loss": 2.9174, "theoretical_loss": 3.5369255655762277, "tokens_seen": 1403334656 }, { "epoch": 4.01, "learning_rate": 0.0002903109327983952, "loss": 2.7257, "theoretical_loss": 3.536910815872928, "tokens_seen": 1403400192 }, { "epoch": 4.01, "learning_rate": 0.00029030090270812435, "loss": 2.9271, "theoretical_loss": 3.536896067051243, "tokens_seen": 1403465728 }, { "epoch": 4.01, "learning_rate": 0.0002902908726178536, "loss": 2.8014, "theoretical_loss": 3.536881319111078, "tokens_seen": 1403531264 }, { "epoch": 4.01, "learning_rate": 0.00029028084252758277, "loss": 2.8533, "theoretical_loss": 3.5368665720523405, "tokens_seen": 1403596800 }, { "epoch": 4.01, "learning_rate": 0.00029027081243731195, "loss": 2.9876, "theoretical_loss": 3.5368518258749355, "tokens_seen": 1403662336 }, { "epoch": 4.01, "learning_rate": 0.00029026078234704113, "loss": 2.8311, "theoretical_loss": 3.5368370805787697, "tokens_seen": 1403727872 }, { "epoch": 4.01, "learning_rate": 0.0002902507522567703, "loss": 2.8495, "theoretical_loss": 3.5368223361637483, "tokens_seen": 1403793408 }, { "epoch": 4.01, "learning_rate": 0.0002902407221664995, "loss": 2.8288, "theoretical_loss": 3.5368075926297795, "tokens_seen": 1403858944 }, { "epoch": 4.01, "learning_rate": 0.00029023069207622873, "loss": 2.8688, "theoretical_loss": 3.536792849976768, "tokens_seen": 1403924480 }, { "epoch": 4.01, "learning_rate": 0.00029022066198595785, "loss": 2.836, "theoretical_loss": 3.5367781082046204, "tokens_seen": 1403990016 }, { "epoch": 4.01, "objective/train/docs_used": 2242645, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8241169452667236, "objective/train/theoretical_loss": 3.5367633673132426, "objective/train/tokens_used": 1424515552, "theoretical_loss": 3.5367633673132426, "tokens_seen": 1404055552 }, { "epoch": 4.01, "learning_rate": 0.0002902106318956871, "loss": 2.766, "theoretical_loss": 3.5367633673132426, "tokens_seen": 1404055552 }, { "epoch": 4.01, "learning_rate": 0.0002902006018054162, "loss": 2.6965, "theoretical_loss": 3.5367486273025417, "tokens_seen": 1404121088 }, { "epoch": 4.01, "learning_rate": 0.00029019057171514545, "loss": 2.6964, "theoretical_loss": 3.5367338881724235, "tokens_seen": 1404186624 }, { "epoch": 4.01, "learning_rate": 0.00029018054162487463, "loss": 2.807, "theoretical_loss": 3.536719149922794, "tokens_seen": 1404252160 }, { "epoch": 4.01, "learning_rate": 0.0002901705115346038, "loss": 2.8138, "theoretical_loss": 3.53670441255356, "tokens_seen": 1404317696 }, { "epoch": 4.01, "learning_rate": 0.000290160481444333, "loss": 2.8053, "theoretical_loss": 3.536689676064628, "tokens_seen": 1404383232 }, { "epoch": 4.01, "learning_rate": 0.00029015045135406223, "loss": 2.7539, "theoretical_loss": 3.5366749404559035, "tokens_seen": 1404448768 }, { "epoch": 4.01, "learning_rate": 0.00029014042126379136, "loss": 2.7519, "theoretical_loss": 3.5366602057272933, "tokens_seen": 1404514304 }, { "epoch": 4.01, "learning_rate": 0.0002901303911735206, "loss": 2.6874, "theoretical_loss": 3.536645471878704, "tokens_seen": 1404579840 }, { "epoch": 4.01, "learning_rate": 0.0002901203610832497, "loss": 2.7638, "theoretical_loss": 3.536630738910042, "tokens_seen": 1404645376 }, { "epoch": 4.01, "learning_rate": 0.00029011033099297896, "loss": 2.7806, "theoretical_loss": 3.536616006821214, "tokens_seen": 1404710912 }, { "epoch": 4.01, "learning_rate": 0.00029010030090270814, "loss": 2.8199, "theoretical_loss": 3.5366012756121252, "tokens_seen": 1404776448 }, { "epoch": 4.01, "learning_rate": 0.0002900902708124373, "loss": 2.7959, "theoretical_loss": 3.536586545282683, "tokens_seen": 1404841984 }, { "epoch": 4.01, "learning_rate": 0.0002900802407221665, "loss": 2.7299, "theoretical_loss": 3.5365718158327937, "tokens_seen": 1404907520 }, { "epoch": 4.01, "learning_rate": 0.0002900702106318957, "loss": 2.886, "theoretical_loss": 3.5365570872623637, "tokens_seen": 1404973056 }, { "epoch": 4.01, "learning_rate": 0.00029006018054162486, "loss": 2.7248, "theoretical_loss": 3.536542359571299, "tokens_seen": 1405038592 }, { "epoch": 4.01, "learning_rate": 0.0002900501504513541, "loss": 2.6942, "theoretical_loss": 3.5365276327595065, "tokens_seen": 1405104128 }, { "epoch": 4.01, "learning_rate": 0.0002900401203610832, "loss": 2.8446, "theoretical_loss": 3.5365129068268932, "tokens_seen": 1405169664 }, { "epoch": 4.01, "learning_rate": 0.00029003009027081246, "loss": 2.8931, "theoretical_loss": 3.536498181773365, "tokens_seen": 1405235200 }, { "epoch": 4.01, "learning_rate": 0.0002900200601805416, "loss": 2.8681, "theoretical_loss": 3.536483457598828, "tokens_seen": 1405300736 }, { "epoch": 4.01, "learning_rate": 0.0002900100300902708, "loss": 2.6255, "theoretical_loss": 3.53646873430319, "tokens_seen": 1405366272 }, { "epoch": 4.01, "learning_rate": 0.00029, "loss": 2.8231, "theoretical_loss": 3.5364540118863568, "tokens_seen": 1405431808 }, { "epoch": 4.01, "learning_rate": 0.0002899899699097292, "loss": 2.7025, "theoretical_loss": 3.536439290348235, "tokens_seen": 1405497344 }, { "epoch": 4.01, "learning_rate": 0.00028997993981945836, "loss": 2.753, "theoretical_loss": 3.5364245696887306, "tokens_seen": 1405562880 }, { "epoch": 4.01, "learning_rate": 0.0002899699097291876, "loss": 2.6731, "theoretical_loss": 3.536409849907751, "tokens_seen": 1405628416 }, { "epoch": 4.01, "objective/train/docs_used": 2245486, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.399740219116211, "objective/train/theoretical_loss": 3.536395131005203, "objective/train/tokens_used": 1426153952, "theoretical_loss": 3.536395131005203, "tokens_seen": 1405693952 }, { "epoch": 4.01, "learning_rate": 0.0002899598796389167, "loss": 2.6487, "theoretical_loss": 3.536395131005203, "tokens_seen": 1405693952 }, { "epoch": 4.01, "learning_rate": 0.00028994984954864596, "loss": 2.7413, "theoretical_loss": 3.536380412980993, "tokens_seen": 1405759488 }, { "epoch": 4.01, "learning_rate": 0.0002899398194583751, "loss": 3.0067, "theoretical_loss": 3.536365695835027, "tokens_seen": 1405825024 }, { "epoch": 4.01, "learning_rate": 0.0002899297893681043, "loss": 2.7007, "theoretical_loss": 3.536350979567213, "tokens_seen": 1405890560 }, { "epoch": 4.01, "learning_rate": 0.0002899197592778335, "loss": 2.6704, "theoretical_loss": 3.5363362641774554, "tokens_seen": 1405956096 }, { "epoch": 4.01, "learning_rate": 0.0002899097291875627, "loss": 2.8593, "theoretical_loss": 3.5363215496656633, "tokens_seen": 1406021632 }, { "epoch": 4.01, "learning_rate": 0.00028989969909729187, "loss": 2.8213, "theoretical_loss": 3.5363068360317422, "tokens_seen": 1406087168 }, { "epoch": 4.01, "learning_rate": 0.00028988966900702105, "loss": 2.7731, "theoretical_loss": 3.536292123275599, "tokens_seen": 1406152704 }, { "epoch": 4.01, "learning_rate": 0.00028987963891675023, "loss": 2.6756, "theoretical_loss": 3.5362774113971405, "tokens_seen": 1406218240 }, { "epoch": 4.01, "learning_rate": 0.00028986960882647946, "loss": 2.9259, "theoretical_loss": 3.536262700396273, "tokens_seen": 1406283776 }, { "epoch": 4.01, "learning_rate": 0.0002898595787362086, "loss": 2.687, "theoretical_loss": 3.536247990272904, "tokens_seen": 1406349312 }, { "epoch": 4.01, "learning_rate": 0.0002898495486459378, "loss": 2.9234, "theoretical_loss": 3.5362332810269397, "tokens_seen": 1406414848 }, { "epoch": 4.01, "learning_rate": 0.000289839518555667, "loss": 2.8896, "theoretical_loss": 3.536218572658287, "tokens_seen": 1406480384 }, { "epoch": 4.01, "learning_rate": 0.0002898294884653962, "loss": 2.8945, "theoretical_loss": 3.5362038651668533, "tokens_seen": 1406545920 }, { "epoch": 4.01, "learning_rate": 0.00028981945837512537, "loss": 2.9097, "theoretical_loss": 3.5361891585525447, "tokens_seen": 1406611456 }, { "epoch": 4.01, "learning_rate": 0.00028980942828485455, "loss": 2.9527, "theoretical_loss": 3.536174452815268, "tokens_seen": 1406676992 }, { "epoch": 4.01, "learning_rate": 0.00028979939819458373, "loss": 2.8613, "theoretical_loss": 3.5361597479549305, "tokens_seen": 1406742528 }, { "epoch": 4.01, "learning_rate": 0.00028978936810431297, "loss": 2.8901, "theoretical_loss": 3.5361450439714384, "tokens_seen": 1406808064 }, { "epoch": 4.01, "learning_rate": 0.00028977933801404215, "loss": 2.715, "theoretical_loss": 3.536130340864699, "tokens_seen": 1406873600 }, { "epoch": 4.01, "learning_rate": 0.00028976930792377133, "loss": 2.891, "theoretical_loss": 3.5361156386346195, "tokens_seen": 1406939136 }, { "epoch": 4.01, "learning_rate": 0.0002897592778335005, "loss": 2.9538, "theoretical_loss": 3.5361009372811067, "tokens_seen": 1407004672 }, { "epoch": 4.01, "learning_rate": 0.0002897492477432297, "loss": 2.9187, "theoretical_loss": 3.536086236804067, "tokens_seen": 1407070208 }, { "epoch": 4.01, "learning_rate": 0.00028973921765295893, "loss": 2.8456, "theoretical_loss": 3.5360715372034077, "tokens_seen": 1407135744 }, { "epoch": 4.01, "learning_rate": 0.00028972918756268805, "loss": 3.038, "theoretical_loss": 3.536056838479036, "tokens_seen": 1407201280 }, { "epoch": 4.01, "learning_rate": 0.0002897191574724173, "loss": 2.921, "theoretical_loss": 3.536042140630858, "tokens_seen": 1407266816 }, { "epoch": 4.01, "objective/train/docs_used": 2247961, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9485247135162354, "objective/train/theoretical_loss": 3.5360274436587815, "objective/train/tokens_used": 1427792352, "theoretical_loss": 3.5360274436587815, "tokens_seen": 1407332352 }, { "epoch": 4.01, "learning_rate": 0.0002897091273821464, "loss": 2.8617, "theoretical_loss": 3.5360274436587815, "tokens_seen": 1407332352 }, { "epoch": 4.01, "learning_rate": 0.00028969909729187565, "loss": 2.8261, "theoretical_loss": 3.5360127475627126, "tokens_seen": 1407397888 }, { "epoch": 4.01, "learning_rate": 0.00028968906720160483, "loss": 2.6986, "theoretical_loss": 3.5359980523425594, "tokens_seen": 1407463424 }, { "epoch": 4.01, "learning_rate": 0.000289679037111334, "loss": 2.8727, "theoretical_loss": 3.5359833579982283, "tokens_seen": 1407528960 }, { "epoch": 4.01, "learning_rate": 0.0002896690070210632, "loss": 2.7856, "theoretical_loss": 3.5359686645296264, "tokens_seen": 1407594496 }, { "epoch": 4.01, "learning_rate": 0.00028965897693079243, "loss": 2.9247, "theoretical_loss": 3.5359539719366606, "tokens_seen": 1407660032 }, { "epoch": 4.01, "learning_rate": 0.00028964894684052156, "loss": 2.7857, "theoretical_loss": 3.535939280219238, "tokens_seen": 1407725568 }, { "epoch": 4.01, "learning_rate": 0.0002896389167502508, "loss": 2.8722, "theoretical_loss": 3.5359245893772657, "tokens_seen": 1407791104 }, { "epoch": 4.01, "learning_rate": 0.0002896288866599799, "loss": 2.8138, "theoretical_loss": 3.5359098994106515, "tokens_seen": 1407856640 }, { "epoch": 4.01, "learning_rate": 0.00028961885656970916, "loss": 2.8799, "theoretical_loss": 3.5358952103193015, "tokens_seen": 1407922176 }, { "epoch": 4.01, "learning_rate": 0.00028960882647943834, "loss": 2.7534, "theoretical_loss": 3.535880522103123, "tokens_seen": 1407987712 }, { "epoch": 4.01, "learning_rate": 0.0002895987963891675, "loss": 2.954, "theoretical_loss": 3.5358658347620233, "tokens_seen": 1408053248 }, { "epoch": 4.01, "learning_rate": 0.0002895887662988967, "loss": 2.8164, "theoretical_loss": 3.53585114829591, "tokens_seen": 1408118784 }, { "epoch": 4.01, "learning_rate": 0.0002895787362086259, "loss": 2.741, "theoretical_loss": 3.535836462704689, "tokens_seen": 1408184320 }, { "epoch": 4.01, "learning_rate": 0.00028956870611835506, "loss": 2.8913, "theoretical_loss": 3.535821777988269, "tokens_seen": 1408249856 }, { "epoch": 4.01, "learning_rate": 0.0002895586760280843, "loss": 2.8056, "theoretical_loss": 3.5358070941465556, "tokens_seen": 1408315392 }, { "epoch": 4.01, "learning_rate": 0.0002895486459378134, "loss": 2.9056, "theoretical_loss": 3.5357924111794574, "tokens_seen": 1408380928 }, { "epoch": 4.01, "learning_rate": 0.00028953861584754266, "loss": 2.8582, "theoretical_loss": 3.535777729086881, "tokens_seen": 1408446464 }, { "epoch": 4.01, "learning_rate": 0.0002895285857572718, "loss": 2.8803, "theoretical_loss": 3.535763047868733, "tokens_seen": 1408512000 }, { "epoch": 4.01, "learning_rate": 0.000289518555667001, "loss": 2.7675, "theoretical_loss": 3.5357483675249224, "tokens_seen": 1408577536 }, { "epoch": 4.01, "learning_rate": 0.0002895085255767302, "loss": 2.8761, "theoretical_loss": 3.5357336880553545, "tokens_seen": 1408643072 }, { "epoch": 4.01, "learning_rate": 0.0002894984954864594, "loss": 2.9837, "theoretical_loss": 3.535719009459938, "tokens_seen": 1408708608 }, { "epoch": 4.01, "learning_rate": 0.00028948846539618856, "loss": 2.8386, "theoretical_loss": 3.5357043317385792, "tokens_seen": 1408774144 }, { "epoch": 4.01, "learning_rate": 0.0002894784353059178, "loss": 2.6922, "theoretical_loss": 3.5356896548911863, "tokens_seen": 1408839680 }, { "epoch": 4.01, "learning_rate": 0.0002894684052156469, "loss": 2.8795, "theoretical_loss": 3.5356749789176654, "tokens_seen": 1408905216 }, { "epoch": 4.01, "objective/train/docs_used": 2250734, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5423974990844727, "objective/train/theoretical_loss": 3.5356603038179246, "objective/train/tokens_used": 1429430752, "theoretical_loss": 3.5356603038179246, "tokens_seen": 1408970752 }, { "epoch": 4.01, "learning_rate": 0.00028945837512537616, "loss": 2.6661, "theoretical_loss": 3.5356603038179246, "tokens_seen": 1408970752 }, { "epoch": 4.01, "learning_rate": 0.0002894483450351053, "loss": 2.7127, "theoretical_loss": 3.5356456295918717, "tokens_seen": 1409036288 }, { "epoch": 4.01, "learning_rate": 0.0002894383149448345, "loss": 2.8381, "theoretical_loss": 3.5356309562394133, "tokens_seen": 1409101824 }, { "epoch": 4.01, "learning_rate": 0.0002894282848545637, "loss": 2.7324, "theoretical_loss": 3.535616283760457, "tokens_seen": 1409167360 }, { "epoch": 4.01, "learning_rate": 0.0002894182547642929, "loss": 2.8595, "theoretical_loss": 3.5356016121549096, "tokens_seen": 1409232896 }, { "epoch": 4.01, "learning_rate": 0.00028940822467402207, "loss": 2.9501, "theoretical_loss": 3.5355869414226797, "tokens_seen": 1409298432 }, { "epoch": 4.01, "learning_rate": 0.00028939819458375125, "loss": 2.8013, "theoretical_loss": 3.5355722715636744, "tokens_seen": 1409363968 }, { "epoch": 4.01, "learning_rate": 0.00028938816449348043, "loss": 2.7539, "theoretical_loss": 3.5355576025778, "tokens_seen": 1409429504 }, { "epoch": 4.01, "learning_rate": 0.00028937813440320966, "loss": 2.9444, "theoretical_loss": 3.5355429344649654, "tokens_seen": 1409495040 }, { "epoch": 4.01, "learning_rate": 0.0002893681043129388, "loss": 2.9294, "theoretical_loss": 3.535528267225077, "tokens_seen": 1409560576 }, { "epoch": 4.01, "learning_rate": 0.000289358074222668, "loss": 2.7892, "theoretical_loss": 3.5355136008580423, "tokens_seen": 1409626112 }, { "epoch": 4.01, "learning_rate": 0.0002893480441323972, "loss": 2.802, "theoretical_loss": 3.53549893536377, "tokens_seen": 1409691648 }, { "epoch": 4.01, "learning_rate": 0.0002893380140421264, "loss": 2.9594, "theoretical_loss": 3.535484270742166, "tokens_seen": 1409757184 }, { "epoch": 4.01, "learning_rate": 0.00028932798395185557, "loss": 2.7786, "theoretical_loss": 3.5354696069931384, "tokens_seen": 1409822720 }, { "epoch": 4.01, "learning_rate": 0.00028932798395185557, "loss": 2.8387, "theoretical_loss": 3.535454944116595, "tokens_seen": 1409888256 }, { "epoch": 4.01, "learning_rate": 0.00028931795386158475, "loss": 2.7988, "theoretical_loss": 3.535440282112444, "tokens_seen": 1409953792 }, { "epoch": 4.01, "learning_rate": 0.00028930792377131393, "loss": 2.8106, "theoretical_loss": 3.5354256209805914, "tokens_seen": 1410019328 }, { "epoch": 4.01, "learning_rate": 0.00028929789368104317, "loss": 2.8239, "theoretical_loss": 3.535410960720945, "tokens_seen": 1410084864 }, { "epoch": 4.01, "learning_rate": 0.0002892878635907723, "loss": 3.0423, "theoretical_loss": 3.5353963013334138, "tokens_seen": 1410150400 }, { "epoch": 4.01, "learning_rate": 0.00028927783350050153, "loss": 2.6956, "theoretical_loss": 3.535381642817904, "tokens_seen": 1410215936 }, { "epoch": 4.01, "learning_rate": 0.00028926780341023066, "loss": 2.6157, "theoretical_loss": 3.535366985174324, "tokens_seen": 1410281472 }, { "epoch": 4.01, "learning_rate": 0.0002892577733199599, "loss": 2.6736, "theoretical_loss": 3.5353523284025807, "tokens_seen": 1410347008 }, { "epoch": 4.01, "learning_rate": 0.0002892477432296891, "loss": 2.8654, "theoretical_loss": 3.5353376725025827, "tokens_seen": 1410412544 }, { "epoch": 4.01, "learning_rate": 0.00028923771313941825, "loss": 2.8617, "theoretical_loss": 3.5353230174742363, "tokens_seen": 1410478080 }, { "epoch": 4.01, "learning_rate": 0.00028922768304914744, "loss": 2.7329, "theoretical_loss": 3.5353083633174505, "tokens_seen": 1410543616 }, { "epoch": 4.01, "objective/train/docs_used": 2253723, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7853970527648926, "objective/train/theoretical_loss": 3.535293710032132, "objective/train/tokens_used": 1431069152, "theoretical_loss": 3.535293710032132, "tokens_seen": 1410609152 }, { "epoch": 4.01, "learning_rate": 0.0002892176529588766, "loss": 2.7304, "theoretical_loss": 3.535293710032132, "tokens_seen": 1410609152 }, { "epoch": 4.01, "learning_rate": 0.0002892076228686058, "loss": 2.7107, "theoretical_loss": 3.5352790576181894, "tokens_seen": 1410674688 }, { "epoch": 4.01, "learning_rate": 0.00028919759277833503, "loss": 2.8897, "theoretical_loss": 3.5352644060755294, "tokens_seen": 1410740224 }, { "epoch": 4.01, "learning_rate": 0.00028918756268806416, "loss": 2.7253, "theoretical_loss": 3.5352497554040605, "tokens_seen": 1410805760 }, { "epoch": 4.01, "learning_rate": 0.0002891775325977934, "loss": 2.6975, "theoretical_loss": 3.5352351056036904, "tokens_seen": 1410871296 }, { "epoch": 4.01, "learning_rate": 0.0002891675025075226, "loss": 2.9814, "theoretical_loss": 3.535220456674326, "tokens_seen": 1410936832 }, { "epoch": 4.01, "learning_rate": 0.00028915747241725176, "loss": 2.8821, "theoretical_loss": 3.535205808615876, "tokens_seen": 1411002368 }, { "epoch": 4.01, "learning_rate": 0.00028914744232698094, "loss": 2.8881, "theoretical_loss": 3.535191161428248, "tokens_seen": 1411067904 }, { "epoch": 4.01, "learning_rate": 0.0002891374122367101, "loss": 2.7615, "theoretical_loss": 3.5351765151113494, "tokens_seen": 1411133440 }, { "epoch": 4.01, "learning_rate": 0.0002891273821464393, "loss": 2.7358, "theoretical_loss": 3.535161869665088, "tokens_seen": 1411198976 }, { "epoch": 4.01, "learning_rate": 0.00028911735205616854, "loss": 2.7443, "theoretical_loss": 3.5351472250893723, "tokens_seen": 1411264512 }, { "epoch": 4.01, "learning_rate": 0.00028910732196589766, "loss": 2.8429, "theoretical_loss": 3.5351325813841097, "tokens_seen": 1411330048 }, { "epoch": 4.01, "learning_rate": 0.0002890972918756269, "loss": 2.862, "theoretical_loss": 3.5351179385492078, "tokens_seen": 1411395584 }, { "epoch": 4.01, "learning_rate": 0.000289087261785356, "loss": 2.8669, "theoretical_loss": 3.5351032965845746, "tokens_seen": 1411461120 }, { "epoch": 4.01, "learning_rate": 0.00028907723169508526, "loss": 2.8906, "theoretical_loss": 3.535088655490118, "tokens_seen": 1411526656 }, { "epoch": 4.01, "learning_rate": 0.00028906720160481444, "loss": 2.7846, "theoretical_loss": 3.5350740152657467, "tokens_seen": 1411592192 }, { "epoch": 4.01, "learning_rate": 0.0002890571715145436, "loss": 2.6058, "theoretical_loss": 3.535059375911367, "tokens_seen": 1411657728 }, { "epoch": 4.01, "learning_rate": 0.0002890471414242728, "loss": 2.8684, "theoretical_loss": 3.5350447374268876, "tokens_seen": 1411723264 }, { "epoch": 4.01, "learning_rate": 0.000289037111334002, "loss": 2.892, "theoretical_loss": 3.535030099812217, "tokens_seen": 1411788800 }, { "epoch": 4.01, "learning_rate": 0.0002890270812437312, "loss": 2.8633, "theoretical_loss": 3.5350154630672623, "tokens_seen": 1411854336 }, { "epoch": 4.01, "learning_rate": 0.0002890170511534604, "loss": 2.9704, "theoretical_loss": 3.535000827191932, "tokens_seen": 1411919872 }, { "epoch": 4.01, "learning_rate": 0.0002890070210631896, "loss": 2.8323, "theoretical_loss": 3.534986192186134, "tokens_seen": 1411985408 }, { "epoch": 4.01, "learning_rate": 0.00028899699097291876, "loss": 2.8654, "theoretical_loss": 3.534971558049776, "tokens_seen": 1412050944 }, { "epoch": 4.01, "learning_rate": 0.000288986960882648, "loss": 2.7407, "theoretical_loss": 3.534956924782766, "tokens_seen": 1412116480 }, { "epoch": 4.01, "learning_rate": 0.0002889769307923771, "loss": 2.7315, "theoretical_loss": 3.5349422923850122, "tokens_seen": 1412182016 }, { "epoch": 4.01, "objective/train/docs_used": 2256780, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.3601598739624023, "objective/train/theoretical_loss": 3.5349276608564226, "objective/train/tokens_used": 1432707552, "theoretical_loss": 3.5349276608564226, "tokens_seen": 1412247552 }, { "epoch": 4.01, "learning_rate": 0.00028896690070210636, "loss": 2.7015, "theoretical_loss": 3.5349276608564226, "tokens_seen": 1412247552 }, { "epoch": 4.01, "learning_rate": 0.0002889568706118355, "loss": 2.729, "theoretical_loss": 3.5349130301969054, "tokens_seen": 1412313088 }, { "epoch": 4.01, "learning_rate": 0.0002889468405215647, "loss": 2.7639, "theoretical_loss": 3.5348984004063686, "tokens_seen": 1412378624 }, { "epoch": 4.01, "learning_rate": 0.0002889368104312939, "loss": 2.7924, "theoretical_loss": 3.5348837714847194, "tokens_seen": 1412444160 }, { "epoch": 4.01, "learning_rate": 0.0002889267803410231, "loss": 2.8733, "theoretical_loss": 3.5348691434318678, "tokens_seen": 1412509696 }, { "epoch": 4.01, "learning_rate": 0.00028891675025075227, "loss": 2.8404, "theoretical_loss": 3.53485451624772, "tokens_seen": 1412575232 }, { "epoch": 4.01, "learning_rate": 0.00028890672016048145, "loss": 2.777, "theoretical_loss": 3.5348398899321847, "tokens_seen": 1412640768 }, { "epoch": 4.01, "learning_rate": 0.00028889669007021063, "loss": 2.7747, "theoretical_loss": 3.5348252644851703, "tokens_seen": 1412706304 }, { "epoch": 4.01, "learning_rate": 0.00028888665997993986, "loss": 2.8545, "theoretical_loss": 3.534810639906585, "tokens_seen": 1412771840 }, { "epoch": 4.01, "learning_rate": 0.000288876629889669, "loss": 2.9021, "theoretical_loss": 3.5347960161963368, "tokens_seen": 1412837376 }, { "epoch": 4.01, "learning_rate": 0.00028886659979939823, "loss": 2.7463, "theoretical_loss": 3.5347813933543337, "tokens_seen": 1412902912 }, { "epoch": 4.01, "learning_rate": 0.0002888565697091274, "loss": 2.9262, "theoretical_loss": 3.534766771380484, "tokens_seen": 1412968448 }, { "epoch": 4.01, "learning_rate": 0.0002888465396188566, "loss": 2.9021, "theoretical_loss": 3.534752150274696, "tokens_seen": 1413033984 }, { "epoch": 4.01, "learning_rate": 0.00028883650952858577, "loss": 2.8476, "theoretical_loss": 3.5347375300368773, "tokens_seen": 1413099520 }, { "epoch": 4.01, "learning_rate": 0.00028882647943831495, "loss": 2.7263, "theoretical_loss": 3.5347229106669373, "tokens_seen": 1413165056 }, { "epoch": 4.01, "learning_rate": 0.00028881644934804413, "loss": 2.8941, "theoretical_loss": 3.534708292164783, "tokens_seen": 1413230592 }, { "epoch": 4.01, "learning_rate": 0.00028880641925777337, "loss": 2.87, "theoretical_loss": 3.534693674530324, "tokens_seen": 1413296128 }, { "epoch": 4.01, "learning_rate": 0.0002887963891675025, "loss": 2.8214, "theoretical_loss": 3.5346790577634675, "tokens_seen": 1413361664 }, { "epoch": 4.01, "learning_rate": 0.00028878635907723173, "loss": 2.8126, "theoretical_loss": 3.5346644418641215, "tokens_seen": 1413427200 }, { "epoch": 4.01, "learning_rate": 0.00028877632898696086, "loss": 2.6239, "theoretical_loss": 3.5346498268321955, "tokens_seen": 1413492736 }, { "epoch": 4.01, "learning_rate": 0.0002887662988966901, "loss": 2.7828, "theoretical_loss": 3.534635212667597, "tokens_seen": 1413558272 }, { "epoch": 4.01, "learning_rate": 0.0002887562688064193, "loss": 2.7866, "theoretical_loss": 3.534620599370234, "tokens_seen": 1413623808 }, { "epoch": 4.01, "learning_rate": 0.00028874623871614845, "loss": 2.7881, "theoretical_loss": 3.534605986940016, "tokens_seen": 1413689344 }, { "epoch": 4.01, "learning_rate": 0.00028873620862587764, "loss": 2.7211, "theoretical_loss": 3.5345913753768503, "tokens_seen": 1413754880 }, { "epoch": 4.01, "learning_rate": 0.0002887261785356068, "loss": 2.9098, "theoretical_loss": 3.534576764680646, "tokens_seen": 1413820416 }, { "epoch": 4.01, "objective/train/docs_used": 2259451, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7442383766174316, "objective/train/theoretical_loss": 3.5345621548513106, "objective/train/tokens_used": 1434345952, "theoretical_loss": 3.5345621548513106, "tokens_seen": 1413885952 }, { "epoch": 4.01, "learning_rate": 0.000288716148445336, "loss": 2.7587, "theoretical_loss": 3.5345621548513106, "tokens_seen": 1413885952 }, { "epoch": 4.01, "learning_rate": 0.00028870611835506523, "loss": 2.7807, "theoretical_loss": 3.5345475458887536, "tokens_seen": 1413951488 }, { "epoch": 4.01, "learning_rate": 0.00028869608826479436, "loss": 2.8811, "theoretical_loss": 3.5345329377928825, "tokens_seen": 1414017024 }, { "epoch": 4.01, "learning_rate": 0.0002886860581745236, "loss": 2.7918, "theoretical_loss": 3.534518330563606, "tokens_seen": 1414082560 }, { "epoch": 4.01, "learning_rate": 0.0002886760280842528, "loss": 2.8627, "theoretical_loss": 3.5345037242008326, "tokens_seen": 1414148096 }, { "epoch": 4.01, "learning_rate": 0.00028866599799398196, "loss": 2.806, "theoretical_loss": 3.5344891187044705, "tokens_seen": 1414213632 }, { "epoch": 4.01, "learning_rate": 0.00028865596790371114, "loss": 2.8539, "theoretical_loss": 3.5344745140744287, "tokens_seen": 1414279168 }, { "epoch": 4.01, "learning_rate": 0.0002886459378134403, "loss": 2.8883, "theoretical_loss": 3.5344599103106153, "tokens_seen": 1414344704 }, { "epoch": 4.01, "learning_rate": 0.0002886359077231695, "loss": 2.8557, "theoretical_loss": 3.5344453074129385, "tokens_seen": 1414410240 }, { "epoch": 4.01, "learning_rate": 0.00028862587763289874, "loss": 2.869, "theoretical_loss": 3.5344307053813075, "tokens_seen": 1414475776 }, { "epoch": 4.01, "learning_rate": 0.00028861584754262786, "loss": 2.8077, "theoretical_loss": 3.53441610421563, "tokens_seen": 1414541312 }, { "epoch": 4.01, "learning_rate": 0.0002886058174523571, "loss": 2.7458, "theoretical_loss": 3.534401503915815, "tokens_seen": 1414606848 }, { "epoch": 4.01, "learning_rate": 0.0002885957873620862, "loss": 2.9604, "theoretical_loss": 3.5343869044817717, "tokens_seen": 1414672384 }, { "epoch": 4.01, "learning_rate": 0.00028858575727181546, "loss": 2.9699, "theoretical_loss": 3.5343723059134073, "tokens_seen": 1414737920 }, { "epoch": 4.01, "learning_rate": 0.00028857572718154464, "loss": 2.7893, "theoretical_loss": 3.5343577082106314, "tokens_seen": 1414803456 }, { "epoch": 4.01, "learning_rate": 0.0002885656970912738, "loss": 2.8862, "theoretical_loss": 3.534343111373352, "tokens_seen": 1414868992 }, { "epoch": 4.01, "learning_rate": 0.000288555667001003, "loss": 2.7972, "theoretical_loss": 3.5343285154014783, "tokens_seen": 1414934528 }, { "epoch": 4.01, "learning_rate": 0.0002885456369107322, "loss": 2.8993, "theoretical_loss": 3.534313920294918, "tokens_seen": 1415000064 }, { "epoch": 4.01, "learning_rate": 0.00028853560682046137, "loss": 2.7741, "theoretical_loss": 3.5342993260535804, "tokens_seen": 1415065600 }, { "epoch": 4.01, "learning_rate": 0.0002885255767301906, "loss": 2.8317, "theoretical_loss": 3.5342847326773743, "tokens_seen": 1415131136 }, { "epoch": 4.01, "learning_rate": 0.00028851554663991973, "loss": 2.7777, "theoretical_loss": 3.5342701401662078, "tokens_seen": 1415196672 }, { "epoch": 4.01, "learning_rate": 0.00028850551654964896, "loss": 2.9339, "theoretical_loss": 3.53425554851999, "tokens_seen": 1415262208 }, { "epoch": 4.01, "learning_rate": 0.00028849548645937815, "loss": 2.8349, "theoretical_loss": 3.5342409577386293, "tokens_seen": 1415327744 }, { "epoch": 4.01, "learning_rate": 0.0002884854563691073, "loss": 2.8417, "theoretical_loss": 3.5342263678220345, "tokens_seen": 1415393280 }, { "epoch": 4.01, "learning_rate": 0.0002884754262788365, "loss": 2.7898, "theoretical_loss": 3.534211778770114, "tokens_seen": 1415458816 }, { "epoch": 4.01, "objective/train/docs_used": 2260925, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.080103635787964, "objective/train/theoretical_loss": 3.534197190582778, "objective/train/tokens_used": 1435984352, "theoretical_loss": 3.534197190582778, "tokens_seen": 1415524352 }, { "epoch": 4.01, "learning_rate": 0.0002884653961885657, "loss": 2.9029, "theoretical_loss": 3.534197190582778, "tokens_seen": 1415524352 }, { "epoch": 4.01, "learning_rate": 0.00028845536609829487, "loss": 2.774, "theoretical_loss": 3.534182603259933, "tokens_seen": 1415589888 }, { "epoch": 4.01, "learning_rate": 0.0002884453360080241, "loss": 2.8261, "theoretical_loss": 3.534168016801489, "tokens_seen": 1415655424 }, { "epoch": 4.01, "learning_rate": 0.00028843530591775323, "loss": 2.6492, "theoretical_loss": 3.534153431207355, "tokens_seen": 1415720960 }, { "epoch": 4.01, "learning_rate": 0.00028842527582748247, "loss": 2.7765, "theoretical_loss": 3.534138846477439, "tokens_seen": 1415786496 }, { "epoch": 4.01, "learning_rate": 0.0002884152457372116, "loss": 2.7967, "theoretical_loss": 3.53412426261165, "tokens_seen": 1415852032 }, { "epoch": 4.01, "learning_rate": 0.00028840521564694083, "loss": 2.7535, "theoretical_loss": 3.534109679609897, "tokens_seen": 1415917568 }, { "epoch": 4.01, "learning_rate": 0.00028839518555667, "loss": 2.7502, "theoretical_loss": 3.5340950974720893, "tokens_seen": 1415983104 }, { "epoch": 4.01, "learning_rate": 0.0002883851554663992, "loss": 2.7835, "theoretical_loss": 3.5340805161981352, "tokens_seen": 1416048640 }, { "epoch": 4.01, "learning_rate": 0.0002883751253761284, "loss": 2.8884, "theoretical_loss": 3.534065935787943, "tokens_seen": 1416114176 }, { "epoch": 4.01, "learning_rate": 0.0002883650952858576, "loss": 2.7772, "theoretical_loss": 3.5340513562414224, "tokens_seen": 1416179712 }, { "epoch": 4.01, "learning_rate": 0.00028835506519558674, "loss": 2.9724, "theoretical_loss": 3.534036777558482, "tokens_seen": 1416245248 }, { "epoch": 4.01, "learning_rate": 0.00028834503510531597, "loss": 2.8868, "theoretical_loss": 3.534022199739031, "tokens_seen": 1416310784 }, { "epoch": 4.02, "learning_rate": 0.0002883350050150451, "loss": 2.9964, "theoretical_loss": 3.534007622782978, "tokens_seen": 1416376320 }, { "epoch": 4.02, "learning_rate": 0.00028832497492477433, "loss": 2.8978, "theoretical_loss": 3.533993046690232, "tokens_seen": 1416441856 }, { "epoch": 4.02, "learning_rate": 0.0002883149448345035, "loss": 2.8986, "theoretical_loss": 3.533978471460701, "tokens_seen": 1416507392 }, { "epoch": 4.02, "learning_rate": 0.0002883049147442327, "loss": 2.8537, "theoretical_loss": 3.5339638970942957, "tokens_seen": 1416572928 }, { "epoch": 4.02, "learning_rate": 0.0002882948846539619, "loss": 2.6971, "theoretical_loss": 3.5339493235909236, "tokens_seen": 1416638464 }, { "epoch": 4.02, "learning_rate": 0.00028828485456369106, "loss": 2.6552, "theoretical_loss": 3.533934750950495, "tokens_seen": 1416704000 }, { "epoch": 4.02, "learning_rate": 0.0002882748244734203, "loss": 2.866, "theoretical_loss": 3.5339201791729176, "tokens_seen": 1416769536 }, { "epoch": 4.02, "learning_rate": 0.0002882647943831495, "loss": 2.8338, "theoretical_loss": 3.5339056082581006, "tokens_seen": 1416835072 }, { "epoch": 4.02, "learning_rate": 0.00028825476429287866, "loss": 2.8111, "theoretical_loss": 3.5338910382059536, "tokens_seen": 1416900608 }, { "epoch": 4.02, "learning_rate": 0.00028824473420260784, "loss": 2.7807, "theoretical_loss": 3.533876469016386, "tokens_seen": 1416966144 }, { "epoch": 4.02, "learning_rate": 0.000288234704112337, "loss": 2.7101, "theoretical_loss": 3.5338619006893053, "tokens_seen": 1417031680 }, { "epoch": 4.02, "learning_rate": 0.0002882246740220662, "loss": 2.877, "theoretical_loss": 3.533847333224622, "tokens_seen": 1417097216 }, { "epoch": 4.02, "objective/train/docs_used": 2264585, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7073395252227783, "objective/train/theoretical_loss": 3.533832766622244, "objective/train/tokens_used": 1437622752, "theoretical_loss": 3.533832766622244, "tokens_seen": 1417162752 }, { "epoch": 4.02, "learning_rate": 0.00028821464393179543, "loss": 2.7643, "theoretical_loss": 3.533832766622244, "tokens_seen": 1417162752 }, { "epoch": 4.02, "learning_rate": 0.00028820461384152456, "loss": 2.9035, "theoretical_loss": 3.5338182008820818, "tokens_seen": 1417228288 }, { "epoch": 4.02, "learning_rate": 0.0002881945837512538, "loss": 2.9556, "theoretical_loss": 3.533803636004044, "tokens_seen": 1417293824 }, { "epoch": 4.02, "learning_rate": 0.000288184553660983, "loss": 2.7314, "theoretical_loss": 3.5337890719880383, "tokens_seen": 1417359360 }, { "epoch": 4.02, "learning_rate": 0.00028817452357071216, "loss": 2.8306, "theoretical_loss": 3.5337745088339756, "tokens_seen": 1417424896 }, { "epoch": 4.02, "learning_rate": 0.00028816449348044134, "loss": 2.6174, "theoretical_loss": 3.5337599465417644, "tokens_seen": 1417490432 }, { "epoch": 4.02, "learning_rate": 0.0002881544633901705, "loss": 2.8407, "theoretical_loss": 3.5337453851113136, "tokens_seen": 1417555968 }, { "epoch": 4.02, "learning_rate": 0.0002881444332998997, "loss": 2.885, "theoretical_loss": 3.533730824542533, "tokens_seen": 1417621504 }, { "epoch": 4.02, "learning_rate": 0.00028813440320962894, "loss": 2.7733, "theoretical_loss": 3.533716264835331, "tokens_seen": 1417687040 }, { "epoch": 4.02, "learning_rate": 0.00028812437311935806, "loss": 2.7506, "theoretical_loss": 3.5337017059896176, "tokens_seen": 1417752576 }, { "epoch": 4.02, "learning_rate": 0.0002881143430290873, "loss": 2.852, "theoretical_loss": 3.533687148005301, "tokens_seen": 1417818112 }, { "epoch": 4.02, "learning_rate": 0.0002881043129388164, "loss": 2.7361, "theoretical_loss": 3.5336725908822917, "tokens_seen": 1417883648 }, { "epoch": 4.02, "learning_rate": 0.00028809428284854566, "loss": 2.913, "theoretical_loss": 3.5336580346204975, "tokens_seen": 1417949184 }, { "epoch": 4.02, "learning_rate": 0.00028808425275827484, "loss": 2.7006, "theoretical_loss": 3.533643479219829, "tokens_seen": 1418014720 }, { "epoch": 4.02, "learning_rate": 0.000288074222668004, "loss": 2.8162, "theoretical_loss": 3.5336289246801944, "tokens_seen": 1418080256 }, { "epoch": 4.02, "learning_rate": 0.0002880641925777332, "loss": 2.6224, "theoretical_loss": 3.5336143710015038, "tokens_seen": 1418145792 }, { "epoch": 4.02, "learning_rate": 0.0002880541624874624, "loss": 2.8501, "theoretical_loss": 3.533599818183666, "tokens_seen": 1418211328 }, { "epoch": 4.02, "learning_rate": 0.00028804413239719157, "loss": 2.8446, "theoretical_loss": 3.5335852662265905, "tokens_seen": 1418276864 }, { "epoch": 4.02, "learning_rate": 0.0002880341023069208, "loss": 2.7528, "theoretical_loss": 3.5335707151301863, "tokens_seen": 1418342400 }, { "epoch": 4.02, "learning_rate": 0.00028802407221664993, "loss": 2.9536, "theoretical_loss": 3.533556164894363, "tokens_seen": 1418407936 }, { "epoch": 4.02, "learning_rate": 0.00028801404212637916, "loss": 2.8631, "theoretical_loss": 3.53354161551903, "tokens_seen": 1418473472 }, { "epoch": 4.02, "learning_rate": 0.00028800401203610835, "loss": 2.8684, "theoretical_loss": 3.5335270670040964, "tokens_seen": 1418539008 }, { "epoch": 4.02, "learning_rate": 0.0002879939819458375, "loss": 2.8894, "theoretical_loss": 3.533512519349472, "tokens_seen": 1418604544 }, { "epoch": 4.02, "learning_rate": 0.0002879839518555667, "loss": 2.8188, "theoretical_loss": 3.533497972555066, "tokens_seen": 1418670080 }, { "epoch": 4.02, "learning_rate": 0.0002879739217652959, "loss": 2.823, "theoretical_loss": 3.533483426620788, "tokens_seen": 1418735616 }, { "epoch": 4.02, "objective/train/docs_used": 2267285, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8112001419067383, "objective/train/theoretical_loss": 3.5334688815465465, "objective/train/tokens_used": 1439261152, "theoretical_loss": 3.5334688815465465, "tokens_seen": 1418801152 }, { "epoch": 4.02, "learning_rate": 0.00028796389167502507, "loss": 2.7625, "theoretical_loss": 3.5334688815465465, "tokens_seen": 1418801152 }, { "epoch": 4.02, "learning_rate": 0.0002879538615847543, "loss": 2.9218, "theoretical_loss": 3.533454337332252, "tokens_seen": 1418866688 }, { "epoch": 4.02, "learning_rate": 0.00028794383149448343, "loss": 2.7279, "theoretical_loss": 3.533439793977813, "tokens_seen": 1418932224 }, { "epoch": 4.02, "learning_rate": 0.00028793380140421267, "loss": 2.7815, "theoretical_loss": 3.53342525148314, "tokens_seen": 1418997760 }, { "epoch": 4.02, "learning_rate": 0.0002879237713139418, "loss": 2.8123, "theoretical_loss": 3.533410709848142, "tokens_seen": 1419063296 }, { "epoch": 4.02, "learning_rate": 0.00028791374122367103, "loss": 2.8023, "theoretical_loss": 3.533396169072728, "tokens_seen": 1419128832 }, { "epoch": 4.02, "learning_rate": 0.0002879037111334002, "loss": 2.7164, "theoretical_loss": 3.533381629156808, "tokens_seen": 1419194368 }, { "epoch": 4.02, "learning_rate": 0.0002878936810431294, "loss": 2.8362, "theoretical_loss": 3.5333670901002914, "tokens_seen": 1419259904 }, { "epoch": 4.02, "learning_rate": 0.0002878836509528586, "loss": 2.7743, "theoretical_loss": 3.5333525519030884, "tokens_seen": 1419325440 }, { "epoch": 4.02, "learning_rate": 0.0002878736208625878, "loss": 2.9206, "theoretical_loss": 3.533338014565107, "tokens_seen": 1419390976 }, { "epoch": 4.02, "learning_rate": 0.00028786359077231694, "loss": 2.8359, "theoretical_loss": 3.533323478086258, "tokens_seen": 1419456512 }, { "epoch": 4.02, "learning_rate": 0.00028785356068204617, "loss": 2.7432, "theoretical_loss": 3.5333089424664506, "tokens_seen": 1419522048 }, { "epoch": 4.02, "learning_rate": 0.0002878435305917753, "loss": 2.8711, "theoretical_loss": 3.5332944077055943, "tokens_seen": 1419587584 }, { "epoch": 4.02, "learning_rate": 0.00028783350050150453, "loss": 2.8521, "theoretical_loss": 3.533279873803599, "tokens_seen": 1419653120 }, { "epoch": 4.02, "learning_rate": 0.0002878234704112337, "loss": 2.872, "theoretical_loss": 3.533265340760374, "tokens_seen": 1419718656 }, { "epoch": 4.02, "learning_rate": 0.0002878134403209629, "loss": 2.7627, "theoretical_loss": 3.533250808575829, "tokens_seen": 1419784192 }, { "epoch": 4.02, "learning_rate": 0.0002878034102306921, "loss": 2.7624, "theoretical_loss": 3.5332362772498733, "tokens_seen": 1419849728 }, { "epoch": 4.02, "learning_rate": 0.00028779338014042126, "loss": 2.8514, "theoretical_loss": 3.5332217467824174, "tokens_seen": 1419915264 }, { "epoch": 4.02, "learning_rate": 0.00028778335005015044, "loss": 2.8662, "theoretical_loss": 3.53320721717337, "tokens_seen": 1419980800 }, { "epoch": 4.02, "learning_rate": 0.0002877733199598797, "loss": 2.9504, "theoretical_loss": 3.5331926884226417, "tokens_seen": 1420046336 }, { "epoch": 4.02, "learning_rate": 0.0002877632898696088, "loss": 2.7046, "theoretical_loss": 3.5331781605301416, "tokens_seen": 1420111872 }, { "epoch": 4.02, "learning_rate": 0.00028775325977933804, "loss": 2.8517, "theoretical_loss": 3.533163633495779, "tokens_seen": 1420177408 }, { "epoch": 4.02, "learning_rate": 0.00028774322968906716, "loss": 2.8296, "theoretical_loss": 3.5331491073194643, "tokens_seen": 1420242944 }, { "epoch": 4.02, "learning_rate": 0.0002877331995987964, "loss": 2.8303, "theoretical_loss": 3.5331345820011073, "tokens_seen": 1420308480 }, { "epoch": 4.02, "learning_rate": 0.0002877231695085256, "loss": 2.8838, "theoretical_loss": 3.5331200575406174, "tokens_seen": 1420374016 }, { "epoch": 4.02, "objective/train/docs_used": 2268625, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.820002794265747, "objective/train/theoretical_loss": 3.5331055339379045, "objective/train/tokens_used": 1440899552, "theoretical_loss": 3.5331055339379045, "tokens_seen": 1420439552 }, { "epoch": 4.02, "learning_rate": 0.00028771313941825476, "loss": 2.6904, "theoretical_loss": 3.5331055339379045, "tokens_seen": 1420439552 }, { "epoch": 4.02, "learning_rate": 0.00028770310932798394, "loss": 2.8535, "theoretical_loss": 3.533091011192878, "tokens_seen": 1420505088 }, { "epoch": 4.02, "learning_rate": 0.0002876930792377132, "loss": 2.8985, "theoretical_loss": 3.533076489305448, "tokens_seen": 1420570624 }, { "epoch": 4.02, "learning_rate": 0.0002876830491474423, "loss": 2.8344, "theoretical_loss": 3.5330619682755247, "tokens_seen": 1420636160 }, { "epoch": 4.02, "learning_rate": 0.00028767301905717154, "loss": 2.8612, "theoretical_loss": 3.533047448103017, "tokens_seen": 1420701696 }, { "epoch": 4.02, "learning_rate": 0.00028766298896690067, "loss": 2.7277, "theoretical_loss": 3.5330329287878355, "tokens_seen": 1420767232 }, { "epoch": 4.02, "learning_rate": 0.0002876529588766299, "loss": 2.8598, "theoretical_loss": 3.53301841032989, "tokens_seen": 1420832768 }, { "epoch": 4.02, "learning_rate": 0.0002876429287863591, "loss": 2.8394, "theoretical_loss": 3.5330038927290897, "tokens_seen": 1420898304 }, { "epoch": 4.02, "learning_rate": 0.00028763289869608826, "loss": 2.8167, "theoretical_loss": 3.5329893759853452, "tokens_seen": 1420963840 }, { "epoch": 4.02, "learning_rate": 0.00028762286860581745, "loss": 2.7837, "theoretical_loss": 3.532974860098566, "tokens_seen": 1421029376 }, { "epoch": 4.02, "learning_rate": 0.0002876128385155466, "loss": 2.8762, "theoretical_loss": 3.5329603450686617, "tokens_seen": 1421094912 }, { "epoch": 4.02, "learning_rate": 0.0002876028084252758, "loss": 2.7372, "theoretical_loss": 3.5329458308955433, "tokens_seen": 1421160448 }, { "epoch": 4.02, "learning_rate": 0.00028759277833500504, "loss": 2.8884, "theoretical_loss": 3.532931317579119, "tokens_seen": 1421225984 }, { "epoch": 4.02, "learning_rate": 0.00028758274824473417, "loss": 2.7973, "theoretical_loss": 3.5329168051193003, "tokens_seen": 1421291520 }, { "epoch": 4.02, "learning_rate": 0.0002875727181544634, "loss": 2.7072, "theoretical_loss": 3.5329022935159964, "tokens_seen": 1421357056 }, { "epoch": 4.02, "learning_rate": 0.00028756268806419253, "loss": 2.7847, "theoretical_loss": 3.5328877827691176, "tokens_seen": 1421422592 }, { "epoch": 4.02, "learning_rate": 0.00028755265797392177, "loss": 2.8145, "theoretical_loss": 3.5328732728785734, "tokens_seen": 1421488128 }, { "epoch": 4.02, "learning_rate": 0.00028754262788365095, "loss": 2.8626, "theoretical_loss": 3.532858763844274, "tokens_seen": 1421553664 }, { "epoch": 4.02, "learning_rate": 0.00028753259779338013, "loss": 2.874, "theoretical_loss": 3.53284425566613, "tokens_seen": 1421619200 }, { "epoch": 4.02, "learning_rate": 0.00028752256770310936, "loss": 2.8438, "theoretical_loss": 3.5328297483440503, "tokens_seen": 1421684736 }, { "epoch": 4.02, "learning_rate": 0.00028751253761283855, "loss": 2.781, "theoretical_loss": 3.5328152418779464, "tokens_seen": 1421750272 }, { "epoch": 4.02, "learning_rate": 0.0002875025075225677, "loss": 2.8511, "theoretical_loss": 3.5328007362677267, "tokens_seen": 1421815808 }, { "epoch": 4.02, "learning_rate": 0.0002874924774322969, "loss": 2.8032, "theoretical_loss": 3.532786231513302, "tokens_seen": 1421881344 }, { "epoch": 4.02, "learning_rate": 0.0002874824473420261, "loss": 2.7867, "theoretical_loss": 3.5327717276145827, "tokens_seen": 1421946880 }, { "epoch": 4.02, "learning_rate": 0.00028747241725175527, "loss": 3.0004, "theoretical_loss": 3.5327572245714784, "tokens_seen": 1422012416 }, { "epoch": 4.02, "objective/train/docs_used": 2271378, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.784115791320801, "objective/train/theoretical_loss": 3.532742722383899, "objective/train/tokens_used": 1442537952, "theoretical_loss": 3.532742722383899, "tokens_seen": 1422077952 }, { "epoch": 4.02, "learning_rate": 0.0002874623871614845, "loss": 2.8675, "theoretical_loss": 3.532742722383899, "tokens_seen": 1422077952 }, { "epoch": 4.02, "learning_rate": 0.00028745235707121363, "loss": 2.8633, "theoretical_loss": 3.532728221051756, "tokens_seen": 1422143488 }, { "epoch": 4.02, "learning_rate": 0.00028744232698094287, "loss": 2.9558, "theoretical_loss": 3.5327137205749577, "tokens_seen": 1422209024 }, { "epoch": 4.02, "learning_rate": 0.000287432296890672, "loss": 2.7409, "theoretical_loss": 3.532699220953415, "tokens_seen": 1422274560 }, { "epoch": 4.02, "learning_rate": 0.00028742226680040123, "loss": 2.9229, "theoretical_loss": 3.5326847221870388, "tokens_seen": 1422340096 }, { "epoch": 4.02, "learning_rate": 0.0002874122367101304, "loss": 2.7913, "theoretical_loss": 3.532670224275738, "tokens_seen": 1422405632 }, { "epoch": 4.02, "learning_rate": 0.0002874022066198596, "loss": 2.6591, "theoretical_loss": 3.5326557272194234, "tokens_seen": 1422471168 }, { "epoch": 4.02, "learning_rate": 0.0002873921765295888, "loss": 2.7389, "theoretical_loss": 3.532641231018005, "tokens_seen": 1422536704 }, { "epoch": 4.02, "learning_rate": 0.000287382146439318, "loss": 2.7146, "theoretical_loss": 3.532626735671393, "tokens_seen": 1422602240 }, { "epoch": 4.02, "learning_rate": 0.00028737211634904714, "loss": 2.8805, "theoretical_loss": 3.532612241179498, "tokens_seen": 1422667776 }, { "epoch": 4.02, "learning_rate": 0.00028736208625877637, "loss": 2.829, "theoretical_loss": 3.53259774754223, "tokens_seen": 1422733312 }, { "epoch": 4.02, "learning_rate": 0.0002873520561685055, "loss": 2.7053, "theoretical_loss": 3.5325832547594995, "tokens_seen": 1422798848 }, { "epoch": 4.02, "learning_rate": 0.00028734202607823473, "loss": 2.8933, "theoretical_loss": 3.532568762831216, "tokens_seen": 1422864384 }, { "epoch": 4.02, "learning_rate": 0.0002873319959879639, "loss": 2.9166, "theoretical_loss": 3.5325542717572906, "tokens_seen": 1422929920 }, { "epoch": 4.02, "learning_rate": 0.0002873219658976931, "loss": 2.8476, "theoretical_loss": 3.5325397815376327, "tokens_seen": 1422995456 }, { "epoch": 4.02, "learning_rate": 0.0002873119358074223, "loss": 2.7795, "theoretical_loss": 3.5325252921721537, "tokens_seen": 1423060992 }, { "epoch": 4.02, "learning_rate": 0.00028730190571715146, "loss": 2.8351, "theoretical_loss": 3.532510803660763, "tokens_seen": 1423126528 }, { "epoch": 4.02, "learning_rate": 0.00028729187562688064, "loss": 2.7599, "theoretical_loss": 3.5324963160033716, "tokens_seen": 1423192064 }, { "epoch": 4.02, "learning_rate": 0.0002872818455366099, "loss": 2.6999, "theoretical_loss": 3.5324818291998894, "tokens_seen": 1423257600 }, { "epoch": 4.02, "learning_rate": 0.000287271815446339, "loss": 2.8664, "theoretical_loss": 3.532467343250227, "tokens_seen": 1423323136 }, { "epoch": 4.02, "learning_rate": 0.00028726178535606824, "loss": 2.9275, "theoretical_loss": 3.5324528581542944, "tokens_seen": 1423388672 }, { "epoch": 4.02, "learning_rate": 0.00028725175526579736, "loss": 2.7064, "theoretical_loss": 3.5324383739120027, "tokens_seen": 1423454208 }, { "epoch": 4.02, "learning_rate": 0.0002872417251755266, "loss": 2.8992, "theoretical_loss": 3.532423890523261, "tokens_seen": 1423519744 }, { "epoch": 4.02, "learning_rate": 0.0002872316950852558, "loss": 2.8451, "theoretical_loss": 3.5324094079879806, "tokens_seen": 1423585280 }, { "epoch": 4.02, "learning_rate": 0.00028722166499498496, "loss": 2.7908, "theoretical_loss": 3.5323949263060728, "tokens_seen": 1423650816 }, { "epoch": 4.02, "objective/train/docs_used": 2274114, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5525717735290527, "objective/train/theoretical_loss": 3.5323804454774463, "objective/train/tokens_used": 1444176352, "theoretical_loss": 3.5323804454774463, "tokens_seen": 1423716352 }, { "epoch": 4.02, "learning_rate": 0.00028721163490471414, "loss": 2.801, "theoretical_loss": 3.5323804454774463, "tokens_seen": 1423716352 }, { "epoch": 4.02, "learning_rate": 0.0002872016048144434, "loss": 2.8731, "theoretical_loss": 3.5323659655020125, "tokens_seen": 1423781888 }, { "epoch": 4.02, "learning_rate": 0.0002871915747241725, "loss": 2.847, "theoretical_loss": 3.5323514863796817, "tokens_seen": 1423847424 }, { "epoch": 4.02, "learning_rate": 0.00028718154463390174, "loss": 2.8652, "theoretical_loss": 3.5323370081103644, "tokens_seen": 1423912960 }, { "epoch": 4.02, "learning_rate": 0.00028717151454363087, "loss": 2.7683, "theoretical_loss": 3.532322530693971, "tokens_seen": 1423978496 }, { "epoch": 4.02, "learning_rate": 0.0002871614844533601, "loss": 2.8694, "theoretical_loss": 3.5323080541304126, "tokens_seen": 1424044032 }, { "epoch": 4.02, "learning_rate": 0.0002871514543630893, "loss": 2.906, "theoretical_loss": 3.5322935784195986, "tokens_seen": 1424109568 }, { "epoch": 4.02, "learning_rate": 0.00028714142427281846, "loss": 2.7909, "theoretical_loss": 3.5322791035614403, "tokens_seen": 1424175104 }, { "epoch": 4.02, "learning_rate": 0.00028713139418254765, "loss": 2.7061, "theoretical_loss": 3.532264629555848, "tokens_seen": 1424240640 }, { "epoch": 4.02, "learning_rate": 0.0002871213640922768, "loss": 2.5518, "theoretical_loss": 3.532250156402732, "tokens_seen": 1424306176 }, { "epoch": 4.02, "learning_rate": 0.000287111334002006, "loss": 2.8456, "theoretical_loss": 3.5322356841020035, "tokens_seen": 1424371712 }, { "epoch": 4.02, "learning_rate": 0.00028710130391173524, "loss": 2.8376, "theoretical_loss": 3.532221212653573, "tokens_seen": 1424437248 }, { "epoch": 4.02, "learning_rate": 0.00028709127382146437, "loss": 2.8059, "theoretical_loss": 3.5322067420573506, "tokens_seen": 1424502784 }, { "epoch": 4.02, "learning_rate": 0.0002870812437311936, "loss": 2.7788, "theoretical_loss": 3.532192272313247, "tokens_seen": 1424568320 }, { "epoch": 4.02, "learning_rate": 0.00028707121364092273, "loss": 2.624, "theoretical_loss": 3.532177803421173, "tokens_seen": 1424633856 }, { "epoch": 4.02, "learning_rate": 0.00028706118355065197, "loss": 2.8853, "theoretical_loss": 3.53216333538104, "tokens_seen": 1424699392 }, { "epoch": 4.02, "learning_rate": 0.00028705115346038115, "loss": 2.7808, "theoretical_loss": 3.532148868192757, "tokens_seen": 1424764928 }, { "epoch": 4.02, "learning_rate": 0.00028704112337011033, "loss": 2.7515, "theoretical_loss": 3.532134401856236, "tokens_seen": 1424830464 }, { "epoch": 4.02, "learning_rate": 0.0002870310932798395, "loss": 2.7201, "theoretical_loss": 3.5321199363713873, "tokens_seen": 1424896000 }, { "epoch": 4.02, "learning_rate": 0.00028702106318956875, "loss": 2.9124, "theoretical_loss": 3.532105471738121, "tokens_seen": 1424961536 }, { "epoch": 4.02, "learning_rate": 0.0002870110330992979, "loss": 2.8336, "theoretical_loss": 3.532091007956349, "tokens_seen": 1425027072 }, { "epoch": 4.02, "learning_rate": 0.0002870010030090271, "loss": 2.8792, "theoretical_loss": 3.532076545025981, "tokens_seen": 1425092608 }, { "epoch": 4.02, "learning_rate": 0.00028699097291875624, "loss": 2.8434, "theoretical_loss": 3.532062082946928, "tokens_seen": 1425158144 }, { "epoch": 4.02, "learning_rate": 0.00028698094282848547, "loss": 2.8611, "theoretical_loss": 3.5320476217191006, "tokens_seen": 1425223680 }, { "epoch": 4.02, "learning_rate": 0.00028697091273821465, "loss": 2.7993, "theoretical_loss": 3.5320331613424103, "tokens_seen": 1425289216 }, { "epoch": 4.02, "objective/train/docs_used": 2277044, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6768486499786377, "objective/train/theoretical_loss": 3.5320187018167672, "objective/train/tokens_used": 1445814752, "theoretical_loss": 3.5320187018167672, "tokens_seen": 1425354752 }, { "epoch": 4.02, "learning_rate": 0.00028696088264794383, "loss": 2.8081, "theoretical_loss": 3.5320187018167672, "tokens_seen": 1425354752 }, { "epoch": 4.02, "learning_rate": 0.000286950852557673, "loss": 2.8994, "theoretical_loss": 3.532004243142082, "tokens_seen": 1425420288 }, { "epoch": 4.02, "learning_rate": 0.0002869408224674022, "loss": 2.817, "theoretical_loss": 3.531989785318266, "tokens_seen": 1425485824 }, { "epoch": 4.02, "learning_rate": 0.0002869307923771314, "loss": 2.8959, "theoretical_loss": 3.5319753283452298, "tokens_seen": 1425551360 }, { "epoch": 4.02, "learning_rate": 0.0002869207622868606, "loss": 2.8591, "theoretical_loss": 3.531960872222884, "tokens_seen": 1425616896 }, { "epoch": 4.02, "learning_rate": 0.00028691073219658974, "loss": 2.8467, "theoretical_loss": 3.53194641695114, "tokens_seen": 1425682432 }, { "epoch": 4.02, "learning_rate": 0.000286900702106319, "loss": 2.8773, "theoretical_loss": 3.5319319625299075, "tokens_seen": 1425747968 }, { "epoch": 4.02, "learning_rate": 0.0002868906720160481, "loss": 2.8347, "theoretical_loss": 3.531917508959099, "tokens_seen": 1425813504 }, { "epoch": 4.02, "learning_rate": 0.00028688064192577734, "loss": 2.8129, "theoretical_loss": 3.531903056238624, "tokens_seen": 1425879040 }, { "epoch": 4.02, "learning_rate": 0.0002868706118355065, "loss": 2.8532, "theoretical_loss": 3.531888604368394, "tokens_seen": 1425944576 }, { "epoch": 4.02, "learning_rate": 0.0002868605817452357, "loss": 2.9241, "theoretical_loss": 3.53187415334832, "tokens_seen": 1426010112 }, { "epoch": 4.02, "learning_rate": 0.0002868505516549649, "loss": 2.8147, "theoretical_loss": 3.5318597031783123, "tokens_seen": 1426075648 }, { "epoch": 4.02, "learning_rate": 0.0002868405215646941, "loss": 2.9195, "theoretical_loss": 3.5318452538582825, "tokens_seen": 1426141184 }, { "epoch": 4.02, "learning_rate": 0.00028683049147442324, "loss": 2.8901, "theoretical_loss": 3.5318308053881418, "tokens_seen": 1426206720 }, { "epoch": 4.02, "learning_rate": 0.0002868204613841525, "loss": 2.8002, "theoretical_loss": 3.5318163577678003, "tokens_seen": 1426272256 }, { "epoch": 4.02, "learning_rate": 0.0002868104312938816, "loss": 2.848, "theoretical_loss": 3.5318019109971694, "tokens_seen": 1426337792 }, { "epoch": 4.02, "learning_rate": 0.00028680040120361084, "loss": 2.8136, "theoretical_loss": 3.53178746507616, "tokens_seen": 1426403328 }, { "epoch": 4.02, "learning_rate": 0.00028679037111334, "loss": 2.8665, "theoretical_loss": 3.531773020004683, "tokens_seen": 1426468864 }, { "epoch": 4.02, "learning_rate": 0.0002867803410230692, "loss": 2.8243, "theoretical_loss": 3.53175857578265, "tokens_seen": 1426534400 }, { "epoch": 4.02, "learning_rate": 0.00028677031093279844, "loss": 2.8705, "theoretical_loss": 3.5317441324099708, "tokens_seen": 1426599936 }, { "epoch": 4.02, "learning_rate": 0.00028676028084252756, "loss": 2.7366, "theoretical_loss": 3.531729689886558, "tokens_seen": 1426665472 }, { "epoch": 4.02, "learning_rate": 0.0002867502507522568, "loss": 2.7143, "theoretical_loss": 3.5317152482123215, "tokens_seen": 1426731008 }, { "epoch": 4.02, "learning_rate": 0.000286740220661986, "loss": 2.9606, "theoretical_loss": 3.531700807387173, "tokens_seen": 1426796544 }, { "epoch": 4.02, "learning_rate": 0.00028673019057171516, "loss": 2.8006, "theoretical_loss": 3.531686367411023, "tokens_seen": 1426862080 }, { "epoch": 4.02, "learning_rate": 0.00028672016048144434, "loss": 2.7531, "theoretical_loss": 3.5316719282837834, "tokens_seen": 1426927616 }, { "epoch": 4.02, "objective/train/docs_used": 2279872, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9743733406066895, "objective/train/theoretical_loss": 3.5316574900053643, "objective/train/tokens_used": 1447453152, "theoretical_loss": 3.5316574900053643, "tokens_seen": 1426993152 }, { "epoch": 4.02, "learning_rate": 0.0002867101303911736, "loss": 2.8738, "theoretical_loss": 3.5316574900053643, "tokens_seen": 1426993152 }, { "epoch": 4.02, "learning_rate": 0.0002867001003009027, "loss": 2.9106, "theoretical_loss": 3.531643052575678, "tokens_seen": 1427058688 }, { "epoch": 4.02, "learning_rate": 0.00028669007021063194, "loss": 2.8353, "theoretical_loss": 3.5316286159946344, "tokens_seen": 1427124224 }, { "epoch": 4.02, "learning_rate": 0.00028668004012036107, "loss": 2.9007, "theoretical_loss": 3.531614180262146, "tokens_seen": 1427189760 }, { "epoch": 4.02, "learning_rate": 0.0002866700100300903, "loss": 2.8273, "theoretical_loss": 3.5315997453781227, "tokens_seen": 1427255296 }, { "epoch": 4.02, "learning_rate": 0.0002866599799398195, "loss": 2.6547, "theoretical_loss": 3.531585311342476, "tokens_seen": 1427320832 }, { "epoch": 4.02, "learning_rate": 0.00028664994984954866, "loss": 2.8277, "theoretical_loss": 3.5315708781551174, "tokens_seen": 1427386368 }, { "epoch": 4.02, "learning_rate": 0.00028663991975927785, "loss": 2.8797, "theoretical_loss": 3.531556445815958, "tokens_seen": 1427451904 }, { "epoch": 4.02, "learning_rate": 0.000286629889669007, "loss": 2.8523, "theoretical_loss": 3.531542014324909, "tokens_seen": 1427517440 }, { "epoch": 4.02, "learning_rate": 0.0002866198595787362, "loss": 2.7777, "theoretical_loss": 3.5315275836818816, "tokens_seen": 1427582976 }, { "epoch": 4.02, "learning_rate": 0.00028660982948846544, "loss": 2.967, "theoretical_loss": 3.531513153886787, "tokens_seen": 1427648512 }, { "epoch": 4.02, "learning_rate": 0.00028659979939819457, "loss": 2.8114, "theoretical_loss": 3.531498724939537, "tokens_seen": 1427714048 }, { "epoch": 4.02, "learning_rate": 0.0002865897693079238, "loss": 2.7179, "theoretical_loss": 3.5314842968400417, "tokens_seen": 1427779584 }, { "epoch": 4.02, "learning_rate": 0.00028657973921765293, "loss": 2.8742, "theoretical_loss": 3.5314698695882134, "tokens_seen": 1427845120 }, { "epoch": 4.02, "learning_rate": 0.00028656970912738217, "loss": 2.8095, "theoretical_loss": 3.5314554431839627, "tokens_seen": 1427910656 }, { "epoch": 4.02, "learning_rate": 0.00028655967903711135, "loss": 2.8949, "theoretical_loss": 3.531441017627202, "tokens_seen": 1427976192 }, { "epoch": 4.02, "learning_rate": 0.00028654964894684053, "loss": 2.7213, "theoretical_loss": 3.5314265929178412, "tokens_seen": 1428041728 }, { "epoch": 4.02, "learning_rate": 0.0002865396188565697, "loss": 2.8377, "theoretical_loss": 3.5314121690557925, "tokens_seen": 1428107264 }, { "epoch": 4.02, "learning_rate": 0.00028652958876629895, "loss": 2.9684, "theoretical_loss": 3.531397746040967, "tokens_seen": 1428172800 }, { "epoch": 4.02, "learning_rate": 0.0002865195586760281, "loss": 2.8363, "theoretical_loss": 3.5313833238732766, "tokens_seen": 1428238336 }, { "epoch": 4.02, "learning_rate": 0.0002865095285857573, "loss": 2.8737, "theoretical_loss": 3.531368902552632, "tokens_seen": 1428303872 }, { "epoch": 4.02, "learning_rate": 0.00028649949849548644, "loss": 2.8817, "theoretical_loss": 3.531354482078944, "tokens_seen": 1428369408 }, { "epoch": 4.02, "learning_rate": 0.00028648946840521567, "loss": 2.9634, "theoretical_loss": 3.531340062452126, "tokens_seen": 1428434944 }, { "epoch": 4.02, "learning_rate": 0.00028647943831494485, "loss": 2.7793, "theoretical_loss": 3.531325643672088, "tokens_seen": 1428500480 }, { "epoch": 4.02, "learning_rate": 0.00028646940822467403, "loss": 2.8163, "theoretical_loss": 3.531311225738741, "tokens_seen": 1428566016 }, { "epoch": 4.02, "objective/train/docs_used": 2281301, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6250953674316406, "objective/train/theoretical_loss": 3.531296808651997, "objective/train/tokens_used": 1449091552, "theoretical_loss": 3.531296808651997, "tokens_seen": 1428631552 }, { "epoch": 4.02, "learning_rate": 0.0002864593781344032, "loss": 2.6436, "theoretical_loss": 3.531296808651997, "tokens_seen": 1428631552 }, { "epoch": 4.02, "learning_rate": 0.0002864493480441324, "loss": 2.9037, "theoretical_loss": 3.5312823924117684, "tokens_seen": 1428697088 }, { "epoch": 4.02, "learning_rate": 0.0002864393179538616, "loss": 2.7452, "theoretical_loss": 3.5312679770179654, "tokens_seen": 1428762624 }, { "epoch": 4.02, "learning_rate": 0.0002864292878635908, "loss": 2.7801, "theoretical_loss": 3.5312535624705, "tokens_seen": 1428828160 }, { "epoch": 4.02, "learning_rate": 0.00028641925777331994, "loss": 2.8176, "theoretical_loss": 3.5312391487692834, "tokens_seen": 1428893696 }, { "epoch": 4.02, "learning_rate": 0.0002864092276830492, "loss": 2.7872, "theoretical_loss": 3.5312247359142273, "tokens_seen": 1428959232 }, { "epoch": 4.02, "learning_rate": 0.0002863991975927783, "loss": 2.9694, "theoretical_loss": 3.5312103239052433, "tokens_seen": 1429024768 }, { "epoch": 4.02, "learning_rate": 0.00028638916750250754, "loss": 2.8696, "theoretical_loss": 3.531195912742243, "tokens_seen": 1429090304 }, { "epoch": 4.02, "learning_rate": 0.0002863791374122367, "loss": 2.8827, "theoretical_loss": 3.5311815024251376, "tokens_seen": 1429155840 }, { "epoch": 4.02, "learning_rate": 0.0002863691073219659, "loss": 2.8968, "theoretical_loss": 3.5311670929538392, "tokens_seen": 1429221376 }, { "epoch": 4.02, "learning_rate": 0.0002863590772316951, "loss": 2.8654, "theoretical_loss": 3.5311526843282586, "tokens_seen": 1429286912 }, { "epoch": 4.02, "learning_rate": 0.0002863490471414243, "loss": 2.7642, "theoretical_loss": 3.531138276548308, "tokens_seen": 1429352448 }, { "epoch": 4.02, "learning_rate": 0.00028633901705115344, "loss": 2.6815, "theoretical_loss": 3.531123869613899, "tokens_seen": 1429417984 }, { "epoch": 4.02, "learning_rate": 0.0002863289869608827, "loss": 2.701, "theoretical_loss": 3.5311094635249427, "tokens_seen": 1429483520 }, { "epoch": 4.02, "learning_rate": 0.0002863189568706118, "loss": 2.9628, "theoretical_loss": 3.531095058281351, "tokens_seen": 1429549056 }, { "epoch": 4.02, "learning_rate": 0.00028630892678034104, "loss": 2.9763, "theoretical_loss": 3.531080653883036, "tokens_seen": 1429614592 }, { "epoch": 4.02, "learning_rate": 0.0002862988966900702, "loss": 2.8026, "theoretical_loss": 3.531066250329909, "tokens_seen": 1429680128 }, { "epoch": 4.02, "learning_rate": 0.0002862888665997994, "loss": 2.8425, "theoretical_loss": 3.5310518476218817, "tokens_seen": 1429745664 }, { "epoch": 4.02, "learning_rate": 0.0002862788365095286, "loss": 2.7776, "theoretical_loss": 3.5310374457588654, "tokens_seen": 1429811200 }, { "epoch": 4.02, "learning_rate": 0.00028626880641925776, "loss": 2.675, "theoretical_loss": 3.531023044740772, "tokens_seen": 1429876736 }, { "epoch": 4.02, "learning_rate": 0.00028625877632898694, "loss": 2.815, "theoretical_loss": 3.531008644567514, "tokens_seen": 1429942272 }, { "epoch": 4.02, "learning_rate": 0.0002862487462387162, "loss": 2.8741, "theoretical_loss": 3.5309942452390017, "tokens_seen": 1430007808 }, { "epoch": 4.02, "learning_rate": 0.0002862387161484453, "loss": 2.8158, "theoretical_loss": 3.530979846755148, "tokens_seen": 1430073344 }, { "epoch": 4.02, "learning_rate": 0.00028622868605817454, "loss": 2.8722, "theoretical_loss": 3.5309654491158637, "tokens_seen": 1430138880 }, { "epoch": 4.02, "learning_rate": 0.00028621865596790367, "loss": 2.9896, "theoretical_loss": 3.530951052321062, "tokens_seen": 1430204416 }, { "epoch": 4.02, "objective/train/docs_used": 2284182, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.485083818435669, "objective/train/theoretical_loss": 3.5309366563706535, "objective/train/tokens_used": 1450729952, "theoretical_loss": 3.5309366563706535, "tokens_seen": 1430269952 }, { "epoch": 4.02, "learning_rate": 0.0002862086258776329, "loss": 2.7703, "theoretical_loss": 3.5309366563706535, "tokens_seen": 1430269952 }, { "epoch": 4.02, "learning_rate": 0.0002861985957873621, "loss": 2.8166, "theoretical_loss": 3.53092226126455, "tokens_seen": 1430335488 }, { "epoch": 4.02, "learning_rate": 0.00028618856569709127, "loss": 2.7013, "theoretical_loss": 3.5309078670026635, "tokens_seen": 1430401024 }, { "epoch": 4.02, "learning_rate": 0.00028617853560682045, "loss": 2.7762, "theoretical_loss": 3.530893473584906, "tokens_seen": 1430466560 }, { "epoch": 4.02, "learning_rate": 0.0002861685055165497, "loss": 2.8509, "theoretical_loss": 3.530879081011189, "tokens_seen": 1430532096 }, { "epoch": 4.02, "learning_rate": 0.0002861584754262788, "loss": 2.8877, "theoretical_loss": 3.530864689281425, "tokens_seen": 1430597632 }, { "epoch": 4.02, "learning_rate": 0.00028614844533600805, "loss": 2.923, "theoretical_loss": 3.5308502983955248, "tokens_seen": 1430663168 }, { "epoch": 4.02, "learning_rate": 0.00028613841524573717, "loss": 2.7363, "theoretical_loss": 3.530835908353401, "tokens_seen": 1430728704 }, { "epoch": 4.02, "learning_rate": 0.0002861283851554664, "loss": 2.7651, "theoretical_loss": 3.5308215191549657, "tokens_seen": 1430794240 }, { "epoch": 4.02, "learning_rate": 0.0002861183550651956, "loss": 2.7929, "theoretical_loss": 3.53080713080013, "tokens_seen": 1430859776 }, { "epoch": 4.02, "learning_rate": 0.00028610832497492477, "loss": 2.7827, "theoretical_loss": 3.5307927432888064, "tokens_seen": 1430925312 }, { "epoch": 4.02, "learning_rate": 0.00028609829488465395, "loss": 2.8672, "theoretical_loss": 3.5307783566209068, "tokens_seen": 1430990848 }, { "epoch": 4.02, "learning_rate": 0.00028608826479438313, "loss": 2.8068, "theoretical_loss": 3.530763970796343, "tokens_seen": 1431056384 }, { "epoch": 4.02, "learning_rate": 0.0002860782347041123, "loss": 2.9021, "theoretical_loss": 3.530749585815027, "tokens_seen": 1431121920 }, { "epoch": 4.02, "learning_rate": 0.00028606820461384155, "loss": 2.8752, "theoretical_loss": 3.5307352016768707, "tokens_seen": 1431187456 }, { "epoch": 4.02, "learning_rate": 0.0002860581745235707, "loss": 2.8166, "theoretical_loss": 3.530720818381786, "tokens_seen": 1431252992 }, { "epoch": 4.02, "learning_rate": 0.0002860481444332999, "loss": 2.6978, "theoretical_loss": 3.5307064359296847, "tokens_seen": 1431318528 }, { "epoch": 4.02, "learning_rate": 0.0002860381143430291, "loss": 2.7668, "theoretical_loss": 3.530692054320479, "tokens_seen": 1431384064 }, { "epoch": 4.02, "learning_rate": 0.0002860280842527583, "loss": 2.9209, "theoretical_loss": 3.5306776735540817, "tokens_seen": 1431449600 }, { "epoch": 4.02, "learning_rate": 0.0002860180541624875, "loss": 2.9177, "theoretical_loss": 3.530663293630403, "tokens_seen": 1431515136 }, { "epoch": 4.02, "learning_rate": 0.00028600802407221664, "loss": 2.7628, "theoretical_loss": 3.530648914549357, "tokens_seen": 1431580672 }, { "epoch": 4.02, "learning_rate": 0.00028599799398194587, "loss": 2.7063, "theoretical_loss": 3.5306345363108544, "tokens_seen": 1431646208 }, { "epoch": 4.02, "learning_rate": 0.00028598796389167505, "loss": 2.8571, "theoretical_loss": 3.530620158914808, "tokens_seen": 1431711744 }, { "epoch": 4.02, "learning_rate": 0.00028597793380140423, "loss": 2.797, "theoretical_loss": 3.530605782361129, "tokens_seen": 1431777280 }, { "epoch": 4.02, "learning_rate": 0.0002859679037111334, "loss": 2.7589, "theoretical_loss": 3.53059140664973, "tokens_seen": 1431842816 }, { "epoch": 4.02, "objective/train/docs_used": 2287003, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8501064777374268, "objective/train/theoretical_loss": 3.5305770317805236, "objective/train/tokens_used": 1452368352, "theoretical_loss": 3.5305770317805236, "tokens_seen": 1431908352 }, { "epoch": 4.02, "learning_rate": 0.0002859578736208626, "loss": 2.7517, "theoretical_loss": 3.5305770317805236, "tokens_seen": 1431908352 }, { "epoch": 4.02, "learning_rate": 0.0002859478435305918, "loss": 3.0008, "theoretical_loss": 3.5305626577534213, "tokens_seen": 1431973888 }, { "epoch": 4.02, "learning_rate": 0.000285937813440321, "loss": 2.7781, "theoretical_loss": 3.5305482845683356, "tokens_seen": 1432039424 }, { "epoch": 4.02, "learning_rate": 0.00028592778335005014, "loss": 2.8546, "theoretical_loss": 3.530533912225178, "tokens_seen": 1432104960 }, { "epoch": 4.02, "learning_rate": 0.0002859177532597794, "loss": 2.8174, "theoretical_loss": 3.530519540723861, "tokens_seen": 1432170496 }, { "epoch": 4.02, "learning_rate": 0.0002859077231695085, "loss": 2.8284, "theoretical_loss": 3.5305051700642975, "tokens_seen": 1432236032 }, { "epoch": 4.02, "learning_rate": 0.00028589769307923774, "loss": 2.9252, "theoretical_loss": 3.5304908002463984, "tokens_seen": 1432301568 }, { "epoch": 4.02, "learning_rate": 0.0002858876629889669, "loss": 2.7039, "theoretical_loss": 3.5304764312700767, "tokens_seen": 1432367104 }, { "epoch": 4.02, "learning_rate": 0.0002858776328986961, "loss": 2.9947, "theoretical_loss": 3.5304620631352446, "tokens_seen": 1432432640 }, { "epoch": 4.02, "learning_rate": 0.0002858676028084253, "loss": 2.9135, "theoretical_loss": 3.530447695841814, "tokens_seen": 1432498176 }, { "epoch": 4.02, "learning_rate": 0.0002858575727181545, "loss": 2.8143, "theoretical_loss": 3.5304333293896972, "tokens_seen": 1432563712 }, { "epoch": 4.02, "learning_rate": 0.00028584754262788364, "loss": 2.7199, "theoretical_loss": 3.530418963778807, "tokens_seen": 1432629248 }, { "epoch": 4.02, "learning_rate": 0.0002858375125376129, "loss": 2.8832, "theoretical_loss": 3.5304045990090547, "tokens_seen": 1432694784 }, { "epoch": 4.02, "learning_rate": 0.000285827482447342, "loss": 2.7629, "theoretical_loss": 3.5303902350803535, "tokens_seen": 1432760320 }, { "epoch": 4.02, "learning_rate": 0.00028581745235707124, "loss": 2.9691, "theoretical_loss": 3.5303758719926153, "tokens_seen": 1432825856 }, { "epoch": 4.02, "learning_rate": 0.0002858074222668004, "loss": 2.934, "theoretical_loss": 3.5303615097457524, "tokens_seen": 1432891392 }, { "epoch": 4.02, "learning_rate": 0.0002857973921765296, "loss": 2.7951, "theoretical_loss": 3.530347148339677, "tokens_seen": 1432956928 }, { "epoch": 4.02, "learning_rate": 0.0002857873620862588, "loss": 2.7852, "theoretical_loss": 3.5303327877743014, "tokens_seen": 1433022464 }, { "epoch": 4.02, "learning_rate": 0.00028577733199598796, "loss": 2.9154, "theoretical_loss": 3.5303184280495383, "tokens_seen": 1433088000 }, { "epoch": 4.02, "learning_rate": 0.00028576730190571714, "loss": 2.7542, "theoretical_loss": 3.5303040691652994, "tokens_seen": 1433153536 }, { "epoch": 4.02, "learning_rate": 0.0002857572718154464, "loss": 3.0246, "theoretical_loss": 3.530289711121498, "tokens_seen": 1433219072 }, { "epoch": 4.02, "learning_rate": 0.0002857472417251755, "loss": 2.7935, "theoretical_loss": 3.5302753539180456, "tokens_seen": 1433284608 }, { "epoch": 4.02, "learning_rate": 0.00028573721163490474, "loss": 2.9075, "theoretical_loss": 3.5302609975548553, "tokens_seen": 1433350144 }, { "epoch": 4.02, "learning_rate": 0.00028572718154463387, "loss": 2.8914, "theoretical_loss": 3.5302466420318392, "tokens_seen": 1433415680 }, { "epoch": 4.02, "learning_rate": 0.0002857171514543631, "loss": 2.8215, "theoretical_loss": 3.5302322873489094, "tokens_seen": 1433481216 }, { "epoch": 4.02, "objective/train/docs_used": 2289434, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8615128993988037, "objective/train/theoretical_loss": 3.5302179335059787, "objective/train/tokens_used": 1454006752, "theoretical_loss": 3.5302179335059787, "tokens_seen": 1433546752 }, { "epoch": 4.02, "learning_rate": 0.0002857071213640923, "loss": 2.8407, "theoretical_loss": 3.5302179335059787, "tokens_seen": 1433546752 }, { "epoch": 4.02, "learning_rate": 0.00028569709127382147, "loss": 2.7375, "theoretical_loss": 3.5302035805029597, "tokens_seen": 1433612288 }, { "epoch": 4.02, "learning_rate": 0.00028568706118355065, "loss": 2.8452, "theoretical_loss": 3.530189228339765, "tokens_seen": 1433677824 }, { "epoch": 4.02, "learning_rate": 0.0002856770310932799, "loss": 2.8183, "theoretical_loss": 3.5301748770163055, "tokens_seen": 1433743360 }, { "epoch": 4.02, "learning_rate": 0.000285667001003009, "loss": 2.8334, "theoretical_loss": 3.5301605265324962, "tokens_seen": 1433808896 }, { "epoch": 4.02, "learning_rate": 0.00028565697091273825, "loss": 2.8934, "theoretical_loss": 3.5301461768882474, "tokens_seen": 1433874432 }, { "epoch": 4.02, "learning_rate": 0.00028564694082246737, "loss": 2.8116, "theoretical_loss": 3.530131828083473, "tokens_seen": 1433939968 }, { "epoch": 4.02, "learning_rate": 0.0002856369107321966, "loss": 2.8633, "theoretical_loss": 3.530117480118085, "tokens_seen": 1434005504 }, { "epoch": 4.02, "learning_rate": 0.0002856268806419258, "loss": 2.846, "theoretical_loss": 3.530103132991996, "tokens_seen": 1434071040 }, { "epoch": 4.02, "learning_rate": 0.00028561685055165497, "loss": 2.8742, "theoretical_loss": 3.5300887867051185, "tokens_seen": 1434136576 }, { "epoch": 4.02, "learning_rate": 0.00028560682046138415, "loss": 2.8567, "theoretical_loss": 3.530074441257365, "tokens_seen": 1434202112 }, { "epoch": 4.02, "learning_rate": 0.00028559679037111333, "loss": 2.8432, "theoretical_loss": 3.5300600966486484, "tokens_seen": 1434267648 }, { "epoch": 4.02, "learning_rate": 0.0002855867602808425, "loss": 2.7046, "theoretical_loss": 3.5300457528788813, "tokens_seen": 1434333184 }, { "epoch": 4.02, "learning_rate": 0.00028557673019057175, "loss": 2.7793, "theoretical_loss": 3.5300314099479753, "tokens_seen": 1434398720 }, { "epoch": 4.02, "learning_rate": 0.0002855667001003009, "loss": 2.7956, "theoretical_loss": 3.5300170678558445, "tokens_seen": 1434464256 }, { "epoch": 4.02, "learning_rate": 0.0002855566700100301, "loss": 2.8481, "theoretical_loss": 3.5300027266024006, "tokens_seen": 1434529792 }, { "epoch": 4.02, "learning_rate": 0.0002855466399197593, "loss": 2.8455, "theoretical_loss": 3.529988386187556, "tokens_seen": 1434595328 }, { "epoch": 4.02, "learning_rate": 0.0002855366098294885, "loss": 2.8134, "theoretical_loss": 3.5299740466112244, "tokens_seen": 1434660864 }, { "epoch": 4.02, "learning_rate": 0.00028552657973921765, "loss": 2.7242, "theoretical_loss": 3.529959707873318, "tokens_seen": 1434726400 }, { "epoch": 4.02, "learning_rate": 0.00028551654964894684, "loss": 3.0073, "theoretical_loss": 3.529945369973749, "tokens_seen": 1434791936 }, { "epoch": 4.02, "learning_rate": 0.000285506519558676, "loss": 2.8203, "theoretical_loss": 3.529931032912431, "tokens_seen": 1434857472 }, { "epoch": 4.02, "learning_rate": 0.00028549648946840525, "loss": 2.7252, "theoretical_loss": 3.529916696689276, "tokens_seen": 1434923008 }, { "epoch": 4.02, "learning_rate": 0.0002854864593781344, "loss": 2.7359, "theoretical_loss": 3.5299023613041967, "tokens_seen": 1434988544 }, { "epoch": 4.02, "learning_rate": 0.0002854764292878636, "loss": 2.7369, "theoretical_loss": 3.529888026757106, "tokens_seen": 1435054080 }, { "epoch": 4.02, "learning_rate": 0.00028546639919759274, "loss": 2.8739, "theoretical_loss": 3.529873693047917, "tokens_seen": 1435119616 }, { "epoch": 4.02, "objective/train/docs_used": 2292216, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.743633270263672, "objective/train/theoretical_loss": 3.529859360176542, "objective/train/tokens_used": 1455645152, "theoretical_loss": 3.529859360176542, "tokens_seen": 1435185152 }, { "epoch": 4.02, "learning_rate": 0.000285456369107322, "loss": 2.7383, "theoretical_loss": 3.529859360176542, "tokens_seen": 1435185152 }, { "epoch": 4.02, "learning_rate": 0.00028544633901705116, "loss": 2.7443, "theoretical_loss": 3.5298450281428937, "tokens_seen": 1435250688 }, { "epoch": 4.02, "learning_rate": 0.00028543630892678034, "loss": 2.8421, "theoretical_loss": 3.5298306969468856, "tokens_seen": 1435316224 }, { "epoch": 4.02, "learning_rate": 0.0002854262788365095, "loss": 2.8082, "theoretical_loss": 3.5298163665884297, "tokens_seen": 1435381760 }, { "epoch": 4.02, "learning_rate": 0.0002854162487462387, "loss": 2.8635, "theoretical_loss": 3.529802037067439, "tokens_seen": 1435447296 }, { "epoch": 4.02, "learning_rate": 0.0002854062186559679, "loss": 2.7785, "theoretical_loss": 3.529787708383827, "tokens_seen": 1435512832 }, { "epoch": 4.02, "learning_rate": 0.0002853961885656971, "loss": 2.6734, "theoretical_loss": 3.529773380537505, "tokens_seen": 1435578368 }, { "epoch": 4.02, "learning_rate": 0.00028538615847542624, "loss": 2.5636, "theoretical_loss": 3.529759053528388, "tokens_seen": 1435643904 }, { "epoch": 4.02, "learning_rate": 0.0002853761283851555, "loss": 2.7064, "theoretical_loss": 3.529744727356387, "tokens_seen": 1435709440 }, { "epoch": 4.02, "learning_rate": 0.00028536609829488466, "loss": 2.8481, "theoretical_loss": 3.529730402021416, "tokens_seen": 1435774976 }, { "epoch": 4.02, "learning_rate": 0.00028535606820461384, "loss": 2.6548, "theoretical_loss": 3.5297160775233873, "tokens_seen": 1435840512 }, { "epoch": 4.02, "learning_rate": 0.000285346038114343, "loss": 2.8591, "theoretical_loss": 3.5297017538622137, "tokens_seen": 1435906048 }, { "epoch": 4.02, "learning_rate": 0.0002853360080240722, "loss": 3.0108, "theoretical_loss": 3.529687431037809, "tokens_seen": 1435971584 }, { "epoch": 4.02, "learning_rate": 0.0002853259779338014, "loss": 2.8205, "theoretical_loss": 3.5296731090500852, "tokens_seen": 1436037120 }, { "epoch": 4.02, "learning_rate": 0.0002853159478435306, "loss": 2.6961, "theoretical_loss": 3.529658787898956, "tokens_seen": 1436102656 }, { "epoch": 4.02, "learning_rate": 0.00028530591775325975, "loss": 2.8586, "theoretical_loss": 3.5296444675843333, "tokens_seen": 1436168192 }, { "epoch": 4.02, "learning_rate": 0.000285295887662989, "loss": 2.7166, "theoretical_loss": 3.5296301481061314, "tokens_seen": 1436233728 }, { "epoch": 4.02, "learning_rate": 0.0002852858575727181, "loss": 2.8291, "theoretical_loss": 3.5296158294642623, "tokens_seen": 1436299264 }, { "epoch": 4.02, "learning_rate": 0.00028527582748244735, "loss": 2.8129, "theoretical_loss": 3.5296015116586394, "tokens_seen": 1436364800 }, { "epoch": 4.02, "learning_rate": 0.0002852657973921766, "loss": 2.8138, "theoretical_loss": 3.5295871946891753, "tokens_seen": 1436430336 }, { "epoch": 4.02, "learning_rate": 0.0002852557673019057, "loss": 2.9153, "theoretical_loss": 3.5295728785557836, "tokens_seen": 1436495872 }, { "epoch": 4.02, "learning_rate": 0.00028524573721163494, "loss": 2.7715, "theoretical_loss": 3.529558563258377, "tokens_seen": 1436561408 }, { "epoch": 4.02, "learning_rate": 0.00028523570712136407, "loss": 2.7811, "theoretical_loss": 3.529544248796869, "tokens_seen": 1436626944 }, { "epoch": 4.02, "learning_rate": 0.0002852256770310933, "loss": 2.8237, "theoretical_loss": 3.5295299351711718, "tokens_seen": 1436692480 }, { "epoch": 4.02, "learning_rate": 0.0002852156469408225, "loss": 2.9034, "theoretical_loss": 3.529515622381199, "tokens_seen": 1436758016 }, { "epoch": 4.02, "objective/train/docs_used": 2294933, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.915731191635132, "objective/train/theoretical_loss": 3.5295013104268635, "objective/train/tokens_used": 1457283552, "theoretical_loss": 3.5295013104268635, "tokens_seen": 1436823552 }, { "epoch": 4.02, "learning_rate": 0.00028520561685055167, "loss": 2.8142, "theoretical_loss": 3.5295013104268635, "tokens_seen": 1436823552 }, { "epoch": 4.02, "learning_rate": 0.00028519558676028085, "loss": 2.9926, "theoretical_loss": 3.529486999308079, "tokens_seen": 1436889088 }, { "epoch": 4.02, "learning_rate": 0.0002851855566700101, "loss": 2.8784, "theoretical_loss": 3.5294726890247574, "tokens_seen": 1436954624 }, { "epoch": 4.02, "learning_rate": 0.0002851755265797392, "loss": 2.698, "theoretical_loss": 3.5294583795768135, "tokens_seen": 1437020160 }, { "epoch": 4.02, "learning_rate": 0.00028516549648946845, "loss": 2.7694, "theoretical_loss": 3.529444070964159, "tokens_seen": 1437085696 }, { "epoch": 4.02, "learning_rate": 0.00028515546639919757, "loss": 2.8964, "theoretical_loss": 3.529429763186708, "tokens_seen": 1437151232 }, { "epoch": 4.02, "learning_rate": 0.0002851454363089268, "loss": 2.8211, "theoretical_loss": 3.529415456244373, "tokens_seen": 1437216768 }, { "epoch": 4.02, "learning_rate": 0.000285135406218656, "loss": 2.8137, "theoretical_loss": 3.5294011501370672, "tokens_seen": 1437282304 }, { "epoch": 4.02, "learning_rate": 0.00028512537612838517, "loss": 2.8804, "theoretical_loss": 3.529386844864704, "tokens_seen": 1437347840 }, { "epoch": 4.02, "learning_rate": 0.00028511534603811435, "loss": 2.8994, "theoretical_loss": 3.5293725404271967, "tokens_seen": 1437413376 }, { "epoch": 4.02, "learning_rate": 0.00028510531594784353, "loss": 2.7837, "theoretical_loss": 3.529358236824459, "tokens_seen": 1437478912 }, { "epoch": 4.02, "learning_rate": 0.0002850952858575727, "loss": 2.8886, "theoretical_loss": 3.529343934056403, "tokens_seen": 1437544448 }, { "epoch": 4.02, "learning_rate": 0.00028508525576730195, "loss": 2.7414, "theoretical_loss": 3.5293296321229426, "tokens_seen": 1437609984 }, { "epoch": 4.02, "learning_rate": 0.0002850752256770311, "loss": 2.8795, "theoretical_loss": 3.529315331023991, "tokens_seen": 1437675520 }, { "epoch": 4.02, "learning_rate": 0.0002850651955867603, "loss": 2.8019, "theoretical_loss": 3.529301030759461, "tokens_seen": 1437741056 }, { "epoch": 4.02, "learning_rate": 0.0002850551654964895, "loss": 2.7557, "theoretical_loss": 3.529286731329267, "tokens_seen": 1437806592 }, { "epoch": 4.02, "learning_rate": 0.0002850451354062187, "loss": 2.8984, "theoretical_loss": 3.5292724327333214, "tokens_seen": 1437872128 }, { "epoch": 4.02, "learning_rate": 0.00028503510531594785, "loss": 2.8631, "theoretical_loss": 3.5292581349715375, "tokens_seen": 1437937664 }, { "epoch": 4.02, "learning_rate": 0.00028502507522567704, "loss": 2.8873, "theoretical_loss": 3.529243838043829, "tokens_seen": 1438003200 }, { "epoch": 4.02, "learning_rate": 0.0002850150451354062, "loss": 2.881, "theoretical_loss": 3.529229541950109, "tokens_seen": 1438068736 }, { "epoch": 4.02, "learning_rate": 0.00028500501504513545, "loss": 2.9393, "theoretical_loss": 3.529215246690291, "tokens_seen": 1438134272 }, { "epoch": 4.02, "learning_rate": 0.0002849949849548646, "loss": 2.9248, "theoretical_loss": 3.529200952264288, "tokens_seen": 1438199808 }, { "epoch": 4.02, "learning_rate": 0.0002849849548645938, "loss": 2.7545, "theoretical_loss": 3.5291866586720144, "tokens_seen": 1438265344 }, { "epoch": 4.02, "learning_rate": 0.00028497492477432294, "loss": 2.7675, "theoretical_loss": 3.5291723659133822, "tokens_seen": 1438330880 }, { "epoch": 4.02, "learning_rate": 0.0002849648946840522, "loss": 2.7203, "theoretical_loss": 3.529158073988306, "tokens_seen": 1438396416 }, { "epoch": 4.02, "objective/train/docs_used": 2297709, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.932583808898926, "objective/train/theoretical_loss": 3.529143782896698, "objective/train/tokens_used": 1458921952, "theoretical_loss": 3.529143782896698, "tokens_seen": 1438461952 }, { "epoch": 4.02, "learning_rate": 0.00028495486459378136, "loss": 2.8901, "theoretical_loss": 3.529143782896698, "tokens_seen": 1438461952 }, { "epoch": 4.02, "learning_rate": 0.00028494483450351054, "loss": 2.7154, "theoretical_loss": 3.529129492638473, "tokens_seen": 1438527488 }, { "epoch": 4.02, "learning_rate": 0.0002849348044132397, "loss": 2.7771, "theoretical_loss": 3.529115203213543, "tokens_seen": 1438593024 }, { "epoch": 4.02, "learning_rate": 0.0002849247743229689, "loss": 2.8522, "theoretical_loss": 3.5291009146218224, "tokens_seen": 1438658560 }, { "epoch": 4.02, "learning_rate": 0.0002849147442326981, "loss": 2.9326, "theoretical_loss": 3.5290866268632244, "tokens_seen": 1438724096 }, { "epoch": 4.02, "learning_rate": 0.0002849047141424273, "loss": 2.7163, "theoretical_loss": 3.529072339937663, "tokens_seen": 1438789632 }, { "epoch": 4.02, "learning_rate": 0.00028489468405215644, "loss": 2.9269, "theoretical_loss": 3.5290580538450502, "tokens_seen": 1438855168 }, { "epoch": 4.02, "learning_rate": 0.0002848846539618857, "loss": 2.9121, "theoretical_loss": 3.5290437685853013, "tokens_seen": 1438920704 }, { "epoch": 4.02, "learning_rate": 0.00028487462387161486, "loss": 2.9328, "theoretical_loss": 3.529029484158329, "tokens_seen": 1438986240 }, { "epoch": 4.02, "learning_rate": 0.00028486459378134404, "loss": 2.729, "theoretical_loss": 3.5290152005640465, "tokens_seen": 1439051776 }, { "epoch": 4.02, "learning_rate": 0.0002848545636910732, "loss": 2.8051, "theoretical_loss": 3.529000917802368, "tokens_seen": 1439117312 }, { "epoch": 4.02, "learning_rate": 0.0002848445336008024, "loss": 2.8972, "theoretical_loss": 3.5289866358732063, "tokens_seen": 1439182848 }, { "epoch": 4.02, "learning_rate": 0.0002848345035105316, "loss": 2.9502, "theoretical_loss": 3.5289723547764758, "tokens_seen": 1439248384 }, { "epoch": 4.02, "learning_rate": 0.0002848244734202608, "loss": 2.4815, "theoretical_loss": 3.5289580745120896, "tokens_seen": 1439313920 }, { "epoch": 4.02, "learning_rate": 0.00028481444332998995, "loss": 2.8397, "theoretical_loss": 3.528943795079961, "tokens_seen": 1439379456 }, { "epoch": 4.02, "learning_rate": 0.0002848044132397192, "loss": 2.8792, "theoretical_loss": 3.5289295164800043, "tokens_seen": 1439444992 }, { "epoch": 4.02, "learning_rate": 0.0002847943831494483, "loss": 2.9376, "theoretical_loss": 3.528915238712133, "tokens_seen": 1439510528 }, { "epoch": 4.02, "learning_rate": 0.00028478435305917755, "loss": 2.9624, "theoretical_loss": 3.5289009617762606, "tokens_seen": 1439576064 }, { "epoch": 4.02, "learning_rate": 0.0002847743229689067, "loss": 2.8275, "theoretical_loss": 3.5288866856723002, "tokens_seen": 1439641600 }, { "epoch": 4.02, "learning_rate": 0.0002847642928786359, "loss": 2.6517, "theoretical_loss": 3.528872410400166, "tokens_seen": 1439707136 }, { "epoch": 4.02, "learning_rate": 0.0002847542627883651, "loss": 2.7328, "theoretical_loss": 3.528858135959772, "tokens_seen": 1439772672 }, { "epoch": 4.02, "learning_rate": 0.00028474423269809427, "loss": 2.7853, "theoretical_loss": 3.5288438623510316, "tokens_seen": 1439838208 }, { "epoch": 4.02, "learning_rate": 0.00028473420260782345, "loss": 2.8058, "theoretical_loss": 3.528829589573858, "tokens_seen": 1439903744 }, { "epoch": 4.02, "learning_rate": 0.0002847241725175527, "loss": 2.7918, "theoretical_loss": 3.5288153176281654, "tokens_seen": 1439969280 }, { "epoch": 4.02, "learning_rate": 0.0002847141424272818, "loss": 2.6452, "theoretical_loss": 3.528801046513867, "tokens_seen": 1440034816 }, { "epoch": 4.02, "objective/train/docs_used": 2300538, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.939342498779297, "objective/train/theoretical_loss": 3.5287867762308776, "objective/train/tokens_used": 1460560352, "theoretical_loss": 3.5287867762308776, "tokens_seen": 1440100352 }, { "epoch": 4.02, "learning_rate": 0.00028470411233701105, "loss": 2.8372, "theoretical_loss": 3.5287867762308776, "tokens_seen": 1440100352 }, { "epoch": 4.02, "learning_rate": 0.00028469408224674023, "loss": 2.9294, "theoretical_loss": 3.52877250677911, "tokens_seen": 1440165888 }, { "epoch": 4.02, "learning_rate": 0.0002846840521564694, "loss": 2.7679, "theoretical_loss": 3.528758238158478, "tokens_seen": 1440231424 }, { "epoch": 4.02, "learning_rate": 0.0002846740220661986, "loss": 2.8035, "theoretical_loss": 3.528743970368896, "tokens_seen": 1440296960 }, { "epoch": 4.02, "learning_rate": 0.0002846639919759278, "loss": 2.8159, "theoretical_loss": 3.5287297034102774, "tokens_seen": 1440362496 }, { "epoch": 4.02, "learning_rate": 0.00028465396188565695, "loss": 2.7826, "theoretical_loss": 3.528715437282536, "tokens_seen": 1440428032 }, { "epoch": 4.02, "learning_rate": 0.0002846439317953862, "loss": 2.8481, "theoretical_loss": 3.5287011719855856, "tokens_seen": 1440493568 }, { "epoch": 4.02, "learning_rate": 0.0002846339017051153, "loss": 2.8179, "theoretical_loss": 3.52868690751934, "tokens_seen": 1440559104 }, { "epoch": 4.02, "learning_rate": 0.00028462387161484455, "loss": 2.7066, "theoretical_loss": 3.528672643883713, "tokens_seen": 1440624640 }, { "epoch": 4.02, "learning_rate": 0.0002846138415245737, "loss": 2.6906, "theoretical_loss": 3.5286583810786185, "tokens_seen": 1440690176 }, { "epoch": 4.02, "learning_rate": 0.0002846038114343029, "loss": 2.8594, "theoretical_loss": 3.528644119103971, "tokens_seen": 1440755712 }, { "epoch": 4.02, "learning_rate": 0.0002845937813440321, "loss": 2.8765, "theoretical_loss": 3.5286298579596833, "tokens_seen": 1440821248 }, { "epoch": 4.02, "learning_rate": 0.0002845837512537613, "loss": 2.9327, "theoretical_loss": 3.5286155976456697, "tokens_seen": 1440886784 }, { "epoch": 4.02, "learning_rate": 0.00028457372116349046, "loss": 2.805, "theoretical_loss": 3.5286013381618444, "tokens_seen": 1440952320 }, { "epoch": 4.02, "learning_rate": 0.0002845636910732197, "loss": 2.8015, "theoretical_loss": 3.528587079508121, "tokens_seen": 1441017856 }, { "epoch": 4.02, "learning_rate": 0.0002845536609829488, "loss": 2.8557, "theoretical_loss": 3.5285728216844134, "tokens_seen": 1441083392 }, { "epoch": 4.02, "learning_rate": 0.00028454363089267805, "loss": 2.6905, "theoretical_loss": 3.528558564690636, "tokens_seen": 1441148928 }, { "epoch": 4.02, "learning_rate": 0.00028453360080240724, "loss": 2.9369, "theoretical_loss": 3.528544308526702, "tokens_seen": 1441214464 }, { "epoch": 4.02, "learning_rate": 0.0002845235707121364, "loss": 2.9404, "theoretical_loss": 3.528530053192526, "tokens_seen": 1441280000 }, { "epoch": 4.02, "learning_rate": 0.00028451354062186565, "loss": 2.8141, "theoretical_loss": 3.528515798688021, "tokens_seen": 1441345536 }, { "epoch": 4.02, "learning_rate": 0.0002845035105315948, "loss": 2.7012, "theoretical_loss": 3.5285015450131025, "tokens_seen": 1441411072 }, { "epoch": 4.02, "learning_rate": 0.000284493480441324, "loss": 2.8453, "theoretical_loss": 3.528487292167684, "tokens_seen": 1441476608 }, { "epoch": 4.02, "learning_rate": 0.00028448345035105314, "loss": 2.8699, "theoretical_loss": 3.5284730401516793, "tokens_seen": 1441542144 }, { "epoch": 4.02, "learning_rate": 0.0002844734202607824, "loss": 2.9608, "theoretical_loss": 3.5284587889650014, "tokens_seen": 1441607680 }, { "epoch": 4.02, "learning_rate": 0.00028446339017051156, "loss": 2.8566, "theoretical_loss": 3.5284445386075665, "tokens_seen": 1441673216 }, { "epoch": 4.02, "objective/train/docs_used": 2303233, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8414344787597656, "objective/train/theoretical_loss": 3.528430289079287, "objective/train/tokens_used": 1462198752, "theoretical_loss": 3.528430289079287, "tokens_seen": 1441738752 }, { "epoch": 4.02, "learning_rate": 0.00028445336008024074, "loss": 2.8321, "theoretical_loss": 3.528430289079287, "tokens_seen": 1441738752 }, { "epoch": 4.02, "learning_rate": 0.0002844433299899699, "loss": 2.8099, "theoretical_loss": 3.528416040380077, "tokens_seen": 1441804288 }, { "epoch": 4.02, "learning_rate": 0.0002844332998996991, "loss": 2.8019, "theoretical_loss": 3.5284017925098516, "tokens_seen": 1441869824 }, { "epoch": 4.02, "learning_rate": 0.0002844232698094283, "loss": 2.6543, "theoretical_loss": 3.528387545468524, "tokens_seen": 1441935360 }, { "epoch": 4.02, "learning_rate": 0.0002844132397191575, "loss": 2.9052, "theoretical_loss": 3.5283732992560095, "tokens_seen": 1442000896 }, { "epoch": 4.02, "learning_rate": 0.00028440320962888664, "loss": 2.9871, "theoretical_loss": 3.5283590538722205, "tokens_seen": 1442066432 }, { "epoch": 4.02, "learning_rate": 0.0002843931795386159, "loss": 2.9411, "theoretical_loss": 3.528344809317072, "tokens_seen": 1442131968 }, { "epoch": 4.02, "learning_rate": 0.00028438314944834506, "loss": 2.8516, "theoretical_loss": 3.528330565590479, "tokens_seen": 1442197504 }, { "epoch": 4.02, "learning_rate": 0.00028437311935807424, "loss": 2.9452, "theoretical_loss": 3.528316322692354, "tokens_seen": 1442263040 }, { "epoch": 4.02, "learning_rate": 0.0002843630892678034, "loss": 2.8428, "theoretical_loss": 3.5283020806226117, "tokens_seen": 1442328576 }, { "epoch": 4.02, "learning_rate": 0.0002843530591775326, "loss": 2.9333, "theoretical_loss": 3.5282878393811674, "tokens_seen": 1442394112 }, { "epoch": 4.02, "learning_rate": 0.0002843430290872618, "loss": 2.8881, "theoretical_loss": 3.528273598967934, "tokens_seen": 1442459648 }, { "epoch": 4.02, "learning_rate": 0.000284332998996991, "loss": 2.8725, "theoretical_loss": 3.528259359382826, "tokens_seen": 1442525184 }, { "epoch": 4.02, "learning_rate": 0.00028432296890672015, "loss": 3.0231, "theoretical_loss": 3.5282451206257583, "tokens_seen": 1442590720 }, { "epoch": 4.02, "learning_rate": 0.0002843129388164494, "loss": 2.8351, "theoretical_loss": 3.5282308826966444, "tokens_seen": 1442656256 }, { "epoch": 4.02, "learning_rate": 0.0002843029087261785, "loss": 2.8007, "theoretical_loss": 3.528216645595399, "tokens_seen": 1442721792 }, { "epoch": 4.02, "learning_rate": 0.00028429287863590775, "loss": 2.6848, "theoretical_loss": 3.5282024093219357, "tokens_seen": 1442787328 }, { "epoch": 4.02, "learning_rate": 0.0002842828485456369, "loss": 2.8692, "theoretical_loss": 3.528188173876169, "tokens_seen": 1442852864 }, { "epoch": 4.02, "learning_rate": 0.0002842728184553661, "loss": 2.6925, "theoretical_loss": 3.528173939258014, "tokens_seen": 1442918400 }, { "epoch": 4.02, "learning_rate": 0.0002842627883650953, "loss": 2.7144, "theoretical_loss": 3.528159705467384, "tokens_seen": 1442983936 }, { "epoch": 4.02, "learning_rate": 0.00028425275827482447, "loss": 2.8501, "theoretical_loss": 3.528145472504194, "tokens_seen": 1443049472 }, { "epoch": 4.02, "learning_rate": 0.00028424272818455365, "loss": 2.7674, "theoretical_loss": 3.528131240368358, "tokens_seen": 1443115008 }, { "epoch": 4.02, "learning_rate": 0.0002842326980942829, "loss": 2.764, "theoretical_loss": 3.52811700905979, "tokens_seen": 1443180544 }, { "epoch": 4.02, "learning_rate": 0.000284222668004012, "loss": 2.7544, "theoretical_loss": 3.528102778578405, "tokens_seen": 1443246080 }, { "epoch": 4.02, "learning_rate": 0.00028421263791374125, "loss": 2.9636, "theoretical_loss": 3.528088548924117, "tokens_seen": 1443311616 }, { "epoch": 4.02, "objective/train/docs_used": 2304576, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.725557804107666, "objective/train/theoretical_loss": 3.5280743200968403, "objective/train/tokens_used": 1463837152, "theoretical_loss": 3.5280743200968403, "tokens_seen": 1443377152 }, { "epoch": 4.02, "learning_rate": 0.00028420260782347043, "loss": 2.7942, "theoretical_loss": 3.5280743200968403, "tokens_seen": 1443377152 }, { "epoch": 4.02, "learning_rate": 0.0002841925777331996, "loss": 2.9213, "theoretical_loss": 3.5280600920964895, "tokens_seen": 1443442688 }, { "epoch": 4.02, "learning_rate": 0.0002841825476429288, "loss": 2.838, "theoretical_loss": 3.528045864922979, "tokens_seen": 1443508224 }, { "epoch": 4.02, "learning_rate": 0.000284172517552658, "loss": 2.6569, "theoretical_loss": 3.5280316385762234, "tokens_seen": 1443573760 }, { "epoch": 4.02, "learning_rate": 0.00028416248746238715, "loss": 2.8673, "theoretical_loss": 3.5280174130561366, "tokens_seen": 1443639296 }, { "epoch": 4.02, "learning_rate": 0.0002841524573721164, "loss": 2.8548, "theoretical_loss": 3.528003188362633, "tokens_seen": 1443704832 }, { "epoch": 4.02, "learning_rate": 0.0002841424272818455, "loss": 2.9165, "theoretical_loss": 3.527988964495628, "tokens_seen": 1443770368 }, { "epoch": 4.02, "learning_rate": 0.00028413239719157475, "loss": 2.6359, "theoretical_loss": 3.5279747414550355, "tokens_seen": 1443835904 }, { "epoch": 4.02, "learning_rate": 0.0002841223671013039, "loss": 2.9002, "theoretical_loss": 3.5279605192407697, "tokens_seen": 1443901440 }, { "epoch": 4.02, "learning_rate": 0.0002841123370110331, "loss": 2.6564, "theoretical_loss": 3.527946297852745, "tokens_seen": 1443966976 }, { "epoch": 4.02, "learning_rate": 0.0002841023069207623, "loss": 2.8703, "theoretical_loss": 3.527932077290876, "tokens_seen": 1444032512 }, { "epoch": 4.02, "learning_rate": 0.0002840922768304915, "loss": 2.866, "theoretical_loss": 3.527917857555078, "tokens_seen": 1444098048 }, { "epoch": 4.02, "learning_rate": 0.00028408224674022066, "loss": 2.7805, "theoretical_loss": 3.527903638645265, "tokens_seen": 1444163584 }, { "epoch": 4.02, "learning_rate": 0.0002840722166499499, "loss": 2.9146, "theoretical_loss": 3.527889420561351, "tokens_seen": 1444229120 }, { "epoch": 4.02, "learning_rate": 0.000284062186559679, "loss": 2.789, "theoretical_loss": 3.5278752033032514, "tokens_seen": 1444294656 }, { "epoch": 4.02, "learning_rate": 0.00028405215646940825, "loss": 2.6822, "theoretical_loss": 3.52786098687088, "tokens_seen": 1444360192 }, { "epoch": 4.02, "learning_rate": 0.0002840421263791374, "loss": 2.7277, "theoretical_loss": 3.5278467712641524, "tokens_seen": 1444425728 }, { "epoch": 4.02, "learning_rate": 0.0002840320962888666, "loss": 2.9813, "theoretical_loss": 3.5278325564829824, "tokens_seen": 1444491264 }, { "epoch": 4.02, "learning_rate": 0.0002840220661985958, "loss": 2.7339, "theoretical_loss": 3.5278183425272847, "tokens_seen": 1444556800 }, { "epoch": 4.02, "learning_rate": 0.000284012036108325, "loss": 2.945, "theoretical_loss": 3.5278041293969737, "tokens_seen": 1444622336 }, { "epoch": 4.02, "learning_rate": 0.00028400200601805416, "loss": 2.9026, "theoretical_loss": 3.5277899170919644, "tokens_seen": 1444687872 }, { "epoch": 4.02, "learning_rate": 0.00028399197592778334, "loss": 2.9526, "theoretical_loss": 3.5277757056121715, "tokens_seen": 1444753408 }, { "epoch": 4.02, "learning_rate": 0.0002839819458375125, "loss": 2.6139, "theoretical_loss": 3.5277614949575096, "tokens_seen": 1444818944 }, { "epoch": 4.02, "learning_rate": 0.00028397191574724176, "loss": 2.7065, "theoretical_loss": 3.5277472851278935, "tokens_seen": 1444884480 }, { "epoch": 4.02, "learning_rate": 0.0002839618856569709, "loss": 2.7831, "theoretical_loss": 3.5277330761232375, "tokens_seen": 1444950016 }, { "epoch": 4.02, "objective/train/docs_used": 2307316, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.894901990890503, "objective/train/theoretical_loss": 3.5277188679434563, "objective/train/tokens_used": 1465475552, "theoretical_loss": 3.5277188679434563, "tokens_seen": 1445015552 }, { "epoch": 4.02, "learning_rate": 0.0002839518555667001, "loss": 2.896, "theoretical_loss": 3.5277188679434563, "tokens_seen": 1445015552 }, { "epoch": 4.02, "learning_rate": 0.00028394182547642925, "loss": 2.8606, "theoretical_loss": 3.5277046605884648, "tokens_seen": 1445081088 }, { "epoch": 4.02, "learning_rate": 0.0002839317953861585, "loss": 2.8823, "theoretical_loss": 3.5276904540581784, "tokens_seen": 1445146624 }, { "epoch": 4.02, "learning_rate": 0.00028392176529588766, "loss": 2.6875, "theoretical_loss": 3.52767624835251, "tokens_seen": 1445212160 }, { "epoch": 4.02, "learning_rate": 0.00028391173520561684, "loss": 2.8269, "theoretical_loss": 3.527662043471376, "tokens_seen": 1445277696 }, { "epoch": 4.02, "learning_rate": 0.000283901705115346, "loss": 2.6482, "theoretical_loss": 3.5276478394146906, "tokens_seen": 1445343232 }, { "epoch": 4.02, "learning_rate": 0.00028389167502507526, "loss": 2.6911, "theoretical_loss": 3.527633636182369, "tokens_seen": 1445408768 }, { "epoch": 4.02, "learning_rate": 0.0002838816449348044, "loss": 2.9369, "theoretical_loss": 3.527619433774325, "tokens_seen": 1445474304 }, { "epoch": 4.02, "learning_rate": 0.0002838716148445336, "loss": 2.7443, "theoretical_loss": 3.5276052321904743, "tokens_seen": 1445539840 }, { "epoch": 4.02, "learning_rate": 0.00028386158475426275, "loss": 2.5586, "theoretical_loss": 3.527591031430731, "tokens_seen": 1445605376 }, { "epoch": 4.02, "learning_rate": 0.000283851554663992, "loss": 2.7669, "theoretical_loss": 3.527576831495011, "tokens_seen": 1445670912 }, { "epoch": 4.02, "learning_rate": 0.00028384152457372117, "loss": 2.8203, "theoretical_loss": 3.527562632383228, "tokens_seen": 1445736448 }, { "epoch": 4.02, "learning_rate": 0.00028383149448345035, "loss": 2.8827, "theoretical_loss": 3.527548434095297, "tokens_seen": 1445801984 }, { "epoch": 4.02, "learning_rate": 0.00028382146439317953, "loss": 2.8727, "theoretical_loss": 3.5275342366311335, "tokens_seen": 1445867520 }, { "epoch": 4.02, "learning_rate": 0.0002838114343029087, "loss": 2.7643, "theoretical_loss": 3.527520039990652, "tokens_seen": 1445933056 }, { "epoch": 4.02, "learning_rate": 0.0002838014042126379, "loss": 2.8202, "theoretical_loss": 3.5275058441737674, "tokens_seen": 1445998592 }, { "epoch": 4.02, "learning_rate": 0.0002837913741223671, "loss": 2.8303, "theoretical_loss": 3.5274916491803947, "tokens_seen": 1446064128 }, { "epoch": 4.02, "learning_rate": 0.0002837813440320963, "loss": 2.8881, "theoretical_loss": 3.527477455010448, "tokens_seen": 1446129664 }, { "epoch": 4.02, "learning_rate": 0.0002837713139418255, "loss": 2.7698, "theoretical_loss": 3.5274632616638435, "tokens_seen": 1446195200 }, { "epoch": 4.02, "learning_rate": 0.00028376128385155467, "loss": 2.6697, "theoretical_loss": 3.527449069140496, "tokens_seen": 1446260736 }, { "epoch": 4.02, "learning_rate": 0.00028375125376128385, "loss": 2.7419, "theoretical_loss": 3.5274348774403195, "tokens_seen": 1446326272 }, { "epoch": 4.02, "learning_rate": 0.0002837412236710131, "loss": 2.8873, "theoretical_loss": 3.5274206865632296, "tokens_seen": 1446391808 }, { "epoch": 4.02, "learning_rate": 0.0002837311935807422, "loss": 2.8936, "theoretical_loss": 3.527406496509141, "tokens_seen": 1446457344 }, { "epoch": 4.02, "learning_rate": 0.00028372116349047145, "loss": 2.7196, "theoretical_loss": 3.5273923072779687, "tokens_seen": 1446522880 }, { "epoch": 4.02, "learning_rate": 0.00028371113340020063, "loss": 2.7861, "theoretical_loss": 3.5273781188696276, "tokens_seen": 1446588416 }, { "epoch": 4.02, "objective/train/docs_used": 2309991, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.974025011062622, "objective/train/theoretical_loss": 3.5273639312840337, "objective/train/tokens_used": 1467113952, "theoretical_loss": 3.5273639312840337, "tokens_seen": 1446653952 }, { "epoch": 4.02, "learning_rate": 0.0002837011033099298, "loss": 2.8363, "theoretical_loss": 3.5273639312840337, "tokens_seen": 1446653952 }, { "epoch": 4.02, "learning_rate": 0.000283691073219659, "loss": 2.7616, "theoretical_loss": 3.527349744521101, "tokens_seen": 1446719488 }, { "epoch": 4.02, "learning_rate": 0.0002836810431293882, "loss": 2.7984, "theoretical_loss": 3.527335558580744, "tokens_seen": 1446785024 }, { "epoch": 4.02, "learning_rate": 0.00028367101303911735, "loss": 2.5406, "theoretical_loss": 3.5273213734628794, "tokens_seen": 1446850560 }, { "epoch": 4.02, "learning_rate": 0.0002836609829488466, "loss": 2.8205, "theoretical_loss": 3.5273071891674213, "tokens_seen": 1446916096 }, { "epoch": 4.02, "learning_rate": 0.0002836509528585757, "loss": 2.8166, "theoretical_loss": 3.5272930056942844, "tokens_seen": 1446981632 }, { "epoch": 4.02, "learning_rate": 0.00028364092276830495, "loss": 2.7216, "theoretical_loss": 3.527278823043385, "tokens_seen": 1447047168 }, { "epoch": 4.02, "learning_rate": 0.0002836308926780341, "loss": 2.5795, "theoretical_loss": 3.5272646412146367, "tokens_seen": 1447112704 }, { "epoch": 4.02, "learning_rate": 0.0002836208625877633, "loss": 2.8096, "theoretical_loss": 3.5272504602079557, "tokens_seen": 1447178240 }, { "epoch": 4.02, "learning_rate": 0.0002836108324974925, "loss": 2.7762, "theoretical_loss": 3.5272362800232564, "tokens_seen": 1447243776 }, { "epoch": 4.02, "learning_rate": 0.0002836008024072217, "loss": 2.7395, "theoretical_loss": 3.527222100660455, "tokens_seen": 1447309312 }, { "epoch": 4.02, "learning_rate": 0.00028359077231695086, "loss": 2.6693, "theoretical_loss": 3.527207922119466, "tokens_seen": 1447374848 }, { "epoch": 4.02, "learning_rate": 0.0002835807422266801, "loss": 2.8451, "theoretical_loss": 3.527193744400204, "tokens_seen": 1447440384 }, { "epoch": 4.02, "learning_rate": 0.0002835707121364092, "loss": 2.894, "theoretical_loss": 3.5271795675025848, "tokens_seen": 1447505920 }, { "epoch": 4.02, "learning_rate": 0.00028356068204613846, "loss": 2.6221, "theoretical_loss": 3.5271653914265233, "tokens_seen": 1447571456 }, { "epoch": 4.02, "learning_rate": 0.0002835506519558676, "loss": 2.7639, "theoretical_loss": 3.5271512161719354, "tokens_seen": 1447636992 }, { "epoch": 4.02, "learning_rate": 0.0002835406218655968, "loss": 2.7076, "theoretical_loss": 3.5271370417387353, "tokens_seen": 1447702528 }, { "epoch": 4.02, "learning_rate": 0.000283530591775326, "loss": 2.7043, "theoretical_loss": 3.527122868126839, "tokens_seen": 1447768064 }, { "epoch": 4.02, "learning_rate": 0.0002835205616850552, "loss": 2.7165, "theoretical_loss": 3.5271086953361617, "tokens_seen": 1447833600 }, { "epoch": 4.02, "learning_rate": 0.00028351053159478436, "loss": 2.8366, "theoretical_loss": 3.5270945233666176, "tokens_seen": 1447899136 }, { "epoch": 4.02, "learning_rate": 0.00028350050150451354, "loss": 2.789, "theoretical_loss": 3.5270803522181238, "tokens_seen": 1447964672 }, { "epoch": 4.02, "learning_rate": 0.0002834904714142427, "loss": 2.8352, "theoretical_loss": 3.5270661818905937, "tokens_seen": 1448030208 }, { "epoch": 4.02, "learning_rate": 0.00028348044132397196, "loss": 2.879, "theoretical_loss": 3.527052012383944, "tokens_seen": 1448095744 }, { "epoch": 4.02, "learning_rate": 0.0002834704112337011, "loss": 2.9106, "theoretical_loss": 3.527037843698089, "tokens_seen": 1448161280 }, { "epoch": 4.02, "learning_rate": 0.0002834603811434303, "loss": 2.6842, "theoretical_loss": 3.527023675832945, "tokens_seen": 1448226816 }, { "epoch": 4.02, "objective/train/docs_used": 2312674, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.774665117263794, "objective/train/theoretical_loss": 3.527009508788426, "objective/train/tokens_used": 1468752352, "theoretical_loss": 3.527009508788426, "tokens_seen": 1448292352 }, { "epoch": 4.02, "learning_rate": 0.00028345035105315945, "loss": 2.7777, "theoretical_loss": 3.527009508788426, "tokens_seen": 1448292352 }, { "epoch": 4.02, "learning_rate": 0.0002834403209628887, "loss": 2.8366, "theoretical_loss": 3.526995342564449, "tokens_seen": 1448357888 }, { "epoch": 4.02, "learning_rate": 0.00028343029087261786, "loss": 2.7484, "theoretical_loss": 3.526981177160928, "tokens_seen": 1448423424 }, { "epoch": 4.02, "learning_rate": 0.00028342026078234705, "loss": 2.7468, "theoretical_loss": 3.526967012577779, "tokens_seen": 1448488960 }, { "epoch": 4.02, "learning_rate": 0.0002834102306920762, "loss": 2.8877, "theoretical_loss": 3.5269528488149167, "tokens_seen": 1448554496 }, { "epoch": 4.02, "learning_rate": 0.00028340020060180546, "loss": 2.7603, "theoretical_loss": 3.526938685872257, "tokens_seen": 1448620032 }, { "epoch": 4.02, "learning_rate": 0.0002833901705115346, "loss": 2.7645, "theoretical_loss": 3.526924523749716, "tokens_seen": 1448685568 }, { "epoch": 4.02, "learning_rate": 0.0002833801404212638, "loss": 2.7846, "theoretical_loss": 3.5269103624472082, "tokens_seen": 1448751104 }, { "epoch": 4.02, "learning_rate": 0.00028337011033099295, "loss": 2.8288, "theoretical_loss": 3.5268962019646493, "tokens_seen": 1448816640 }, { "epoch": 4.02, "learning_rate": 0.0002833600802407222, "loss": 2.9194, "theoretical_loss": 3.526882042301954, "tokens_seen": 1448882176 }, { "epoch": 4.02, "learning_rate": 0.00028335005015045137, "loss": 2.8203, "theoretical_loss": 3.526867883459039, "tokens_seen": 1448947712 }, { "epoch": 4.02, "learning_rate": 0.00028334002006018055, "loss": 2.8498, "theoretical_loss": 3.526853725435819, "tokens_seen": 1449013248 }, { "epoch": 4.02, "learning_rate": 0.00028332998996990973, "loss": 2.743, "theoretical_loss": 3.5268395682322096, "tokens_seen": 1449078784 }, { "epoch": 4.02, "learning_rate": 0.0002833199598796389, "loss": 2.7512, "theoretical_loss": 3.5268254118481264, "tokens_seen": 1449144320 }, { "epoch": 4.02, "learning_rate": 0.0002833099297893681, "loss": 2.7873, "theoretical_loss": 3.5268112562834846, "tokens_seen": 1449209856 }, { "epoch": 4.02, "learning_rate": 0.0002832998996990973, "loss": 2.867, "theoretical_loss": 3.5267971015382003, "tokens_seen": 1449275392 }, { "epoch": 4.03, "learning_rate": 0.00028328986960882645, "loss": 2.8512, "theoretical_loss": 3.5267829476121886, "tokens_seen": 1449340928 }, { "epoch": 4.03, "learning_rate": 0.0002832798395185557, "loss": 2.8321, "theoretical_loss": 3.526768794505365, "tokens_seen": 1449406464 }, { "epoch": 4.03, "learning_rate": 0.0002832698094282848, "loss": 2.9252, "theoretical_loss": 3.5267546422176452, "tokens_seen": 1449472000 }, { "epoch": 4.03, "learning_rate": 0.00028325977933801405, "loss": 2.9508, "theoretical_loss": 3.526740490748945, "tokens_seen": 1449537536 }, { "epoch": 4.03, "learning_rate": 0.00028324974924774323, "loss": 2.7421, "theoretical_loss": 3.526726340099179, "tokens_seen": 1449603072 }, { "epoch": 4.03, "learning_rate": 0.0002832397191574724, "loss": 2.7558, "theoretical_loss": 3.526712190268264, "tokens_seen": 1449668608 }, { "epoch": 4.03, "learning_rate": 0.0002832296890672016, "loss": 2.779, "theoretical_loss": 3.5266980412561155, "tokens_seen": 1449734144 }, { "epoch": 4.03, "learning_rate": 0.00028321965897693083, "loss": 2.735, "theoretical_loss": 3.526683893062648, "tokens_seen": 1449799680 }, { "epoch": 4.03, "learning_rate": 0.00028320962888665996, "loss": 2.7151, "theoretical_loss": 3.5266697456877782, "tokens_seen": 1449865216 }, { "epoch": 4.03, "objective/train/docs_used": 2315519, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.801711082458496, "objective/train/theoretical_loss": 3.5266555991314217, "objective/train/tokens_used": 1470390752, "theoretical_loss": 3.5266555991314217, "tokens_seen": 1449930752 }, { "epoch": 4.03, "learning_rate": 0.0002831995987963892, "loss": 2.8101, "theoretical_loss": 3.5266555991314217, "tokens_seen": 1449930752 }, { "epoch": 4.03, "learning_rate": 0.0002831895687061183, "loss": 2.7378, "theoretical_loss": 3.526641453393493, "tokens_seen": 1449996288 }, { "epoch": 4.03, "learning_rate": 0.00028317953861584755, "loss": 2.7103, "theoretical_loss": 3.526627308473909, "tokens_seen": 1450061824 }, { "epoch": 4.03, "learning_rate": 0.00028316950852557674, "loss": 2.8864, "theoretical_loss": 3.526613164372585, "tokens_seen": 1450127360 }, { "epoch": 4.03, "learning_rate": 0.0002831594784353059, "loss": 2.7434, "theoretical_loss": 3.526599021089437, "tokens_seen": 1450192896 }, { "epoch": 4.03, "learning_rate": 0.0002831494483450351, "loss": 2.72, "theoretical_loss": 3.5265848786243796, "tokens_seen": 1450258432 }, { "epoch": 4.03, "learning_rate": 0.0002831394182547643, "loss": 2.8056, "theoretical_loss": 3.5265707369773294, "tokens_seen": 1450323968 }, { "epoch": 4.03, "learning_rate": 0.00028312938816449346, "loss": 2.7324, "theoretical_loss": 3.526556596148202, "tokens_seen": 1450389504 }, { "epoch": 4.03, "learning_rate": 0.0002831193580742227, "loss": 2.8234, "theoretical_loss": 3.526542456136914, "tokens_seen": 1450455040 }, { "epoch": 4.03, "learning_rate": 0.0002831093279839518, "loss": 2.8417, "theoretical_loss": 3.5265283169433794, "tokens_seen": 1450520576 }, { "epoch": 4.03, "learning_rate": 0.00028309929789368106, "loss": 2.8495, "theoretical_loss": 3.526514178567515, "tokens_seen": 1450586112 }, { "epoch": 4.03, "learning_rate": 0.0002830892678034102, "loss": 2.6602, "theoretical_loss": 3.5265000410092364, "tokens_seen": 1450651648 }, { "epoch": 4.03, "learning_rate": 0.0002830792377131394, "loss": 2.9249, "theoretical_loss": 3.5264859042684598, "tokens_seen": 1450717184 }, { "epoch": 4.03, "learning_rate": 0.0002830692076228686, "loss": 2.6536, "theoretical_loss": 3.5264717683451003, "tokens_seen": 1450782720 }, { "epoch": 4.03, "learning_rate": 0.0002830591775325978, "loss": 2.8391, "theoretical_loss": 3.5264576332390742, "tokens_seen": 1450848256 }, { "epoch": 4.03, "learning_rate": 0.00028304914744232696, "loss": 2.9355, "theoretical_loss": 3.5264434989502966, "tokens_seen": 1450913792 }, { "epoch": 4.03, "learning_rate": 0.0002830391173520562, "loss": 2.6805, "theoretical_loss": 3.5264293654786845, "tokens_seen": 1450979328 }, { "epoch": 4.03, "learning_rate": 0.0002830290872617854, "loss": 2.7477, "theoretical_loss": 3.526415232824153, "tokens_seen": 1451044864 }, { "epoch": 4.03, "learning_rate": 0.00028301905717151456, "loss": 2.8103, "theoretical_loss": 3.526401100986618, "tokens_seen": 1451110400 }, { "epoch": 4.03, "learning_rate": 0.00028300902708124374, "loss": 2.726, "theoretical_loss": 3.5263869699659955, "tokens_seen": 1451175936 }, { "epoch": 4.03, "learning_rate": 0.0002829989969909729, "loss": 2.7771, "theoretical_loss": 3.5263728397622014, "tokens_seen": 1451241472 }, { "epoch": 4.03, "learning_rate": 0.00028298896690070216, "loss": 2.7694, "theoretical_loss": 3.5263587103751517, "tokens_seen": 1451307008 }, { "epoch": 4.03, "learning_rate": 0.0002829789368104313, "loss": 2.7529, "theoretical_loss": 3.5263445818047616, "tokens_seen": 1451372544 }, { "epoch": 4.03, "learning_rate": 0.0002829689067201605, "loss": 2.885, "theoretical_loss": 3.526330454050948, "tokens_seen": 1451438080 }, { "epoch": 4.03, "learning_rate": 0.00028295887662988965, "loss": 2.8978, "theoretical_loss": 3.5263163271136264, "tokens_seen": 1451503616 }, { "epoch": 4.03, "objective/train/docs_used": 2316982, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.108663558959961, "objective/train/theoretical_loss": 3.526302200992713, "objective/train/tokens_used": 1472029152, "theoretical_loss": 3.526302200992713, "tokens_seen": 1451569152 }, { "epoch": 4.03, "learning_rate": 0.0002829488465396189, "loss": 2.7427, "theoretical_loss": 3.526302200992713, "tokens_seen": 1451569152 }, { "epoch": 4.03, "learning_rate": 0.00028293881644934806, "loss": 2.8175, "theoretical_loss": 3.5262880756881234, "tokens_seen": 1451634688 }, { "epoch": 4.03, "learning_rate": 0.00028292878635907725, "loss": 2.6576, "theoretical_loss": 3.526273951199774, "tokens_seen": 1451700224 }, { "epoch": 4.03, "learning_rate": 0.0002829187562688064, "loss": 2.8576, "theoretical_loss": 3.5262598275275803, "tokens_seen": 1451765760 }, { "epoch": 4.03, "learning_rate": 0.00028290872617853566, "loss": 2.8464, "theoretical_loss": 3.5262457046714584, "tokens_seen": 1451831296 }, { "epoch": 4.03, "learning_rate": 0.0002828986960882648, "loss": 2.9265, "theoretical_loss": 3.5262315826313246, "tokens_seen": 1451896832 }, { "epoch": 4.03, "learning_rate": 0.000282888665997994, "loss": 2.7776, "theoretical_loss": 3.5262174614070947, "tokens_seen": 1451962368 }, { "epoch": 4.03, "learning_rate": 0.00028287863590772315, "loss": 2.6981, "theoretical_loss": 3.526203340998684, "tokens_seen": 1452027904 }, { "epoch": 4.03, "learning_rate": 0.0002828686058174524, "loss": 2.7806, "theoretical_loss": 3.5261892214060104, "tokens_seen": 1452093440 }, { "epoch": 4.03, "learning_rate": 0.00028285857572718157, "loss": 2.7613, "theoretical_loss": 3.5261751026289887, "tokens_seen": 1452158976 }, { "epoch": 4.03, "learning_rate": 0.00028284854563691075, "loss": 2.7791, "theoretical_loss": 3.526160984667535, "tokens_seen": 1452224512 }, { "epoch": 4.03, "learning_rate": 0.00028283851554663993, "loss": 2.7989, "theoretical_loss": 3.5261468675215655, "tokens_seen": 1452290048 }, { "epoch": 4.03, "learning_rate": 0.0002828284854563691, "loss": 2.973, "theoretical_loss": 3.5261327511909966, "tokens_seen": 1452355584 }, { "epoch": 4.03, "learning_rate": 0.0002828184553660983, "loss": 2.8691, "theoretical_loss": 3.5261186356757444, "tokens_seen": 1452421120 }, { "epoch": 4.03, "learning_rate": 0.0002828084252758275, "loss": 2.7968, "theoretical_loss": 3.526104520975724, "tokens_seen": 1452486656 }, { "epoch": 4.03, "learning_rate": 0.00028279839518555665, "loss": 2.8762, "theoretical_loss": 3.5260904070908534, "tokens_seen": 1452552192 }, { "epoch": 4.03, "learning_rate": 0.0002827883650952859, "loss": 2.7309, "theoretical_loss": 3.526076294021047, "tokens_seen": 1452617728 }, { "epoch": 4.03, "learning_rate": 0.000282778335005015, "loss": 2.9766, "theoretical_loss": 3.5260621817662217, "tokens_seen": 1452683264 }, { "epoch": 4.03, "learning_rate": 0.00028276830491474425, "loss": 2.9617, "theoretical_loss": 3.5260480703262935, "tokens_seen": 1452748800 }, { "epoch": 4.03, "learning_rate": 0.00028275827482447343, "loss": 2.9091, "theoretical_loss": 3.526033959701179, "tokens_seen": 1452814336 }, { "epoch": 4.03, "learning_rate": 0.0002827482447342026, "loss": 2.8628, "theoretical_loss": 3.526019849890794, "tokens_seen": 1452879872 }, { "epoch": 4.03, "learning_rate": 0.0002827382146439318, "loss": 2.8154, "theoretical_loss": 3.5260057408950547, "tokens_seen": 1452945408 }, { "epoch": 4.03, "learning_rate": 0.00028272818455366103, "loss": 2.8178, "theoretical_loss": 3.5259916327138776, "tokens_seen": 1453010944 }, { "epoch": 4.03, "learning_rate": 0.00028271815446339016, "loss": 2.8064, "theoretical_loss": 3.525977525347179, "tokens_seen": 1453076480 }, { "epoch": 4.03, "learning_rate": 0.0002827081243731194, "loss": 2.868, "theoretical_loss": 3.5259634187948743, "tokens_seen": 1453142016 }, { "epoch": 4.03, "objective/train/docs_used": 2320009, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.631333351135254, "objective/train/theoretical_loss": 3.5259493130568806, "objective/train/tokens_used": 1473667552, "theoretical_loss": 3.5259493130568806, "tokens_seen": 1453207552 }, { "epoch": 4.03, "learning_rate": 0.0002826980942828485, "loss": 2.8306, "theoretical_loss": 3.5259493130568806, "tokens_seen": 1453207552 }, { "epoch": 4.03, "learning_rate": 0.00028268806419257775, "loss": 2.8745, "theoretical_loss": 3.5259352081331143, "tokens_seen": 1453273088 }, { "epoch": 4.03, "learning_rate": 0.00028267803410230694, "loss": 2.6975, "theoretical_loss": 3.5259211040234915, "tokens_seen": 1453338624 }, { "epoch": 4.03, "learning_rate": 0.0002826680040120361, "loss": 2.9088, "theoretical_loss": 3.525907000727928, "tokens_seen": 1453404160 }, { "epoch": 4.03, "learning_rate": 0.0002826579739217653, "loss": 2.6072, "theoretical_loss": 3.5258928982463402, "tokens_seen": 1453469696 }, { "epoch": 4.03, "learning_rate": 0.0002826479438314945, "loss": 2.7275, "theoretical_loss": 3.525878796578645, "tokens_seen": 1453535232 }, { "epoch": 4.03, "learning_rate": 0.00028263791374122366, "loss": 2.9449, "theoretical_loss": 3.525864695724758, "tokens_seen": 1453600768 }, { "epoch": 4.03, "learning_rate": 0.0002826278836509529, "loss": 2.6886, "theoretical_loss": 3.525850595684597, "tokens_seen": 1453666304 }, { "epoch": 4.03, "learning_rate": 0.000282617853560682, "loss": 2.8436, "theoretical_loss": 3.5258364964580764, "tokens_seen": 1453731840 }, { "epoch": 4.03, "learning_rate": 0.00028260782347041126, "loss": 2.7917, "theoretical_loss": 3.5258223980451135, "tokens_seen": 1453797376 }, { "epoch": 4.03, "learning_rate": 0.0002825977933801404, "loss": 2.8145, "theoretical_loss": 3.525808300445625, "tokens_seen": 1453862912 }, { "epoch": 4.03, "learning_rate": 0.0002825877632898696, "loss": 2.8127, "theoretical_loss": 3.525794203659527, "tokens_seen": 1453928448 }, { "epoch": 4.03, "learning_rate": 0.0002825777331995988, "loss": 2.8766, "theoretical_loss": 3.525780107686735, "tokens_seen": 1453993984 }, { "epoch": 4.03, "learning_rate": 0.000282567703109328, "loss": 2.7761, "theoretical_loss": 3.5257660125271677, "tokens_seen": 1454059520 }, { "epoch": 4.03, "learning_rate": 0.00028255767301905716, "loss": 2.651, "theoretical_loss": 3.5257519181807395, "tokens_seen": 1454125056 }, { "epoch": 4.03, "learning_rate": 0.0002825476429287864, "loss": 2.7854, "theoretical_loss": 3.5257378246473676, "tokens_seen": 1454190592 }, { "epoch": 4.03, "learning_rate": 0.0002825376128385155, "loss": 2.8281, "theoretical_loss": 3.525723731926968, "tokens_seen": 1454256128 }, { "epoch": 4.03, "learning_rate": 0.00028252758274824476, "loss": 2.8453, "theoretical_loss": 3.525709640019458, "tokens_seen": 1454321664 }, { "epoch": 4.03, "learning_rate": 0.0002825175526579739, "loss": 2.783, "theoretical_loss": 3.5256955489247535, "tokens_seen": 1454387200 }, { "epoch": 4.03, "learning_rate": 0.0002825075225677031, "loss": 2.8611, "theoretical_loss": 3.5256814586427705, "tokens_seen": 1454452736 }, { "epoch": 4.03, "learning_rate": 0.0002824974924774323, "loss": 2.982, "theoretical_loss": 3.5256673691734264, "tokens_seen": 1454518272 }, { "epoch": 4.03, "learning_rate": 0.0002824874623871615, "loss": 2.7827, "theoretical_loss": 3.5256532805166376, "tokens_seen": 1454583808 }, { "epoch": 4.03, "learning_rate": 0.00028247743229689067, "loss": 2.7928, "theoretical_loss": 3.5256391926723207, "tokens_seen": 1454649344 }, { "epoch": 4.03, "learning_rate": 0.00028246740220661985, "loss": 2.8628, "theoretical_loss": 3.5256251056403918, "tokens_seen": 1454714880 }, { "epoch": 4.03, "learning_rate": 0.00028245737211634903, "loss": 2.8171, "theoretical_loss": 3.5256110194207673, "tokens_seen": 1454780416 }, { "epoch": 4.03, "objective/train/docs_used": 2322855, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7167165279388428, "objective/train/theoretical_loss": 3.5255969340133646, "objective/train/tokens_used": 1475305952, "theoretical_loss": 3.5255969340133646, "tokens_seen": 1454845952 }, { "epoch": 4.03, "learning_rate": 0.00028244734202607826, "loss": 2.7041, "theoretical_loss": 3.5255969340133646, "tokens_seen": 1454845952 }, { "epoch": 4.03, "learning_rate": 0.0002824373119358074, "loss": 2.7819, "theoretical_loss": 3.5255828494180994, "tokens_seen": 1454911488 }, { "epoch": 4.03, "learning_rate": 0.0002824272818455366, "loss": 2.8796, "theoretical_loss": 3.5255687656348886, "tokens_seen": 1454977024 }, { "epoch": 4.03, "learning_rate": 0.0002824172517552658, "loss": 2.6816, "theoretical_loss": 3.5255546826636497, "tokens_seen": 1455042560 }, { "epoch": 4.03, "learning_rate": 0.000282407221664995, "loss": 2.7862, "theoretical_loss": 3.525540600504298, "tokens_seen": 1455108096 }, { "epoch": 4.03, "learning_rate": 0.00028239719157472417, "loss": 2.84, "theoretical_loss": 3.5255265191567506, "tokens_seen": 1455173632 }, { "epoch": 4.03, "learning_rate": 0.00028238716148445335, "loss": 2.7716, "theoretical_loss": 3.5255124386209244, "tokens_seen": 1455239168 }, { "epoch": 4.03, "learning_rate": 0.00028237713139418253, "loss": 2.8009, "theoretical_loss": 3.5254983588967357, "tokens_seen": 1455304704 }, { "epoch": 4.03, "learning_rate": 0.00028236710130391177, "loss": 2.8904, "theoretical_loss": 3.5254842799841013, "tokens_seen": 1455370240 }, { "epoch": 4.03, "learning_rate": 0.0002823570712136409, "loss": 2.6549, "theoretical_loss": 3.5254702018829382, "tokens_seen": 1455435776 }, { "epoch": 4.03, "learning_rate": 0.00028234704112337013, "loss": 2.6821, "theoretical_loss": 3.5254561245931626, "tokens_seen": 1455501312 }, { "epoch": 4.03, "learning_rate": 0.00028233701103309926, "loss": 2.7226, "theoretical_loss": 3.5254420481146913, "tokens_seen": 1455566848 }, { "epoch": 4.03, "learning_rate": 0.0002823269809428285, "loss": 2.9145, "theoretical_loss": 3.5254279724474413, "tokens_seen": 1455632384 }, { "epoch": 4.03, "learning_rate": 0.0002823169508525577, "loss": 2.581, "theoretical_loss": 3.525413897591329, "tokens_seen": 1455697920 }, { "epoch": 4.03, "learning_rate": 0.00028230692076228685, "loss": 2.8284, "theoretical_loss": 3.5253998235462713, "tokens_seen": 1455763456 }, { "epoch": 4.03, "learning_rate": 0.00028229689067201604, "loss": 2.8848, "theoretical_loss": 3.5253857503121853, "tokens_seen": 1455828992 }, { "epoch": 4.03, "learning_rate": 0.0002822868605817452, "loss": 2.7473, "theoretical_loss": 3.5253716778889874, "tokens_seen": 1455894528 }, { "epoch": 4.03, "learning_rate": 0.00028227683049147445, "loss": 2.7646, "theoretical_loss": 3.525357606276594, "tokens_seen": 1455960064 }, { "epoch": 4.03, "learning_rate": 0.00028226680040120363, "loss": 2.9, "theoretical_loss": 3.5253435354749225, "tokens_seen": 1456025600 }, { "epoch": 4.03, "learning_rate": 0.0002822567703109328, "loss": 2.8391, "theoretical_loss": 3.525329465483889, "tokens_seen": 1456091136 }, { "epoch": 4.03, "learning_rate": 0.000282246740220662, "loss": 2.8082, "theoretical_loss": 3.5253153963034114, "tokens_seen": 1456156672 }, { "epoch": 4.03, "learning_rate": 0.00028223671013039123, "loss": 2.6534, "theoretical_loss": 3.525301327933406, "tokens_seen": 1456222208 }, { "epoch": 4.03, "learning_rate": 0.00028222668004012036, "loss": 2.8061, "theoretical_loss": 3.5252872603737893, "tokens_seen": 1456287744 }, { "epoch": 4.03, "learning_rate": 0.0002822166499498496, "loss": 2.9553, "theoretical_loss": 3.5252731936244786, "tokens_seen": 1456353280 }, { "epoch": 4.03, "learning_rate": 0.0002822066198595787, "loss": 2.7912, "theoretical_loss": 3.5252591276853904, "tokens_seen": 1456418816 }, { "epoch": 4.03, "objective/train/docs_used": 2325909, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.682508945465088, "objective/train/theoretical_loss": 3.5252450625564418, "objective/train/tokens_used": 1476944352, "theoretical_loss": 3.5252450625564418, "tokens_seen": 1456484352 }, { "epoch": 4.03, "learning_rate": 0.00028219658976930795, "loss": 2.9194, "theoretical_loss": 3.5252450625564418, "tokens_seen": 1456484352 }, { "epoch": 4.03, "learning_rate": 0.00028218655967903714, "loss": 2.8939, "theoretical_loss": 3.5252309982375496, "tokens_seen": 1456549888 }, { "epoch": 4.03, "learning_rate": 0.0002821765295887663, "loss": 2.8139, "theoretical_loss": 3.525216934728631, "tokens_seen": 1456615424 }, { "epoch": 4.03, "learning_rate": 0.0002821664994984955, "loss": 2.6936, "theoretical_loss": 3.5252028720296025, "tokens_seen": 1456680960 }, { "epoch": 4.03, "learning_rate": 0.0002821564694082247, "loss": 2.7542, "theoretical_loss": 3.5251888101403814, "tokens_seen": 1456746496 }, { "epoch": 4.03, "learning_rate": 0.00028214643931795386, "loss": 2.9296, "theoretical_loss": 3.5251747490608842, "tokens_seen": 1456812032 }, { "epoch": 4.03, "learning_rate": 0.0002821364092276831, "loss": 2.9562, "theoretical_loss": 3.525160688791028, "tokens_seen": 1456877568 }, { "epoch": 4.03, "learning_rate": 0.0002821263791374122, "loss": 2.7776, "theoretical_loss": 3.52514662933073, "tokens_seen": 1456943104 }, { "epoch": 4.03, "learning_rate": 0.00028211634904714146, "loss": 2.7584, "theoretical_loss": 3.525132570679907, "tokens_seen": 1457008640 }, { "epoch": 4.03, "learning_rate": 0.0002821063189568706, "loss": 2.9047, "theoretical_loss": 3.5251185128384757, "tokens_seen": 1457074176 }, { "epoch": 4.03, "learning_rate": 0.0002820962888665998, "loss": 2.8773, "theoretical_loss": 3.5251044558063542, "tokens_seen": 1457139712 }, { "epoch": 4.03, "learning_rate": 0.000282086258776329, "loss": 2.9543, "theoretical_loss": 3.5250903995834584, "tokens_seen": 1457205248 }, { "epoch": 4.03, "learning_rate": 0.0002820762286860582, "loss": 2.7932, "theoretical_loss": 3.5250763441697055, "tokens_seen": 1457270784 }, { "epoch": 4.03, "learning_rate": 0.00028206619859578736, "loss": 2.7555, "theoretical_loss": 3.5250622895650126, "tokens_seen": 1457336320 }, { "epoch": 4.03, "learning_rate": 0.0002820561685055166, "loss": 2.7056, "theoretical_loss": 3.525048235769297, "tokens_seen": 1457401856 }, { "epoch": 4.03, "learning_rate": 0.0002820461384152457, "loss": 2.8801, "theoretical_loss": 3.525034182782475, "tokens_seen": 1457467392 }, { "epoch": 4.03, "learning_rate": 0.00028203610832497496, "loss": 2.7541, "theoretical_loss": 3.5250201306044655, "tokens_seen": 1457532928 }, { "epoch": 4.03, "learning_rate": 0.0002820260782347041, "loss": 2.8287, "theoretical_loss": 3.5250060792351836, "tokens_seen": 1457598464 }, { "epoch": 4.03, "learning_rate": 0.0002820160481444333, "loss": 2.7595, "theoretical_loss": 3.5249920286745473, "tokens_seen": 1457664000 }, { "epoch": 4.03, "learning_rate": 0.0002820060180541625, "loss": 2.6583, "theoretical_loss": 3.5249779789224736, "tokens_seen": 1457729536 }, { "epoch": 4.03, "learning_rate": 0.0002819959879638917, "loss": 2.6614, "theoretical_loss": 3.5249639299788793, "tokens_seen": 1457795072 }, { "epoch": 4.03, "learning_rate": 0.00028198595787362087, "loss": 2.7157, "theoretical_loss": 3.524949881843682, "tokens_seen": 1457860608 }, { "epoch": 4.03, "learning_rate": 0.00028197592778335005, "loss": 3.0084, "theoretical_loss": 3.524935834516799, "tokens_seen": 1457926144 }, { "epoch": 4.03, "learning_rate": 0.00028196589769307923, "loss": 3.0132, "theoretical_loss": 3.5249217879981467, "tokens_seen": 1457991680 }, { "epoch": 4.03, "learning_rate": 0.00028195586760280846, "loss": 2.8342, "theoretical_loss": 3.5249077422876427, "tokens_seen": 1458057216 }, { "epoch": 4.03, "objective/train/docs_used": 2328927, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0189876556396484, "objective/train/theoretical_loss": 3.5248936973852043, "objective/train/tokens_used": 1478582752, "theoretical_loss": 3.5248936973852043, "tokens_seen": 1458122752 }, { "epoch": 4.03, "learning_rate": 0.0002819458375125376, "loss": 2.8681, "theoretical_loss": 3.5248936973852043, "tokens_seen": 1458122752 }, { "epoch": 4.03, "learning_rate": 0.0002819358074222668, "loss": 2.7898, "theoretical_loss": 3.524879653290748, "tokens_seen": 1458188288 }, { "epoch": 4.03, "learning_rate": 0.000281925777331996, "loss": 2.7825, "theoretical_loss": 3.524865610004192, "tokens_seen": 1458253824 }, { "epoch": 4.03, "learning_rate": 0.0002819157472417252, "loss": 2.8859, "theoretical_loss": 3.524851567525454, "tokens_seen": 1458319360 }, { "epoch": 4.03, "learning_rate": 0.00028190571715145437, "loss": 2.9086, "theoretical_loss": 3.5248375258544495, "tokens_seen": 1458384896 }, { "epoch": 4.03, "learning_rate": 0.00028189568706118355, "loss": 2.7685, "theoretical_loss": 3.5248234849910967, "tokens_seen": 1458450432 }, { "epoch": 4.03, "learning_rate": 0.00028188565697091273, "loss": 2.5415, "theoretical_loss": 3.524809444935313, "tokens_seen": 1458515968 }, { "epoch": 4.03, "learning_rate": 0.00028187562688064197, "loss": 2.886, "theoretical_loss": 3.5247954056870148, "tokens_seen": 1458581504 }, { "epoch": 4.03, "learning_rate": 0.0002818655967903711, "loss": 2.9029, "theoretical_loss": 3.52478136724612, "tokens_seen": 1458647040 }, { "epoch": 4.03, "learning_rate": 0.00028185556670010033, "loss": 2.8779, "theoretical_loss": 3.5247673296125464, "tokens_seen": 1458712576 }, { "epoch": 4.03, "learning_rate": 0.00028184553660982946, "loss": 2.749, "theoretical_loss": 3.5247532927862104, "tokens_seen": 1458778112 }, { "epoch": 4.03, "learning_rate": 0.0002818355065195587, "loss": 2.9389, "theoretical_loss": 3.52473925676703, "tokens_seen": 1458843648 }, { "epoch": 4.03, "learning_rate": 0.0002818254764292879, "loss": 2.7908, "theoretical_loss": 3.524725221554922, "tokens_seen": 1458909184 }, { "epoch": 4.03, "learning_rate": 0.00028181544633901705, "loss": 2.7462, "theoretical_loss": 3.524711187149804, "tokens_seen": 1458974720 }, { "epoch": 4.03, "learning_rate": 0.00028180541624874624, "loss": 2.8493, "theoretical_loss": 3.5246971535515934, "tokens_seen": 1459040256 }, { "epoch": 4.03, "learning_rate": 0.0002817953861584754, "loss": 2.9262, "theoretical_loss": 3.524683120760207, "tokens_seen": 1459105792 }, { "epoch": 4.03, "learning_rate": 0.0002817853560682046, "loss": 2.9144, "theoretical_loss": 3.524669088775563, "tokens_seen": 1459171328 }, { "epoch": 4.03, "learning_rate": 0.00028177532597793383, "loss": 2.798, "theoretical_loss": 3.5246550575975792, "tokens_seen": 1459236864 }, { "epoch": 4.03, "learning_rate": 0.00028176529588766296, "loss": 2.7812, "theoretical_loss": 3.5246410272261715, "tokens_seen": 1459302400 }, { "epoch": 4.03, "learning_rate": 0.0002817552657973922, "loss": 2.8684, "theoretical_loss": 3.524626997661258, "tokens_seen": 1459367936 }, { "epoch": 4.03, "learning_rate": 0.0002817452357071214, "loss": 2.7663, "theoretical_loss": 3.5246129689027565, "tokens_seen": 1459433472 }, { "epoch": 4.03, "learning_rate": 0.00028173520561685056, "loss": 2.922, "theoretical_loss": 3.5245989409505842, "tokens_seen": 1459499008 }, { "epoch": 4.03, "learning_rate": 0.00028172517552657974, "loss": 2.9499, "theoretical_loss": 3.524584913804658, "tokens_seen": 1459564544 }, { "epoch": 4.03, "learning_rate": 0.0002817151454363089, "loss": 2.7222, "theoretical_loss": 3.524570887464896, "tokens_seen": 1459630080 }, { "epoch": 4.03, "learning_rate": 0.0002817051153460381, "loss": 2.9137, "theoretical_loss": 3.5245568619312158, "tokens_seen": 1459695616 }, { "epoch": 4.03, "objective/train/docs_used": 2330225, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9520325660705566, "objective/train/theoretical_loss": 3.524542837203535, "objective/train/tokens_used": 1480221152, "theoretical_loss": 3.524542837203535, "tokens_seen": 1459761152 }, { "epoch": 4.03, "learning_rate": 0.00028169508525576734, "loss": 2.9433, "theoretical_loss": 3.524542837203535, "tokens_seen": 1459761152 }, { "epoch": 4.03, "learning_rate": 0.00028168505516549646, "loss": 2.9466, "theoretical_loss": 3.52452881328177, "tokens_seen": 1459826688 }, { "epoch": 4.03, "learning_rate": 0.0002816750250752257, "loss": 2.8188, "theoretical_loss": 3.5245147901658394, "tokens_seen": 1459892224 }, { "epoch": 4.03, "learning_rate": 0.0002816649949849548, "loss": 2.5592, "theoretical_loss": 3.52450076785566, "tokens_seen": 1459957760 }, { "epoch": 4.03, "learning_rate": 0.00028165496489468406, "loss": 2.6693, "theoretical_loss": 3.5244867463511502, "tokens_seen": 1460023296 }, { "epoch": 4.03, "learning_rate": 0.00028164493480441324, "loss": 2.6378, "theoretical_loss": 3.5244727256522266, "tokens_seen": 1460088832 }, { "epoch": 4.03, "learning_rate": 0.0002816349047141424, "loss": 2.8048, "theoretical_loss": 3.5244587057588075, "tokens_seen": 1460154368 }, { "epoch": 4.03, "learning_rate": 0.0002816248746238716, "loss": 2.8763, "theoretical_loss": 3.52444468667081, "tokens_seen": 1460219904 }, { "epoch": 4.03, "learning_rate": 0.0002816148445336008, "loss": 2.735, "theoretical_loss": 3.524430668388152, "tokens_seen": 1460285440 }, { "epoch": 4.03, "learning_rate": 0.00028160481444332997, "loss": 2.9337, "theoretical_loss": 3.5244166509107506, "tokens_seen": 1460350976 }, { "epoch": 4.03, "learning_rate": 0.0002815947843530592, "loss": 2.8772, "theoretical_loss": 3.524402634238524, "tokens_seen": 1460416512 }, { "epoch": 4.03, "learning_rate": 0.00028158475426278833, "loss": 2.7582, "theoretical_loss": 3.5243886183713897, "tokens_seen": 1460482048 }, { "epoch": 4.03, "learning_rate": 0.00028157472417251756, "loss": 2.879, "theoretical_loss": 3.5243746033092656, "tokens_seen": 1460547584 }, { "epoch": 4.03, "learning_rate": 0.00028156469408224674, "loss": 2.7084, "theoretical_loss": 3.524360589052068, "tokens_seen": 1460613120 }, { "epoch": 4.03, "learning_rate": 0.0002815546639919759, "loss": 2.7716, "theoretical_loss": 3.5243465755997163, "tokens_seen": 1460678656 }, { "epoch": 4.03, "learning_rate": 0.0002815446339017051, "loss": 2.8369, "theoretical_loss": 3.524332562952127, "tokens_seen": 1460744192 }, { "epoch": 4.03, "learning_rate": 0.0002815346038114343, "loss": 2.8647, "theoretical_loss": 3.5243185511092183, "tokens_seen": 1460809728 }, { "epoch": 4.03, "learning_rate": 0.0002815245737211635, "loss": 2.8103, "theoretical_loss": 3.524304540070908, "tokens_seen": 1460875264 }, { "epoch": 4.03, "learning_rate": 0.0002815145436308927, "loss": 2.8475, "theoretical_loss": 3.524290529837113, "tokens_seen": 1460940800 }, { "epoch": 4.03, "learning_rate": 0.0002815045135406219, "loss": 2.7633, "theoretical_loss": 3.524276520407752, "tokens_seen": 1461006336 }, { "epoch": 4.03, "learning_rate": 0.00028149448345035107, "loss": 2.6986, "theoretical_loss": 3.524262511782742, "tokens_seen": 1461071872 }, { "epoch": 4.03, "learning_rate": 0.00028148445336008025, "loss": 2.9216, "theoretical_loss": 3.524248503962001, "tokens_seen": 1461137408 }, { "epoch": 4.03, "learning_rate": 0.00028147442326980943, "loss": 2.9144, "theoretical_loss": 3.5242344969454473, "tokens_seen": 1461202944 }, { "epoch": 4.03, "learning_rate": 0.00028146439317953866, "loss": 2.8097, "theoretical_loss": 3.5242204907329984, "tokens_seen": 1461268480 }, { "epoch": 4.03, "learning_rate": 0.0002814543630892678, "loss": 2.7353, "theoretical_loss": 3.524206485324571, "tokens_seen": 1461334016 }, { "epoch": 4.03, "objective/train/docs_used": 2333652, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0563559532165527, "objective/train/theoretical_loss": 3.5241924807200844, "objective/train/tokens_used": 1481859552, "theoretical_loss": 3.5241924807200844, "tokens_seen": 1461399552 }, { "epoch": 4.03, "learning_rate": 0.000281444332998997, "loss": 2.964, "theoretical_loss": 3.5241924807200844, "tokens_seen": 1461399552 }, { "epoch": 4.03, "learning_rate": 0.0002814343029087262, "loss": 2.9525, "theoretical_loss": 3.5241784769194555, "tokens_seen": 1461465088 }, { "epoch": 4.03, "learning_rate": 0.0002814242728184554, "loss": 2.9143, "theoretical_loss": 3.5241644739226023, "tokens_seen": 1461530624 }, { "epoch": 4.03, "learning_rate": 0.00028141424272818457, "loss": 2.7836, "theoretical_loss": 3.524150471729443, "tokens_seen": 1461596160 }, { "epoch": 4.03, "learning_rate": 0.00028140421263791375, "loss": 2.6784, "theoretical_loss": 3.524136470339895, "tokens_seen": 1461661696 }, { "epoch": 4.03, "learning_rate": 0.00028139418254764293, "loss": 2.7071, "theoretical_loss": 3.524122469753876, "tokens_seen": 1461727232 }, { "epoch": 4.03, "learning_rate": 0.00028138415245737217, "loss": 2.7578, "theoretical_loss": 3.5241084699713046, "tokens_seen": 1461792768 }, { "epoch": 4.03, "learning_rate": 0.0002813741223671013, "loss": 2.9143, "theoretical_loss": 3.5240944709920985, "tokens_seen": 1461858304 }, { "epoch": 4.03, "learning_rate": 0.00028136409227683053, "loss": 2.9192, "theoretical_loss": 3.5240804728161743, "tokens_seen": 1461923840 }, { "epoch": 4.03, "learning_rate": 0.00028135406218655966, "loss": 2.9359, "theoretical_loss": 3.5240664754434516, "tokens_seen": 1461989376 }, { "epoch": 4.03, "learning_rate": 0.0002813440320962889, "loss": 2.649, "theoretical_loss": 3.5240524788738474, "tokens_seen": 1462054912 }, { "epoch": 4.03, "learning_rate": 0.0002813340020060181, "loss": 2.8314, "theoretical_loss": 3.52403848310728, "tokens_seen": 1462120448 }, { "epoch": 4.03, "learning_rate": 0.00028132397191574725, "loss": 2.8844, "theoretical_loss": 3.5240244881436675, "tokens_seen": 1462185984 }, { "epoch": 4.03, "learning_rate": 0.00028131394182547644, "loss": 2.8238, "theoretical_loss": 3.524010493982927, "tokens_seen": 1462251520 }, { "epoch": 4.03, "learning_rate": 0.0002813039117352056, "loss": 2.7529, "theoretical_loss": 3.523996500624977, "tokens_seen": 1462317056 }, { "epoch": 4.03, "learning_rate": 0.0002812938816449348, "loss": 2.7015, "theoretical_loss": 3.523982508069736, "tokens_seen": 1462382592 }, { "epoch": 4.03, "learning_rate": 0.00028128385155466403, "loss": 2.8788, "theoretical_loss": 3.5239685163171206, "tokens_seen": 1462448128 }, { "epoch": 4.03, "learning_rate": 0.00028127382146439316, "loss": 2.8331, "theoretical_loss": 3.52395452536705, "tokens_seen": 1462513664 }, { "epoch": 4.03, "learning_rate": 0.0002812637913741224, "loss": 2.8112, "theoretical_loss": 3.523940535219442, "tokens_seen": 1462579200 }, { "epoch": 4.03, "learning_rate": 0.0002812537612838516, "loss": 2.7268, "theoretical_loss": 3.5239265458742146, "tokens_seen": 1462644736 }, { "epoch": 4.03, "learning_rate": 0.00028124373119358076, "loss": 2.7309, "theoretical_loss": 3.5239125573312853, "tokens_seen": 1462710272 }, { "epoch": 4.03, "learning_rate": 0.00028123370110330994, "loss": 2.7844, "theoretical_loss": 3.523898569590573, "tokens_seen": 1462775808 }, { "epoch": 4.03, "learning_rate": 0.0002812236710130391, "loss": 2.8759, "theoretical_loss": 3.523884582651995, "tokens_seen": 1462841344 }, { "epoch": 4.03, "learning_rate": 0.0002812136409227683, "loss": 2.8605, "theoretical_loss": 3.5238705965154695, "tokens_seen": 1462906880 }, { "epoch": 4.03, "learning_rate": 0.00028120361083249754, "loss": 2.799, "theoretical_loss": 3.523856611180915, "tokens_seen": 1462972416 }, { "epoch": 4.03, "objective/train/docs_used": 2336250, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.823498487472534, "objective/train/theoretical_loss": 3.5238426266482494, "objective/train/tokens_used": 1483497952, "theoretical_loss": 3.5238426266482494, "tokens_seen": 1463037952 }, { "epoch": 4.03, "learning_rate": 0.00028119358074222666, "loss": 2.8676, "theoretical_loss": 3.5238426266482494, "tokens_seen": 1463037952 }, { "epoch": 4.03, "learning_rate": 0.0002811835506519559, "loss": 2.7979, "theoretical_loss": 3.523828642917391, "tokens_seen": 1463103488 }, { "epoch": 4.03, "learning_rate": 0.000281173520561685, "loss": 2.8541, "theoretical_loss": 3.523814659988257, "tokens_seen": 1463169024 }, { "epoch": 4.03, "learning_rate": 0.00028116349047141426, "loss": 2.7814, "theoretical_loss": 3.5238006778607667, "tokens_seen": 1463234560 }, { "epoch": 4.03, "learning_rate": 0.00028115346038114344, "loss": 2.7738, "theoretical_loss": 3.5237866965348372, "tokens_seen": 1463300096 }, { "epoch": 4.03, "learning_rate": 0.0002811434302908726, "loss": 2.675, "theoretical_loss": 3.523772716010388, "tokens_seen": 1463365632 }, { "epoch": 4.03, "learning_rate": 0.0002811334002006018, "loss": 2.7372, "theoretical_loss": 3.523758736287336, "tokens_seen": 1463431168 }, { "epoch": 4.03, "learning_rate": 0.000281123370110331, "loss": 2.6784, "theoretical_loss": 3.5237447573655998, "tokens_seen": 1463496704 }, { "epoch": 4.03, "learning_rate": 0.00028111334002006017, "loss": 2.8192, "theoretical_loss": 3.5237307792450974, "tokens_seen": 1463562240 }, { "epoch": 4.03, "learning_rate": 0.0002811033099297894, "loss": 2.7604, "theoretical_loss": 3.5237168019257474, "tokens_seen": 1463627776 }, { "epoch": 4.03, "learning_rate": 0.00028109327983951853, "loss": 2.8178, "theoretical_loss": 3.523702825407468, "tokens_seen": 1463693312 }, { "epoch": 4.03, "learning_rate": 0.00028108324974924776, "loss": 2.8779, "theoretical_loss": 3.5236888496901773, "tokens_seen": 1463758848 }, { "epoch": 4.03, "learning_rate": 0.00028107321965897695, "loss": 2.9841, "theoretical_loss": 3.5236748747737936, "tokens_seen": 1463824384 }, { "epoch": 4.03, "learning_rate": 0.0002810631895687061, "loss": 2.7505, "theoretical_loss": 3.5236609006582347, "tokens_seen": 1463889920 }, { "epoch": 4.03, "learning_rate": 0.0002810531594784353, "loss": 2.8514, "theoretical_loss": 3.523646927343419, "tokens_seen": 1463955456 }, { "epoch": 4.03, "learning_rate": 0.0002810431293881645, "loss": 2.9135, "theoretical_loss": 3.5236329548292655, "tokens_seen": 1464020992 }, { "epoch": 4.03, "learning_rate": 0.00028103309929789367, "loss": 2.8037, "theoretical_loss": 3.5236189831156914, "tokens_seen": 1464086528 }, { "epoch": 4.03, "learning_rate": 0.0002810230692076229, "loss": 2.8603, "theoretical_loss": 3.5236050122026157, "tokens_seen": 1464152064 }, { "epoch": 4.03, "learning_rate": 0.00028101303911735203, "loss": 2.9547, "theoretical_loss": 3.523591042089957, "tokens_seen": 1464217600 }, { "epoch": 4.03, "learning_rate": 0.00028100300902708127, "loss": 2.639, "theoretical_loss": 3.523577072777633, "tokens_seen": 1464283136 }, { "epoch": 4.03, "learning_rate": 0.0002809929789368104, "loss": 2.7814, "theoretical_loss": 3.523563104265562, "tokens_seen": 1464348672 }, { "epoch": 4.03, "learning_rate": 0.00028098294884653963, "loss": 2.9459, "theoretical_loss": 3.5235491365536626, "tokens_seen": 1464414208 }, { "epoch": 4.03, "learning_rate": 0.0002809729187562688, "loss": 2.7375, "theoretical_loss": 3.5235351696418533, "tokens_seen": 1464479744 }, { "epoch": 4.03, "learning_rate": 0.000280962888665998, "loss": 2.7984, "theoretical_loss": 3.523521203530052, "tokens_seen": 1464545280 }, { "epoch": 4.03, "learning_rate": 0.00028095285857572717, "loss": 2.9075, "theoretical_loss": 3.523507238218178, "tokens_seen": 1464610816 }, { "epoch": 4.03, "objective/train/docs_used": 2337620, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9902091026306152, "objective/train/theoretical_loss": 3.5234932737061486, "objective/train/tokens_used": 1485136352, "theoretical_loss": 3.5234932737061486, "tokens_seen": 1464676352 }, { "epoch": 4.03, "learning_rate": 0.00028094282848545635, "loss": 2.9391, "theoretical_loss": 3.5234932737061486, "tokens_seen": 1464676352 }, { "epoch": 4.03, "learning_rate": 0.00028093279839518553, "loss": 2.8613, "theoretical_loss": 3.523479309993882, "tokens_seen": 1464741888 }, { "epoch": 4.03, "learning_rate": 0.00028092276830491477, "loss": 2.894, "theoretical_loss": 3.5234653470812987, "tokens_seen": 1464807424 }, { "epoch": 4.03, "learning_rate": 0.0002809127382146439, "loss": 2.7007, "theoretical_loss": 3.5234513849683147, "tokens_seen": 1464872960 }, { "epoch": 4.03, "learning_rate": 0.00028090270812437313, "loss": 2.7206, "theoretical_loss": 3.5234374236548494, "tokens_seen": 1464938496 }, { "epoch": 4.03, "learning_rate": 0.0002808926780341023, "loss": 2.8013, "theoretical_loss": 3.523423463140822, "tokens_seen": 1465004032 }, { "epoch": 4.03, "learning_rate": 0.0002808826479438315, "loss": 2.9203, "theoretical_loss": 3.5234095034261497, "tokens_seen": 1465069568 }, { "epoch": 4.03, "learning_rate": 0.0002808726178535607, "loss": 2.8058, "theoretical_loss": 3.523395544510752, "tokens_seen": 1465135104 }, { "epoch": 4.03, "learning_rate": 0.00028086258776328986, "loss": 2.7672, "theoretical_loss": 3.523381586394546, "tokens_seen": 1465200640 }, { "epoch": 4.03, "learning_rate": 0.00028085255767301904, "loss": 2.9842, "theoretical_loss": 3.523367629077452, "tokens_seen": 1465266176 }, { "epoch": 4.03, "learning_rate": 0.0002808425275827483, "loss": 2.7369, "theoretical_loss": 3.5233536725593875, "tokens_seen": 1465331712 }, { "epoch": 4.03, "learning_rate": 0.0002808324974924774, "loss": 2.7718, "theoretical_loss": 3.5233397168402707, "tokens_seen": 1465397248 }, { "epoch": 4.03, "learning_rate": 0.00028082246740220664, "loss": 2.7478, "theoretical_loss": 3.523325761920021, "tokens_seen": 1465462784 }, { "epoch": 4.03, "learning_rate": 0.00028081243731193576, "loss": 2.7843, "theoretical_loss": 3.5233118077985566, "tokens_seen": 1465528320 }, { "epoch": 4.03, "learning_rate": 0.000280802407221665, "loss": 2.8613, "theoretical_loss": 3.5232978544757962, "tokens_seen": 1465593856 }, { "epoch": 4.03, "learning_rate": 0.0002807923771313942, "loss": 2.7711, "theoretical_loss": 3.5232839019516575, "tokens_seen": 1465659392 }, { "epoch": 4.03, "learning_rate": 0.00028078234704112336, "loss": 2.8276, "theoretical_loss": 3.5232699502260605, "tokens_seen": 1465724928 }, { "epoch": 4.03, "learning_rate": 0.0002807723169508526, "loss": 2.6797, "theoretical_loss": 3.5232559992989225, "tokens_seen": 1465790464 }, { "epoch": 4.03, "learning_rate": 0.0002807622868605818, "loss": 2.8776, "theoretical_loss": 3.523242049170163, "tokens_seen": 1465856000 }, { "epoch": 4.03, "learning_rate": 0.00028075225677031096, "loss": 2.8836, "theoretical_loss": 3.5232280998397, "tokens_seen": 1465921536 }, { "epoch": 4.03, "learning_rate": 0.00028074222668004014, "loss": 2.8652, "theoretical_loss": 3.5232141513074526, "tokens_seen": 1465987072 }, { "epoch": 4.03, "learning_rate": 0.0002807321965897693, "loss": 2.8132, "theoretical_loss": 3.5232002035733396, "tokens_seen": 1466052608 }, { "epoch": 4.03, "learning_rate": 0.0002807221664994985, "loss": 2.8845, "theoretical_loss": 3.5231862566372785, "tokens_seen": 1466118144 }, { "epoch": 4.03, "learning_rate": 0.00028071213640922774, "loss": 2.704, "theoretical_loss": 3.5231723104991897, "tokens_seen": 1466183680 }, { "epoch": 4.03, "learning_rate": 0.00028070210631895686, "loss": 2.8581, "theoretical_loss": 3.52315836515899, "tokens_seen": 1466249216 }, { "epoch": 4.03, "objective/train/docs_used": 2340479, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0394797325134277, "objective/train/theoretical_loss": 3.5231444206165996, "objective/train/tokens_used": 1486774752, "theoretical_loss": 3.5231444206165996, "tokens_seen": 1466314752 }, { "epoch": 4.03, "learning_rate": 0.0002806920762286861, "loss": 2.9248, "theoretical_loss": 3.5231444206165996, "tokens_seen": 1466314752 }, { "epoch": 4.03, "learning_rate": 0.0002806820461384152, "loss": 2.6816, "theoretical_loss": 3.5231304768719367, "tokens_seen": 1466380288 }, { "epoch": 4.03, "learning_rate": 0.00028067201604814446, "loss": 2.636, "theoretical_loss": 3.52311653392492, "tokens_seen": 1466445824 }, { "epoch": 4.03, "learning_rate": 0.00028066198595787364, "loss": 2.6678, "theoretical_loss": 3.5231025917754684, "tokens_seen": 1466511360 }, { "epoch": 4.03, "learning_rate": 0.0002806519558676028, "loss": 2.7019, "theoretical_loss": 3.5230886504234995, "tokens_seen": 1466576896 }, { "epoch": 4.03, "learning_rate": 0.000280641925777332, "loss": 2.7007, "theoretical_loss": 3.5230747098689337, "tokens_seen": 1466642432 }, { "epoch": 4.03, "learning_rate": 0.0002806318956870612, "loss": 2.6109, "theoretical_loss": 3.523060770111689, "tokens_seen": 1466707968 }, { "epoch": 4.03, "learning_rate": 0.00028062186559679037, "loss": 2.9016, "theoretical_loss": 3.5230468311516843, "tokens_seen": 1466773504 }, { "epoch": 4.03, "learning_rate": 0.0002806118355065196, "loss": 2.7796, "theoretical_loss": 3.523032892988838, "tokens_seen": 1466839040 }, { "epoch": 4.03, "learning_rate": 0.00028060180541624873, "loss": 2.7781, "theoretical_loss": 3.52301895562307, "tokens_seen": 1466904576 }, { "epoch": 4.03, "learning_rate": 0.00028059177532597796, "loss": 2.8471, "theoretical_loss": 3.5230050190542976, "tokens_seen": 1466970112 }, { "epoch": 4.03, "learning_rate": 0.00028058174523570715, "loss": 2.9119, "theoretical_loss": 3.5229910832824407, "tokens_seen": 1467035648 }, { "epoch": 4.03, "learning_rate": 0.0002805717151454363, "loss": 2.6761, "theoretical_loss": 3.522977148307417, "tokens_seen": 1467101184 }, { "epoch": 4.03, "learning_rate": 0.0002805616850551655, "loss": 2.8502, "theoretical_loss": 3.522963214129147, "tokens_seen": 1467166720 }, { "epoch": 4.03, "learning_rate": 0.0002805516549648947, "loss": 2.7626, "theoretical_loss": 3.5229492807475484, "tokens_seen": 1467232256 }, { "epoch": 4.03, "learning_rate": 0.00028054162487462387, "loss": 2.7575, "theoretical_loss": 3.52293534816254, "tokens_seen": 1467297792 }, { "epoch": 4.03, "learning_rate": 0.0002805315947843531, "loss": 2.8302, "theoretical_loss": 3.5229214163740417, "tokens_seen": 1467363328 }, { "epoch": 4.03, "learning_rate": 0.00028052156469408223, "loss": 2.795, "theoretical_loss": 3.5229074853819715, "tokens_seen": 1467428864 }, { "epoch": 4.03, "learning_rate": 0.00028051153460381147, "loss": 2.8349, "theoretical_loss": 3.5228935551862484, "tokens_seen": 1467494400 }, { "epoch": 4.03, "learning_rate": 0.0002805015045135406, "loss": 2.841, "theoretical_loss": 3.5228796257867914, "tokens_seen": 1467559936 }, { "epoch": 4.03, "learning_rate": 0.00028049147442326983, "loss": 2.7806, "theoretical_loss": 3.5228656971835193, "tokens_seen": 1467625472 }, { "epoch": 4.03, "learning_rate": 0.000280481444332999, "loss": 2.8213, "theoretical_loss": 3.5228517693763517, "tokens_seen": 1467691008 }, { "epoch": 4.03, "learning_rate": 0.0002804714142427282, "loss": 2.7226, "theoretical_loss": 3.522837842365207, "tokens_seen": 1467756544 }, { "epoch": 4.03, "learning_rate": 0.00028046138415245737, "loss": 2.7844, "theoretical_loss": 3.5228239161500037, "tokens_seen": 1467822080 }, { "epoch": 4.03, "learning_rate": 0.00028045135406218655, "loss": 2.957, "theoretical_loss": 3.5228099907306616, "tokens_seen": 1467887616 }, { "epoch": 4.03, "objective/train/docs_used": 2343295, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5314743518829346, "objective/train/theoretical_loss": 3.5227960661071, "objective/train/tokens_used": 1488413152, "theoretical_loss": 3.5227960661071, "tokens_seen": 1467953152 }, { "epoch": 4.03, "learning_rate": 0.00028044132397191574, "loss": 2.8083, "theoretical_loss": 3.5227960661071, "tokens_seen": 1467953152 }, { "epoch": 4.03, "learning_rate": 0.00028043129388164497, "loss": 2.8907, "theoretical_loss": 3.5227821422792367, "tokens_seen": 1468018688 }, { "epoch": 4.03, "learning_rate": 0.0002804212637913741, "loss": 2.7478, "theoretical_loss": 3.5227682192469914, "tokens_seen": 1468084224 }, { "epoch": 4.03, "learning_rate": 0.00028041123370110333, "loss": 2.9081, "theoretical_loss": 3.522754297010283, "tokens_seen": 1468149760 }, { "epoch": 4.03, "learning_rate": 0.0002804012036108325, "loss": 2.8637, "theoretical_loss": 3.5227403755690307, "tokens_seen": 1468215296 }, { "epoch": 4.03, "learning_rate": 0.0002803911735205617, "loss": 2.9583, "theoretical_loss": 3.522726454923153, "tokens_seen": 1468280832 }, { "epoch": 4.03, "learning_rate": 0.0002803811434302909, "loss": 2.7966, "theoretical_loss": 3.5227125350725697, "tokens_seen": 1468346368 }, { "epoch": 4.03, "learning_rate": 0.00028037111334002006, "loss": 2.9969, "theoretical_loss": 3.5226986160172, "tokens_seen": 1468411904 }, { "epoch": 4.03, "learning_rate": 0.00028036108324974924, "loss": 2.7724, "theoretical_loss": 3.522684697756962, "tokens_seen": 1468477440 }, { "epoch": 4.03, "learning_rate": 0.0002803510531594785, "loss": 2.8143, "theoretical_loss": 3.5226707802917754, "tokens_seen": 1468542976 }, { "epoch": 4.03, "learning_rate": 0.0002803410230692076, "loss": 2.8407, "theoretical_loss": 3.5226568636215596, "tokens_seen": 1468608512 }, { "epoch": 4.03, "learning_rate": 0.00028033099297893684, "loss": 2.8914, "theoretical_loss": 3.522642947746233, "tokens_seen": 1468674048 }, { "epoch": 4.03, "learning_rate": 0.00028032096288866596, "loss": 2.9126, "theoretical_loss": 3.522629032665715, "tokens_seen": 1468739584 }, { "epoch": 4.03, "learning_rate": 0.0002803109327983952, "loss": 2.82, "theoretical_loss": 3.5226151183799246, "tokens_seen": 1468805120 }, { "epoch": 4.03, "learning_rate": 0.0002803009027081244, "loss": 2.7881, "theoretical_loss": 3.5226012048887823, "tokens_seen": 1468870656 }, { "epoch": 4.03, "learning_rate": 0.00028029087261785356, "loss": 2.9265, "theoretical_loss": 3.522587292192205, "tokens_seen": 1468936192 }, { "epoch": 4.03, "learning_rate": 0.00028028084252758274, "loss": 2.8501, "theoretical_loss": 3.5225733802901136, "tokens_seen": 1469001728 }, { "epoch": 4.03, "learning_rate": 0.000280270812437312, "loss": 2.948, "theoretical_loss": 3.5225594691824265, "tokens_seen": 1469067264 }, { "epoch": 4.03, "learning_rate": 0.0002802607823470411, "loss": 2.598, "theoretical_loss": 3.522545558869063, "tokens_seen": 1469132800 }, { "epoch": 4.03, "learning_rate": 0.00028025075225677034, "loss": 2.664, "theoretical_loss": 3.5225316493499426, "tokens_seen": 1469198336 }, { "epoch": 4.03, "learning_rate": 0.00028024072216649947, "loss": 2.6805, "theoretical_loss": 3.522517740624984, "tokens_seen": 1469263872 }, { "epoch": 4.03, "learning_rate": 0.0002802306920762287, "loss": 2.8274, "theoretical_loss": 3.522503832694108, "tokens_seen": 1469329408 }, { "epoch": 4.03, "learning_rate": 0.0002802206619859579, "loss": 2.6182, "theoretical_loss": 3.522489925557231, "tokens_seen": 1469394944 }, { "epoch": 4.03, "learning_rate": 0.00028021063189568706, "loss": 2.8327, "theoretical_loss": 3.5224760192142752, "tokens_seen": 1469460480 }, { "epoch": 4.03, "learning_rate": 0.00028020060180541624, "loss": 2.7532, "theoretical_loss": 3.522462113665158, "tokens_seen": 1469526016 }, { "epoch": 4.03, "objective/train/docs_used": 2346341, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0546634197235107, "objective/train/theoretical_loss": 3.5224482089097995, "objective/train/tokens_used": 1490051552, "theoretical_loss": 3.5224482089097995, "tokens_seen": 1469591552 }, { "epoch": 4.03, "learning_rate": 0.0002801905717151454, "loss": 3.0163, "theoretical_loss": 3.5224482089097995, "tokens_seen": 1469591552 }, { "epoch": 4.03, "learning_rate": 0.0002801805416248746, "loss": 2.6669, "theoretical_loss": 3.522434304948119, "tokens_seen": 1469657088 }, { "epoch": 4.03, "learning_rate": 0.00028017051153460384, "loss": 2.7303, "theoretical_loss": 3.5224204017800345, "tokens_seen": 1469722624 }, { "epoch": 4.03, "learning_rate": 0.00028016048144433297, "loss": 2.8091, "theoretical_loss": 3.5224064994054674, "tokens_seen": 1469788160 }, { "epoch": 4.03, "learning_rate": 0.0002801504513540622, "loss": 2.7591, "theoretical_loss": 3.522392597824336, "tokens_seen": 1469853696 }, { "epoch": 4.03, "learning_rate": 0.00028014042126379133, "loss": 2.7065, "theoretical_loss": 3.522378697036559, "tokens_seen": 1469919232 }, { "epoch": 4.03, "learning_rate": 0.00028013039117352057, "loss": 2.7937, "theoretical_loss": 3.522364797042057, "tokens_seen": 1469984768 }, { "epoch": 4.03, "learning_rate": 0.00028012036108324975, "loss": 2.7074, "theoretical_loss": 3.5223508978407487, "tokens_seen": 1470050304 }, { "epoch": 4.03, "learning_rate": 0.00028011033099297893, "loss": 2.7701, "theoretical_loss": 3.5223369994325533, "tokens_seen": 1470115840 }, { "epoch": 4.03, "learning_rate": 0.0002801003009027081, "loss": 2.8538, "theoretical_loss": 3.52232310181739, "tokens_seen": 1470181376 }, { "epoch": 4.03, "learning_rate": 0.00028009027081243735, "loss": 2.8078, "theoretical_loss": 3.52230920499518, "tokens_seen": 1470246912 }, { "epoch": 4.03, "learning_rate": 0.00028008024072216647, "loss": 2.944, "theoretical_loss": 3.52229530896584, "tokens_seen": 1470312448 }, { "epoch": 4.03, "learning_rate": 0.0002800702106318957, "loss": 2.7833, "theoretical_loss": 3.5222814137292917, "tokens_seen": 1470377984 }, { "epoch": 4.03, "learning_rate": 0.00028006018054162483, "loss": 2.8339, "theoretical_loss": 3.5222675192854536, "tokens_seen": 1470443520 }, { "epoch": 4.03, "learning_rate": 0.00028005015045135407, "loss": 2.792, "theoretical_loss": 3.522253625634245, "tokens_seen": 1470509056 }, { "epoch": 4.03, "learning_rate": 0.00028004012036108325, "loss": 2.7918, "theoretical_loss": 3.522239732775585, "tokens_seen": 1470574592 }, { "epoch": 4.03, "learning_rate": 0.00028003009027081243, "loss": 2.8287, "theoretical_loss": 3.5222258407093943, "tokens_seen": 1470640128 }, { "epoch": 4.03, "learning_rate": 0.00028002006018054167, "loss": 2.7506, "theoretical_loss": 3.5222119494355915, "tokens_seen": 1470705664 }, { "epoch": 4.03, "learning_rate": 0.0002800100300902708, "loss": 2.8767, "theoretical_loss": 3.522198058954096, "tokens_seen": 1470771200 }, { "epoch": 4.03, "learning_rate": 0.00028000000000000003, "loss": 2.7657, "theoretical_loss": 3.522184169264828, "tokens_seen": 1470836736 }, { "epoch": 4.03, "learning_rate": 0.0002799899699097292, "loss": 2.8922, "theoretical_loss": 3.522170280367707, "tokens_seen": 1470902272 }, { "epoch": 4.03, "learning_rate": 0.0002799799398194584, "loss": 2.846, "theoretical_loss": 3.5221563922626515, "tokens_seen": 1470967808 }, { "epoch": 4.03, "learning_rate": 0.0002799699097291876, "loss": 2.7896, "theoretical_loss": 3.522142504949582, "tokens_seen": 1471033344 }, { "epoch": 4.03, "learning_rate": 0.00027995987963891675, "loss": 2.9192, "theoretical_loss": 3.5221286184284173, "tokens_seen": 1471098880 }, { "epoch": 4.03, "learning_rate": 0.00027994984954864594, "loss": 2.754, "theoretical_loss": 3.522114732699078, "tokens_seen": 1471164416 }, { "epoch": 4.03, "objective/train/docs_used": 2348962, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7591745853424072, "objective/train/theoretical_loss": 3.522100847761483, "objective/train/tokens_used": 1491689952, "theoretical_loss": 3.522100847761483, "tokens_seen": 1471229952 }, { "epoch": 4.03, "learning_rate": 0.00027993981945837517, "loss": 2.8169, "theoretical_loss": 3.522100847761483, "tokens_seen": 1471229952 }, { "epoch": 4.03, "learning_rate": 0.0002799297893681043, "loss": 2.7051, "theoretical_loss": 3.5220869636155516, "tokens_seen": 1471295488 }, { "epoch": 4.03, "learning_rate": 0.00027991975927783353, "loss": 2.7762, "theoretical_loss": 3.522073080261204, "tokens_seen": 1471361024 }, { "epoch": 4.03, "learning_rate": 0.0002799097291875627, "loss": 2.6923, "theoretical_loss": 3.52205919769836, "tokens_seen": 1471426560 }, { "epoch": 4.03, "learning_rate": 0.0002798996990972919, "loss": 2.8842, "theoretical_loss": 3.5220453159269383, "tokens_seen": 1471492096 }, { "epoch": 4.03, "learning_rate": 0.0002798896690070211, "loss": 2.7543, "theoretical_loss": 3.5220314349468596, "tokens_seen": 1471557632 }, { "epoch": 4.03, "learning_rate": 0.00027987963891675026, "loss": 2.7242, "theoretical_loss": 3.5220175547580426, "tokens_seen": 1471623168 }, { "epoch": 4.03, "learning_rate": 0.00027986960882647944, "loss": 2.6378, "theoretical_loss": 3.5220036753604074, "tokens_seen": 1471688704 }, { "epoch": 4.03, "learning_rate": 0.0002798595787362087, "loss": 2.6363, "theoretical_loss": 3.521989796753874, "tokens_seen": 1471754240 }, { "epoch": 4.03, "learning_rate": 0.0002798495486459378, "loss": 2.8517, "theoretical_loss": 3.5219759189383613, "tokens_seen": 1471819776 }, { "epoch": 4.03, "learning_rate": 0.00027983951855566704, "loss": 2.7555, "theoretical_loss": 3.52196204191379, "tokens_seen": 1471885312 }, { "epoch": 4.03, "learning_rate": 0.00027982948846539616, "loss": 2.88, "theoretical_loss": 3.5219481656800786, "tokens_seen": 1471950848 }, { "epoch": 4.03, "learning_rate": 0.0002798194583751254, "loss": 2.8213, "theoretical_loss": 3.521934290237148, "tokens_seen": 1472016384 }, { "epoch": 4.03, "learning_rate": 0.0002798094282848546, "loss": 2.7494, "theoretical_loss": 3.5219204155849173, "tokens_seen": 1472081920 }, { "epoch": 4.03, "learning_rate": 0.00027979939819458376, "loss": 2.8313, "theoretical_loss": 3.5219065417233058, "tokens_seen": 1472147456 }, { "epoch": 4.03, "learning_rate": 0.00027978936810431294, "loss": 2.6861, "theoretical_loss": 3.5218926686522343, "tokens_seen": 1472212992 }, { "epoch": 4.03, "learning_rate": 0.0002797793380140422, "loss": 2.8688, "theoretical_loss": 3.521878796371622, "tokens_seen": 1472278528 }, { "epoch": 4.03, "learning_rate": 0.0002797693079237713, "loss": 2.6626, "theoretical_loss": 3.5218649248813887, "tokens_seen": 1472344064 }, { "epoch": 4.03, "learning_rate": 0.00027975927783350054, "loss": 2.6655, "theoretical_loss": 3.5218510541814543, "tokens_seen": 1472409600 }, { "epoch": 4.03, "learning_rate": 0.00027974924774322967, "loss": 2.8501, "theoretical_loss": 3.5218371842717384, "tokens_seen": 1472475136 }, { "epoch": 4.03, "learning_rate": 0.0002797392176529589, "loss": 2.668, "theoretical_loss": 3.5218233151521607, "tokens_seen": 1472540672 }, { "epoch": 4.03, "learning_rate": 0.0002797291875626881, "loss": 2.8214, "theoretical_loss": 3.521809446822642, "tokens_seen": 1472606208 }, { "epoch": 4.03, "learning_rate": 0.00027971915747241726, "loss": 2.8368, "theoretical_loss": 3.5217955792831006, "tokens_seen": 1472671744 }, { "epoch": 4.03, "learning_rate": 0.00027970912738214644, "loss": 2.836, "theoretical_loss": 3.521781712533458, "tokens_seen": 1472737280 }, { "epoch": 4.03, "learning_rate": 0.0002796990972918756, "loss": 2.8124, "theoretical_loss": 3.5217678465736326, "tokens_seen": 1472802816 }, { "epoch": 4.03, "objective/train/docs_used": 2350390, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7542591094970703, "objective/train/theoretical_loss": 3.521753981403545, "objective/train/tokens_used": 1493328352, "theoretical_loss": 3.521753981403545, "tokens_seen": 1472868352 }, { "epoch": 4.03, "learning_rate": 0.0002796890672016048, "loss": 2.6975, "theoretical_loss": 3.521753981403545, "tokens_seen": 1472868352 }, { "epoch": 4.03, "learning_rate": 0.00027967903711133404, "loss": 2.8421, "theoretical_loss": 3.521740117023115, "tokens_seen": 1472933888 }, { "epoch": 4.03, "learning_rate": 0.00027966900702106317, "loss": 2.785, "theoretical_loss": 3.521726253432263, "tokens_seen": 1472999424 }, { "epoch": 4.03, "learning_rate": 0.0002796589769307924, "loss": 2.7678, "theoretical_loss": 3.5217123906309076, "tokens_seen": 1473064960 }, { "epoch": 4.03, "learning_rate": 0.00027964894684052153, "loss": 2.6683, "theoretical_loss": 3.5216985286189697, "tokens_seen": 1473130496 }, { "epoch": 4.03, "learning_rate": 0.00027963891675025077, "loss": 2.7883, "theoretical_loss": 3.521684667396369, "tokens_seen": 1473196032 }, { "epoch": 4.03, "learning_rate": 0.00027962888665997995, "loss": 2.9431, "theoretical_loss": 3.5216708069630256, "tokens_seen": 1473261568 }, { "epoch": 4.03, "learning_rate": 0.00027961885656970913, "loss": 2.8779, "theoretical_loss": 3.521656947318859, "tokens_seen": 1473327104 }, { "epoch": 4.03, "learning_rate": 0.0002796088264794383, "loss": 2.7089, "theoretical_loss": 3.52164308846379, "tokens_seen": 1473392640 }, { "epoch": 4.03, "learning_rate": 0.00027959879638916755, "loss": 2.8148, "theoretical_loss": 3.5216292303977377, "tokens_seen": 1473458176 }, { "epoch": 4.03, "learning_rate": 0.00027958876629889667, "loss": 2.547, "theoretical_loss": 3.521615373120623, "tokens_seen": 1473523712 }, { "epoch": 4.03, "learning_rate": 0.0002795787362086259, "loss": 2.8451, "theoretical_loss": 3.521601516632365, "tokens_seen": 1473589248 }, { "epoch": 4.03, "learning_rate": 0.00027956870611835503, "loss": 2.8311, "theoretical_loss": 3.521587660932884, "tokens_seen": 1473654784 }, { "epoch": 4.03, "learning_rate": 0.00027955867602808427, "loss": 2.8761, "theoretical_loss": 3.5215738060221, "tokens_seen": 1473720320 }, { "epoch": 4.03, "learning_rate": 0.00027954864593781345, "loss": 2.8002, "theoretical_loss": 3.521559951899933, "tokens_seen": 1473785856 }, { "epoch": 4.03, "learning_rate": 0.00027953861584754263, "loss": 2.8759, "theoretical_loss": 3.521546098566303, "tokens_seen": 1473851392 }, { "epoch": 4.03, "learning_rate": 0.0002795285857572718, "loss": 2.9697, "theoretical_loss": 3.5215322460211307, "tokens_seen": 1473916928 }, { "epoch": 4.03, "learning_rate": 0.000279518555667001, "loss": 2.8321, "theoretical_loss": 3.5215183942643353, "tokens_seen": 1473982464 }, { "epoch": 4.03, "learning_rate": 0.0002795085255767302, "loss": 2.7588, "theoretical_loss": 3.521504543295838, "tokens_seen": 1474048000 }, { "epoch": 4.03, "learning_rate": 0.0002794984954864594, "loss": 2.9246, "theoretical_loss": 3.521490693115558, "tokens_seen": 1474113536 }, { "epoch": 4.03, "learning_rate": 0.00027948846539618854, "loss": 2.802, "theoretical_loss": 3.521476843723415, "tokens_seen": 1474179072 }, { "epoch": 4.03, "learning_rate": 0.0002794784353059178, "loss": 2.8974, "theoretical_loss": 3.5214629951193297, "tokens_seen": 1474244608 }, { "epoch": 4.03, "learning_rate": 0.0002794684052156469, "loss": 2.805, "theoretical_loss": 3.5214491473032226, "tokens_seen": 1474310144 }, { "epoch": 4.03, "learning_rate": 0.00027945837512537614, "loss": 2.6504, "theoretical_loss": 3.5214353002750136, "tokens_seen": 1474375680 }, { "epoch": 4.03, "learning_rate": 0.0002794483450351053, "loss": 2.981, "theoretical_loss": 3.5214214540346225, "tokens_seen": 1474441216 }, { "debugging/Self-BLEU-5": 0.5413186790053153, "debugging/distinct-1-grams": 0.7586577466221536, "debugging/distinct-2-grams": 0.9501756019233524, "debugging/entropy-1-grams": 6.2481304563552635, "debugging/entropy-2-grams": 7.236626443831751, "debugging/length": 524.25, "debugging/num_segments": 20, "epoch": 4.03, "objective/train/docs_used": 2353212, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0460939407348633, "objective/train/theoretical_loss": 3.5214076085819697, "objective/train/tokens_used": 1494966752, "theoretical_loss": 3.5214076085819697, "tokens_seen": 1474506752 }, { "epoch": 4.03, "learning_rate": 0.0002794383149448345, "loss": 2.8915, "theoretical_loss": 3.5214076085819697, "tokens_seen": 1474506752 }, { "epoch": 4.03, "learning_rate": 0.0002794282848545637, "loss": 2.8158, "theoretical_loss": 3.5213937639169757, "tokens_seen": 1474572288 }, { "epoch": 4.03, "learning_rate": 0.0002794182547642929, "loss": 2.7037, "theoretical_loss": 3.5213799200395597, "tokens_seen": 1474637824 }, { "epoch": 4.03, "learning_rate": 0.00027940822467402204, "loss": 2.6801, "theoretical_loss": 3.5213660769496427, "tokens_seen": 1474703360 }, { "epoch": 4.03, "learning_rate": 0.0002793981945837513, "loss": 2.8629, "theoretical_loss": 3.521352234647145, "tokens_seen": 1474768896 }, { "epoch": 4.03, "learning_rate": 0.0002793881644934804, "loss": 2.8844, "theoretical_loss": 3.5213383931319866, "tokens_seen": 1474834432 }, { "epoch": 4.03, "learning_rate": 0.00027937813440320964, "loss": 2.7328, "theoretical_loss": 3.5213245524040877, "tokens_seen": 1474899968 }, { "epoch": 4.03, "learning_rate": 0.0002793681043129388, "loss": 2.8296, "theoretical_loss": 3.521310712463368, "tokens_seen": 1474965504 }, { "epoch": 4.03, "learning_rate": 0.000279358074222668, "loss": 2.7793, "theoretical_loss": 3.521296873309749, "tokens_seen": 1475031040 }, { "epoch": 4.03, "learning_rate": 0.0002793480441323972, "loss": 2.9688, "theoretical_loss": 3.52128303494315, "tokens_seen": 1475096576 }, { "epoch": 4.03, "learning_rate": 0.00027933801404212636, "loss": 2.9019, "theoretical_loss": 3.5212691973634915, "tokens_seen": 1475162112 }, { "epoch": 4.03, "learning_rate": 0.00027932798395185554, "loss": 2.7495, "theoretical_loss": 3.521255360570694, "tokens_seen": 1475227648 }, { "epoch": 4.03, "learning_rate": 0.0002793179538615848, "loss": 2.5407, "theoretical_loss": 3.5212415245646778, "tokens_seen": 1475293184 }, { "epoch": 4.03, "learning_rate": 0.0002793079237713139, "loss": 2.6769, "theoretical_loss": 3.5212276893453627, "tokens_seen": 1475358720 }, { "epoch": 4.03, "learning_rate": 0.00027929789368104314, "loss": 2.6239, "theoretical_loss": 3.5212138549126695, "tokens_seen": 1475424256 }, { "epoch": 4.03, "learning_rate": 0.00027928786359077227, "loss": 2.7146, "theoretical_loss": 3.5212000212665187, "tokens_seen": 1475489792 }, { "epoch": 4.03, "learning_rate": 0.0002792778335005015, "loss": 2.816, "theoretical_loss": 3.52118618840683, "tokens_seen": 1475555328 }, { "epoch": 4.03, "learning_rate": 0.00027926780341023074, "loss": 2.773, "theoretical_loss": 3.5211723563335244, "tokens_seen": 1475620864 }, { "epoch": 4.03, "learning_rate": 0.00027925777331995987, "loss": 2.7312, "theoretical_loss": 3.5211585250465216, "tokens_seen": 1475686400 }, { "epoch": 4.03, "learning_rate": 0.0002792477432296891, "loss": 2.7807, "theoretical_loss": 3.5211446945457427, "tokens_seen": 1475751936 }, { "epoch": 4.03, "learning_rate": 0.0002792377131394183, "loss": 2.781, "theoretical_loss": 3.521130864831108, "tokens_seen": 1475817472 }, { "epoch": 4.03, "learning_rate": 0.00027922768304914746, "loss": 2.7553, "theoretical_loss": 3.521117035902537, "tokens_seen": 1475883008 }, { "epoch": 4.03, "learning_rate": 0.00027921765295887664, "loss": 2.8439, "theoretical_loss": 3.5211032077599516, "tokens_seen": 1475948544 }, { "epoch": 4.03, "learning_rate": 0.0002792076228686058, "loss": 2.8197, "theoretical_loss": 3.5210893804032715, "tokens_seen": 1476014080 }, { "epoch": 4.03, "learning_rate": 0.000279197592778335, "loss": 2.8621, "theoretical_loss": 3.5210755538324165, "tokens_seen": 1476079616 }, { "epoch": 4.03, "objective/train/docs_used": 2355969, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8160181045532227, "objective/train/theoretical_loss": 3.521061728047308, "objective/train/tokens_used": 1496605152, "theoretical_loss": 3.521061728047308, "tokens_seen": 1476145152 }, { "epoch": 4.03, "learning_rate": 0.00027918756268806424, "loss": 2.7449, "theoretical_loss": 3.521061728047308, "tokens_seen": 1476145152 }, { "epoch": 4.03, "learning_rate": 0.00027917753259779337, "loss": 2.9464, "theoretical_loss": 3.521047903047866, "tokens_seen": 1476210688 }, { "epoch": 4.03, "learning_rate": 0.0002791675025075226, "loss": 2.7526, "theoretical_loss": 3.521034078834011, "tokens_seen": 1476276224 }, { "epoch": 4.03, "learning_rate": 0.00027915747241725173, "loss": 2.8616, "theoretical_loss": 3.5210202554056638, "tokens_seen": 1476341760 }, { "epoch": 4.03, "learning_rate": 0.00027914744232698097, "loss": 2.7825, "theoretical_loss": 3.5210064327627446, "tokens_seen": 1476407296 }, { "epoch": 4.03, "learning_rate": 0.00027913741223671015, "loss": 2.7341, "theoretical_loss": 3.520992610905174, "tokens_seen": 1476472832 }, { "epoch": 4.03, "learning_rate": 0.00027912738214643933, "loss": 2.898, "theoretical_loss": 3.5209787898328724, "tokens_seen": 1476538368 }, { "epoch": 4.03, "learning_rate": 0.0002791173520561685, "loss": 2.6899, "theoretical_loss": 3.5209649695457603, "tokens_seen": 1476603904 }, { "epoch": 4.03, "learning_rate": 0.00027910732196589775, "loss": 2.8218, "theoretical_loss": 3.520951150043759, "tokens_seen": 1476669440 }, { "epoch": 4.03, "learning_rate": 0.00027909729187562687, "loss": 2.7045, "theoretical_loss": 3.520937331326788, "tokens_seen": 1476734976 }, { "epoch": 4.03, "learning_rate": 0.0002790872617853561, "loss": 2.7975, "theoretical_loss": 3.5209235133947683, "tokens_seen": 1476800512 }, { "epoch": 4.03, "learning_rate": 0.00027907723169508523, "loss": 2.7666, "theoretical_loss": 3.520909696247621, "tokens_seen": 1476866048 }, { "epoch": 4.03, "learning_rate": 0.00027906720160481447, "loss": 2.7816, "theoretical_loss": 3.5208958798852654, "tokens_seen": 1476931584 }, { "epoch": 4.03, "learning_rate": 0.00027905717151454365, "loss": 2.6644, "theoretical_loss": 3.5208820643076235, "tokens_seen": 1476997120 }, { "epoch": 4.03, "learning_rate": 0.00027904714142427283, "loss": 2.6161, "theoretical_loss": 3.5208682495146153, "tokens_seen": 1477062656 }, { "epoch": 4.03, "learning_rate": 0.000279037111334002, "loss": 2.6745, "theoretical_loss": 3.520854435506161, "tokens_seen": 1477128192 }, { "epoch": 4.03, "learning_rate": 0.0002790270812437312, "loss": 2.8477, "theoretical_loss": 3.520840622282182, "tokens_seen": 1477193728 }, { "epoch": 4.03, "learning_rate": 0.0002790170511534604, "loss": 2.9589, "theoretical_loss": 3.5208268098425988, "tokens_seen": 1477259264 }, { "epoch": 4.03, "learning_rate": 0.0002790070210631896, "loss": 2.8679, "theoretical_loss": 3.5208129981873313, "tokens_seen": 1477324800 }, { "epoch": 4.03, "learning_rate": 0.00027899699097291874, "loss": 2.7727, "theoretical_loss": 3.5207991873163014, "tokens_seen": 1477390336 }, { "epoch": 4.03, "learning_rate": 0.000278986960882648, "loss": 2.7081, "theoretical_loss": 3.520785377229429, "tokens_seen": 1477455872 }, { "epoch": 4.03, "learning_rate": 0.0002789769307923771, "loss": 2.9091, "theoretical_loss": 3.5207715679266345, "tokens_seen": 1477521408 }, { "epoch": 4.03, "learning_rate": 0.00027896690070210634, "loss": 2.8019, "theoretical_loss": 3.5207577594078394, "tokens_seen": 1477586944 }, { "epoch": 4.03, "learning_rate": 0.0002789568706118355, "loss": 2.6224, "theoretical_loss": 3.520743951672964, "tokens_seen": 1477652480 }, { "epoch": 4.03, "learning_rate": 0.0002789468405215647, "loss": 2.7785, "theoretical_loss": 3.5207301447219295, "tokens_seen": 1477718016 }, { "epoch": 4.03, "objective/train/docs_used": 2358390, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9614357948303223, "objective/train/theoretical_loss": 3.520716338554656, "objective/train/tokens_used": 1498243552, "theoretical_loss": 3.520716338554656, "tokens_seen": 1477783552 }, { "epoch": 4.03, "learning_rate": 0.0002789368104312939, "loss": 2.7433, "theoretical_loss": 3.520716338554656, "tokens_seen": 1477783552 }, { "epoch": 4.03, "learning_rate": 0.0002789267803410231, "loss": 2.7298, "theoretical_loss": 3.5207025331710646, "tokens_seen": 1477849088 }, { "epoch": 4.03, "learning_rate": 0.00027891675025075224, "loss": 2.6986, "theoretical_loss": 3.5206887285710757, "tokens_seen": 1477914624 }, { "epoch": 4.03, "learning_rate": 0.0002789067201604815, "loss": 2.645, "theoretical_loss": 3.5206749247546103, "tokens_seen": 1477980160 }, { "epoch": 4.03, "learning_rate": 0.0002788966900702106, "loss": 2.8666, "theoretical_loss": 3.5206611217215897, "tokens_seen": 1478045696 }, { "epoch": 4.03, "learning_rate": 0.00027888665997993984, "loss": 2.8381, "theoretical_loss": 3.5206473194719337, "tokens_seen": 1478111232 }, { "epoch": 4.03, "learning_rate": 0.000278876629889669, "loss": 2.7788, "theoretical_loss": 3.5206335180055643, "tokens_seen": 1478176768 }, { "epoch": 4.03, "learning_rate": 0.0002788665997993982, "loss": 2.6536, "theoretical_loss": 3.5206197173224014, "tokens_seen": 1478242304 }, { "epoch": 4.03, "learning_rate": 0.0002788565697091274, "loss": 2.7978, "theoretical_loss": 3.520605917422366, "tokens_seen": 1478307840 }, { "epoch": 4.03, "learning_rate": 0.00027884653961885656, "loss": 2.7895, "theoretical_loss": 3.520592118305379, "tokens_seen": 1478373376 }, { "epoch": 4.03, "learning_rate": 0.00027883650952858574, "loss": 2.826, "theoretical_loss": 3.5205783199713614, "tokens_seen": 1478438912 }, { "epoch": 4.03, "learning_rate": 0.000278826479438315, "loss": 2.7576, "theoretical_loss": 3.520564522420234, "tokens_seen": 1478504448 }, { "epoch": 4.03, "learning_rate": 0.0002788164493480441, "loss": 2.7405, "theoretical_loss": 3.5205507256519177, "tokens_seen": 1478569984 }, { "epoch": 4.03, "learning_rate": 0.00027880641925777334, "loss": 2.83, "theoretical_loss": 3.520536929666333, "tokens_seen": 1478635520 }, { "epoch": 4.03, "learning_rate": 0.00027879638916750247, "loss": 2.7709, "theoretical_loss": 3.520523134463402, "tokens_seen": 1478701056 }, { "epoch": 4.03, "learning_rate": 0.0002787863590772317, "loss": 2.9477, "theoretical_loss": 3.520509340043044, "tokens_seen": 1478766592 }, { "epoch": 4.03, "learning_rate": 0.0002787763289869609, "loss": 2.7507, "theoretical_loss": 3.520495546405181, "tokens_seen": 1478832128 }, { "epoch": 4.03, "learning_rate": 0.00027876629889669007, "loss": 2.7413, "theoretical_loss": 3.520481753549734, "tokens_seen": 1478897664 }, { "epoch": 4.03, "learning_rate": 0.00027875626880641925, "loss": 2.62, "theoretical_loss": 3.5204679614766228, "tokens_seen": 1478963200 }, { "epoch": 4.03, "learning_rate": 0.0002787462387161485, "loss": 2.8805, "theoretical_loss": 3.5204541701857694, "tokens_seen": 1479028736 }, { "epoch": 4.03, "learning_rate": 0.0002787362086258776, "loss": 2.7438, "theoretical_loss": 3.520440379677095, "tokens_seen": 1479094272 }, { "epoch": 4.03, "learning_rate": 0.00027872617853560685, "loss": 2.8471, "theoretical_loss": 3.52042658995052, "tokens_seen": 1479159808 }, { "epoch": 4.03, "learning_rate": 0.00027871614844533597, "loss": 2.7044, "theoretical_loss": 3.520412801005965, "tokens_seen": 1479225344 }, { "epoch": 4.03, "learning_rate": 0.0002787061183550652, "loss": 2.8616, "theoretical_loss": 3.5203990128433515, "tokens_seen": 1479290880 }, { "epoch": 4.03, "learning_rate": 0.0002786960882647944, "loss": 2.7227, "theoretical_loss": 3.5203852254626016, "tokens_seen": 1479356416 }, { "epoch": 4.03, "objective/train/docs_used": 2361177, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9877641201019287, "objective/train/theoretical_loss": 3.5203714388636342, "objective/train/tokens_used": 1499881952, "theoretical_loss": 3.5203714388636342, "tokens_seen": 1479421952 }, { "epoch": 4.03, "learning_rate": 0.00027868605817452357, "loss": 2.7605, "theoretical_loss": 3.5203714388636342, "tokens_seen": 1479421952 }, { "epoch": 4.03, "learning_rate": 0.00027867602808425275, "loss": 2.7057, "theoretical_loss": 3.520357653046372, "tokens_seen": 1479487488 }, { "epoch": 4.03, "learning_rate": 0.00027866599799398193, "loss": 2.7577, "theoretical_loss": 3.520343868010735, "tokens_seen": 1479553024 }, { "epoch": 4.03, "learning_rate": 0.0002786559679037111, "loss": 2.8428, "theoretical_loss": 3.5203300837566447, "tokens_seen": 1479618560 }, { "epoch": 4.03, "learning_rate": 0.00027864593781344035, "loss": 2.6605, "theoretical_loss": 3.5203163002840228, "tokens_seen": 1479684096 }, { "epoch": 4.03, "learning_rate": 0.0002786359077231695, "loss": 2.7393, "theoretical_loss": 3.520302517592789, "tokens_seen": 1479749632 }, { "epoch": 4.03, "learning_rate": 0.0002786258776328987, "loss": 2.8112, "theoretical_loss": 3.5202887356828656, "tokens_seen": 1479815168 }, { "epoch": 4.03, "learning_rate": 0.0002786158475426279, "loss": 2.7515, "theoretical_loss": 3.5202749545541736, "tokens_seen": 1479880704 }, { "epoch": 4.03, "learning_rate": 0.00027860581745235707, "loss": 2.8601, "theoretical_loss": 3.5202611742066336, "tokens_seen": 1479946240 }, { "epoch": 4.03, "learning_rate": 0.00027859578736208625, "loss": 2.8336, "theoretical_loss": 3.520247394640167, "tokens_seen": 1480011776 }, { "epoch": 4.03, "learning_rate": 0.00027858575727181544, "loss": 2.8129, "theoretical_loss": 3.5202336158546945, "tokens_seen": 1480077312 }, { "epoch": 4.03, "learning_rate": 0.0002785757271815446, "loss": 2.954, "theoretical_loss": 3.5202198378501377, "tokens_seen": 1480142848 }, { "epoch": 4.03, "learning_rate": 0.00027856569709127385, "loss": 2.8596, "theoretical_loss": 3.5202060606264185, "tokens_seen": 1480208384 }, { "epoch": 4.03, "learning_rate": 0.000278555667001003, "loss": 2.8291, "theoretical_loss": 3.5201922841834565, "tokens_seen": 1480273920 }, { "epoch": 4.03, "learning_rate": 0.0002785456369107322, "loss": 2.8633, "theoretical_loss": 3.520178508521174, "tokens_seen": 1480339456 }, { "epoch": 4.03, "learning_rate": 0.00027853560682046134, "loss": 2.8124, "theoretical_loss": 3.520164733639492, "tokens_seen": 1480404992 }, { "epoch": 4.03, "learning_rate": 0.0002785255767301906, "loss": 2.7259, "theoretical_loss": 3.5201509595383316, "tokens_seen": 1480470528 }, { "epoch": 4.03, "learning_rate": 0.0002785155466399198, "loss": 2.6304, "theoretical_loss": 3.520137186217614, "tokens_seen": 1480536064 }, { "epoch": 4.03, "learning_rate": 0.00027850551654964894, "loss": 2.715, "theoretical_loss": 3.5201234136772603, "tokens_seen": 1480601600 }, { "epoch": 4.03, "learning_rate": 0.0002784954864593782, "loss": 2.9055, "theoretical_loss": 3.520109641917192, "tokens_seen": 1480667136 }, { "epoch": 4.03, "learning_rate": 0.0002784854563691073, "loss": 2.726, "theoretical_loss": 3.5200958709373302, "tokens_seen": 1480732672 }, { "epoch": 4.03, "learning_rate": 0.00027847542627883654, "loss": 2.8721, "theoretical_loss": 3.5200821007375964, "tokens_seen": 1480798208 }, { "epoch": 4.03, "learning_rate": 0.0002784653961885657, "loss": 2.844, "theoretical_loss": 3.5200683313179115, "tokens_seen": 1480863744 }, { "epoch": 4.03, "learning_rate": 0.0002784553660982949, "loss": 2.8053, "theoretical_loss": 3.5200545626781974, "tokens_seen": 1480929280 }, { "epoch": 4.03, "learning_rate": 0.0002784453360080241, "loss": 2.8086, "theoretical_loss": 3.520040794818375, "tokens_seen": 1480994816 }, { "epoch": 4.03, "objective/train/docs_used": 2363744, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8411216735839844, "objective/train/theoretical_loss": 3.520027027738365, "objective/train/tokens_used": 1501520352, "theoretical_loss": 3.520027027738365, "tokens_seen": 1481060352 }, { "epoch": 4.03, "learning_rate": 0.0002784353059177533, "loss": 2.8294, "theoretical_loss": 3.520027027738365, "tokens_seen": 1481060352 }, { "epoch": 4.03, "learning_rate": 0.00027842527582748244, "loss": 2.803, "theoretical_loss": 3.5200132614380903, "tokens_seen": 1481125888 }, { "epoch": 4.03, "learning_rate": 0.0002784152457372117, "loss": 2.8597, "theoretical_loss": 3.519999495917471, "tokens_seen": 1481191424 }, { "epoch": 4.03, "learning_rate": 0.0002784052156469408, "loss": 2.7987, "theoretical_loss": 3.5199857311764284, "tokens_seen": 1481256960 }, { "epoch": 4.03, "learning_rate": 0.00027839518555667004, "loss": 2.7591, "theoretical_loss": 3.519971967214884, "tokens_seen": 1481322496 }, { "epoch": 4.03, "learning_rate": 0.0002783851554663992, "loss": 2.8149, "theoretical_loss": 3.5199582040327604, "tokens_seen": 1481388032 }, { "epoch": 4.03, "learning_rate": 0.0002783751253761284, "loss": 2.749, "theoretical_loss": 3.5199444416299777, "tokens_seen": 1481453568 }, { "epoch": 4.03, "learning_rate": 0.0002783650952858576, "loss": 2.7706, "theoretical_loss": 3.519930680006457, "tokens_seen": 1481519104 }, { "epoch": 4.03, "learning_rate": 0.00027835506519558676, "loss": 2.6618, "theoretical_loss": 3.519916919162121, "tokens_seen": 1481584640 }, { "epoch": 4.03, "learning_rate": 0.00027834503510531594, "loss": 2.8572, "theoretical_loss": 3.5199031590968897, "tokens_seen": 1481650176 }, { "epoch": 4.03, "learning_rate": 0.0002783350050150452, "loss": 2.776, "theoretical_loss": 3.5198893998106855, "tokens_seen": 1481715712 }, { "epoch": 4.03, "learning_rate": 0.0002783249749247743, "loss": 2.7473, "theoretical_loss": 3.51987564130343, "tokens_seen": 1481781248 }, { "epoch": 4.03, "learning_rate": 0.00027831494483450354, "loss": 2.6838, "theoretical_loss": 3.519861883575044, "tokens_seen": 1481846784 }, { "epoch": 4.03, "learning_rate": 0.00027830491474423267, "loss": 2.682, "theoretical_loss": 3.519848126625449, "tokens_seen": 1481912320 }, { "epoch": 4.03, "learning_rate": 0.0002782948846539619, "loss": 2.8338, "theoretical_loss": 3.519834370454567, "tokens_seen": 1481977856 }, { "epoch": 4.03, "learning_rate": 0.0002782848545636911, "loss": 2.8229, "theoretical_loss": 3.5198206150623186, "tokens_seen": 1482043392 }, { "epoch": 4.03, "learning_rate": 0.00027827482447342027, "loss": 2.8826, "theoretical_loss": 3.519806860448626, "tokens_seen": 1482108928 }, { "epoch": 4.03, "learning_rate": 0.00027826479438314945, "loss": 2.7761, "theoretical_loss": 3.5197931066134114, "tokens_seen": 1482174464 }, { "epoch": 4.03, "learning_rate": 0.0002782547642928787, "loss": 2.7713, "theoretical_loss": 3.5197793535565944, "tokens_seen": 1482240000 }, { "epoch": 4.03, "learning_rate": 0.0002782447342026078, "loss": 2.9383, "theoretical_loss": 3.5197656012780985, "tokens_seen": 1482305536 }, { "epoch": 4.04, "learning_rate": 0.00027823470411233705, "loss": 2.6216, "theoretical_loss": 3.519751849777844, "tokens_seen": 1482371072 }, { "epoch": 4.04, "learning_rate": 0.00027822467402206617, "loss": 2.7496, "theoretical_loss": 3.5197380990557527, "tokens_seen": 1482436608 }, { "epoch": 4.04, "learning_rate": 0.0002782146439317954, "loss": 2.7009, "theoretical_loss": 3.5197243491117467, "tokens_seen": 1482502144 }, { "epoch": 4.04, "learning_rate": 0.0002782046138415246, "loss": 2.7974, "theoretical_loss": 3.5197105999457468, "tokens_seen": 1482567680 }, { "epoch": 4.04, "learning_rate": 0.00027819458375125377, "loss": 2.6743, "theoretical_loss": 3.5196968515576748, "tokens_seen": 1482633216 }, { "epoch": 4.04, "objective/train/docs_used": 2366625, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0891759395599365, "objective/train/theoretical_loss": 3.519683103947453, "objective/train/tokens_used": 1503158752, "theoretical_loss": 3.519683103947453, "tokens_seen": 1482698752 }, { "epoch": 4.04, "learning_rate": 0.00027818455366098295, "loss": 2.6245, "theoretical_loss": 3.519683103947453, "tokens_seen": 1482698752 }, { "epoch": 4.04, "learning_rate": 0.00027817452357071213, "loss": 2.8145, "theoretical_loss": 3.5196693571150024, "tokens_seen": 1482764288 }, { "epoch": 4.04, "learning_rate": 0.0002781644934804413, "loss": 2.8423, "theoretical_loss": 3.5196556110602444, "tokens_seen": 1482829824 }, { "epoch": 4.04, "learning_rate": 0.00027815446339017055, "loss": 3.0341, "theoretical_loss": 3.5196418657831012, "tokens_seen": 1482895360 }, { "epoch": 4.04, "learning_rate": 0.0002781444332998997, "loss": 2.8969, "theoretical_loss": 3.519628121283494, "tokens_seen": 1482960896 }, { "epoch": 4.04, "learning_rate": 0.0002781344032096289, "loss": 2.8231, "theoretical_loss": 3.5196143775613447, "tokens_seen": 1483026432 }, { "epoch": 4.04, "learning_rate": 0.0002781243731193581, "loss": 2.7875, "theoretical_loss": 3.519600634616575, "tokens_seen": 1483091968 }, { "epoch": 4.04, "learning_rate": 0.0002781143430290873, "loss": 2.7503, "theoretical_loss": 3.5195868924491065, "tokens_seen": 1483157504 }, { "epoch": 4.04, "learning_rate": 0.00027810431293881645, "loss": 2.8313, "theoretical_loss": 3.519573151058861, "tokens_seen": 1483223040 }, { "epoch": 4.04, "learning_rate": 0.00027809428284854564, "loss": 2.6083, "theoretical_loss": 3.5195594104457597, "tokens_seen": 1483288576 }, { "epoch": 4.04, "learning_rate": 0.0002780842527582748, "loss": 2.7242, "theoretical_loss": 3.519545670609725, "tokens_seen": 1483354112 }, { "epoch": 4.04, "learning_rate": 0.00027807422266800405, "loss": 2.9997, "theoretical_loss": 3.519531931550678, "tokens_seen": 1483419648 }, { "epoch": 4.04, "learning_rate": 0.0002780641925777332, "loss": 2.8322, "theoretical_loss": 3.5195181932685413, "tokens_seen": 1483485184 }, { "epoch": 4.04, "learning_rate": 0.0002780541624874624, "loss": 2.8584, "theoretical_loss": 3.5195044557632356, "tokens_seen": 1483550720 }, { "epoch": 4.04, "learning_rate": 0.00027804413239719154, "loss": 2.7909, "theoretical_loss": 3.5194907190346836, "tokens_seen": 1483616256 }, { "epoch": 4.04, "learning_rate": 0.0002780341023069208, "loss": 2.7832, "theoretical_loss": 3.5194769830828063, "tokens_seen": 1483681792 }, { "epoch": 4.04, "learning_rate": 0.00027802407221664996, "loss": 2.8971, "theoretical_loss": 3.5194632479075265, "tokens_seen": 1483747328 }, { "epoch": 4.04, "learning_rate": 0.00027801404212637914, "loss": 2.813, "theoretical_loss": 3.5194495135087647, "tokens_seen": 1483812864 }, { "epoch": 4.04, "learning_rate": 0.0002780040120361083, "loss": 2.9228, "theoretical_loss": 3.5194357798864435, "tokens_seen": 1483878400 }, { "epoch": 4.04, "learning_rate": 0.0002779939819458375, "loss": 2.8067, "theoretical_loss": 3.5194220470404844, "tokens_seen": 1483943936 }, { "epoch": 4.04, "learning_rate": 0.0002779839518555667, "loss": 2.8954, "theoretical_loss": 3.5194083149708097, "tokens_seen": 1484009472 }, { "epoch": 4.04, "learning_rate": 0.0002779739217652959, "loss": 2.6199, "theoretical_loss": 3.519394583677341, "tokens_seen": 1484075008 }, { "epoch": 4.04, "learning_rate": 0.00027796389167502504, "loss": 2.6537, "theoretical_loss": 3.5193808531599995, "tokens_seen": 1484140544 }, { "epoch": 4.04, "learning_rate": 0.0002779538615847543, "loss": 2.8908, "theoretical_loss": 3.519367123418708, "tokens_seen": 1484206080 }, { "epoch": 4.04, "learning_rate": 0.00027794383149448346, "loss": 2.859, "theoretical_loss": 3.519353394453388, "tokens_seen": 1484271616 }, { "epoch": 4.04, "objective/train/docs_used": 2368081, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6597535610198975, "objective/train/theoretical_loss": 3.5193396662639613, "objective/train/tokens_used": 1504797152, "theoretical_loss": 3.5193396662639613, "tokens_seen": 1484337152 }, { "epoch": 4.04, "learning_rate": 0.00027793380140421264, "loss": 2.7323, "theoretical_loss": 3.5193396662639613, "tokens_seen": 1484337152 }, { "epoch": 4.04, "learning_rate": 0.0002779237713139418, "loss": 2.6867, "theoretical_loss": 3.5193259388503497, "tokens_seen": 1484402688 }, { "epoch": 4.04, "learning_rate": 0.000277913741223671, "loss": 2.6411, "theoretical_loss": 3.519312212212476, "tokens_seen": 1484468224 }, { "epoch": 4.04, "learning_rate": 0.0002779037111334002, "loss": 2.7564, "theoretical_loss": 3.5192984863502605, "tokens_seen": 1484533760 }, { "epoch": 4.04, "learning_rate": 0.0002778936810431294, "loss": 2.7535, "theoretical_loss": 3.519284761263627, "tokens_seen": 1484599296 }, { "epoch": 4.04, "learning_rate": 0.00027788365095285855, "loss": 2.8643, "theoretical_loss": 3.519271036952496, "tokens_seen": 1484664832 }, { "epoch": 4.04, "learning_rate": 0.0002778736208625878, "loss": 2.8335, "theoretical_loss": 3.51925731341679, "tokens_seen": 1484730368 }, { "epoch": 4.04, "learning_rate": 0.0002778635907723169, "loss": 2.8672, "theoretical_loss": 3.5192435906564308, "tokens_seen": 1484795904 }, { "epoch": 4.04, "learning_rate": 0.00027785356068204614, "loss": 2.8499, "theoretical_loss": 3.5192298686713404, "tokens_seen": 1484861440 }, { "epoch": 4.04, "learning_rate": 0.0002778435305917753, "loss": 2.8175, "theoretical_loss": 3.519216147461441, "tokens_seen": 1484926976 }, { "epoch": 4.04, "learning_rate": 0.0002778335005015045, "loss": 2.9578, "theoretical_loss": 3.5192024270266544, "tokens_seen": 1484992512 }, { "epoch": 4.04, "learning_rate": 0.0002778234704112337, "loss": 2.9681, "theoretical_loss": 3.5191887073669026, "tokens_seen": 1485058048 }, { "epoch": 4.04, "learning_rate": 0.00027781344032096287, "loss": 2.7502, "theoretical_loss": 3.519174988482108, "tokens_seen": 1485123584 }, { "epoch": 4.04, "learning_rate": 0.00027780341023069205, "loss": 2.8084, "theoretical_loss": 3.519161270372192, "tokens_seen": 1485189120 }, { "epoch": 4.04, "learning_rate": 0.0002777933801404213, "loss": 2.6702, "theoretical_loss": 3.519147553037077, "tokens_seen": 1485254656 }, { "epoch": 4.04, "learning_rate": 0.0002777833500501504, "loss": 2.7784, "theoretical_loss": 3.5191338364766853, "tokens_seen": 1485320192 }, { "epoch": 4.04, "learning_rate": 0.00027777331995987965, "loss": 2.9422, "theoretical_loss": 3.5191201206909386, "tokens_seen": 1485385728 }, { "epoch": 4.04, "learning_rate": 0.0002777632898696089, "loss": 2.7158, "theoretical_loss": 3.519106405679759, "tokens_seen": 1485451264 }, { "epoch": 4.04, "learning_rate": 0.000277753259779338, "loss": 2.8404, "theoretical_loss": 3.5190926914430687, "tokens_seen": 1485516800 }, { "epoch": 4.04, "learning_rate": 0.00027774322968906725, "loss": 2.6141, "theoretical_loss": 3.5190789779807896, "tokens_seen": 1485582336 }, { "epoch": 4.04, "learning_rate": 0.00027773319959879637, "loss": 2.9168, "theoretical_loss": 3.519065265292844, "tokens_seen": 1485647872 }, { "epoch": 4.04, "learning_rate": 0.0002777231695085256, "loss": 2.8519, "theoretical_loss": 3.519051553379154, "tokens_seen": 1485713408 }, { "epoch": 4.04, "learning_rate": 0.0002777131394182548, "loss": 2.9072, "theoretical_loss": 3.5190378422396416, "tokens_seen": 1485778944 }, { "epoch": 4.04, "learning_rate": 0.00027770310932798397, "loss": 2.8037, "theoretical_loss": 3.5190241318742297, "tokens_seen": 1485844480 }, { "epoch": 4.04, "learning_rate": 0.00027769307923771315, "loss": 2.8421, "theoretical_loss": 3.519010422282839, "tokens_seen": 1485910016 }, { "epoch": 4.04, "objective/train/docs_used": 2371006, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.975248336791992, "objective/train/theoretical_loss": 3.518996713465393, "objective/train/tokens_used": 1506435552, "theoretical_loss": 3.518996713465393, "tokens_seen": 1485975552 }, { "epoch": 4.04, "learning_rate": 0.00027768304914744233, "loss": 2.6855, "theoretical_loss": 3.518996713465393, "tokens_seen": 1485975552 }, { "epoch": 4.04, "learning_rate": 0.0002776730190571715, "loss": 2.8846, "theoretical_loss": 3.518983005421813, "tokens_seen": 1486041088 }, { "epoch": 4.04, "learning_rate": 0.00027766298896690075, "loss": 2.8003, "theoretical_loss": 3.518969298152022, "tokens_seen": 1486106624 }, { "epoch": 4.04, "learning_rate": 0.0002776529588766299, "loss": 2.7104, "theoretical_loss": 3.518955591655941, "tokens_seen": 1486172160 }, { "epoch": 4.04, "learning_rate": 0.0002776429287863591, "loss": 2.763, "theoretical_loss": 3.518941885933494, "tokens_seen": 1486237696 }, { "epoch": 4.04, "learning_rate": 0.0002776328986960883, "loss": 2.7266, "theoretical_loss": 3.5189281809846014, "tokens_seen": 1486303232 }, { "epoch": 4.04, "learning_rate": 0.0002776228686058175, "loss": 2.8342, "theoretical_loss": 3.5189144768091865, "tokens_seen": 1486368768 }, { "epoch": 4.04, "learning_rate": 0.00027761283851554665, "loss": 2.7155, "theoretical_loss": 3.5189007734071716, "tokens_seen": 1486434304 }, { "epoch": 4.04, "learning_rate": 0.00027760280842527584, "loss": 2.9492, "theoretical_loss": 3.518887070778478, "tokens_seen": 1486499840 }, { "epoch": 4.04, "learning_rate": 0.000277592778335005, "loss": 2.8418, "theoretical_loss": 3.518873368923029, "tokens_seen": 1486565376 }, { "epoch": 4.04, "learning_rate": 0.00027758274824473425, "loss": 2.7698, "theoretical_loss": 3.518859667840746, "tokens_seen": 1486630912 }, { "epoch": 4.04, "learning_rate": 0.0002775727181544634, "loss": 2.9919, "theoretical_loss": 3.5188459675315524, "tokens_seen": 1486696448 }, { "epoch": 4.04, "learning_rate": 0.0002775626880641926, "loss": 2.8412, "theoretical_loss": 3.5188322679953696, "tokens_seen": 1486761984 }, { "epoch": 4.04, "learning_rate": 0.00027755265797392174, "loss": 2.8626, "theoretical_loss": 3.5188185692321206, "tokens_seen": 1486827520 }, { "epoch": 4.04, "learning_rate": 0.000277542627883651, "loss": 2.8994, "theoretical_loss": 3.518804871241727, "tokens_seen": 1486893056 }, { "epoch": 4.04, "learning_rate": 0.00027753259779338016, "loss": 2.9771, "theoretical_loss": 3.518791174024111, "tokens_seen": 1486958592 }, { "epoch": 4.04, "learning_rate": 0.00027752256770310934, "loss": 2.778, "theoretical_loss": 3.518777477579196, "tokens_seen": 1487024128 }, { "epoch": 4.04, "learning_rate": 0.0002775125376128385, "loss": 2.8769, "theoretical_loss": 3.518763781906903, "tokens_seen": 1487089664 }, { "epoch": 4.04, "learning_rate": 0.0002775025075225677, "loss": 2.7666, "theoretical_loss": 3.518750087007156, "tokens_seen": 1487155200 }, { "epoch": 4.04, "learning_rate": 0.0002774924774322969, "loss": 2.9301, "theoretical_loss": 3.518736392879876, "tokens_seen": 1487220736 }, { "epoch": 4.04, "learning_rate": 0.0002774824473420261, "loss": 2.863, "theoretical_loss": 3.518722699524986, "tokens_seen": 1487286272 }, { "epoch": 4.04, "learning_rate": 0.00027747241725175524, "loss": 2.771, "theoretical_loss": 3.518709006942408, "tokens_seen": 1487351808 }, { "epoch": 4.04, "learning_rate": 0.0002774623871614845, "loss": 2.7233, "theoretical_loss": 3.5186953151320655, "tokens_seen": 1487417344 }, { "epoch": 4.04, "learning_rate": 0.00027745235707121366, "loss": 2.8454, "theoretical_loss": 3.518681624093879, "tokens_seen": 1487482880 }, { "epoch": 4.04, "learning_rate": 0.00027744232698094284, "loss": 2.5323, "theoretical_loss": 3.518667933827773, "tokens_seen": 1487548416 }, { "epoch": 4.04, "objective/train/docs_used": 2373648, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8053364753723145, "objective/train/theoretical_loss": 3.5186542443336686, "objective/train/tokens_used": 1508073952, "theoretical_loss": 3.5186542443336686, "tokens_seen": 1487613952 }, { "epoch": 4.04, "learning_rate": 0.000277432296890672, "loss": 2.7803, "theoretical_loss": 3.5186542443336686, "tokens_seen": 1487613952 }, { "epoch": 4.04, "learning_rate": 0.0002774222668004012, "loss": 2.8481, "theoretical_loss": 3.518640555611489, "tokens_seen": 1487679488 }, { "epoch": 4.04, "learning_rate": 0.0002774122367101304, "loss": 2.7327, "theoretical_loss": 3.518626867661156, "tokens_seen": 1487745024 }, { "epoch": 4.04, "learning_rate": 0.0002774022066198596, "loss": 2.8041, "theoretical_loss": 3.5186131804825926, "tokens_seen": 1487810560 }, { "epoch": 4.04, "learning_rate": 0.00027739217652958875, "loss": 2.8114, "theoretical_loss": 3.518599494075721, "tokens_seen": 1487876096 }, { "epoch": 4.04, "learning_rate": 0.000277382146439318, "loss": 2.7478, "theoretical_loss": 3.5185858084404638, "tokens_seen": 1487941632 }, { "epoch": 4.04, "learning_rate": 0.0002773721163490471, "loss": 2.7194, "theoretical_loss": 3.5185721235767433, "tokens_seen": 1488007168 }, { "epoch": 4.04, "learning_rate": 0.00027736208625877634, "loss": 2.6561, "theoretical_loss": 3.518558439484483, "tokens_seen": 1488072704 }, { "epoch": 4.04, "learning_rate": 0.0002773520561685055, "loss": 2.8045, "theoretical_loss": 3.518544756163604, "tokens_seen": 1488138240 }, { "epoch": 4.04, "learning_rate": 0.0002773420260782347, "loss": 2.8119, "theoretical_loss": 3.51853107361403, "tokens_seen": 1488203776 }, { "epoch": 4.04, "learning_rate": 0.0002773319959879639, "loss": 2.7071, "theoretical_loss": 3.518517391835683, "tokens_seen": 1488269312 }, { "epoch": 4.04, "learning_rate": 0.00027732196589769307, "loss": 2.8869, "theoretical_loss": 3.518503710828486, "tokens_seen": 1488334848 }, { "epoch": 4.04, "learning_rate": 0.00027731193580742225, "loss": 2.733, "theoretical_loss": 3.5184900305923605, "tokens_seen": 1488400384 }, { "epoch": 4.04, "learning_rate": 0.0002773019057171515, "loss": 2.656, "theoretical_loss": 3.5184763511272306, "tokens_seen": 1488465920 }, { "epoch": 4.04, "learning_rate": 0.0002772918756268806, "loss": 2.7542, "theoretical_loss": 3.5184626724330177, "tokens_seen": 1488531456 }, { "epoch": 4.04, "learning_rate": 0.00027728184553660985, "loss": 2.7707, "theoretical_loss": 3.518448994509645, "tokens_seen": 1488596992 }, { "epoch": 4.04, "learning_rate": 0.00027727181544633903, "loss": 2.8142, "theoretical_loss": 3.518435317357035, "tokens_seen": 1488662528 }, { "epoch": 4.04, "learning_rate": 0.0002772617853560682, "loss": 2.835, "theoretical_loss": 3.5184216409751103, "tokens_seen": 1488728064 }, { "epoch": 4.04, "learning_rate": 0.0002772517552657974, "loss": 2.7149, "theoretical_loss": 3.518407965363794, "tokens_seen": 1488793600 }, { "epoch": 4.04, "learning_rate": 0.00027724172517552657, "loss": 2.6845, "theoretical_loss": 3.5183942905230077, "tokens_seen": 1488859136 }, { "epoch": 4.04, "learning_rate": 0.00027723169508525575, "loss": 2.8189, "theoretical_loss": 3.518380616452675, "tokens_seen": 1488924672 }, { "epoch": 4.04, "learning_rate": 0.000277221664994985, "loss": 2.7413, "theoretical_loss": 3.518366943152719, "tokens_seen": 1488990208 }, { "epoch": 4.04, "learning_rate": 0.0002772116349047141, "loss": 2.6862, "theoretical_loss": 3.5183532706230602, "tokens_seen": 1489055744 }, { "epoch": 4.04, "learning_rate": 0.00027720160481444335, "loss": 2.8841, "theoretical_loss": 3.518339598863624, "tokens_seen": 1489121280 }, { "epoch": 4.04, "learning_rate": 0.0002771915747241725, "loss": 2.9179, "theoretical_loss": 3.518325927874332, "tokens_seen": 1489186816 }, { "epoch": 4.04, "objective/train/docs_used": 2376435, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.919553756713867, "objective/train/theoretical_loss": 3.518312257655106, "objective/train/tokens_used": 1509712352, "theoretical_loss": 3.518312257655106, "tokens_seen": 1489252352 }, { "epoch": 4.04, "learning_rate": 0.0002771815446339017, "loss": 2.7804, "theoretical_loss": 3.518312257655106, "tokens_seen": 1489252352 }, { "epoch": 4.04, "learning_rate": 0.0002771715145436309, "loss": 2.9358, "theoretical_loss": 3.5182985882058704, "tokens_seen": 1489317888 }, { "epoch": 4.04, "learning_rate": 0.0002771614844533601, "loss": 2.8078, "theoretical_loss": 3.518284919526547, "tokens_seen": 1489383424 }, { "epoch": 4.04, "learning_rate": 0.00027715145436308926, "loss": 2.7774, "theoretical_loss": 3.5182712516170582, "tokens_seen": 1489448960 }, { "epoch": 4.04, "learning_rate": 0.0002771414242728185, "loss": 2.7676, "theoretical_loss": 3.518257584477327, "tokens_seen": 1489514496 }, { "epoch": 4.04, "learning_rate": 0.0002771313941825476, "loss": 2.939, "theoretical_loss": 3.5182439181072773, "tokens_seen": 1489580032 }, { "epoch": 4.04, "learning_rate": 0.00027712136409227685, "loss": 2.7216, "theoretical_loss": 3.518230252506831, "tokens_seen": 1489645568 }, { "epoch": 4.04, "learning_rate": 0.000277111334002006, "loss": 2.9309, "theoretical_loss": 3.5182165876759104, "tokens_seen": 1489711104 }, { "epoch": 4.04, "learning_rate": 0.0002771013039117352, "loss": 2.8911, "theoretical_loss": 3.5182029236144396, "tokens_seen": 1489776640 }, { "epoch": 4.04, "learning_rate": 0.0002770912738214644, "loss": 2.735, "theoretical_loss": 3.51818926032234, "tokens_seen": 1489842176 }, { "epoch": 4.04, "learning_rate": 0.0002770812437311936, "loss": 2.8978, "theoretical_loss": 3.5181755977995355, "tokens_seen": 1489907712 }, { "epoch": 4.04, "learning_rate": 0.00027707121364092276, "loss": 2.8362, "theoretical_loss": 3.5181619360459484, "tokens_seen": 1489973248 }, { "epoch": 4.04, "learning_rate": 0.00027706118355065194, "loss": 2.8453, "theoretical_loss": 3.5181482750615016, "tokens_seen": 1490038784 }, { "epoch": 4.04, "learning_rate": 0.0002770511534603811, "loss": 2.8254, "theoretical_loss": 3.518134614846119, "tokens_seen": 1490104320 }, { "epoch": 4.04, "learning_rate": 0.00027704112337011036, "loss": 2.8674, "theoretical_loss": 3.5181209553997217, "tokens_seen": 1490169856 }, { "epoch": 4.04, "learning_rate": 0.0002770310932798395, "loss": 2.8006, "theoretical_loss": 3.518107296722234, "tokens_seen": 1490235392 }, { "epoch": 4.04, "learning_rate": 0.0002770210631895687, "loss": 2.719, "theoretical_loss": 3.518093638813578, "tokens_seen": 1490300928 }, { "epoch": 4.04, "learning_rate": 0.0002770110330992979, "loss": 2.7894, "theoretical_loss": 3.5180799816736767, "tokens_seen": 1490366464 }, { "epoch": 4.04, "learning_rate": 0.0002770010030090271, "loss": 2.8591, "theoretical_loss": 3.5180663253024536, "tokens_seen": 1490432000 }, { "epoch": 4.04, "learning_rate": 0.0002769909729187563, "loss": 2.8064, "theoretical_loss": 3.5180526696998315, "tokens_seen": 1490497536 }, { "epoch": 4.04, "learning_rate": 0.00027698094282848544, "loss": 2.8039, "theoretical_loss": 3.5180390148657326, "tokens_seen": 1490563072 }, { "epoch": 4.04, "learning_rate": 0.0002769709127382147, "loss": 2.7177, "theoretical_loss": 3.518025360800081, "tokens_seen": 1490628608 }, { "epoch": 4.04, "learning_rate": 0.00027696088264794386, "loss": 2.7279, "theoretical_loss": 3.518011707502799, "tokens_seen": 1490694144 }, { "epoch": 4.04, "learning_rate": 0.00027695085255767304, "loss": 2.7956, "theoretical_loss": 3.517998054973809, "tokens_seen": 1490759680 }, { "epoch": 4.04, "learning_rate": 0.0002769408224674022, "loss": 2.7374, "theoretical_loss": 3.517984403213035, "tokens_seen": 1490825216 }, { "epoch": 4.04, "objective/train/docs_used": 2379105, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.702008008956909, "objective/train/theoretical_loss": 3.5179707522204, "objective/train/tokens_used": 1511350752, "theoretical_loss": 3.5179707522204, "tokens_seen": 1490890752 }, { "epoch": 4.04, "learning_rate": 0.0002769307923771314, "loss": 2.6952, "theoretical_loss": 3.5179707522204, "tokens_seen": 1490890752 }, { "epoch": 4.04, "learning_rate": 0.0002769207622868606, "loss": 2.9088, "theoretical_loss": 3.517957101995826, "tokens_seen": 1490956288 }, { "epoch": 4.04, "learning_rate": 0.0002769107321965898, "loss": 2.8726, "theoretical_loss": 3.5179434525392375, "tokens_seen": 1491021824 }, { "epoch": 4.04, "learning_rate": 0.00027690070210631895, "loss": 2.7377, "theoretical_loss": 3.5179298038505564, "tokens_seen": 1491087360 }, { "epoch": 4.04, "learning_rate": 0.0002768906720160482, "loss": 2.9582, "theoretical_loss": 3.517916155929706, "tokens_seen": 1491152896 }, { "epoch": 4.04, "learning_rate": 0.0002768806419257773, "loss": 2.6937, "theoretical_loss": 3.5179025087766096, "tokens_seen": 1491218432 }, { "epoch": 4.04, "learning_rate": 0.00027687061183550655, "loss": 2.8607, "theoretical_loss": 3.5178888623911897, "tokens_seen": 1491283968 }, { "epoch": 4.04, "learning_rate": 0.0002768605817452357, "loss": 2.8903, "theoretical_loss": 3.51787521677337, "tokens_seen": 1491349504 }, { "epoch": 4.04, "learning_rate": 0.0002768505516549649, "loss": 2.8066, "theoretical_loss": 3.5178615719230737, "tokens_seen": 1491415040 }, { "epoch": 4.04, "learning_rate": 0.0002768405215646941, "loss": 2.7144, "theoretical_loss": 3.5178479278402235, "tokens_seen": 1491480576 }, { "epoch": 4.04, "learning_rate": 0.00027683049147442327, "loss": 2.8152, "theoretical_loss": 3.5178342845247426, "tokens_seen": 1491546112 }, { "epoch": 4.04, "learning_rate": 0.00027682046138415245, "loss": 2.8622, "theoretical_loss": 3.517820641976554, "tokens_seen": 1491611648 }, { "epoch": 4.04, "learning_rate": 0.0002768104312938817, "loss": 2.8043, "theoretical_loss": 3.517807000195581, "tokens_seen": 1491677184 }, { "epoch": 4.04, "learning_rate": 0.0002768004012036108, "loss": 2.8911, "theoretical_loss": 3.5177933591817467, "tokens_seen": 1491742720 }, { "epoch": 4.04, "learning_rate": 0.00027679037111334005, "loss": 3.0258, "theoretical_loss": 3.5177797189349747, "tokens_seen": 1491808256 }, { "epoch": 4.04, "learning_rate": 0.00027678034102306923, "loss": 2.8857, "theoretical_loss": 3.5177660794551873, "tokens_seen": 1491873792 }, { "epoch": 4.04, "learning_rate": 0.0002767703109327984, "loss": 2.8677, "theoretical_loss": 3.5177524407423086, "tokens_seen": 1491939328 }, { "epoch": 4.04, "learning_rate": 0.0002767602808425276, "loss": 2.9011, "theoretical_loss": 3.517738802796261, "tokens_seen": 1492004864 }, { "epoch": 4.04, "learning_rate": 0.00027675025075225677, "loss": 2.8226, "theoretical_loss": 3.517725165616968, "tokens_seen": 1492070400 }, { "epoch": 4.04, "learning_rate": 0.00027674022066198595, "loss": 2.8034, "theoretical_loss": 3.5177115292043535, "tokens_seen": 1492135936 }, { "epoch": 4.04, "learning_rate": 0.0002767301905717152, "loss": 2.839, "theoretical_loss": 3.5176978935583394, "tokens_seen": 1492201472 }, { "epoch": 4.04, "learning_rate": 0.0002767201604814443, "loss": 2.8634, "theoretical_loss": 3.51768425867885, "tokens_seen": 1492267008 }, { "epoch": 4.04, "learning_rate": 0.00027671013039117355, "loss": 2.8878, "theoretical_loss": 3.517670624565808, "tokens_seen": 1492332544 }, { "epoch": 4.04, "learning_rate": 0.0002767001003009027, "loss": 2.881, "theoretical_loss": 3.5176569912191367, "tokens_seen": 1492398080 }, { "epoch": 4.04, "learning_rate": 0.0002766900702106319, "loss": 2.8069, "theoretical_loss": 3.51764335863876, "tokens_seen": 1492463616 }, { "epoch": 4.04, "objective/train/docs_used": 2381939, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8083269596099854, "objective/train/theoretical_loss": 3.5176297268246, "objective/train/tokens_used": 1512989152, "theoretical_loss": 3.5176297268246, "tokens_seen": 1492529152 }, { "epoch": 4.04, "learning_rate": 0.0002766800401203611, "loss": 2.9097, "theoretical_loss": 3.5176297268246, "tokens_seen": 1492529152 }, { "epoch": 4.04, "learning_rate": 0.0002766700100300903, "loss": 2.8519, "theoretical_loss": 3.5176160957765816, "tokens_seen": 1492594688 }, { "epoch": 4.04, "learning_rate": 0.00027665997993981946, "loss": 2.8342, "theoretical_loss": 3.5176024654946265, "tokens_seen": 1492660224 }, { "epoch": 4.04, "learning_rate": 0.0002766499498495487, "loss": 2.7493, "theoretical_loss": 3.5175888359786587, "tokens_seen": 1492725760 }, { "epoch": 4.04, "learning_rate": 0.0002766399197592778, "loss": 2.7976, "theoretical_loss": 3.517575207228602, "tokens_seen": 1492791296 }, { "epoch": 4.04, "learning_rate": 0.00027662988966900705, "loss": 2.7719, "theoretical_loss": 3.5175615792443793, "tokens_seen": 1492856832 }, { "epoch": 4.04, "learning_rate": 0.0002766198595787362, "loss": 2.7408, "theoretical_loss": 3.517547952025913, "tokens_seen": 1492922368 }, { "epoch": 4.04, "learning_rate": 0.0002766098294884654, "loss": 2.7375, "theoretical_loss": 3.5175343255731284, "tokens_seen": 1492987904 }, { "epoch": 4.04, "learning_rate": 0.0002765997993981946, "loss": 2.7352, "theoretical_loss": 3.5175206998859476, "tokens_seen": 1493053440 }, { "epoch": 4.04, "learning_rate": 0.0002765897693079238, "loss": 2.7983, "theoretical_loss": 3.517507074964294, "tokens_seen": 1493118976 }, { "epoch": 4.04, "learning_rate": 0.00027657973921765296, "loss": 2.6339, "theoretical_loss": 3.5174934508080913, "tokens_seen": 1493184512 }, { "epoch": 4.04, "learning_rate": 0.00027656970912738214, "loss": 2.8492, "theoretical_loss": 3.517479827417263, "tokens_seen": 1493250048 }, { "epoch": 4.04, "learning_rate": 0.0002765596790371113, "loss": 2.6519, "theoretical_loss": 3.517466204791732, "tokens_seen": 1493315584 }, { "epoch": 4.04, "learning_rate": 0.00027654964894684056, "loss": 2.7458, "theoretical_loss": 3.5174525829314227, "tokens_seen": 1493381120 }, { "epoch": 4.04, "learning_rate": 0.0002765396188565697, "loss": 2.8531, "theoretical_loss": 3.517438961836257, "tokens_seen": 1493446656 }, { "epoch": 4.04, "learning_rate": 0.0002765295887662989, "loss": 2.6384, "theoretical_loss": 3.5174253415061605, "tokens_seen": 1493512192 }, { "epoch": 4.04, "learning_rate": 0.00027651955867602805, "loss": 2.8485, "theoretical_loss": 3.5174117219410546, "tokens_seen": 1493577728 }, { "epoch": 4.04, "learning_rate": 0.0002765095285857573, "loss": 2.7642, "theoretical_loss": 3.517398103140864, "tokens_seen": 1493643264 }, { "epoch": 4.04, "learning_rate": 0.00027649949849548646, "loss": 2.7804, "theoretical_loss": 3.517384485105511, "tokens_seen": 1493708800 }, { "epoch": 4.04, "learning_rate": 0.00027648946840521564, "loss": 2.8136, "theoretical_loss": 3.5173708678349205, "tokens_seen": 1493774336 }, { "epoch": 4.04, "learning_rate": 0.0002764794383149448, "loss": 2.6218, "theoretical_loss": 3.517357251329015, "tokens_seen": 1493839872 }, { "epoch": 4.04, "learning_rate": 0.00027646940822467406, "loss": 2.7406, "theoretical_loss": 3.517343635587719, "tokens_seen": 1493905408 }, { "epoch": 4.04, "learning_rate": 0.0002764593781344032, "loss": 2.7984, "theoretical_loss": 3.517330020610955, "tokens_seen": 1493970944 }, { "epoch": 4.04, "learning_rate": 0.0002764493480441324, "loss": 2.7442, "theoretical_loss": 3.517316406398647, "tokens_seen": 1494036480 }, { "epoch": 4.04, "learning_rate": 0.00027643931795386155, "loss": 2.7809, "theoretical_loss": 3.5173027929507183, "tokens_seen": 1494102016 }, { "epoch": 4.04, "objective/train/docs_used": 2384882, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.739278793334961, "objective/train/theoretical_loss": 3.5172891802670927, "objective/train/tokens_used": 1514627552, "theoretical_loss": 3.5172891802670927, "tokens_seen": 1494167552 }, { "epoch": 4.04, "learning_rate": 0.0002764292878635908, "loss": 2.8211, "theoretical_loss": 3.5172891802670927, "tokens_seen": 1494167552 }, { "epoch": 4.04, "learning_rate": 0.00027641925777331997, "loss": 2.9251, "theoretical_loss": 3.517275568347694, "tokens_seen": 1494233088 }, { "epoch": 4.04, "learning_rate": 0.00027640922768304915, "loss": 2.7545, "theoretical_loss": 3.5172619571924453, "tokens_seen": 1494298624 }, { "epoch": 4.04, "learning_rate": 0.00027639919759277833, "loss": 2.7895, "theoretical_loss": 3.5172483468012703, "tokens_seen": 1494364160 }, { "epoch": 4.04, "learning_rate": 0.0002763891675025075, "loss": 2.9374, "theoretical_loss": 3.5172347371740926, "tokens_seen": 1494429696 }, { "epoch": 4.04, "learning_rate": 0.0002763791374122367, "loss": 2.8711, "theoretical_loss": 3.5172211283108363, "tokens_seen": 1494495232 }, { "epoch": 4.04, "learning_rate": 0.0002763691073219659, "loss": 2.7881, "theoretical_loss": 3.5172075202114246, "tokens_seen": 1494560768 }, { "epoch": 4.04, "learning_rate": 0.00027635907723169505, "loss": 2.7828, "theoretical_loss": 3.517193912875781, "tokens_seen": 1494626304 }, { "epoch": 4.04, "learning_rate": 0.0002763490471414243, "loss": 2.8903, "theoretical_loss": 3.517180306303829, "tokens_seen": 1494691840 }, { "epoch": 4.04, "learning_rate": 0.0002763390170511534, "loss": 2.8827, "theoretical_loss": 3.517166700495493, "tokens_seen": 1494757376 }, { "epoch": 4.04, "learning_rate": 0.00027632898696088265, "loss": 2.6433, "theoretical_loss": 3.517153095450696, "tokens_seen": 1494822912 }, { "epoch": 4.04, "learning_rate": 0.00027631895687061183, "loss": 2.8249, "theoretical_loss": 3.5171394911693623, "tokens_seen": 1494888448 }, { "epoch": 4.04, "learning_rate": 0.000276308926780341, "loss": 2.8305, "theoretical_loss": 3.5171258876514147, "tokens_seen": 1494953984 }, { "epoch": 4.04, "learning_rate": 0.0002762988966900702, "loss": 2.7556, "theoretical_loss": 3.5171122848967777, "tokens_seen": 1495019520 }, { "epoch": 4.04, "learning_rate": 0.00027628886659979943, "loss": 2.8318, "theoretical_loss": 3.5170986829053748, "tokens_seen": 1495085056 }, { "epoch": 4.04, "learning_rate": 0.00027627883650952856, "loss": 2.8514, "theoretical_loss": 3.5170850816771293, "tokens_seen": 1495150592 }, { "epoch": 4.04, "learning_rate": 0.0002762688064192578, "loss": 2.8686, "theoretical_loss": 3.5170714812119654, "tokens_seen": 1495216128 }, { "epoch": 4.04, "learning_rate": 0.00027625877632898697, "loss": 2.8535, "theoretical_loss": 3.517057881509807, "tokens_seen": 1495281664 }, { "epoch": 4.04, "learning_rate": 0.00027624874623871615, "loss": 2.7955, "theoretical_loss": 3.517044282570577, "tokens_seen": 1495347200 }, { "epoch": 4.04, "learning_rate": 0.0002762387161484454, "loss": 2.8835, "theoretical_loss": 3.5170306843942, "tokens_seen": 1495412736 }, { "epoch": 4.04, "learning_rate": 0.0002762286860581745, "loss": 2.8776, "theoretical_loss": 3.5170170869805997, "tokens_seen": 1495478272 }, { "epoch": 4.04, "learning_rate": 0.00027621865596790375, "loss": 2.9181, "theoretical_loss": 3.5170034903297, "tokens_seen": 1495543808 }, { "epoch": 4.04, "learning_rate": 0.0002762086258776329, "loss": 2.8554, "theoretical_loss": 3.5169898944414237, "tokens_seen": 1495609344 }, { "epoch": 4.04, "learning_rate": 0.0002761985957873621, "loss": 2.8622, "theoretical_loss": 3.5169762993156954, "tokens_seen": 1495674880 }, { "epoch": 4.04, "learning_rate": 0.0002761885656970913, "loss": 2.5773, "theoretical_loss": 3.5169627049524395, "tokens_seen": 1495740416 }, { "epoch": 4.04, "objective/train/docs_used": 2387207, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6532058715820312, "objective/train/theoretical_loss": 3.516949111351579, "objective/train/tokens_used": 1516265952, "theoretical_loss": 3.516949111351579, "tokens_seen": 1495805952 }, { "epoch": 4.04, "learning_rate": 0.0002761785356068205, "loss": 2.6819, "theoretical_loss": 3.516949111351579, "tokens_seen": 1495805952 }, { "epoch": 4.04, "learning_rate": 0.00027616850551654966, "loss": 2.7751, "theoretical_loss": 3.516935518513037, "tokens_seen": 1495871488 }, { "epoch": 4.04, "learning_rate": 0.0002761584754262789, "loss": 2.8455, "theoretical_loss": 3.516921926436739, "tokens_seen": 1495937024 }, { "epoch": 4.04, "learning_rate": 0.000276148445336008, "loss": 2.7959, "theoretical_loss": 3.516908335122608, "tokens_seen": 1496002560 }, { "epoch": 4.04, "learning_rate": 0.00027613841524573725, "loss": 2.9721, "theoretical_loss": 3.5168947445705685, "tokens_seen": 1496068096 }, { "epoch": 4.04, "learning_rate": 0.0002761283851554664, "loss": 2.8614, "theoretical_loss": 3.516881154780543, "tokens_seen": 1496133632 }, { "epoch": 4.04, "learning_rate": 0.0002761183550651956, "loss": 2.7055, "theoretical_loss": 3.516867565752457, "tokens_seen": 1496199168 }, { "epoch": 4.04, "learning_rate": 0.0002761083249749248, "loss": 2.7687, "theoretical_loss": 3.5168539774862335, "tokens_seen": 1496264704 }, { "epoch": 4.04, "learning_rate": 0.000276098294884654, "loss": 2.7528, "theoretical_loss": 3.5168403899817964, "tokens_seen": 1496330240 }, { "epoch": 4.04, "learning_rate": 0.00027608826479438316, "loss": 2.8377, "theoretical_loss": 3.51682680323907, "tokens_seen": 1496395776 }, { "epoch": 4.04, "learning_rate": 0.00027607823470411234, "loss": 2.8915, "theoretical_loss": 3.5168132172579782, "tokens_seen": 1496461312 }, { "epoch": 4.04, "learning_rate": 0.0002760682046138415, "loss": 2.8225, "theoretical_loss": 3.516799632038445, "tokens_seen": 1496526848 }, { "epoch": 4.04, "learning_rate": 0.00027605817452357076, "loss": 2.9507, "theoretical_loss": 3.5167860475803936, "tokens_seen": 1496592384 }, { "epoch": 4.04, "learning_rate": 0.0002760481444332999, "loss": 2.8282, "theoretical_loss": 3.5167724638837488, "tokens_seen": 1496657920 }, { "epoch": 4.04, "learning_rate": 0.0002760381143430291, "loss": 2.7266, "theoretical_loss": 3.516758880948435, "tokens_seen": 1496723456 }, { "epoch": 4.04, "learning_rate": 0.00027602808425275825, "loss": 2.8117, "theoretical_loss": 3.516745298774375, "tokens_seen": 1496788992 }, { "epoch": 4.04, "learning_rate": 0.0002760180541624875, "loss": 2.734, "theoretical_loss": 3.5167317173614934, "tokens_seen": 1496854528 }, { "epoch": 4.04, "learning_rate": 0.00027600802407221666, "loss": 2.8197, "theoretical_loss": 3.5167181367097142, "tokens_seen": 1496920064 }, { "epoch": 4.04, "learning_rate": 0.00027599799398194584, "loss": 2.7994, "theoretical_loss": 3.5167045568189614, "tokens_seen": 1496985600 }, { "epoch": 4.04, "learning_rate": 0.000275987963891675, "loss": 2.7617, "theoretical_loss": 3.5166909776891595, "tokens_seen": 1497051136 }, { "epoch": 4.04, "learning_rate": 0.00027597793380140426, "loss": 2.6881, "theoretical_loss": 3.5166773993202316, "tokens_seen": 1497116672 }, { "epoch": 4.04, "learning_rate": 0.0002759679037111334, "loss": 2.4767, "theoretical_loss": 3.5166638217121022, "tokens_seen": 1497182208 }, { "epoch": 4.04, "learning_rate": 0.0002759578736208626, "loss": 2.9443, "theoretical_loss": 3.516650244864696, "tokens_seen": 1497247744 }, { "epoch": 4.04, "learning_rate": 0.00027594784353059175, "loss": 2.7825, "theoretical_loss": 3.5166366687779362, "tokens_seen": 1497313280 }, { "epoch": 4.04, "learning_rate": 0.000275937813440321, "loss": 2.9727, "theoretical_loss": 3.5166230934517473, "tokens_seen": 1497378816 }, { "epoch": 4.04, "objective/train/docs_used": 2390068, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.888827323913574, "objective/train/theoretical_loss": 3.516609518886053, "objective/train/tokens_used": 1517904352, "theoretical_loss": 3.516609518886053, "tokens_seen": 1497444352 }, { "epoch": 4.04, "learning_rate": 0.00027592778335005017, "loss": 2.8632, "theoretical_loss": 3.516609518886053, "tokens_seen": 1497444352 }, { "epoch": 4.04, "learning_rate": 0.00027591775325977935, "loss": 2.8823, "theoretical_loss": 3.516595945080778, "tokens_seen": 1497509888 }, { "epoch": 4.04, "learning_rate": 0.00027590772316950853, "loss": 2.8515, "theoretical_loss": 3.5165823720358462, "tokens_seen": 1497575424 }, { "epoch": 4.04, "learning_rate": 0.0002758976930792377, "loss": 2.8502, "theoretical_loss": 3.516568799751182, "tokens_seen": 1497640960 }, { "epoch": 4.04, "learning_rate": 0.0002758876629889669, "loss": 2.7185, "theoretical_loss": 3.516555228226709, "tokens_seen": 1497706496 }, { "epoch": 4.04, "learning_rate": 0.0002758776328986961, "loss": 2.8356, "theoretical_loss": 3.5165416574623514, "tokens_seen": 1497772032 }, { "epoch": 4.04, "learning_rate": 0.00027586760280842525, "loss": 2.8237, "theoretical_loss": 3.516528087458034, "tokens_seen": 1497837568 }, { "epoch": 4.04, "learning_rate": 0.0002758575727181545, "loss": 2.7791, "theoretical_loss": 3.5165145182136808, "tokens_seen": 1497903104 }, { "epoch": 4.04, "learning_rate": 0.0002758475426278836, "loss": 2.8814, "theoretical_loss": 3.516500949729215, "tokens_seen": 1497968640 }, { "epoch": 4.04, "learning_rate": 0.00027583751253761285, "loss": 2.8304, "theoretical_loss": 3.516487382004562, "tokens_seen": 1498034176 }, { "epoch": 4.04, "learning_rate": 0.00027582748244734203, "loss": 2.769, "theoretical_loss": 3.5164738150396455, "tokens_seen": 1498099712 }, { "epoch": 4.04, "learning_rate": 0.0002758174523570712, "loss": 2.826, "theoretical_loss": 3.51646024883439, "tokens_seen": 1498165248 }, { "epoch": 4.04, "learning_rate": 0.0002758074222668004, "loss": 2.8891, "theoretical_loss": 3.516446683388719, "tokens_seen": 1498230784 }, { "epoch": 4.04, "learning_rate": 0.00027579739217652963, "loss": 2.8239, "theoretical_loss": 3.516433118702558, "tokens_seen": 1498296320 }, { "epoch": 4.04, "learning_rate": 0.00027578736208625876, "loss": 2.7753, "theoretical_loss": 3.5164195547758297, "tokens_seen": 1498361856 }, { "epoch": 4.04, "learning_rate": 0.000275777331995988, "loss": 2.8767, "theoretical_loss": 3.51640599160846, "tokens_seen": 1498427392 }, { "epoch": 4.04, "learning_rate": 0.0002757673019057171, "loss": 2.9054, "theoretical_loss": 3.5163924292003723, "tokens_seen": 1498492928 }, { "epoch": 4.04, "learning_rate": 0.00027575727181544635, "loss": 2.812, "theoretical_loss": 3.5163788675514907, "tokens_seen": 1498558464 }, { "epoch": 4.04, "learning_rate": 0.00027574724172517554, "loss": 2.8725, "theoretical_loss": 3.51636530666174, "tokens_seen": 1498624000 }, { "epoch": 4.04, "learning_rate": 0.0002757372116349047, "loss": 2.7784, "theoretical_loss": 3.5163517465310443, "tokens_seen": 1498689536 }, { "epoch": 4.04, "learning_rate": 0.0002757271815446339, "loss": 2.8698, "theoretical_loss": 3.5163381871593278, "tokens_seen": 1498755072 }, { "epoch": 4.04, "learning_rate": 0.0002757171514543631, "loss": 2.7582, "theoretical_loss": 3.516324628546515, "tokens_seen": 1498820608 }, { "epoch": 4.04, "learning_rate": 0.00027570712136409226, "loss": 2.9928, "theoretical_loss": 3.51631107069253, "tokens_seen": 1498886144 }, { "epoch": 4.04, "learning_rate": 0.0002756970912738215, "loss": 2.8376, "theoretical_loss": 3.516297513597298, "tokens_seen": 1498951680 }, { "epoch": 4.04, "learning_rate": 0.0002756870611835506, "loss": 2.8625, "theoretical_loss": 3.5162839572607423, "tokens_seen": 1499017216 }, { "epoch": 4.04, "objective/train/docs_used": 2392821, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.763467311859131, "objective/train/theoretical_loss": 3.516270401682787, "objective/train/tokens_used": 1519542752, "theoretical_loss": 3.516270401682787, "tokens_seen": 1499082752 }, { "epoch": 4.04, "learning_rate": 0.00027567703109327986, "loss": 2.7864, "theoretical_loss": 3.516270401682787, "tokens_seen": 1499082752 }, { "epoch": 4.04, "learning_rate": 0.000275667001003009, "loss": 2.7565, "theoretical_loss": 3.5162568468633584, "tokens_seen": 1499148288 }, { "epoch": 4.04, "learning_rate": 0.0002756569709127382, "loss": 2.8455, "theoretical_loss": 3.516243292802379, "tokens_seen": 1499213824 }, { "epoch": 4.04, "learning_rate": 0.0002756469408224674, "loss": 2.818, "theoretical_loss": 3.5162297394997744, "tokens_seen": 1499279360 }, { "epoch": 4.04, "learning_rate": 0.0002756369107321966, "loss": 2.7564, "theoretical_loss": 3.516216186955468, "tokens_seen": 1499344896 }, { "epoch": 4.04, "learning_rate": 0.00027562688064192576, "loss": 2.9127, "theoretical_loss": 3.5162026351693854, "tokens_seen": 1499410432 }, { "epoch": 4.04, "learning_rate": 0.000275616850551655, "loss": 2.8925, "theoretical_loss": 3.5161890841414496, "tokens_seen": 1499475968 }, { "epoch": 4.04, "learning_rate": 0.0002756068204613841, "loss": 2.53, "theoretical_loss": 3.516175533871586, "tokens_seen": 1499541504 }, { "epoch": 4.04, "learning_rate": 0.00027559679037111336, "loss": 2.7772, "theoretical_loss": 3.5161619843597194, "tokens_seen": 1499607040 }, { "epoch": 4.04, "learning_rate": 0.0002755867602808425, "loss": 2.8511, "theoretical_loss": 3.516148435605773, "tokens_seen": 1499672576 }, { "epoch": 4.04, "learning_rate": 0.0002755767301905717, "loss": 2.9522, "theoretical_loss": 3.516134887609673, "tokens_seen": 1499738112 }, { "epoch": 4.04, "learning_rate": 0.0002755667001003009, "loss": 2.8747, "theoretical_loss": 3.5161213403713427, "tokens_seen": 1499803648 }, { "epoch": 4.04, "learning_rate": 0.0002755566700100301, "loss": 2.8272, "theoretical_loss": 3.516107793890707, "tokens_seen": 1499869184 }, { "epoch": 4.04, "learning_rate": 0.00027554663991975927, "loss": 2.9258, "theoretical_loss": 3.5160942481676893, "tokens_seen": 1499934720 }, { "epoch": 4.04, "learning_rate": 0.00027553660982948845, "loss": 2.7922, "theoretical_loss": 3.516080703202216, "tokens_seen": 1500000256 }, { "epoch": 4.04, "learning_rate": 0.00027552657973921763, "loss": 2.8127, "theoretical_loss": 3.5160671589942107, "tokens_seen": 1500065792 }, { "epoch": 4.04, "learning_rate": 0.00027551654964894686, "loss": 2.806, "theoretical_loss": 3.5160536155435977, "tokens_seen": 1500131328 }, { "epoch": 4.04, "learning_rate": 0.00027550651955867604, "loss": 2.853, "theoretical_loss": 3.516040072850302, "tokens_seen": 1500196864 }, { "epoch": 4.04, "learning_rate": 0.0002754964894684052, "loss": 2.9508, "theoretical_loss": 3.5160265309142487, "tokens_seen": 1500262400 }, { "epoch": 4.04, "learning_rate": 0.00027548645937813446, "loss": 2.8408, "theoretical_loss": 3.516012989735361, "tokens_seen": 1500327936 }, { "epoch": 4.04, "learning_rate": 0.0002754764292878636, "loss": 2.9051, "theoretical_loss": 3.515999449313564, "tokens_seen": 1500393472 }, { "epoch": 4.04, "learning_rate": 0.0002754663991975928, "loss": 2.7718, "theoretical_loss": 3.515985909648783, "tokens_seen": 1500459008 }, { "epoch": 4.04, "learning_rate": 0.00027545636910732195, "loss": 2.6773, "theoretical_loss": 3.515972370740942, "tokens_seen": 1500524544 }, { "epoch": 4.04, "learning_rate": 0.0002754463390170512, "loss": 2.817, "theoretical_loss": 3.515958832589966, "tokens_seen": 1500590080 }, { "epoch": 4.04, "learning_rate": 0.00027543630892678037, "loss": 2.8289, "theoretical_loss": 3.5159452951957793, "tokens_seen": 1500655616 }, { "epoch": 4.04, "objective/train/docs_used": 2394261, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.858182191848755, "objective/train/theoretical_loss": 3.5159317585583065, "objective/train/tokens_used": 1521181152, "theoretical_loss": 3.5159317585583065, "tokens_seen": 1500721152 }, { "epoch": 4.04, "learning_rate": 0.00027542627883650955, "loss": 2.7925, "theoretical_loss": 3.5159317585583065, "tokens_seen": 1500721152 }, { "epoch": 4.04, "learning_rate": 0.00027541624874623873, "loss": 2.6543, "theoretical_loss": 3.515918222677472, "tokens_seen": 1500786688 }, { "epoch": 4.04, "learning_rate": 0.0002754062186559679, "loss": 2.7254, "theoretical_loss": 3.515904687553202, "tokens_seen": 1500852224 }, { "epoch": 4.04, "learning_rate": 0.0002753961885656971, "loss": 2.8454, "theoretical_loss": 3.5158911531854193, "tokens_seen": 1500917760 }, { "epoch": 4.04, "learning_rate": 0.0002753861584754263, "loss": 2.8654, "theoretical_loss": 3.5158776195740495, "tokens_seen": 1500983296 }, { "epoch": 4.04, "learning_rate": 0.00027537612838515545, "loss": 2.7249, "theoretical_loss": 3.5158640867190174, "tokens_seen": 1501048832 }, { "epoch": 4.04, "learning_rate": 0.0002753660982948847, "loss": 2.9172, "theoretical_loss": 3.515850554620247, "tokens_seen": 1501114368 }, { "epoch": 4.04, "learning_rate": 0.0002753560682046138, "loss": 2.6442, "theoretical_loss": 3.5158370232776637, "tokens_seen": 1501179904 }, { "epoch": 4.04, "learning_rate": 0.00027534603811434305, "loss": 2.8119, "theoretical_loss": 3.515823492691192, "tokens_seen": 1501245440 }, { "epoch": 4.04, "learning_rate": 0.00027533600802407223, "loss": 2.7257, "theoretical_loss": 3.5158099628607564, "tokens_seen": 1501310976 }, { "epoch": 4.04, "learning_rate": 0.0002753259779338014, "loss": 2.6073, "theoretical_loss": 3.5157964337862824, "tokens_seen": 1501376512 }, { "epoch": 4.04, "learning_rate": 0.0002753159478435306, "loss": 2.8195, "theoretical_loss": 3.5157829054676943, "tokens_seen": 1501442048 }, { "epoch": 4.04, "learning_rate": 0.00027530591775325983, "loss": 2.9467, "theoretical_loss": 3.515769377904917, "tokens_seen": 1501507584 }, { "epoch": 4.04, "learning_rate": 0.00027529588766298896, "loss": 2.7763, "theoretical_loss": 3.5157558510978744, "tokens_seen": 1501573120 }, { "epoch": 4.04, "learning_rate": 0.0002752858575727182, "loss": 2.8658, "theoretical_loss": 3.5157423250464923, "tokens_seen": 1501638656 }, { "epoch": 4.04, "learning_rate": 0.0002752758274824473, "loss": 2.8394, "theoretical_loss": 3.515728799750695, "tokens_seen": 1501704192 }, { "epoch": 4.04, "learning_rate": 0.00027526579739217655, "loss": 2.7135, "theoretical_loss": 3.5157152752104084, "tokens_seen": 1501769728 }, { "epoch": 4.04, "learning_rate": 0.00027525576730190574, "loss": 2.7574, "theoretical_loss": 3.5157017514255564, "tokens_seen": 1501835264 }, { "epoch": 4.04, "learning_rate": 0.0002752457372116349, "loss": 2.7681, "theoretical_loss": 3.5156882283960638, "tokens_seen": 1501900800 }, { "epoch": 4.04, "learning_rate": 0.0002752357071213641, "loss": 2.7099, "theoretical_loss": 3.515674706121855, "tokens_seen": 1501966336 }, { "epoch": 4.04, "learning_rate": 0.0002752256770310933, "loss": 2.7024, "theoretical_loss": 3.5156611846028563, "tokens_seen": 1502031872 }, { "epoch": 4.04, "learning_rate": 0.00027521564694082246, "loss": 2.8299, "theoretical_loss": 3.5156476638389913, "tokens_seen": 1502097408 }, { "epoch": 4.04, "learning_rate": 0.0002752056168505517, "loss": 2.7981, "theoretical_loss": 3.5156341438301855, "tokens_seen": 1502162944 }, { "epoch": 4.04, "learning_rate": 0.0002751955867602808, "loss": 2.5789, "theoretical_loss": 3.5156206245763633, "tokens_seen": 1502228480 }, { "epoch": 4.04, "learning_rate": 0.00027518555667001006, "loss": 2.6059, "theoretical_loss": 3.51560710607745, "tokens_seen": 1502294016 }, { "epoch": 4.04, "objective/train/docs_used": 2397293, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5513722896575928, "objective/train/theoretical_loss": 3.515593588333371, "objective/train/tokens_used": 1522819552, "theoretical_loss": 3.515593588333371, "tokens_seen": 1502359552 }, { "epoch": 4.04, "learning_rate": 0.0002751755265797392, "loss": 2.7356, "theoretical_loss": 3.515593588333371, "tokens_seen": 1502359552 }, { "epoch": 4.04, "learning_rate": 0.0002751654964894684, "loss": 2.8311, "theoretical_loss": 3.5155800713440506, "tokens_seen": 1502425088 }, { "epoch": 4.04, "learning_rate": 0.0002751554663991976, "loss": 2.731, "theoretical_loss": 3.5155665551094133, "tokens_seen": 1502490624 }, { "epoch": 4.04, "learning_rate": 0.0002751454363089268, "loss": 2.739, "theoretical_loss": 3.515553039629385, "tokens_seen": 1502556160 }, { "epoch": 4.04, "learning_rate": 0.00027513540621865596, "loss": 2.9372, "theoretical_loss": 3.5155395249038897, "tokens_seen": 1502621696 }, { "epoch": 4.04, "learning_rate": 0.0002751253761283852, "loss": 2.8916, "theoretical_loss": 3.515526010932853, "tokens_seen": 1502687232 }, { "epoch": 4.04, "learning_rate": 0.0002751153460381143, "loss": 2.8096, "theoretical_loss": 3.5155124977162, "tokens_seen": 1502752768 }, { "epoch": 4.04, "learning_rate": 0.00027510531594784356, "loss": 2.8655, "theoretical_loss": 3.5154989852538554, "tokens_seen": 1502818304 }, { "epoch": 4.04, "learning_rate": 0.0002750952858575727, "loss": 2.8434, "theoretical_loss": 3.5154854735457444, "tokens_seen": 1502883840 }, { "epoch": 4.04, "learning_rate": 0.0002750852557673019, "loss": 2.7519, "theoretical_loss": 3.515471962591792, "tokens_seen": 1502949376 }, { "epoch": 4.04, "learning_rate": 0.0002750752256770311, "loss": 2.9016, "theoretical_loss": 3.515458452391923, "tokens_seen": 1503014912 }, { "epoch": 4.04, "learning_rate": 0.0002750651955867603, "loss": 2.6356, "theoretical_loss": 3.515444942946062, "tokens_seen": 1503080448 }, { "epoch": 4.04, "learning_rate": 0.00027505516549648947, "loss": 2.5718, "theoretical_loss": 3.5154314342541353, "tokens_seen": 1503145984 }, { "epoch": 4.04, "learning_rate": 0.00027504513540621865, "loss": 2.7371, "theoretical_loss": 3.5154179263160668, "tokens_seen": 1503211520 }, { "epoch": 4.04, "learning_rate": 0.00027503510531594783, "loss": 2.7979, "theoretical_loss": 3.515404419131782, "tokens_seen": 1503277056 }, { "epoch": 4.04, "learning_rate": 0.00027502507522567706, "loss": 2.7027, "theoretical_loss": 3.515390912701206, "tokens_seen": 1503342592 }, { "epoch": 4.04, "learning_rate": 0.0002750150451354062, "loss": 2.8674, "theoretical_loss": 3.5153774070242645, "tokens_seen": 1503408128 }, { "epoch": 4.04, "learning_rate": 0.0002750050150451354, "loss": 2.672, "theoretical_loss": 3.5153639021008813, "tokens_seen": 1503473664 }, { "epoch": 4.04, "learning_rate": 0.00027499498495486455, "loss": 2.97, "theoretical_loss": 3.5153503979309826, "tokens_seen": 1503539200 }, { "epoch": 4.04, "learning_rate": 0.0002749849548645938, "loss": 2.6541, "theoretical_loss": 3.5153368945144927, "tokens_seen": 1503604736 }, { "epoch": 4.04, "learning_rate": 0.00027497492477432297, "loss": 2.8665, "theoretical_loss": 3.5153233918513376, "tokens_seen": 1503670272 }, { "epoch": 4.04, "learning_rate": 0.00027496489468405215, "loss": 2.9002, "theoretical_loss": 3.5153098899414417, "tokens_seen": 1503735808 }, { "epoch": 4.04, "learning_rate": 0.00027495486459378133, "loss": 2.8918, "theoretical_loss": 3.5152963887847304, "tokens_seen": 1503801344 }, { "epoch": 4.04, "learning_rate": 0.00027494483450351057, "loss": 2.8033, "theoretical_loss": 3.5152828883811287, "tokens_seen": 1503866880 }, { "epoch": 4.04, "learning_rate": 0.0002749348044132397, "loss": 2.7736, "theoretical_loss": 3.5152693887305624, "tokens_seen": 1503932416 }, { "epoch": 4.04, "objective/train/docs_used": 2400761, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.818056583404541, "objective/train/theoretical_loss": 3.515255889832956, "objective/train/tokens_used": 1524457952, "theoretical_loss": 3.515255889832956, "tokens_seen": 1503997952 }, { "epoch": 4.04, "learning_rate": 0.00027492477432296893, "loss": 2.9131, "theoretical_loss": 3.515255889832956, "tokens_seen": 1503997952 }, { "epoch": 4.04, "learning_rate": 0.00027491474423269806, "loss": 2.8201, "theoretical_loss": 3.5152423916882354, "tokens_seen": 1504063488 }, { "epoch": 4.04, "learning_rate": 0.0002749047141424273, "loss": 2.8231, "theoretical_loss": 3.5152288942963246, "tokens_seen": 1504129024 }, { "epoch": 4.04, "learning_rate": 0.00027489468405215647, "loss": 2.7459, "theoretical_loss": 3.5152153976571503, "tokens_seen": 1504194560 }, { "epoch": 4.04, "learning_rate": 0.00027488465396188565, "loss": 2.7192, "theoretical_loss": 3.515201901770636, "tokens_seen": 1504260096 }, { "epoch": 4.04, "learning_rate": 0.00027487462387161483, "loss": 2.8439, "theoretical_loss": 3.5151884066367094, "tokens_seen": 1504325632 }, { "epoch": 4.04, "learning_rate": 0.000274864593781344, "loss": 2.7901, "theoretical_loss": 3.5151749122552935, "tokens_seen": 1504391168 }, { "epoch": 4.04, "learning_rate": 0.0002748545636910732, "loss": 2.6048, "theoretical_loss": 3.515161418626314, "tokens_seen": 1504456704 }, { "epoch": 4.04, "learning_rate": 0.00027484453360080243, "loss": 2.9216, "theoretical_loss": 3.515147925749697, "tokens_seen": 1504522240 }, { "epoch": 4.04, "learning_rate": 0.00027483450351053156, "loss": 2.825, "theoretical_loss": 3.5151344336253674, "tokens_seen": 1504587776 }, { "epoch": 4.04, "learning_rate": 0.0002748244734202608, "loss": 2.6789, "theoretical_loss": 3.51512094225325, "tokens_seen": 1504653312 }, { "epoch": 4.04, "learning_rate": 0.00027481444332999, "loss": 2.6761, "theoretical_loss": 3.5151074516332708, "tokens_seen": 1504718848 }, { "epoch": 4.04, "learning_rate": 0.00027480441323971916, "loss": 2.859, "theoretical_loss": 3.5150939617653547, "tokens_seen": 1504784384 }, { "epoch": 4.04, "learning_rate": 0.00027479438314944834, "loss": 2.8605, "theoretical_loss": 3.515080472649427, "tokens_seen": 1504849920 }, { "epoch": 4.04, "learning_rate": 0.0002747843530591775, "loss": 2.743, "theoretical_loss": 3.515066984285413, "tokens_seen": 1504915456 }, { "epoch": 4.04, "learning_rate": 0.00027477432296890675, "loss": 2.7542, "theoretical_loss": 3.5150534966732385, "tokens_seen": 1504980992 }, { "epoch": 4.04, "learning_rate": 0.00027476429287863594, "loss": 2.9426, "theoretical_loss": 3.5150400098128287, "tokens_seen": 1505046528 }, { "epoch": 4.04, "learning_rate": 0.0002747542627883651, "loss": 2.6663, "theoretical_loss": 3.5150265237041087, "tokens_seen": 1505112064 }, { "epoch": 4.04, "learning_rate": 0.0002747442326980943, "loss": 2.8451, "theoretical_loss": 3.5150130383470035, "tokens_seen": 1505177600 }, { "epoch": 4.04, "learning_rate": 0.0002747342026078235, "loss": 2.8045, "theoretical_loss": 3.51499955374144, "tokens_seen": 1505243136 }, { "epoch": 4.04, "learning_rate": 0.00027472417251755266, "loss": 2.8135, "theoretical_loss": 3.5149860698873416, "tokens_seen": 1505308672 }, { "epoch": 4.04, "learning_rate": 0.0002747141424272819, "loss": 2.7895, "theoretical_loss": 3.5149725867846353, "tokens_seen": 1505374208 }, { "epoch": 4.04, "learning_rate": 0.000274704112337011, "loss": 2.7598, "theoretical_loss": 3.514959104433246, "tokens_seen": 1505439744 }, { "epoch": 4.04, "learning_rate": 0.00027469408224674026, "loss": 2.8336, "theoretical_loss": 3.514945622833099, "tokens_seen": 1505505280 }, { "epoch": 4.04, "learning_rate": 0.0002746840521564694, "loss": 2.7421, "theoretical_loss": 3.5149321419841195, "tokens_seen": 1505570816 }, { "epoch": 4.04, "objective/train/docs_used": 2402245, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.215850830078125, "objective/train/theoretical_loss": 3.5149186618862336, "objective/train/tokens_used": 1526096352, "theoretical_loss": 3.5149186618862336, "tokens_seen": 1505636352 }, { "epoch": 4.04, "learning_rate": 0.0002746740220661986, "loss": 3.0183, "theoretical_loss": 3.5149186618862336, "tokens_seen": 1505636352 }, { "epoch": 4.04, "learning_rate": 0.0002746639919759278, "loss": 2.7317, "theoretical_loss": 3.514905182539366, "tokens_seen": 1505701888 }, { "epoch": 4.04, "learning_rate": 0.000274653961885657, "loss": 2.8778, "theoretical_loss": 3.5148917039434426, "tokens_seen": 1505767424 }, { "epoch": 4.04, "learning_rate": 0.00027464393179538616, "loss": 2.7683, "theoretical_loss": 3.5148782260983893, "tokens_seen": 1505832960 }, { "epoch": 4.04, "learning_rate": 0.0002746339017051154, "loss": 2.9054, "theoretical_loss": 3.514864749004131, "tokens_seen": 1505898496 }, { "epoch": 4.04, "learning_rate": 0.0002746238716148445, "loss": 2.7695, "theoretical_loss": 3.514851272660593, "tokens_seen": 1505964032 }, { "epoch": 4.04, "learning_rate": 0.00027461384152457376, "loss": 2.9685, "theoretical_loss": 3.5148377970677016, "tokens_seen": 1506029568 }, { "epoch": 4.04, "learning_rate": 0.0002746038114343029, "loss": 2.9741, "theoretical_loss": 3.514824322225382, "tokens_seen": 1506095104 }, { "epoch": 4.04, "learning_rate": 0.0002745937813440321, "loss": 2.7943, "theoretical_loss": 3.5148108481335596, "tokens_seen": 1506160640 }, { "epoch": 4.04, "learning_rate": 0.0002745837512537613, "loss": 2.7215, "theoretical_loss": 3.5147973747921597, "tokens_seen": 1506226176 }, { "epoch": 4.04, "learning_rate": 0.0002745737211634905, "loss": 2.6879, "theoretical_loss": 3.514783902201108, "tokens_seen": 1506291712 }, { "epoch": 4.04, "learning_rate": 0.00027456369107321967, "loss": 2.8933, "theoretical_loss": 3.5147704303603304, "tokens_seen": 1506357248 }, { "epoch": 4.04, "learning_rate": 0.00027455366098294885, "loss": 2.9616, "theoretical_loss": 3.5147569592697527, "tokens_seen": 1506422784 }, { "epoch": 4.04, "learning_rate": 0.00027454363089267803, "loss": 2.7796, "theoretical_loss": 3.5147434889292994, "tokens_seen": 1506488320 }, { "epoch": 4.04, "learning_rate": 0.00027453360080240726, "loss": 2.8167, "theoretical_loss": 3.5147300193388973, "tokens_seen": 1506553856 }, { "epoch": 4.04, "learning_rate": 0.0002745235707121364, "loss": 2.8134, "theoretical_loss": 3.514716550498471, "tokens_seen": 1506619392 }, { "epoch": 4.04, "learning_rate": 0.0002745135406218656, "loss": 2.823, "theoretical_loss": 3.514703082407947, "tokens_seen": 1506684928 }, { "epoch": 4.04, "learning_rate": 0.00027450351053159475, "loss": 2.8576, "theoretical_loss": 3.514689615067251, "tokens_seen": 1506750464 }, { "epoch": 4.04, "learning_rate": 0.000274493480441324, "loss": 2.8082, "theoretical_loss": 3.5146761484763074, "tokens_seen": 1506816000 }, { "epoch": 4.04, "learning_rate": 0.00027448345035105317, "loss": 2.8046, "theoretical_loss": 3.514662682635043, "tokens_seen": 1506881536 }, { "epoch": 4.04, "learning_rate": 0.00027447342026078235, "loss": 2.8358, "theoretical_loss": 3.5146492175433828, "tokens_seen": 1506947072 }, { "epoch": 4.04, "learning_rate": 0.00027446339017051153, "loss": 2.7734, "theoretical_loss": 3.5146357532012535, "tokens_seen": 1507012608 }, { "epoch": 4.04, "learning_rate": 0.00027445336008024077, "loss": 2.7991, "theoretical_loss": 3.5146222896085795, "tokens_seen": 1507078144 }, { "epoch": 4.04, "learning_rate": 0.0002744433299899699, "loss": 2.7615, "theoretical_loss": 3.5146088267652873, "tokens_seen": 1507143680 }, { "epoch": 4.04, "learning_rate": 0.00027443329989969913, "loss": 2.7701, "theoretical_loss": 3.514595364671302, "tokens_seen": 1507209216 }, { "epoch": 4.04, "objective/train/docs_used": 2404963, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8798160552978516, "objective/train/theoretical_loss": 3.51458190332655, "objective/train/tokens_used": 1527734752, "theoretical_loss": 3.51458190332655, "tokens_seen": 1507274752 }, { "epoch": 4.04, "learning_rate": 0.00027442326980942826, "loss": 2.8142, "theoretical_loss": 3.51458190332655, "tokens_seen": 1507274752 }, { "epoch": 4.04, "learning_rate": 0.0002744132397191575, "loss": 2.9103, "theoretical_loss": 3.5145684427309565, "tokens_seen": 1507340288 }, { "epoch": 4.04, "learning_rate": 0.00027440320962888667, "loss": 2.738, "theoretical_loss": 3.5145549828844476, "tokens_seen": 1507405824 }, { "epoch": 4.04, "learning_rate": 0.00027439317953861585, "loss": 2.853, "theoretical_loss": 3.5145415237869493, "tokens_seen": 1507471360 }, { "epoch": 4.04, "learning_rate": 0.00027438314944834503, "loss": 2.8973, "theoretical_loss": 3.5145280654383866, "tokens_seen": 1507536896 }, { "epoch": 4.04, "learning_rate": 0.0002743731193580742, "loss": 2.9099, "theoretical_loss": 3.5145146078386853, "tokens_seen": 1507602432 }, { "epoch": 4.04, "learning_rate": 0.0002743630892678034, "loss": 2.6709, "theoretical_loss": 3.5145011509877717, "tokens_seen": 1507667968 }, { "epoch": 4.04, "learning_rate": 0.00027435305917753263, "loss": 2.8722, "theoretical_loss": 3.514487694885572, "tokens_seen": 1507733504 }, { "epoch": 4.04, "learning_rate": 0.00027434302908726176, "loss": 2.7867, "theoretical_loss": 3.5144742395320105, "tokens_seen": 1507799040 }, { "epoch": 4.04, "learning_rate": 0.000274332998996991, "loss": 2.8405, "theoretical_loss": 3.5144607849270146, "tokens_seen": 1507864576 }, { "epoch": 4.04, "learning_rate": 0.0002743229689067202, "loss": 2.6726, "theoretical_loss": 3.5144473310705093, "tokens_seen": 1507930112 }, { "epoch": 4.04, "learning_rate": 0.00027431293881644936, "loss": 2.8271, "theoretical_loss": 3.5144338779624205, "tokens_seen": 1507995648 }, { "epoch": 4.04, "learning_rate": 0.00027430290872617854, "loss": 2.7809, "theoretical_loss": 3.5144204256026743, "tokens_seen": 1508061184 }, { "epoch": 4.04, "learning_rate": 0.0002742928786359077, "loss": 2.7673, "theoretical_loss": 3.514406973991196, "tokens_seen": 1508126720 }, { "epoch": 4.04, "learning_rate": 0.0002742828485456369, "loss": 2.6678, "theoretical_loss": 3.5143935231279118, "tokens_seen": 1508192256 }, { "epoch": 4.04, "learning_rate": 0.00027427281845536614, "loss": 2.7873, "theoretical_loss": 3.514380073012748, "tokens_seen": 1508257792 }, { "epoch": 4.04, "learning_rate": 0.00027426278836509526, "loss": 2.9519, "theoretical_loss": 3.51436662364563, "tokens_seen": 1508323328 }, { "epoch": 4.04, "learning_rate": 0.0002742527582748245, "loss": 2.8316, "theoretical_loss": 3.514353175026484, "tokens_seen": 1508388864 }, { "epoch": 4.04, "learning_rate": 0.0002742427281845536, "loss": 2.7793, "theoretical_loss": 3.5143397271552352, "tokens_seen": 1508454400 }, { "epoch": 4.04, "learning_rate": 0.00027423269809428286, "loss": 2.8322, "theoretical_loss": 3.5143262800318102, "tokens_seen": 1508519936 }, { "epoch": 4.04, "learning_rate": 0.00027422266800401204, "loss": 2.7552, "theoretical_loss": 3.5143128336561347, "tokens_seen": 1508585472 }, { "epoch": 4.04, "learning_rate": 0.0002742126379137412, "loss": 2.764, "theoretical_loss": 3.5142993880281352, "tokens_seen": 1508651008 }, { "epoch": 4.04, "learning_rate": 0.0002742026078234704, "loss": 2.7456, "theoretical_loss": 3.5142859431477365, "tokens_seen": 1508716544 }, { "epoch": 4.04, "learning_rate": 0.0002741925777331996, "loss": 2.8395, "theoretical_loss": 3.5142724990148655, "tokens_seen": 1508782080 }, { "epoch": 4.04, "learning_rate": 0.00027418254764292877, "loss": 2.8138, "theoretical_loss": 3.5142590556294477, "tokens_seen": 1508847616 }, { "epoch": 4.04, "objective/train/docs_used": 2407956, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.426378011703491, "objective/train/theoretical_loss": 3.5142456129914095, "objective/train/tokens_used": 1529373152, "theoretical_loss": 3.5142456129914095, "tokens_seen": 1508913152 }, { "epoch": 4.04, "learning_rate": 0.000274172517552658, "loss": 2.8681, "theoretical_loss": 3.5142456129914095, "tokens_seen": 1508913152 }, { "epoch": 4.04, "learning_rate": 0.00027416248746238713, "loss": 2.673, "theoretical_loss": 3.5142321711006765, "tokens_seen": 1508978688 }, { "epoch": 4.04, "learning_rate": 0.00027415245737211636, "loss": 2.8105, "theoretical_loss": 3.514218729957175, "tokens_seen": 1509044224 }, { "epoch": 4.04, "learning_rate": 0.00027414242728184554, "loss": 2.6535, "theoretical_loss": 3.514205289560831, "tokens_seen": 1509109760 }, { "epoch": 4.04, "learning_rate": 0.0002741323971915747, "loss": 2.7874, "theoretical_loss": 3.51419184991157, "tokens_seen": 1509175296 }, { "epoch": 4.04, "learning_rate": 0.0002741223671013039, "loss": 2.8471, "theoretical_loss": 3.5141784110093184, "tokens_seen": 1509240832 }, { "epoch": 4.04, "learning_rate": 0.0002741123370110331, "loss": 2.8899, "theoretical_loss": 3.5141649728540028, "tokens_seen": 1509306368 }, { "epoch": 4.04, "learning_rate": 0.00027410230692076227, "loss": 2.7839, "theoretical_loss": 3.514151535445548, "tokens_seen": 1509371904 }, { "epoch": 4.04, "learning_rate": 0.0002740922768304915, "loss": 2.8923, "theoretical_loss": 3.5141380987838815, "tokens_seen": 1509437440 }, { "epoch": 4.04, "learning_rate": 0.00027408224674022063, "loss": 2.6405, "theoretical_loss": 3.5141246628689276, "tokens_seen": 1509502976 }, { "epoch": 4.04, "learning_rate": 0.00027407221664994987, "loss": 2.8371, "theoretical_loss": 3.5141112277006146, "tokens_seen": 1509568512 }, { "epoch": 4.04, "learning_rate": 0.000274062186559679, "loss": 2.8227, "theoretical_loss": 3.514097793278867, "tokens_seen": 1509634048 }, { "epoch": 4.04, "learning_rate": 0.00027405215646940823, "loss": 2.9667, "theoretical_loss": 3.5140843596036113, "tokens_seen": 1509699584 }, { "epoch": 4.04, "learning_rate": 0.0002740421263791374, "loss": 2.8081, "theoretical_loss": 3.5140709266747736, "tokens_seen": 1509765120 }, { "epoch": 4.04, "learning_rate": 0.0002740320962888666, "loss": 2.7524, "theoretical_loss": 3.51405749449228, "tokens_seen": 1509830656 }, { "epoch": 4.04, "learning_rate": 0.0002740220661985958, "loss": 2.9183, "theoretical_loss": 3.5140440630560574, "tokens_seen": 1509896192 }, { "epoch": 4.04, "learning_rate": 0.00027401203610832495, "loss": 2.8215, "theoretical_loss": 3.5140306323660306, "tokens_seen": 1509961728 }, { "epoch": 4.04, "learning_rate": 0.0002740020060180542, "loss": 2.8555, "theoretical_loss": 3.5140172024221266, "tokens_seen": 1510027264 }, { "epoch": 4.04, "learning_rate": 0.00027399197592778337, "loss": 2.8717, "theoretical_loss": 3.5140037732242715, "tokens_seen": 1510092800 }, { "epoch": 4.04, "learning_rate": 0.00027398194583751255, "loss": 2.8453, "theoretical_loss": 3.5139903447723917, "tokens_seen": 1510158336 }, { "epoch": 4.04, "learning_rate": 0.00027397191574724173, "loss": 2.8292, "theoretical_loss": 3.5139769170664126, "tokens_seen": 1510223872 }, { "epoch": 4.04, "learning_rate": 0.00027396188565697097, "loss": 2.7883, "theoretical_loss": 3.513963490106261, "tokens_seen": 1510289408 }, { "epoch": 4.04, "learning_rate": 0.0002739518555667001, "loss": 2.66, "theoretical_loss": 3.513950063891863, "tokens_seen": 1510354944 }, { "epoch": 4.04, "learning_rate": 0.00027394182547642933, "loss": 2.7419, "theoretical_loss": 3.5139366384231447, "tokens_seen": 1510420480 }, { "epoch": 4.04, "learning_rate": 0.00027393179538615846, "loss": 2.6802, "theoretical_loss": 3.513923213700033, "tokens_seen": 1510486016 }, { "epoch": 4.04, "objective/train/docs_used": 2410605, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.634821653366089, "objective/train/theoretical_loss": 3.513909789722453, "objective/train/tokens_used": 1531011552, "theoretical_loss": 3.513909789722453, "tokens_seen": 1510551552 }, { "epoch": 4.04, "learning_rate": 0.0002739217652958877, "loss": 2.744, "theoretical_loss": 3.513909789722453, "tokens_seen": 1510551552 }, { "epoch": 4.04, "learning_rate": 0.00027391173520561687, "loss": 2.8355, "theoretical_loss": 3.513896366490332, "tokens_seen": 1510617088 }, { "epoch": 4.04, "learning_rate": 0.00027390170511534605, "loss": 2.8165, "theoretical_loss": 3.513882944003596, "tokens_seen": 1510682624 }, { "epoch": 4.04, "learning_rate": 0.00027389167502507524, "loss": 2.8232, "theoretical_loss": 3.513869522262171, "tokens_seen": 1510748160 }, { "epoch": 4.04, "learning_rate": 0.0002738816449348044, "loss": 2.7276, "theoretical_loss": 3.513856101265983, "tokens_seen": 1510813696 }, { "epoch": 4.04, "learning_rate": 0.0002738716148445336, "loss": 2.8608, "theoretical_loss": 3.5138426810149586, "tokens_seen": 1510879232 }, { "epoch": 4.04, "learning_rate": 0.00027386158475426283, "loss": 2.8343, "theoretical_loss": 3.5138292615090245, "tokens_seen": 1510944768 }, { "epoch": 4.04, "learning_rate": 0.00027385155466399196, "loss": 2.8274, "theoretical_loss": 3.513815842748107, "tokens_seen": 1511010304 }, { "epoch": 4.04, "learning_rate": 0.0002738415245737212, "loss": 2.7092, "theoretical_loss": 3.5138024247321313, "tokens_seen": 1511075840 }, { "epoch": 4.04, "learning_rate": 0.0002738314944834504, "loss": 2.8248, "theoretical_loss": 3.5137890074610256, "tokens_seen": 1511141376 }, { "epoch": 4.04, "learning_rate": 0.00027382146439317956, "loss": 2.9302, "theoretical_loss": 3.5137755909347144, "tokens_seen": 1511206912 }, { "epoch": 4.04, "learning_rate": 0.00027381143430290874, "loss": 2.9286, "theoretical_loss": 3.5137621751531256, "tokens_seen": 1511272448 }, { "epoch": 4.04, "learning_rate": 0.0002738014042126379, "loss": 2.7711, "theoretical_loss": 3.5137487601161843, "tokens_seen": 1511337984 }, { "epoch": 4.04, "learning_rate": 0.0002737913741223671, "loss": 2.8947, "theoretical_loss": 3.5137353458238176, "tokens_seen": 1511403520 }, { "epoch": 4.04, "learning_rate": 0.00027378134403209634, "loss": 2.8039, "theoretical_loss": 3.513721932275952, "tokens_seen": 1511469056 }, { "epoch": 4.04, "learning_rate": 0.00027377131394182546, "loss": 2.6943, "theoretical_loss": 3.5137085194725133, "tokens_seen": 1511534592 }, { "epoch": 4.04, "learning_rate": 0.0002737612838515547, "loss": 2.85, "theoretical_loss": 3.513695107413428, "tokens_seen": 1511600128 }, { "epoch": 4.04, "learning_rate": 0.0002737512537612838, "loss": 2.8384, "theoretical_loss": 3.5136816960986232, "tokens_seen": 1511665664 }, { "epoch": 4.04, "learning_rate": 0.00027374122367101306, "loss": 2.827, "theoretical_loss": 3.513668285528025, "tokens_seen": 1511731200 }, { "epoch": 4.04, "learning_rate": 0.00027373119358074224, "loss": 2.7805, "theoretical_loss": 3.513654875701559, "tokens_seen": 1511796736 }, { "epoch": 4.04, "learning_rate": 0.0002737211634904714, "loss": 2.9351, "theoretical_loss": 3.5136414666191533, "tokens_seen": 1511862272 }, { "epoch": 4.04, "learning_rate": 0.0002737111334002006, "loss": 2.8503, "theoretical_loss": 3.513628058280733, "tokens_seen": 1511927808 }, { "epoch": 4.04, "learning_rate": 0.0002737011033099298, "loss": 2.8318, "theoretical_loss": 3.5136146506862254, "tokens_seen": 1511993344 }, { "epoch": 4.04, "learning_rate": 0.00027369107321965897, "loss": 2.9151, "theoretical_loss": 3.513601243835556, "tokens_seen": 1512058880 }, { "epoch": 4.04, "learning_rate": 0.0002736810431293882, "loss": 2.9683, "theoretical_loss": 3.5135878377286525, "tokens_seen": 1512124416 }, { "epoch": 4.04, "objective/train/docs_used": 2413370, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7921218872070312, "objective/train/theoretical_loss": 3.5135744323654405, "objective/train/tokens_used": 1532649952, "theoretical_loss": 3.5135744323654405, "tokens_seen": 1512189952 }, { "epoch": 4.04, "learning_rate": 0.00027367101303911733, "loss": 2.828, "theoretical_loss": 3.5135744323654405, "tokens_seen": 1512189952 }, { "epoch": 4.04, "learning_rate": 0.00027366098294884656, "loss": 2.9204, "theoretical_loss": 3.513561027745847, "tokens_seen": 1512255488 }, { "epoch": 4.04, "learning_rate": 0.00027365095285857574, "loss": 2.6265, "theoretical_loss": 3.5135476238697985, "tokens_seen": 1512321024 }, { "epoch": 4.04, "learning_rate": 0.0002736409227683049, "loss": 2.7752, "theoretical_loss": 3.513534220737221, "tokens_seen": 1512386560 }, { "epoch": 4.04, "learning_rate": 0.0002736308926780341, "loss": 2.7291, "theoretical_loss": 3.5135208183480415, "tokens_seen": 1512452096 }, { "epoch": 4.04, "learning_rate": 0.0002736208625877633, "loss": 2.8239, "theoretical_loss": 3.513507416702187, "tokens_seen": 1512517632 }, { "epoch": 4.04, "learning_rate": 0.00027361083249749247, "loss": 2.8832, "theoretical_loss": 3.513494015799583, "tokens_seen": 1512583168 }, { "epoch": 4.04, "learning_rate": 0.0002736008024072217, "loss": 2.6427, "theoretical_loss": 3.513480615640157, "tokens_seen": 1512648704 }, { "epoch": 4.04, "learning_rate": 0.00027359077231695083, "loss": 2.851, "theoretical_loss": 3.513467216223835, "tokens_seen": 1512714240 }, { "epoch": 4.04, "learning_rate": 0.00027358074222668007, "loss": 2.7741, "theoretical_loss": 3.5134538175505443, "tokens_seen": 1512779776 }, { "epoch": 4.04, "learning_rate": 0.0002735707121364092, "loss": 2.7394, "theoretical_loss": 3.5134404196202107, "tokens_seen": 1512845312 }, { "epoch": 4.04, "learning_rate": 0.00027356068204613843, "loss": 2.8054, "theoretical_loss": 3.5134270224327615, "tokens_seen": 1512910848 }, { "epoch": 4.04, "learning_rate": 0.0002735506519558676, "loss": 2.6987, "theoretical_loss": 3.5134136259881226, "tokens_seen": 1512976384 }, { "epoch": 4.04, "learning_rate": 0.0002735406218655968, "loss": 2.8246, "theoretical_loss": 3.513400230286221, "tokens_seen": 1513041920 }, { "epoch": 4.04, "learning_rate": 0.00027353059177532597, "loss": 2.7681, "theoretical_loss": 3.5133868353269837, "tokens_seen": 1513107456 }, { "epoch": 4.04, "learning_rate": 0.00027352056168505515, "loss": 2.6657, "theoretical_loss": 3.5133734411103372, "tokens_seen": 1513172992 }, { "epoch": 4.04, "learning_rate": 0.00027351053159478433, "loss": 2.7511, "theoretical_loss": 3.5133600476362075, "tokens_seen": 1513238528 }, { "epoch": 4.04, "learning_rate": 0.00027350050150451357, "loss": 2.6744, "theoretical_loss": 3.513346654904522, "tokens_seen": 1513304064 }, { "epoch": 4.04, "learning_rate": 0.0002734904714142427, "loss": 2.7497, "theoretical_loss": 3.5133332629152076, "tokens_seen": 1513369600 }, { "epoch": 4.04, "learning_rate": 0.00027348044132397193, "loss": 2.8216, "theoretical_loss": 3.51331987166819, "tokens_seen": 1513435136 }, { "epoch": 4.04, "learning_rate": 0.0002734704112337011, "loss": 2.7922, "theoretical_loss": 3.513306481163397, "tokens_seen": 1513500672 }, { "epoch": 4.04, "learning_rate": 0.0002734603811434303, "loss": 2.8492, "theoretical_loss": 3.513293091400755, "tokens_seen": 1513566208 }, { "epoch": 4.04, "learning_rate": 0.0002734503510531595, "loss": 2.8451, "theoretical_loss": 3.5132797023801903, "tokens_seen": 1513631744 }, { "epoch": 4.04, "learning_rate": 0.00027344032096288866, "loss": 2.8148, "theoretical_loss": 3.51326631410163, "tokens_seen": 1513697280 }, { "epoch": 4.04, "learning_rate": 0.00027343029087261784, "loss": 2.7831, "theoretical_loss": 3.513252926565001, "tokens_seen": 1513762816 }, { "epoch": 4.04, "objective/train/docs_used": 2416326, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.805906057357788, "objective/train/theoretical_loss": 3.51323953977023, "objective/train/tokens_used": 1534288352, "theoretical_loss": 3.51323953977023, "tokens_seen": 1513828352 }, { "epoch": 4.04, "learning_rate": 0.0002734202607823471, "loss": 2.7337, "theoretical_loss": 3.51323953977023, "tokens_seen": 1513828352 }, { "epoch": 4.04, "learning_rate": 0.0002734102306920762, "loss": 2.7875, "theoretical_loss": 3.513226153717243, "tokens_seen": 1513893888 }, { "epoch": 4.04, "learning_rate": 0.00027340020060180544, "loss": 2.7197, "theoretical_loss": 3.5132127684059675, "tokens_seen": 1513959424 }, { "epoch": 4.04, "learning_rate": 0.00027339017051153456, "loss": 2.8788, "theoretical_loss": 3.513199383836331, "tokens_seen": 1514024960 }, { "epoch": 4.04, "learning_rate": 0.0002733801404212638, "loss": 2.8248, "theoretical_loss": 3.513186000008259, "tokens_seen": 1514090496 }, { "epoch": 4.04, "learning_rate": 0.000273370110330993, "loss": 2.5684, "theoretical_loss": 3.5131726169216786, "tokens_seen": 1514156032 }, { "epoch": 4.04, "learning_rate": 0.00027336008024072216, "loss": 2.7388, "theoretical_loss": 3.5131592345765172, "tokens_seen": 1514221568 }, { "epoch": 4.04, "learning_rate": 0.00027335005015045134, "loss": 2.8092, "theoretical_loss": 3.513145852972701, "tokens_seen": 1514287104 }, { "epoch": 4.04, "learning_rate": 0.0002733400200601806, "loss": 2.8759, "theoretical_loss": 3.5131324721101578, "tokens_seen": 1514352640 }, { "epoch": 4.04, "learning_rate": 0.0002733299899699097, "loss": 2.8278, "theoretical_loss": 3.5131190919888136, "tokens_seen": 1514418176 }, { "epoch": 4.04, "learning_rate": 0.00027331995987963894, "loss": 2.8353, "theoretical_loss": 3.513105712608595, "tokens_seen": 1514483712 }, { "epoch": 4.04, "learning_rate": 0.00027330992978936807, "loss": 2.6801, "theoretical_loss": 3.51309233396943, "tokens_seen": 1514549248 }, { "epoch": 4.04, "learning_rate": 0.0002732998996990973, "loss": 2.8289, "theoretical_loss": 3.5130789560712445, "tokens_seen": 1514614784 }, { "epoch": 4.04, "learning_rate": 0.0002732898696088265, "loss": 2.8181, "theoretical_loss": 3.513065578913966, "tokens_seen": 1514680320 }, { "epoch": 4.04, "learning_rate": 0.00027327983951855566, "loss": 2.742, "theoretical_loss": 3.5130522024975215, "tokens_seen": 1514745856 }, { "epoch": 4.04, "learning_rate": 0.0002732698094282849, "loss": 2.6822, "theoretical_loss": 3.5130388268218375, "tokens_seen": 1514811392 }, { "epoch": 4.04, "learning_rate": 0.000273259779338014, "loss": 2.808, "theoretical_loss": 3.5130254518868407, "tokens_seen": 1514876928 }, { "epoch": 4.04, "learning_rate": 0.00027324974924774326, "loss": 2.7535, "theoretical_loss": 3.5130120776924585, "tokens_seen": 1514942464 }, { "epoch": 4.04, "learning_rate": 0.00027323971915747244, "loss": 2.723, "theoretical_loss": 3.5129987042386173, "tokens_seen": 1515008000 }, { "epoch": 4.04, "learning_rate": 0.0002732296890672016, "loss": 2.9399, "theoretical_loss": 3.512985331525245, "tokens_seen": 1515073536 }, { "epoch": 4.04, "learning_rate": 0.0002732196589769308, "loss": 2.8336, "theoretical_loss": 3.512971959552268, "tokens_seen": 1515139072 }, { "epoch": 4.04, "learning_rate": 0.00027320962888666, "loss": 3.0453, "theoretical_loss": 3.5129585883196137, "tokens_seen": 1515204608 }, { "epoch": 4.04, "learning_rate": 0.00027319959879638917, "loss": 2.7721, "theoretical_loss": 3.512945217827208, "tokens_seen": 1515270144 }, { "epoch": 4.05, "learning_rate": 0.0002731895687061184, "loss": 2.8905, "theoretical_loss": 3.512931848074979, "tokens_seen": 1515335680 }, { "epoch": 4.05, "learning_rate": 0.00027317953861584753, "loss": 2.8151, "theoretical_loss": 3.5129184790628543, "tokens_seen": 1515401216 }, { "epoch": 4.05, "objective/train/docs_used": 2417812, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.016741991043091, "objective/train/theoretical_loss": 3.5129051107907587, "objective/train/tokens_used": 1535926752, "theoretical_loss": 3.5129051107907587, "tokens_seen": 1515466752 }, { "epoch": 4.05, "learning_rate": 0.00027316950852557676, "loss": 2.8398, "theoretical_loss": 3.5129051107907587, "tokens_seen": 1515466752 }, { "epoch": 4.05, "learning_rate": 0.00027315947843530594, "loss": 2.6966, "theoretical_loss": 3.5128917432586215, "tokens_seen": 1515532288 }, { "epoch": 4.05, "learning_rate": 0.0002731494483450351, "loss": 2.8471, "theoretical_loss": 3.512878376466368, "tokens_seen": 1515597824 }, { "epoch": 4.05, "learning_rate": 0.0002731394182547643, "loss": 2.9057, "theoretical_loss": 3.5128650104139263, "tokens_seen": 1515663360 }, { "epoch": 4.05, "learning_rate": 0.0002731293881644935, "loss": 2.7809, "theoretical_loss": 3.5128516451012235, "tokens_seen": 1515728896 }, { "epoch": 4.05, "learning_rate": 0.00027311935807422267, "loss": 2.9517, "theoretical_loss": 3.5128382805281864, "tokens_seen": 1515794432 }, { "epoch": 4.05, "learning_rate": 0.0002731093279839519, "loss": 2.6869, "theoretical_loss": 3.5128249166947416, "tokens_seen": 1515859968 }, { "epoch": 4.05, "learning_rate": 0.00027309929789368103, "loss": 2.8614, "theoretical_loss": 3.5128115536008173, "tokens_seen": 1515925504 }, { "epoch": 4.05, "learning_rate": 0.00027308926780341027, "loss": 2.8562, "theoretical_loss": 3.5127981912463397, "tokens_seen": 1515991040 }, { "epoch": 4.05, "learning_rate": 0.0002730792377131394, "loss": 2.5664, "theoretical_loss": 3.512784829631236, "tokens_seen": 1516056576 }, { "epoch": 4.05, "learning_rate": 0.00027306920762286863, "loss": 2.8484, "theoretical_loss": 3.512771468755434, "tokens_seen": 1516122112 }, { "epoch": 4.05, "learning_rate": 0.0002730591775325978, "loss": 2.6896, "theoretical_loss": 3.5127581086188595, "tokens_seen": 1516187648 }, { "epoch": 4.05, "learning_rate": 0.000273049147442327, "loss": 2.8232, "theoretical_loss": 3.5127447492214414, "tokens_seen": 1516253184 }, { "epoch": 4.05, "learning_rate": 0.00027303911735205617, "loss": 2.8227, "theoretical_loss": 3.512731390563106, "tokens_seen": 1516318720 }, { "epoch": 4.05, "learning_rate": 0.00027302908726178535, "loss": 2.728, "theoretical_loss": 3.5127180326437797, "tokens_seen": 1516384256 }, { "epoch": 4.05, "learning_rate": 0.00027301905717151453, "loss": 2.7456, "theoretical_loss": 3.5127046754633913, "tokens_seen": 1516449792 }, { "epoch": 4.05, "learning_rate": 0.00027300902708124377, "loss": 2.8497, "theoretical_loss": 3.512691319021867, "tokens_seen": 1516515328 }, { "epoch": 4.05, "learning_rate": 0.0002729989969909729, "loss": 2.8506, "theoretical_loss": 3.512677963319134, "tokens_seen": 1516580864 }, { "epoch": 4.05, "learning_rate": 0.00027298896690070213, "loss": 2.8157, "theoretical_loss": 3.512664608355119, "tokens_seen": 1516646400 }, { "epoch": 4.05, "learning_rate": 0.0002729789368104313, "loss": 2.7688, "theoretical_loss": 3.5126512541297505, "tokens_seen": 1516711936 }, { "epoch": 4.05, "learning_rate": 0.0002729689067201605, "loss": 2.7612, "theoretical_loss": 3.512637900642955, "tokens_seen": 1516777472 }, { "epoch": 4.05, "learning_rate": 0.0002729588766298897, "loss": 2.8258, "theoretical_loss": 3.51262454789466, "tokens_seen": 1516843008 }, { "epoch": 4.05, "learning_rate": 0.00027294884653961886, "loss": 2.7377, "theoretical_loss": 3.5126111958847925, "tokens_seen": 1516908544 }, { "epoch": 4.05, "learning_rate": 0.00027293881644934804, "loss": 2.7277, "theoretical_loss": 3.5125978446132797, "tokens_seen": 1516974080 }, { "epoch": 4.05, "learning_rate": 0.0002729287863590773, "loss": 2.886, "theoretical_loss": 3.5125844940800492, "tokens_seen": 1517039616 }, { "epoch": 4.05, "objective/train/docs_used": 2420464, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8288047313690186, "objective/train/theoretical_loss": 3.512571144285028, "objective/train/tokens_used": 1537565152, "theoretical_loss": 3.512571144285028, "tokens_seen": 1517105152 }, { "epoch": 4.05, "learning_rate": 0.0002729187562688064, "loss": 2.8338, "theoretical_loss": 3.512571144285028, "tokens_seen": 1517105152 }, { "epoch": 4.05, "learning_rate": 0.00027290872617853564, "loss": 2.7181, "theoretical_loss": 3.5125577952281435, "tokens_seen": 1517170688 }, { "epoch": 4.05, "learning_rate": 0.00027289869608826476, "loss": 2.7277, "theoretical_loss": 3.512544446909323, "tokens_seen": 1517236224 }, { "epoch": 4.05, "learning_rate": 0.000272888665997994, "loss": 2.7326, "theoretical_loss": 3.512531099328494, "tokens_seen": 1517301760 }, { "epoch": 4.05, "learning_rate": 0.0002728786359077232, "loss": 2.8267, "theoretical_loss": 3.512517752485584, "tokens_seen": 1517367296 }, { "epoch": 4.05, "learning_rate": 0.00027286860581745236, "loss": 2.8033, "theoretical_loss": 3.5125044063805193, "tokens_seen": 1517432832 }, { "epoch": 4.05, "learning_rate": 0.00027285857572718154, "loss": 2.7014, "theoretical_loss": 3.5124910610132285, "tokens_seen": 1517498368 }, { "epoch": 4.05, "learning_rate": 0.0002728485456369108, "loss": 2.9367, "theoretical_loss": 3.512477716383638, "tokens_seen": 1517563904 }, { "epoch": 4.05, "learning_rate": 0.0002728385155466399, "loss": 2.9686, "theoretical_loss": 3.5124643724916753, "tokens_seen": 1517629440 }, { "epoch": 4.05, "learning_rate": 0.00027282848545636914, "loss": 2.684, "theoretical_loss": 3.512451029337269, "tokens_seen": 1517694976 }, { "epoch": 4.05, "learning_rate": 0.00027281845536609827, "loss": 2.6915, "theoretical_loss": 3.512437686920345, "tokens_seen": 1517760512 }, { "epoch": 4.05, "learning_rate": 0.0002728084252758275, "loss": 2.7653, "theoretical_loss": 3.512424345240831, "tokens_seen": 1517826048 }, { "epoch": 4.05, "learning_rate": 0.0002727983951855567, "loss": 2.6943, "theoretical_loss": 3.512411004298655, "tokens_seen": 1517891584 }, { "epoch": 4.05, "learning_rate": 0.00027278836509528586, "loss": 2.7951, "theoretical_loss": 3.5123976640937435, "tokens_seen": 1517957120 }, { "epoch": 4.05, "learning_rate": 0.00027277833500501504, "loss": 2.8417, "theoretical_loss": 3.512384324626025, "tokens_seen": 1518022656 }, { "epoch": 4.05, "learning_rate": 0.0002727683049147442, "loss": 2.8704, "theoretical_loss": 3.5123709858954264, "tokens_seen": 1518088192 }, { "epoch": 4.05, "learning_rate": 0.0002727582748244734, "loss": 2.811, "theoretical_loss": 3.512357647901875, "tokens_seen": 1518153728 }, { "epoch": 4.05, "learning_rate": 0.00027274824473420264, "loss": 2.7857, "theoretical_loss": 3.5123443106452985, "tokens_seen": 1518219264 }, { "epoch": 4.05, "learning_rate": 0.00027273821464393177, "loss": 2.8858, "theoretical_loss": 3.5123309741256246, "tokens_seen": 1518284800 }, { "epoch": 4.05, "learning_rate": 0.000272728184553661, "loss": 2.7782, "theoretical_loss": 3.51231763834278, "tokens_seen": 1518350336 }, { "epoch": 4.05, "learning_rate": 0.00027271815446339013, "loss": 2.7581, "theoretical_loss": 3.512304303296693, "tokens_seen": 1518415872 }, { "epoch": 4.05, "learning_rate": 0.00027270812437311937, "loss": 2.899, "theoretical_loss": 3.5122909689872905, "tokens_seen": 1518481408 }, { "epoch": 4.05, "learning_rate": 0.00027269809428284855, "loss": 2.8056, "theoretical_loss": 3.5122776354145007, "tokens_seen": 1518546944 }, { "epoch": 4.05, "learning_rate": 0.00027268806419257773, "loss": 2.8946, "theoretical_loss": 3.51226430257825, "tokens_seen": 1518612480 }, { "epoch": 4.05, "learning_rate": 0.0002726780341023069, "loss": 2.8676, "theoretical_loss": 3.5122509704784672, "tokens_seen": 1518678016 }, { "epoch": 4.05, "objective/train/docs_used": 2423290, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7047128677368164, "objective/train/theoretical_loss": 3.512237639115079, "objective/train/tokens_used": 1539203552, "theoretical_loss": 3.512237639115079, "tokens_seen": 1518743552 }, { "epoch": 4.05, "learning_rate": 0.00027266800401203614, "loss": 2.771, "theoretical_loss": 3.512237639115079, "tokens_seen": 1518743552 }, { "epoch": 4.05, "learning_rate": 0.00027265797392176527, "loss": 2.7664, "theoretical_loss": 3.5122243084880136, "tokens_seen": 1518809088 }, { "epoch": 4.05, "learning_rate": 0.0002726479438314945, "loss": 2.8784, "theoretical_loss": 3.5122109785971976, "tokens_seen": 1518874624 }, { "epoch": 4.05, "learning_rate": 0.00027263791374122363, "loss": 2.8505, "theoretical_loss": 3.5121976494425593, "tokens_seen": 1518940160 }, { "epoch": 4.05, "learning_rate": 0.00027262788365095287, "loss": 2.8351, "theoretical_loss": 3.5121843210240264, "tokens_seen": 1519005696 }, { "epoch": 4.05, "learning_rate": 0.00027261785356068205, "loss": 2.7682, "theoretical_loss": 3.512170993341526, "tokens_seen": 1519071232 }, { "epoch": 4.05, "learning_rate": 0.00027260782347041123, "loss": 2.8023, "theoretical_loss": 3.5121576663949865, "tokens_seen": 1519136768 }, { "epoch": 4.05, "learning_rate": 0.0002725977933801404, "loss": 2.8489, "theoretical_loss": 3.512144340184334, "tokens_seen": 1519202304 }, { "epoch": 4.05, "learning_rate": 0.0002725877632898696, "loss": 2.8382, "theoretical_loss": 3.5121310147094977, "tokens_seen": 1519267840 }, { "epoch": 4.05, "learning_rate": 0.0002725777331995988, "loss": 2.786, "theoretical_loss": 3.5121176899704047, "tokens_seen": 1519333376 }, { "epoch": 4.05, "learning_rate": 0.000272567703109328, "loss": 2.7942, "theoretical_loss": 3.512104365966982, "tokens_seen": 1519398912 }, { "epoch": 4.05, "learning_rate": 0.00027255767301905714, "loss": 2.9402, "theoretical_loss": 3.5120910426991583, "tokens_seen": 1519464448 }, { "epoch": 4.05, "learning_rate": 0.00027254764292878637, "loss": 2.8538, "theoretical_loss": 3.512077720166861, "tokens_seen": 1519529984 }, { "epoch": 4.05, "learning_rate": 0.0002725376128385155, "loss": 2.7273, "theoretical_loss": 3.5120643983700166, "tokens_seen": 1519595520 }, { "epoch": 4.05, "learning_rate": 0.00027252758274824473, "loss": 2.6596, "theoretical_loss": 3.5120510773085547, "tokens_seen": 1519661056 }, { "epoch": 4.05, "learning_rate": 0.00027251755265797397, "loss": 2.8024, "theoretical_loss": 3.5120377569824015, "tokens_seen": 1519726592 }, { "epoch": 4.05, "learning_rate": 0.0002725075225677031, "loss": 2.783, "theoretical_loss": 3.512024437391485, "tokens_seen": 1519792128 }, { "epoch": 4.05, "learning_rate": 0.00027249749247743233, "loss": 2.7285, "theoretical_loss": 3.5120111185357334, "tokens_seen": 1519857664 }, { "epoch": 4.05, "learning_rate": 0.0002724874623871615, "loss": 2.7929, "theoretical_loss": 3.5119978004150747, "tokens_seen": 1519923200 }, { "epoch": 4.05, "learning_rate": 0.0002724774322968907, "loss": 2.8362, "theoretical_loss": 3.511984483029435, "tokens_seen": 1519988736 }, { "epoch": 4.05, "learning_rate": 0.0002724674022066199, "loss": 2.8065, "theoretical_loss": 3.511971166378744, "tokens_seen": 1520054272 }, { "epoch": 4.05, "learning_rate": 0.00027245737211634906, "loss": 2.7956, "theoretical_loss": 3.5119578504629287, "tokens_seen": 1520119808 }, { "epoch": 4.05, "learning_rate": 0.00027244734202607824, "loss": 2.9018, "theoretical_loss": 3.5119445352819163, "tokens_seen": 1520185344 }, { "epoch": 4.05, "learning_rate": 0.0002724373119358075, "loss": 2.7346, "theoretical_loss": 3.5119312208356352, "tokens_seen": 1520250880 }, { "epoch": 4.05, "learning_rate": 0.0002724272818455366, "loss": 2.9638, "theoretical_loss": 3.5119179071240127, "tokens_seen": 1520316416 }, { "epoch": 4.05, "objective/train/docs_used": 2426119, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7228758335113525, "objective/train/theoretical_loss": 3.5119045941469778, "objective/train/tokens_used": 1540841952, "theoretical_loss": 3.5119045941469778, "tokens_seen": 1520381952 }, { "epoch": 4.05, "learning_rate": 0.00027241725175526584, "loss": 2.8837, "theoretical_loss": 3.5119045941469778, "tokens_seen": 1520381952 }, { "epoch": 4.05, "learning_rate": 0.00027240722166499496, "loss": 2.8164, "theoretical_loss": 3.5118912819044565, "tokens_seen": 1520447488 }, { "epoch": 4.05, "learning_rate": 0.0002723971915747242, "loss": 2.743, "theoretical_loss": 3.5118779703963785, "tokens_seen": 1520513024 }, { "epoch": 4.05, "learning_rate": 0.0002723871614844534, "loss": 2.7448, "theoretical_loss": 3.5118646596226695, "tokens_seen": 1520578560 }, { "epoch": 4.05, "learning_rate": 0.00027237713139418256, "loss": 2.8381, "theoretical_loss": 3.5118513495832593, "tokens_seen": 1520644096 }, { "epoch": 4.05, "learning_rate": 0.00027236710130391174, "loss": 2.8667, "theoretical_loss": 3.511838040278075, "tokens_seen": 1520709632 }, { "epoch": 4.05, "learning_rate": 0.000272357071213641, "loss": 2.8074, "theoretical_loss": 3.511824731707044, "tokens_seen": 1520775168 }, { "epoch": 4.05, "learning_rate": 0.0002723470411233701, "loss": 2.8169, "theoretical_loss": 3.5118114238700953, "tokens_seen": 1520840704 }, { "epoch": 4.05, "learning_rate": 0.00027233701103309934, "loss": 2.8774, "theoretical_loss": 3.5117981167671557, "tokens_seen": 1520906240 }, { "epoch": 4.05, "learning_rate": 0.00027232698094282847, "loss": 2.8763, "theoretical_loss": 3.5117848103981535, "tokens_seen": 1520971776 }, { "epoch": 4.05, "learning_rate": 0.0002723169508525577, "loss": 2.8201, "theoretical_loss": 3.5117715047630162, "tokens_seen": 1521037312 }, { "epoch": 4.05, "learning_rate": 0.0002723069207622869, "loss": 2.8736, "theoretical_loss": 3.5117581998616725, "tokens_seen": 1521102848 }, { "epoch": 4.05, "learning_rate": 0.00027229689067201606, "loss": 2.7564, "theoretical_loss": 3.51174489569405, "tokens_seen": 1521168384 }, { "epoch": 4.05, "learning_rate": 0.00027228686058174524, "loss": 2.909, "theoretical_loss": 3.511731592260076, "tokens_seen": 1521233920 }, { "epoch": 4.05, "learning_rate": 0.0002722768304914744, "loss": 2.6919, "theoretical_loss": 3.511718289559679, "tokens_seen": 1521299456 }, { "epoch": 4.05, "learning_rate": 0.0002722668004012036, "loss": 2.9493, "theoretical_loss": 3.511704987592787, "tokens_seen": 1521364992 }, { "epoch": 4.05, "learning_rate": 0.00027225677031093284, "loss": 2.8498, "theoretical_loss": 3.5116916863593284, "tokens_seen": 1521430528 }, { "epoch": 4.05, "learning_rate": 0.00027224674022066197, "loss": 2.7912, "theoretical_loss": 3.5116783858592298, "tokens_seen": 1521496064 }, { "epoch": 4.05, "learning_rate": 0.0002722367101303912, "loss": 2.727, "theoretical_loss": 3.511665086092421, "tokens_seen": 1521561600 }, { "epoch": 4.05, "learning_rate": 0.00027222668004012033, "loss": 2.872, "theoretical_loss": 3.5116517870588284, "tokens_seen": 1521627136 }, { "epoch": 4.05, "learning_rate": 0.00027221664994984957, "loss": 2.7759, "theoretical_loss": 3.511638488758381, "tokens_seen": 1521692672 }, { "epoch": 4.05, "learning_rate": 0.00027220661985957875, "loss": 2.8957, "theoretical_loss": 3.5116251911910057, "tokens_seen": 1521758208 }, { "epoch": 4.05, "learning_rate": 0.00027219658976930793, "loss": 2.9721, "theoretical_loss": 3.5116118943566317, "tokens_seen": 1521823744 }, { "epoch": 4.05, "learning_rate": 0.0002721865596790371, "loss": 2.9157, "theoretical_loss": 3.511598598255187, "tokens_seen": 1521889280 }, { "epoch": 4.05, "learning_rate": 0.00027217652958876635, "loss": 2.826, "theoretical_loss": 3.5115853028865986, "tokens_seen": 1521954816 }, { "epoch": 4.05, "objective/train/docs_used": 2428664, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.459770441055298, "objective/train/theoretical_loss": 3.511572008250795, "objective/train/tokens_used": 1542480352, "theoretical_loss": 3.511572008250795, "tokens_seen": 1522020352 }, { "epoch": 4.05, "learning_rate": 0.00027216649949849547, "loss": 2.8252, "theoretical_loss": 3.511572008250795, "tokens_seen": 1522020352 }, { "epoch": 4.05, "learning_rate": 0.0002721564694082247, "loss": 2.8138, "theoretical_loss": 3.511558714347705, "tokens_seen": 1522085888 }, { "epoch": 4.05, "learning_rate": 0.00027214643931795383, "loss": 2.9228, "theoretical_loss": 3.5115454211772557, "tokens_seen": 1522151424 }, { "epoch": 4.05, "learning_rate": 0.00027213640922768307, "loss": 2.7383, "theoretical_loss": 3.511532128739376, "tokens_seen": 1522216960 }, { "epoch": 4.05, "learning_rate": 0.00027212637913741225, "loss": 2.9386, "theoretical_loss": 3.5115188370339934, "tokens_seen": 1522282496 }, { "epoch": 4.05, "learning_rate": 0.00027211634904714143, "loss": 2.9203, "theoretical_loss": 3.511505546061036, "tokens_seen": 1522348032 }, { "epoch": 4.05, "learning_rate": 0.0002721063189568706, "loss": 2.8754, "theoretical_loss": 3.511492255820432, "tokens_seen": 1522413568 }, { "epoch": 4.05, "learning_rate": 0.0002720962888665998, "loss": 2.8048, "theoretical_loss": 3.5114789663121098, "tokens_seen": 1522479104 }, { "epoch": 4.05, "learning_rate": 0.000272086258776329, "loss": 2.7691, "theoretical_loss": 3.511465677535998, "tokens_seen": 1522544640 }, { "epoch": 4.05, "learning_rate": 0.0002720762286860582, "loss": 2.8898, "theoretical_loss": 3.511452389492023, "tokens_seen": 1522610176 }, { "epoch": 4.05, "learning_rate": 0.00027206619859578734, "loss": 2.8756, "theoretical_loss": 3.511439102180115, "tokens_seen": 1522675712 }, { "epoch": 4.05, "learning_rate": 0.00027205616850551657, "loss": 2.671, "theoretical_loss": 3.5114258156002007, "tokens_seen": 1522741248 }, { "epoch": 4.05, "learning_rate": 0.0002720461384152457, "loss": 2.9199, "theoretical_loss": 3.5114125297522087, "tokens_seen": 1522806784 }, { "epoch": 4.05, "learning_rate": 0.00027203610832497493, "loss": 2.8588, "theoretical_loss": 3.511399244636068, "tokens_seen": 1522872320 }, { "epoch": 4.05, "learning_rate": 0.0002720260782347041, "loss": 2.9966, "theoretical_loss": 3.511385960251705, "tokens_seen": 1522937856 }, { "epoch": 4.05, "learning_rate": 0.0002720160481444333, "loss": 2.7211, "theoretical_loss": 3.5113726765990494, "tokens_seen": 1523003392 }, { "epoch": 4.05, "learning_rate": 0.0002720060180541625, "loss": 3.032, "theoretical_loss": 3.511359393678029, "tokens_seen": 1523068928 }, { "epoch": 4.05, "learning_rate": 0.0002719959879638917, "loss": 2.8299, "theoretical_loss": 3.511346111488572, "tokens_seen": 1523134464 }, { "epoch": 4.05, "learning_rate": 0.00027198595787362084, "loss": 2.7394, "theoretical_loss": 3.5113328300306064, "tokens_seen": 1523200000 }, { "epoch": 4.05, "learning_rate": 0.0002719759277833501, "loss": 2.8519, "theoretical_loss": 3.511319549304061, "tokens_seen": 1523265536 }, { "epoch": 4.05, "learning_rate": 0.0002719658976930792, "loss": 2.8472, "theoretical_loss": 3.5113062693088644, "tokens_seen": 1523331072 }, { "epoch": 4.05, "learning_rate": 0.00027195586760280844, "loss": 2.767, "theoretical_loss": 3.511292990044943, "tokens_seen": 1523396608 }, { "epoch": 4.05, "learning_rate": 0.0002719458375125376, "loss": 2.8719, "theoretical_loss": 3.511279711512227, "tokens_seen": 1523462144 }, { "epoch": 4.05, "learning_rate": 0.0002719358074222668, "loss": 2.8146, "theoretical_loss": 3.5112664337106434, "tokens_seen": 1523527680 }, { "epoch": 4.05, "learning_rate": 0.000271925777331996, "loss": 2.9158, "theoretical_loss": 3.5112531566401217, "tokens_seen": 1523593216 }, { "epoch": 4.05, "objective/train/docs_used": 2431550, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.969774007797241, "objective/train/theoretical_loss": 3.5112398803005895, "objective/train/tokens_used": 1544118752, "theoretical_loss": 3.5112398803005895, "tokens_seen": 1523658752 }, { "epoch": 4.05, "learning_rate": 0.00027191574724172516, "loss": 2.6812, "theoretical_loss": 3.5112398803005895, "tokens_seen": 1523658752 }, { "epoch": 4.05, "learning_rate": 0.00027190571715145434, "loss": 2.8946, "theoretical_loss": 3.5112266046919745, "tokens_seen": 1523724288 }, { "epoch": 4.05, "learning_rate": 0.0002718956870611836, "loss": 2.768, "theoretical_loss": 3.5112133298142068, "tokens_seen": 1523789824 }, { "epoch": 4.05, "learning_rate": 0.0002718856569709127, "loss": 2.8113, "theoretical_loss": 3.5112000556672127, "tokens_seen": 1523855360 }, { "epoch": 4.05, "learning_rate": 0.00027187562688064194, "loss": 2.8279, "theoretical_loss": 3.5111867822509217, "tokens_seen": 1523920896 }, { "epoch": 4.05, "learning_rate": 0.00027186559679037107, "loss": 2.7404, "theoretical_loss": 3.511173509565262, "tokens_seen": 1523986432 }, { "epoch": 4.05, "learning_rate": 0.0002718555667001003, "loss": 2.7469, "theoretical_loss": 3.511160237610162, "tokens_seen": 1524051968 }, { "epoch": 4.05, "learning_rate": 0.0002718455366098295, "loss": 2.709, "theoretical_loss": 3.51114696638555, "tokens_seen": 1524117504 }, { "epoch": 4.05, "learning_rate": 0.00027183550651955867, "loss": 2.7595, "theoretical_loss": 3.5111336958913544, "tokens_seen": 1524183040 }, { "epoch": 4.05, "learning_rate": 0.00027182547642928785, "loss": 2.9084, "theoretical_loss": 3.511120426127503, "tokens_seen": 1524248576 }, { "epoch": 4.05, "learning_rate": 0.0002718154463390171, "loss": 2.6604, "theoretical_loss": 3.5111071570939254, "tokens_seen": 1524314112 }, { "epoch": 4.05, "learning_rate": 0.0002718054162487462, "loss": 2.7893, "theoretical_loss": 3.5110938887905494, "tokens_seen": 1524379648 }, { "epoch": 4.05, "learning_rate": 0.00027179538615847544, "loss": 2.8576, "theoretical_loss": 3.511080621217303, "tokens_seen": 1524445184 }, { "epoch": 4.05, "learning_rate": 0.00027178535606820457, "loss": 2.6565, "theoretical_loss": 3.5110673543741155, "tokens_seen": 1524510720 }, { "epoch": 4.05, "learning_rate": 0.0002717753259779338, "loss": 2.8439, "theoretical_loss": 3.5110540882609147, "tokens_seen": 1524576256 }, { "epoch": 4.05, "learning_rate": 0.00027176529588766304, "loss": 2.8883, "theoretical_loss": 3.511040822877629, "tokens_seen": 1524641792 }, { "epoch": 4.05, "learning_rate": 0.00027175526579739217, "loss": 2.9675, "theoretical_loss": 3.511027558224187, "tokens_seen": 1524707328 }, { "epoch": 4.05, "learning_rate": 0.0002717452357071214, "loss": 2.7831, "theoretical_loss": 3.511014294300518, "tokens_seen": 1524772864 }, { "epoch": 4.05, "learning_rate": 0.00027173520561685053, "loss": 2.9439, "theoretical_loss": 3.5110010311065496, "tokens_seen": 1524838400 }, { "epoch": 4.05, "learning_rate": 0.00027172517552657977, "loss": 2.5797, "theoretical_loss": 3.5109877686422104, "tokens_seen": 1524903936 }, { "epoch": 4.05, "learning_rate": 0.00027171514543630895, "loss": 2.8925, "theoretical_loss": 3.510974506907429, "tokens_seen": 1524969472 }, { "epoch": 4.05, "learning_rate": 0.00027170511534603813, "loss": 2.7514, "theoretical_loss": 3.5109612459021333, "tokens_seen": 1525035008 }, { "epoch": 4.05, "learning_rate": 0.0002716950852557673, "loss": 2.7167, "theoretical_loss": 3.5109479856262533, "tokens_seen": 1525100544 }, { "epoch": 4.05, "learning_rate": 0.00027168505516549655, "loss": 2.7658, "theoretical_loss": 3.510934726079716, "tokens_seen": 1525166080 }, { "epoch": 4.05, "learning_rate": 0.00027167502507522567, "loss": 2.9438, "theoretical_loss": 3.510921467262451, "tokens_seen": 1525231616 }, { "epoch": 4.05, "objective/train/docs_used": 2434422, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.86464262008667, "objective/train/theoretical_loss": 3.510908209174387, "objective/train/tokens_used": 1545757152, "theoretical_loss": 3.510908209174387, "tokens_seen": 1525297152 }, { "epoch": 4.05, "learning_rate": 0.0002716649949849549, "loss": 2.8221, "theoretical_loss": 3.510908209174387, "tokens_seen": 1525297152 }, { "epoch": 4.05, "learning_rate": 0.00027165496489468403, "loss": 2.855, "theoretical_loss": 3.510894951815451, "tokens_seen": 1525362688 }, { "epoch": 4.05, "learning_rate": 0.00027164493480441327, "loss": 2.8067, "theoretical_loss": 3.510881695185573, "tokens_seen": 1525428224 }, { "epoch": 4.05, "learning_rate": 0.00027163490471414245, "loss": 2.7776, "theoretical_loss": 3.5108684392846814, "tokens_seen": 1525493760 }, { "epoch": 4.05, "learning_rate": 0.00027162487462387163, "loss": 2.618, "theoretical_loss": 3.5108551841127045, "tokens_seen": 1525559296 }, { "epoch": 4.05, "learning_rate": 0.0002716148445336008, "loss": 2.7599, "theoretical_loss": 3.5108419296695708, "tokens_seen": 1525624832 }, { "epoch": 4.05, "learning_rate": 0.00027160481444333, "loss": 2.7757, "theoretical_loss": 3.5108286759552096, "tokens_seen": 1525690368 }, { "epoch": 4.05, "learning_rate": 0.0002715947843530592, "loss": 2.6996, "theoretical_loss": 3.5108154229695487, "tokens_seen": 1525755904 }, { "epoch": 4.05, "learning_rate": 0.0002715847542627884, "loss": 2.8239, "theoretical_loss": 3.510802170712517, "tokens_seen": 1525821440 }, { "epoch": 4.05, "learning_rate": 0.00027157472417251754, "loss": 2.8044, "theoretical_loss": 3.510788919184044, "tokens_seen": 1525886976 }, { "epoch": 4.05, "learning_rate": 0.0002715646940822468, "loss": 2.8872, "theoretical_loss": 3.5107756683840567, "tokens_seen": 1525952512 }, { "epoch": 4.05, "learning_rate": 0.0002715546639919759, "loss": 2.9146, "theoretical_loss": 3.510762418312485, "tokens_seen": 1526018048 }, { "epoch": 4.05, "learning_rate": 0.00027154463390170514, "loss": 2.8178, "theoretical_loss": 3.5107491689692574, "tokens_seen": 1526083584 }, { "epoch": 4.05, "learning_rate": 0.0002715346038114343, "loss": 2.9249, "theoretical_loss": 3.5107359203543025, "tokens_seen": 1526149120 }, { "epoch": 4.05, "learning_rate": 0.0002715245737211635, "loss": 2.6735, "theoretical_loss": 3.5107226724675487, "tokens_seen": 1526214656 }, { "epoch": 4.05, "learning_rate": 0.0002715145436308927, "loss": 2.7701, "theoretical_loss": 3.510709425308925, "tokens_seen": 1526280192 }, { "epoch": 4.05, "learning_rate": 0.0002715045135406219, "loss": 2.8126, "theoretical_loss": 3.51069617887836, "tokens_seen": 1526345728 }, { "epoch": 4.05, "learning_rate": 0.00027149448345035104, "loss": 2.8608, "theoretical_loss": 3.5106829331757825, "tokens_seen": 1526411264 }, { "epoch": 4.05, "learning_rate": 0.0002714844533600803, "loss": 2.6643, "theoretical_loss": 3.510669688201121, "tokens_seen": 1526476800 }, { "epoch": 4.05, "learning_rate": 0.0002714744232698094, "loss": 2.7798, "theoretical_loss": 3.5106564439543053, "tokens_seen": 1526542336 }, { "epoch": 4.05, "learning_rate": 0.00027146439317953864, "loss": 2.8118, "theoretical_loss": 3.510643200435263, "tokens_seen": 1526607872 }, { "epoch": 4.05, "learning_rate": 0.0002714543630892678, "loss": 2.806, "theoretical_loss": 3.5106299576439226, "tokens_seen": 1526673408 }, { "epoch": 4.05, "learning_rate": 0.000271444332998997, "loss": 2.786, "theoretical_loss": 3.510616715580214, "tokens_seen": 1526738944 }, { "epoch": 4.05, "learning_rate": 0.0002714343029087262, "loss": 2.8916, "theoretical_loss": 3.510603474244065, "tokens_seen": 1526804480 }, { "epoch": 4.05, "learning_rate": 0.00027142427281845536, "loss": 2.8626, "theoretical_loss": 3.510590233635406, "tokens_seen": 1526870016 }, { "epoch": 4.05, "objective/train/docs_used": 2437563, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.956270694732666, "objective/train/theoretical_loss": 3.5105769937541638, "objective/train/tokens_used": 1547395552, "theoretical_loss": 3.5105769937541638, "tokens_seen": 1526935552 }, { "epoch": 4.05, "learning_rate": 0.00027141424272818454, "loss": 2.8957, "theoretical_loss": 3.5105769937541638, "tokens_seen": 1526935552 }, { "epoch": 4.05, "learning_rate": 0.0002714042126379138, "loss": 2.8901, "theoretical_loss": 3.510563754600268, "tokens_seen": 1527001088 }, { "epoch": 4.05, "learning_rate": 0.0002713941825476429, "loss": 2.7651, "theoretical_loss": 3.5105505161736477, "tokens_seen": 1527066624 }, { "epoch": 4.05, "learning_rate": 0.00027138415245737214, "loss": 2.6946, "theoretical_loss": 3.5105372784742315, "tokens_seen": 1527132160 }, { "epoch": 4.05, "learning_rate": 0.00027137412236710127, "loss": 2.83, "theoretical_loss": 3.5105240415019487, "tokens_seen": 1527197696 }, { "epoch": 4.05, "learning_rate": 0.0002713640922768305, "loss": 2.7816, "theoretical_loss": 3.5105108052567275, "tokens_seen": 1527263232 }, { "epoch": 4.05, "learning_rate": 0.0002713540621865597, "loss": 2.8426, "theoretical_loss": 3.5104975697384972, "tokens_seen": 1527328768 }, { "epoch": 4.05, "learning_rate": 0.00027134403209628887, "loss": 2.7451, "theoretical_loss": 3.5104843349471864, "tokens_seen": 1527394304 }, { "epoch": 4.05, "learning_rate": 0.00027133400200601805, "loss": 2.9257, "theoretical_loss": 3.510471100882724, "tokens_seen": 1527459840 }, { "epoch": 4.05, "learning_rate": 0.0002713239719157473, "loss": 2.7101, "theoretical_loss": 3.510457867545039, "tokens_seen": 1527525376 }, { "epoch": 4.05, "learning_rate": 0.0002713139418254764, "loss": 2.9673, "theoretical_loss": 3.5104446349340606, "tokens_seen": 1527590912 }, { "epoch": 4.05, "learning_rate": 0.00027130391173520564, "loss": 2.9754, "theoretical_loss": 3.5104314030497172, "tokens_seen": 1527656448 }, { "epoch": 4.05, "learning_rate": 0.00027129388164493477, "loss": 2.7488, "theoretical_loss": 3.510418171891938, "tokens_seen": 1527721984 }, { "epoch": 4.05, "learning_rate": 0.000271283851554664, "loss": 2.8541, "theoretical_loss": 3.510404941460652, "tokens_seen": 1527787520 }, { "epoch": 4.05, "learning_rate": 0.0002712738214643932, "loss": 2.9656, "theoretical_loss": 3.510391711755788, "tokens_seen": 1527853056 }, { "epoch": 4.05, "learning_rate": 0.00027126379137412237, "loss": 2.9078, "theoretical_loss": 3.5103784827772753, "tokens_seen": 1527918592 }, { "epoch": 4.05, "learning_rate": 0.00027125376128385155, "loss": 2.9281, "theoretical_loss": 3.510365254525042, "tokens_seen": 1527984128 }, { "epoch": 4.05, "learning_rate": 0.00027124373119358073, "loss": 2.8428, "theoretical_loss": 3.5103520269990183, "tokens_seen": 1528049664 }, { "epoch": 4.05, "learning_rate": 0.0002712337011033099, "loss": 2.7153, "theoretical_loss": 3.510338800199132, "tokens_seen": 1528115200 }, { "epoch": 4.05, "learning_rate": 0.00027122367101303915, "loss": 2.6774, "theoretical_loss": 3.5103255741253125, "tokens_seen": 1528180736 }, { "epoch": 4.05, "learning_rate": 0.0002712136409227683, "loss": 2.7508, "theoretical_loss": 3.5103123487774894, "tokens_seen": 1528246272 }, { "epoch": 4.05, "learning_rate": 0.0002712036108324975, "loss": 2.6602, "theoretical_loss": 3.5102991241555914, "tokens_seen": 1528311808 }, { "epoch": 4.05, "learning_rate": 0.0002711935807422267, "loss": 2.8548, "theoretical_loss": 3.5102859002595475, "tokens_seen": 1528377344 }, { "epoch": 4.05, "learning_rate": 0.00027118355065195587, "loss": 2.8752, "theoretical_loss": 3.510272677089286, "tokens_seen": 1528442880 }, { "epoch": 4.05, "learning_rate": 0.00027117352056168505, "loss": 2.7319, "theoretical_loss": 3.510259454644737, "tokens_seen": 1528508416 }, { "epoch": 4.05, "objective/train/docs_used": 2440329, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6965980529785156, "objective/train/theoretical_loss": 3.5102462329258293, "objective/train/tokens_used": 1549033952, "theoretical_loss": 3.5102462329258293, "tokens_seen": 1528573952 }, { "epoch": 4.05, "learning_rate": 0.00027116349047141423, "loss": 2.8124, "theoretical_loss": 3.5102462329258293, "tokens_seen": 1528573952 }, { "epoch": 4.05, "learning_rate": 0.0002711534603811434, "loss": 2.7317, "theoretical_loss": 3.5102330119324914, "tokens_seen": 1528639488 }, { "epoch": 4.05, "learning_rate": 0.00027114343029087265, "loss": 2.9107, "theoretical_loss": 3.510219791664653, "tokens_seen": 1528705024 }, { "epoch": 4.05, "learning_rate": 0.0002711334002006018, "loss": 2.9258, "theoretical_loss": 3.510206572122243, "tokens_seen": 1528770560 }, { "epoch": 4.05, "learning_rate": 0.000271123370110331, "loss": 2.7839, "theoretical_loss": 3.5101933533051906, "tokens_seen": 1528836096 }, { "epoch": 4.05, "learning_rate": 0.00027111334002006014, "loss": 2.8072, "theoretical_loss": 3.5101801352134245, "tokens_seen": 1528901632 }, { "epoch": 4.05, "learning_rate": 0.0002711033099297894, "loss": 2.7117, "theoretical_loss": 3.510166917846874, "tokens_seen": 1528967168 }, { "epoch": 4.05, "learning_rate": 0.00027109327983951856, "loss": 2.7057, "theoretical_loss": 3.5101537012054687, "tokens_seen": 1529032704 }, { "epoch": 4.05, "learning_rate": 0.00027108324974924774, "loss": 2.8979, "theoretical_loss": 3.5101404852891376, "tokens_seen": 1529098240 }, { "epoch": 4.05, "learning_rate": 0.0002710732196589769, "loss": 2.676, "theoretical_loss": 3.510127270097809, "tokens_seen": 1529163776 }, { "epoch": 4.05, "learning_rate": 0.0002710631895687061, "loss": 2.6688, "theoretical_loss": 3.510114055631413, "tokens_seen": 1529229312 }, { "epoch": 4.05, "learning_rate": 0.0002710531594784353, "loss": 2.7131, "theoretical_loss": 3.5101008418898787, "tokens_seen": 1529294848 }, { "epoch": 4.05, "learning_rate": 0.0002710431293881645, "loss": 2.7958, "theoretical_loss": 3.510087628873135, "tokens_seen": 1529360384 }, { "epoch": 4.05, "learning_rate": 0.00027103309929789364, "loss": 2.8098, "theoretical_loss": 3.5100744165811104, "tokens_seen": 1529425920 }, { "epoch": 4.05, "learning_rate": 0.0002710230692076229, "loss": 2.9212, "theoretical_loss": 3.5100612050137356, "tokens_seen": 1529491456 }, { "epoch": 4.05, "learning_rate": 0.0002710130391173521, "loss": 2.6754, "theoretical_loss": 3.5100479941709386, "tokens_seen": 1529556992 }, { "epoch": 4.05, "learning_rate": 0.00027100300902708124, "loss": 2.9029, "theoretical_loss": 3.5100347840526496, "tokens_seen": 1529622528 }, { "epoch": 4.05, "learning_rate": 0.0002709929789368105, "loss": 2.8767, "theoretical_loss": 3.5100215746587966, "tokens_seen": 1529688064 }, { "epoch": 4.05, "learning_rate": 0.0002709829488465396, "loss": 2.9277, "theoretical_loss": 3.51000836598931, "tokens_seen": 1529753600 }, { "epoch": 4.05, "learning_rate": 0.00027097291875626884, "loss": 2.7648, "theoretical_loss": 3.509995158044118, "tokens_seen": 1529819136 }, { "epoch": 4.05, "learning_rate": 0.000270962888665998, "loss": 2.8418, "theoretical_loss": 3.509981950823151, "tokens_seen": 1529884672 }, { "epoch": 4.05, "learning_rate": 0.0002709528585757272, "loss": 2.8932, "theoretical_loss": 3.509968744326337, "tokens_seen": 1529950208 }, { "epoch": 4.05, "learning_rate": 0.0002709428284854564, "loss": 2.7859, "theoretical_loss": 3.5099555385536068, "tokens_seen": 1530015744 }, { "epoch": 4.05, "learning_rate": 0.00027093279839518556, "loss": 2.7579, "theoretical_loss": 3.5099423335048883, "tokens_seen": 1530081280 }, { "epoch": 4.05, "learning_rate": 0.00027092276830491474, "loss": 2.7548, "theoretical_loss": 3.5099291291801116, "tokens_seen": 1530146816 }, { "epoch": 4.05, "objective/train/docs_used": 2442464, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.784806489944458, "objective/train/theoretical_loss": 3.5099159255792056, "objective/train/tokens_used": 1550672352, "theoretical_loss": 3.5099159255792056, "tokens_seen": 1530212352 }, { "epoch": 4.05, "learning_rate": 0.000270912738214644, "loss": 2.7892, "theoretical_loss": 3.5099159255792056, "tokens_seen": 1530212352 }, { "epoch": 4.05, "learning_rate": 0.0002709027081243731, "loss": 2.8047, "theoretical_loss": 3.5099027227021002, "tokens_seen": 1530277888 }, { "epoch": 4.05, "learning_rate": 0.00027089267803410234, "loss": 2.8155, "theoretical_loss": 3.5098895205487235, "tokens_seen": 1530343424 }, { "epoch": 4.05, "learning_rate": 0.00027088264794383147, "loss": 2.6909, "theoretical_loss": 3.509876319119006, "tokens_seen": 1530408960 }, { "epoch": 4.05, "learning_rate": 0.0002708726178535607, "loss": 2.7567, "theoretical_loss": 3.509863118412877, "tokens_seen": 1530474496 }, { "epoch": 4.05, "learning_rate": 0.0002708625877632899, "loss": 2.7136, "theoretical_loss": 3.509849918430265, "tokens_seen": 1530540032 }, { "epoch": 4.05, "learning_rate": 0.00027085255767301907, "loss": 2.5673, "theoretical_loss": 3.5098367191711004, "tokens_seen": 1530605568 }, { "epoch": 4.05, "learning_rate": 0.00027084252758274825, "loss": 2.7925, "theoretical_loss": 3.5098235206353117, "tokens_seen": 1530671104 }, { "epoch": 4.05, "learning_rate": 0.0002708324974924775, "loss": 2.89, "theoretical_loss": 3.5098103228228292, "tokens_seen": 1530736640 }, { "epoch": 4.05, "learning_rate": 0.0002708224674022066, "loss": 3.0302, "theoretical_loss": 3.5097971257335816, "tokens_seen": 1530802176 }, { "epoch": 4.05, "learning_rate": 0.00027081243731193584, "loss": 2.6734, "theoretical_loss": 3.509783929367498, "tokens_seen": 1530867712 }, { "epoch": 4.05, "learning_rate": 0.00027080240722166497, "loss": 2.8472, "theoretical_loss": 3.5097707337245088, "tokens_seen": 1530933248 }, { "epoch": 4.05, "learning_rate": 0.0002707923771313942, "loss": 2.7927, "theoretical_loss": 3.5097575388045428, "tokens_seen": 1530998784 }, { "epoch": 4.05, "learning_rate": 0.0002707823470411234, "loss": 2.8293, "theoretical_loss": 3.5097443446075296, "tokens_seen": 1531064320 }, { "epoch": 4.05, "learning_rate": 0.00027077231695085257, "loss": 2.8688, "theoretical_loss": 3.509731151133398, "tokens_seen": 1531129856 }, { "epoch": 4.05, "learning_rate": 0.00027076228686058175, "loss": 2.8233, "theoretical_loss": 3.509717958382079, "tokens_seen": 1531195392 }, { "epoch": 4.05, "learning_rate": 0.00027075225677031093, "loss": 2.9019, "theoretical_loss": 3.5097047663535004, "tokens_seen": 1531260928 }, { "epoch": 4.05, "learning_rate": 0.0002707422266800401, "loss": 2.9954, "theoretical_loss": 3.5096915750475928, "tokens_seen": 1531326464 }, { "epoch": 4.05, "learning_rate": 0.00027073219658976935, "loss": 2.7799, "theoretical_loss": 3.509678384464285, "tokens_seen": 1531392000 }, { "epoch": 4.05, "learning_rate": 0.0002707221664994985, "loss": 2.7726, "theoretical_loss": 3.5096651946035067, "tokens_seen": 1531457536 }, { "epoch": 4.05, "learning_rate": 0.0002707121364092277, "loss": 2.7174, "theoretical_loss": 3.5096520054651883, "tokens_seen": 1531523072 }, { "epoch": 4.05, "learning_rate": 0.0002707021063189569, "loss": 2.6243, "theoretical_loss": 3.5096388170492574, "tokens_seen": 1531588608 }, { "epoch": 4.05, "learning_rate": 0.00027069207622868607, "loss": 2.8907, "theoretical_loss": 3.5096256293556456, "tokens_seen": 1531654144 }, { "epoch": 4.05, "learning_rate": 0.00027068204613841525, "loss": 2.7862, "theoretical_loss": 3.509612442384281, "tokens_seen": 1531719680 }, { "epoch": 4.05, "learning_rate": 0.00027067201604814443, "loss": 2.8219, "theoretical_loss": 3.5095992561350933, "tokens_seen": 1531785216 }, { "epoch": 4.05, "objective/train/docs_used": 2445369, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6086065769195557, "objective/train/theoretical_loss": 3.5095860706080124, "objective/train/tokens_used": 1552310752, "theoretical_loss": 3.5095860706080124, "tokens_seen": 1531850752 }, { "epoch": 4.05, "learning_rate": 0.0002706619859578736, "loss": 2.7172, "theoretical_loss": 3.5095860706080124, "tokens_seen": 1531850752 }, { "epoch": 4.05, "learning_rate": 0.00027065195586760285, "loss": 2.9291, "theoretical_loss": 3.5095728858029682, "tokens_seen": 1531916288 }, { "epoch": 4.05, "learning_rate": 0.000270641925777332, "loss": 2.8549, "theoretical_loss": 3.5095597017198905, "tokens_seen": 1531981824 }, { "epoch": 4.05, "learning_rate": 0.0002706318956870612, "loss": 2.8103, "theoretical_loss": 3.5095465183587073, "tokens_seen": 1532047360 }, { "epoch": 4.05, "learning_rate": 0.00027062186559679034, "loss": 2.7805, "theoretical_loss": 3.5095333357193494, "tokens_seen": 1532112896 }, { "epoch": 4.05, "learning_rate": 0.0002706118355065196, "loss": 2.8292, "theoretical_loss": 3.5095201538017466, "tokens_seen": 1532178432 }, { "epoch": 4.05, "learning_rate": 0.00027060180541624876, "loss": 2.8741, "theoretical_loss": 3.5095069726058274, "tokens_seen": 1532243968 }, { "epoch": 4.05, "learning_rate": 0.00027059177532597794, "loss": 2.7259, "theoretical_loss": 3.5094937921315226, "tokens_seen": 1532309504 }, { "epoch": 4.05, "learning_rate": 0.0002705817452357071, "loss": 2.8214, "theoretical_loss": 3.5094806123787614, "tokens_seen": 1532375040 }, { "epoch": 4.05, "learning_rate": 0.0002705717151454363, "loss": 2.8445, "theoretical_loss": 3.5094674333474734, "tokens_seen": 1532440576 }, { "epoch": 4.05, "learning_rate": 0.0002705616850551655, "loss": 2.7127, "theoretical_loss": 3.5094542550375882, "tokens_seen": 1532506112 }, { "epoch": 4.05, "learning_rate": 0.0002705516549648947, "loss": 2.895, "theoretical_loss": 3.5094410774490354, "tokens_seen": 1532571648 }, { "epoch": 4.05, "learning_rate": 0.00027054162487462384, "loss": 2.7963, "theoretical_loss": 3.509427900581745, "tokens_seen": 1532637184 }, { "epoch": 4.05, "learning_rate": 0.0002705315947843531, "loss": 2.8576, "theoretical_loss": 3.509414724435646, "tokens_seen": 1532702720 }, { "epoch": 4.05, "learning_rate": 0.00027052156469408226, "loss": 2.8209, "theoretical_loss": 3.509401549010669, "tokens_seen": 1532768256 }, { "epoch": 4.05, "learning_rate": 0.00027051153460381144, "loss": 2.7619, "theoretical_loss": 3.5093883743067433, "tokens_seen": 1532833792 }, { "epoch": 4.05, "learning_rate": 0.0002705015045135406, "loss": 2.7711, "theoretical_loss": 3.5093752003237983, "tokens_seen": 1532899328 }, { "epoch": 4.05, "learning_rate": 0.0002704914744232698, "loss": 2.7938, "theoretical_loss": 3.5093620270617643, "tokens_seen": 1532964864 }, { "epoch": 4.05, "learning_rate": 0.000270481444332999, "loss": 2.828, "theoretical_loss": 3.509348854520571, "tokens_seen": 1533030400 }, { "epoch": 4.05, "learning_rate": 0.0002704714142427282, "loss": 2.7402, "theoretical_loss": 3.509335682700147, "tokens_seen": 1533095936 }, { "epoch": 4.05, "learning_rate": 0.00027046138415245735, "loss": 2.765, "theoretical_loss": 3.5093225116004234, "tokens_seen": 1533161472 }, { "epoch": 4.05, "learning_rate": 0.0002704513540621866, "loss": 2.5838, "theoretical_loss": 3.50930934122133, "tokens_seen": 1533227008 }, { "epoch": 4.05, "learning_rate": 0.0002704413239719157, "loss": 2.6986, "theoretical_loss": 3.5092961715627955, "tokens_seen": 1533292544 }, { "epoch": 4.05, "learning_rate": 0.00027043129388164494, "loss": 2.871, "theoretical_loss": 3.5092830026247506, "tokens_seen": 1533358080 }, { "epoch": 4.05, "learning_rate": 0.0002704212637913741, "loss": 2.8801, "theoretical_loss": 3.5092698344071245, "tokens_seen": 1533423616 }, { "epoch": 4.05, "objective/train/docs_used": 2448031, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8136913776397705, "objective/train/theoretical_loss": 3.5092566669098475, "objective/train/tokens_used": 1553949152, "theoretical_loss": 3.5092566669098475, "tokens_seen": 1533489152 }, { "epoch": 4.05, "learning_rate": 0.0002704112337011033, "loss": 2.7588, "theoretical_loss": 3.5092566669098475, "tokens_seen": 1533489152 }, { "epoch": 4.05, "learning_rate": 0.0002704012036108325, "loss": 2.6193, "theoretical_loss": 3.5092435001328486, "tokens_seen": 1533554688 }, { "epoch": 4.05, "learning_rate": 0.00027039117352056167, "loss": 2.8189, "theoretical_loss": 3.509230334076059, "tokens_seen": 1533620224 }, { "epoch": 4.05, "learning_rate": 0.00027038114343029085, "loss": 2.7257, "theoretical_loss": 3.509217168739407, "tokens_seen": 1533685760 }, { "epoch": 4.05, "learning_rate": 0.0002703711133400201, "loss": 2.7753, "theoretical_loss": 3.5092040041228234, "tokens_seen": 1533751296 }, { "epoch": 4.05, "learning_rate": 0.0002703610832497492, "loss": 2.8845, "theoretical_loss": 3.509190840226238, "tokens_seen": 1533816832 }, { "epoch": 4.05, "learning_rate": 0.00027035105315947845, "loss": 2.6648, "theoretical_loss": 3.50917767704958, "tokens_seen": 1533882368 }, { "epoch": 4.05, "learning_rate": 0.00027034102306920763, "loss": 2.7751, "theoretical_loss": 3.5091645145927806, "tokens_seen": 1533947904 }, { "epoch": 4.05, "learning_rate": 0.0002703309929789368, "loss": 2.78, "theoretical_loss": 3.5091513528557683, "tokens_seen": 1534013440 }, { "epoch": 4.05, "learning_rate": 0.000270320962888666, "loss": 2.8869, "theoretical_loss": 3.5091381918384736, "tokens_seen": 1534078976 }, { "epoch": 4.05, "learning_rate": 0.00027031093279839517, "loss": 2.8601, "theoretical_loss": 3.509125031540826, "tokens_seen": 1534144512 }, { "epoch": 4.05, "learning_rate": 0.00027030090270812435, "loss": 2.7309, "theoretical_loss": 3.509111871962756, "tokens_seen": 1534210048 }, { "epoch": 4.05, "learning_rate": 0.0002702908726178536, "loss": 2.9004, "theoretical_loss": 3.5090987131041933, "tokens_seen": 1534275584 }, { "epoch": 4.05, "learning_rate": 0.0002702808425275827, "loss": 2.6354, "theoretical_loss": 3.5090855549650675, "tokens_seen": 1534341120 }, { "epoch": 4.05, "learning_rate": 0.00027027081243731195, "loss": 2.8822, "theoretical_loss": 3.5090723975453093, "tokens_seen": 1534406656 }, { "epoch": 4.05, "learning_rate": 0.00027026078234704113, "loss": 2.9225, "theoretical_loss": 3.5090592408448478, "tokens_seen": 1534472192 }, { "epoch": 4.05, "learning_rate": 0.0002702507522567703, "loss": 2.7752, "theoretical_loss": 3.5090460848636136, "tokens_seen": 1534537728 }, { "epoch": 4.05, "learning_rate": 0.00027024072216649955, "loss": 2.7443, "theoretical_loss": 3.5090329296015357, "tokens_seen": 1534603264 }, { "epoch": 4.05, "learning_rate": 0.0002702306920762287, "loss": 2.7231, "theoretical_loss": 3.5090197750585457, "tokens_seen": 1534668800 }, { "epoch": 4.05, "learning_rate": 0.0002702206619859579, "loss": 2.8783, "theoretical_loss": 3.5090066212345716, "tokens_seen": 1534734336 }, { "epoch": 4.05, "learning_rate": 0.0002702106318956871, "loss": 2.6702, "theoretical_loss": 3.508993468129545, "tokens_seen": 1534799872 }, { "epoch": 4.05, "learning_rate": 0.00027020060180541627, "loss": 2.7783, "theoretical_loss": 3.5089803157433956, "tokens_seen": 1534865408 }, { "epoch": 4.05, "learning_rate": 0.00027019057171514545, "loss": 2.9106, "theoretical_loss": 3.5089671640760534, "tokens_seen": 1534930944 }, { "epoch": 4.05, "learning_rate": 0.00027018054162487463, "loss": 2.8031, "theoretical_loss": 3.5089540131274477, "tokens_seen": 1534996480 }, { "epoch": 4.05, "learning_rate": 0.0002701705115346038, "loss": 2.8768, "theoretical_loss": 3.508940862897509, "tokens_seen": 1535062016 }, { "epoch": 4.05, "objective/train/docs_used": 2449490, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8923065662384033, "objective/train/theoretical_loss": 3.5089277133861674, "objective/train/tokens_used": 1555587552, "theoretical_loss": 3.5089277133861674, "tokens_seen": 1535127552 }, { "epoch": 4.05, "learning_rate": 0.00027016048144433305, "loss": 2.8961, "theoretical_loss": 3.5089277133861674, "tokens_seen": 1535127552 }, { "epoch": 4.05, "learning_rate": 0.0002701504513540622, "loss": 2.9517, "theoretical_loss": 3.5089145645933533, "tokens_seen": 1535193088 }, { "epoch": 4.05, "learning_rate": 0.0002701404212637914, "loss": 2.8533, "theoretical_loss": 3.508901416518996, "tokens_seen": 1535258624 }, { "epoch": 4.05, "learning_rate": 0.00027013039117352054, "loss": 2.7911, "theoretical_loss": 3.5088882691630263, "tokens_seen": 1535324160 }, { "epoch": 4.05, "learning_rate": 0.0002701203610832498, "loss": 2.7251, "theoretical_loss": 3.5088751225253736, "tokens_seen": 1535389696 }, { "epoch": 4.05, "learning_rate": 0.00027011033099297896, "loss": 2.8619, "theoretical_loss": 3.508861976605969, "tokens_seen": 1535455232 }, { "epoch": 4.05, "learning_rate": 0.00027010030090270814, "loss": 2.8218, "theoretical_loss": 3.5088488314047415, "tokens_seen": 1535520768 }, { "epoch": 4.05, "learning_rate": 0.0002700902708124373, "loss": 2.7223, "theoretical_loss": 3.5088356869216217, "tokens_seen": 1535586304 }, { "epoch": 4.05, "learning_rate": 0.0002700802407221665, "loss": 2.8812, "theoretical_loss": 3.5088225431565396, "tokens_seen": 1535651840 }, { "epoch": 4.05, "learning_rate": 0.0002700702106318957, "loss": 2.7464, "theoretical_loss": 3.508809400109426, "tokens_seen": 1535717376 }, { "epoch": 4.05, "learning_rate": 0.0002700601805416249, "loss": 2.8914, "theoretical_loss": 3.50879625778021, "tokens_seen": 1535782912 }, { "epoch": 4.05, "learning_rate": 0.00027005015045135404, "loss": 2.8243, "theoretical_loss": 3.5087831161688223, "tokens_seen": 1535848448 }, { "epoch": 4.05, "learning_rate": 0.0002700401203610833, "loss": 2.7851, "theoretical_loss": 3.5087699752751935, "tokens_seen": 1535913984 }, { "epoch": 4.05, "learning_rate": 0.00027003009027081246, "loss": 2.88, "theoretical_loss": 3.508756835099253, "tokens_seen": 1535979520 }, { "epoch": 4.05, "learning_rate": 0.00027002006018054164, "loss": 2.8908, "theoretical_loss": 3.5087436956409315, "tokens_seen": 1536045056 }, { "epoch": 4.05, "learning_rate": 0.0002700100300902708, "loss": 2.9355, "theoretical_loss": 3.5087305569001583, "tokens_seen": 1536110592 }, { "epoch": 4.05, "learning_rate": 0.00027, "loss": 2.8003, "theoretical_loss": 3.5087174188768646, "tokens_seen": 1536176128 }, { "epoch": 4.05, "learning_rate": 0.0002699899699097292, "loss": 2.8684, "theoretical_loss": 3.5087042815709806, "tokens_seen": 1536241664 }, { "epoch": 4.05, "learning_rate": 0.0002699799398194584, "loss": 2.7957, "theoretical_loss": 3.508691144982436, "tokens_seen": 1536307200 }, { "epoch": 4.05, "learning_rate": 0.00026996990972918755, "loss": 2.8508, "theoretical_loss": 3.5086780091111613, "tokens_seen": 1536372736 }, { "epoch": 4.05, "learning_rate": 0.0002699598796389168, "loss": 2.5917, "theoretical_loss": 3.5086648739570867, "tokens_seen": 1536438272 }, { "epoch": 4.05, "learning_rate": 0.0002699498495486459, "loss": 2.6812, "theoretical_loss": 3.5086517395201424, "tokens_seen": 1536503808 }, { "epoch": 4.05, "learning_rate": 0.00026993981945837514, "loss": 2.8728, "theoretical_loss": 3.5086386058002588, "tokens_seen": 1536569344 }, { "epoch": 4.05, "learning_rate": 0.0002699297893681043, "loss": 2.7619, "theoretical_loss": 3.508625472797366, "tokens_seen": 1536634880 }, { "epoch": 4.05, "learning_rate": 0.0002699197592778335, "loss": 2.6797, "theoretical_loss": 3.508612340511394, "tokens_seen": 1536700416 }, { "epoch": 4.05, "objective/train/docs_used": 2452229, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7600290775299072, "objective/train/theoretical_loss": 3.508599208942274, "objective/train/tokens_used": 1557225952, "theoretical_loss": 3.508599208942274, "tokens_seen": 1536765952 }, { "epoch": 4.05, "learning_rate": 0.0002699097291875627, "loss": 2.9137, "theoretical_loss": 3.508599208942274, "tokens_seen": 1536765952 }, { "epoch": 4.05, "learning_rate": 0.00026989969909729187, "loss": 2.9302, "theoretical_loss": 3.5085860780899356, "tokens_seen": 1536831488 }, { "epoch": 4.05, "learning_rate": 0.00026988966900702105, "loss": 2.8322, "theoretical_loss": 3.5085729479543093, "tokens_seen": 1536897024 }, { "epoch": 4.05, "learning_rate": 0.0002698796389167503, "loss": 2.6932, "theoretical_loss": 3.508559818535325, "tokens_seen": 1536962560 }, { "epoch": 4.05, "learning_rate": 0.0002698696088264794, "loss": 2.4917, "theoretical_loss": 3.508546689832914, "tokens_seen": 1537028096 }, { "epoch": 4.05, "learning_rate": 0.00026985957873620865, "loss": 2.7585, "theoretical_loss": 3.5085335618470057, "tokens_seen": 1537093632 }, { "epoch": 4.05, "learning_rate": 0.00026984954864593783, "loss": 2.9206, "theoretical_loss": 3.508520434577531, "tokens_seen": 1537159168 }, { "epoch": 4.05, "learning_rate": 0.000269839518555667, "loss": 2.8887, "theoretical_loss": 3.50850730802442, "tokens_seen": 1537224704 }, { "epoch": 4.05, "learning_rate": 0.0002698294884653962, "loss": 2.8143, "theoretical_loss": 3.508494182187603, "tokens_seen": 1537290240 }, { "epoch": 4.05, "learning_rate": 0.00026981945837512537, "loss": 2.5782, "theoretical_loss": 3.5084810570670104, "tokens_seen": 1537355776 }, { "epoch": 4.05, "learning_rate": 0.00026980942828485455, "loss": 2.773, "theoretical_loss": 3.508467932662573, "tokens_seen": 1537421312 }, { "epoch": 4.05, "learning_rate": 0.0002697993981945838, "loss": 2.7156, "theoretical_loss": 3.5084548089742205, "tokens_seen": 1537486848 }, { "epoch": 4.05, "learning_rate": 0.0002697893681043129, "loss": 2.8167, "theoretical_loss": 3.508441686001884, "tokens_seen": 1537552384 }, { "epoch": 4.05, "learning_rate": 0.00026977933801404215, "loss": 2.899, "theoretical_loss": 3.508428563745494, "tokens_seen": 1537617920 }, { "epoch": 4.05, "learning_rate": 0.0002697693079237713, "loss": 2.7119, "theoretical_loss": 3.5084154422049805, "tokens_seen": 1537683456 }, { "epoch": 4.05, "learning_rate": 0.0002697592778335005, "loss": 2.8797, "theoretical_loss": 3.5084023213802737, "tokens_seen": 1537748992 }, { "epoch": 4.05, "learning_rate": 0.0002697492477432297, "loss": 2.8118, "theoretical_loss": 3.508389201271304, "tokens_seen": 1537814528 }, { "epoch": 4.05, "learning_rate": 0.0002697392176529589, "loss": 2.8411, "theoretical_loss": 3.5083760818780023, "tokens_seen": 1537880064 }, { "epoch": 4.05, "learning_rate": 0.00026972918756268806, "loss": 2.7823, "theoretical_loss": 3.5083629632002995, "tokens_seen": 1537945600 }, { "epoch": 4.05, "learning_rate": 0.0002697191574724173, "loss": 2.7481, "theoretical_loss": 3.5083498452381257, "tokens_seen": 1538011136 }, { "epoch": 4.05, "learning_rate": 0.0002697091273821464, "loss": 2.842, "theoretical_loss": 3.50833672799141, "tokens_seen": 1538076672 }, { "epoch": 4.05, "learning_rate": 0.00026969909729187565, "loss": 2.8595, "theoretical_loss": 3.508323611460085, "tokens_seen": 1538142208 }, { "epoch": 4.05, "learning_rate": 0.0002696890672016048, "loss": 2.8394, "theoretical_loss": 3.5083104956440803, "tokens_seen": 1538207744 }, { "epoch": 4.05, "learning_rate": 0.000269679037111334, "loss": 2.8594, "theoretical_loss": 3.5082973805433264, "tokens_seen": 1538273280 }, { "epoch": 4.05, "learning_rate": 0.0002696690070210632, "loss": 2.7806, "theoretical_loss": 3.5082842661577542, "tokens_seen": 1538338816 }, { "epoch": 4.05, "objective/train/docs_used": 2454703, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.94954776763916, "objective/train/theoretical_loss": 3.5082711524872936, "objective/train/tokens_used": 1558864352, "theoretical_loss": 3.5082711524872936, "tokens_seen": 1538404352 }, { "epoch": 4.05, "learning_rate": 0.0002696589769307924, "loss": 2.8211, "theoretical_loss": 3.5082711524872936, "tokens_seen": 1538404352 }, { "epoch": 4.05, "learning_rate": 0.00026964894684052156, "loss": 2.9533, "theoretical_loss": 3.508258039531875, "tokens_seen": 1538469888 }, { "epoch": 4.05, "learning_rate": 0.00026963891675025074, "loss": 2.5825, "theoretical_loss": 3.50824492729143, "tokens_seen": 1538535424 }, { "epoch": 4.05, "learning_rate": 0.0002696288866599799, "loss": 2.9815, "theoretical_loss": 3.5082318157658885, "tokens_seen": 1538600960 }, { "epoch": 4.05, "learning_rate": 0.00026961885656970916, "loss": 2.8827, "theoretical_loss": 3.5082187049551807, "tokens_seen": 1538666496 }, { "epoch": 4.05, "learning_rate": 0.0002696088264794383, "loss": 2.8027, "theoretical_loss": 3.508205594859238, "tokens_seen": 1538732032 }, { "epoch": 4.05, "learning_rate": 0.0002695987963891675, "loss": 2.7725, "theoretical_loss": 3.508192485477991, "tokens_seen": 1538797568 }, { "epoch": 4.05, "learning_rate": 0.00026958876629889665, "loss": 2.9628, "theoretical_loss": 3.5081793768113694, "tokens_seen": 1538863104 }, { "epoch": 4.05, "learning_rate": 0.0002695787362086259, "loss": 2.9483, "theoretical_loss": 3.5081662688593047, "tokens_seen": 1538928640 }, { "epoch": 4.05, "learning_rate": 0.00026956870611835506, "loss": 2.868, "theoretical_loss": 3.5081531616217267, "tokens_seen": 1538994176 }, { "epoch": 4.05, "learning_rate": 0.00026955867602808424, "loss": 2.7485, "theoretical_loss": 3.5081400550985666, "tokens_seen": 1539059712 }, { "epoch": 4.05, "learning_rate": 0.0002695486459378134, "loss": 2.8951, "theoretical_loss": 3.508126949289755, "tokens_seen": 1539125248 }, { "epoch": 4.05, "learning_rate": 0.00026953861584754266, "loss": 2.8543, "theoretical_loss": 3.5081138441952224, "tokens_seen": 1539190784 }, { "epoch": 4.05, "learning_rate": 0.0002695285857572718, "loss": 2.8545, "theoretical_loss": 3.5081007398149, "tokens_seen": 1539256320 }, { "epoch": 4.05, "learning_rate": 0.000269518555667001, "loss": 2.6647, "theoretical_loss": 3.5080876361487174, "tokens_seen": 1539321856 }, { "epoch": 4.05, "learning_rate": 0.0002695085255767302, "loss": 2.8284, "theoretical_loss": 3.508074533196606, "tokens_seen": 1539387392 }, { "epoch": 4.05, "learning_rate": 0.0002694984954864594, "loss": 2.7609, "theoretical_loss": 3.5080614309584965, "tokens_seen": 1539452928 }, { "epoch": 4.05, "learning_rate": 0.0002694884653961886, "loss": 2.7919, "theoretical_loss": 3.5080483294343194, "tokens_seen": 1539518464 }, { "epoch": 4.05, "learning_rate": 0.00026947843530591775, "loss": 2.8463, "theoretical_loss": 3.5080352286240055, "tokens_seen": 1539584000 }, { "epoch": 4.05, "learning_rate": 0.000269468405215647, "loss": 2.7303, "theoretical_loss": 3.508022128527486, "tokens_seen": 1539649536 }, { "epoch": 4.05, "learning_rate": 0.0002694583751253761, "loss": 2.8612, "theoretical_loss": 3.5080090291446906, "tokens_seen": 1539715072 }, { "epoch": 4.05, "learning_rate": 0.00026944834503510534, "loss": 2.7348, "theoretical_loss": 3.507995930475551, "tokens_seen": 1539780608 }, { "epoch": 4.05, "learning_rate": 0.0002694383149448345, "loss": 2.7789, "theoretical_loss": 3.507982832519997, "tokens_seen": 1539846144 }, { "epoch": 4.05, "learning_rate": 0.0002694282848545637, "loss": 2.7743, "theoretical_loss": 3.5079697352779604, "tokens_seen": 1539911680 }, { "epoch": 4.05, "learning_rate": 0.0002694182547642929, "loss": 2.8246, "theoretical_loss": 3.507956638749371, "tokens_seen": 1539977216 }, { "epoch": 4.05, "objective/train/docs_used": 2457638, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9191513061523438, "objective/train/theoretical_loss": 3.50794354293416, "objective/train/tokens_used": 1560502752, "theoretical_loss": 3.50794354293416, "tokens_seen": 1540042752 }, { "epoch": 4.05, "learning_rate": 0.00026940822467402207, "loss": 2.9533, "theoretical_loss": 3.50794354293416, "tokens_seen": 1540042752 }, { "epoch": 4.05, "learning_rate": 0.00026939819458375125, "loss": 2.8354, "theoretical_loss": 3.5079304478322584, "tokens_seen": 1540108288 }, { "epoch": 4.05, "learning_rate": 0.0002693881644934805, "loss": 2.7654, "theoretical_loss": 3.507917353443597, "tokens_seen": 1540173824 }, { "epoch": 4.05, "learning_rate": 0.0002693781344032096, "loss": 2.9778, "theoretical_loss": 3.5079042597681065, "tokens_seen": 1540239360 }, { "epoch": 4.05, "learning_rate": 0.00026936810431293885, "loss": 2.8304, "theoretical_loss": 3.507891166805717, "tokens_seen": 1540304896 }, { "epoch": 4.05, "learning_rate": 0.00026935807422266803, "loss": 2.7023, "theoretical_loss": 3.5078780745563605, "tokens_seen": 1540370432 }, { "epoch": 4.05, "learning_rate": 0.0002693480441323972, "loss": 2.765, "theoretical_loss": 3.5078649830199673, "tokens_seen": 1540435968 }, { "epoch": 4.05, "learning_rate": 0.0002693380140421264, "loss": 2.861, "theoretical_loss": 3.507851892196468, "tokens_seen": 1540501504 }, { "epoch": 4.05, "learning_rate": 0.00026932798395185557, "loss": 2.7606, "theoretical_loss": 3.507838802085794, "tokens_seen": 1540567040 }, { "epoch": 4.05, "learning_rate": 0.00026931795386158475, "loss": 2.8701, "theoretical_loss": 3.5078257126878754, "tokens_seen": 1540632576 }, { "epoch": 4.05, "learning_rate": 0.000269307923771314, "loss": 2.7646, "theoretical_loss": 3.5078126240026437, "tokens_seen": 1540698112 }, { "epoch": 4.05, "learning_rate": 0.0002692978936810431, "loss": 2.7567, "theoretical_loss": 3.5077995360300296, "tokens_seen": 1540763648 }, { "epoch": 4.05, "learning_rate": 0.00026928786359077235, "loss": 2.6634, "theoretical_loss": 3.5077864487699637, "tokens_seen": 1540829184 }, { "epoch": 4.05, "learning_rate": 0.0002692778335005015, "loss": 2.7015, "theoretical_loss": 3.5077733622223777, "tokens_seen": 1540894720 }, { "epoch": 4.05, "learning_rate": 0.0002692678034102307, "loss": 2.7716, "theoretical_loss": 3.5077602763872022, "tokens_seen": 1540960256 }, { "epoch": 4.05, "learning_rate": 0.0002692577733199599, "loss": 2.6907, "theoretical_loss": 3.507747191264367, "tokens_seen": 1541025792 }, { "epoch": 4.05, "learning_rate": 0.0002692477432296891, "loss": 2.7858, "theoretical_loss": 3.507734106853805, "tokens_seen": 1541091328 }, { "epoch": 4.05, "learning_rate": 0.00026923771313941826, "loss": 2.9145, "theoretical_loss": 3.5077210231554456, "tokens_seen": 1541156864 }, { "epoch": 4.05, "learning_rate": 0.0002692276830491475, "loss": 2.9223, "theoretical_loss": 3.5077079401692206, "tokens_seen": 1541222400 }, { "epoch": 4.05, "learning_rate": 0.0002692176529588766, "loss": 2.8026, "theoretical_loss": 3.50769485789506, "tokens_seen": 1541287936 }, { "epoch": 4.05, "learning_rate": 0.00026920762286860585, "loss": 2.7741, "theoretical_loss": 3.507681776332896, "tokens_seen": 1541353472 }, { "epoch": 4.05, "learning_rate": 0.000269197592778335, "loss": 2.815, "theoretical_loss": 3.507668695482659, "tokens_seen": 1541419008 }, { "epoch": 4.05, "learning_rate": 0.0002691875626880642, "loss": 2.8752, "theoretical_loss": 3.5076556153442793, "tokens_seen": 1541484544 }, { "epoch": 4.05, "learning_rate": 0.0002691775325977934, "loss": 2.7747, "theoretical_loss": 3.507642535917689, "tokens_seen": 1541550080 }, { "epoch": 4.05, "learning_rate": 0.0002691675025075226, "loss": 2.6785, "theoretical_loss": 3.5076294572028184, "tokens_seen": 1541615616 }, { "epoch": 4.05, "objective/train/docs_used": 2460385, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.015993118286133, "objective/train/theoretical_loss": 3.5076163791995993, "objective/train/tokens_used": 1562141152, "theoretical_loss": 3.5076163791995993, "tokens_seen": 1541681152 }, { "epoch": 4.05, "learning_rate": 0.00026915747241725176, "loss": 2.9732, "theoretical_loss": 3.5076163791995993, "tokens_seen": 1541681152 }, { "epoch": 4.05, "learning_rate": 0.00026914744232698094, "loss": 2.7608, "theoretical_loss": 3.507603301907962, "tokens_seen": 1541746688 }, { "epoch": 4.05, "learning_rate": 0.0002691374122367101, "loss": 2.8566, "theoretical_loss": 3.5075902253278377, "tokens_seen": 1541812224 }, { "epoch": 4.05, "learning_rate": 0.00026912738214643936, "loss": 2.8746, "theoretical_loss": 3.5075771494591574, "tokens_seen": 1541877760 }, { "epoch": 4.05, "learning_rate": 0.0002691173520561685, "loss": 2.8203, "theoretical_loss": 3.507564074301852, "tokens_seen": 1541943296 }, { "epoch": 4.05, "learning_rate": 0.0002691073219658977, "loss": 2.8809, "theoretical_loss": 3.5075509998558534, "tokens_seen": 1542008832 }, { "epoch": 4.05, "learning_rate": 0.00026909729187562685, "loss": 2.7802, "theoretical_loss": 3.507537926121092, "tokens_seen": 1542074368 }, { "epoch": 4.05, "learning_rate": 0.0002690872617853561, "loss": 2.8086, "theoretical_loss": 3.5075248530974985, "tokens_seen": 1542139904 }, { "epoch": 4.05, "learning_rate": 0.00026907723169508526, "loss": 2.6424, "theoretical_loss": 3.507511780785005, "tokens_seen": 1542205440 }, { "epoch": 4.05, "learning_rate": 0.00026906720160481444, "loss": 2.9204, "theoretical_loss": 3.507498709183542, "tokens_seen": 1542270976 }, { "epoch": 4.05, "learning_rate": 0.0002690571715145436, "loss": 2.9734, "theoretical_loss": 3.5074856382930406, "tokens_seen": 1542336512 }, { "epoch": 4.05, "learning_rate": 0.00026904714142427286, "loss": 2.7792, "theoretical_loss": 3.5074725681134318, "tokens_seen": 1542402048 }, { "epoch": 4.05, "learning_rate": 0.000269037111334002, "loss": 2.7854, "theoretical_loss": 3.5074594986446472, "tokens_seen": 1542467584 }, { "epoch": 4.05, "learning_rate": 0.0002690270812437312, "loss": 2.9134, "theoretical_loss": 3.5074464298866177, "tokens_seen": 1542533120 }, { "epoch": 4.05, "learning_rate": 0.00026901705115346035, "loss": 2.7299, "theoretical_loss": 3.507433361839275, "tokens_seen": 1542598656 }, { "epoch": 4.05, "learning_rate": 0.0002690070210631896, "loss": 2.7825, "theoretical_loss": 3.5074202945025483, "tokens_seen": 1542664192 }, { "epoch": 4.05, "learning_rate": 0.00026899699097291877, "loss": 2.7265, "theoretical_loss": 3.5074072278763713, "tokens_seen": 1542729728 }, { "epoch": 4.05, "learning_rate": 0.00026898696088264795, "loss": 2.5912, "theoretical_loss": 3.507394161960674, "tokens_seen": 1542795264 }, { "epoch": 4.05, "learning_rate": 0.00026897693079237713, "loss": 2.7771, "theoretical_loss": 3.5073810967553873, "tokens_seen": 1542860800 }, { "epoch": 4.05, "learning_rate": 0.0002689669007021063, "loss": 2.8539, "theoretical_loss": 3.507368032260443, "tokens_seen": 1542926336 }, { "epoch": 4.05, "learning_rate": 0.0002689568706118355, "loss": 2.8032, "theoretical_loss": 3.5073549684757714, "tokens_seen": 1542991872 }, { "epoch": 4.05, "learning_rate": 0.0002689468405215647, "loss": 2.7461, "theoretical_loss": 3.507341905401305, "tokens_seen": 1543057408 }, { "epoch": 4.05, "learning_rate": 0.00026893681043129385, "loss": 2.8009, "theoretical_loss": 3.5073288430369747, "tokens_seen": 1543122944 }, { "epoch": 4.05, "learning_rate": 0.0002689267803410231, "loss": 2.8962, "theoretical_loss": 3.5073157813827107, "tokens_seen": 1543188480 }, { "epoch": 4.05, "learning_rate": 0.0002689167502507522, "loss": 2.8707, "theoretical_loss": 3.507302720438445, "tokens_seen": 1543254016 }, { "epoch": 4.05, "objective/train/docs_used": 2463209, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.786871910095215, "objective/train/theoretical_loss": 3.5072896602041093, "objective/train/tokens_used": 1563779552, "theoretical_loss": 3.5072896602041093, "tokens_seen": 1543319552 }, { "epoch": 4.05, "learning_rate": 0.00026890672016048145, "loss": 2.7724, "theoretical_loss": 3.5072896602041093, "tokens_seen": 1543319552 }, { "epoch": 4.05, "learning_rate": 0.00026889669007021063, "loss": 2.8587, "theoretical_loss": 3.5072766006796345, "tokens_seen": 1543385088 }, { "epoch": 4.05, "learning_rate": 0.0002688866599799398, "loss": 2.7438, "theoretical_loss": 3.507263541864952, "tokens_seen": 1543450624 }, { "epoch": 4.05, "learning_rate": 0.000268876629889669, "loss": 2.7865, "theoretical_loss": 3.507250483759992, "tokens_seen": 1543516160 }, { "epoch": 4.05, "learning_rate": 0.00026886659979939823, "loss": 2.928, "theoretical_loss": 3.5072374263646875, "tokens_seen": 1543581696 }, { "epoch": 4.05, "learning_rate": 0.00026885656970912736, "loss": 2.9002, "theoretical_loss": 3.5072243696789687, "tokens_seen": 1543647232 }, { "epoch": 4.05, "learning_rate": 0.0002688465396188566, "loss": 2.7397, "theoretical_loss": 3.507211313702767, "tokens_seen": 1543712768 }, { "epoch": 4.05, "learning_rate": 0.0002688365095285857, "loss": 2.7774, "theoretical_loss": 3.5071982584360146, "tokens_seen": 1543778304 }, { "epoch": 4.05, "learning_rate": 0.00026882647943831495, "loss": 2.7875, "theoretical_loss": 3.5071852038786417, "tokens_seen": 1543843840 }, { "epoch": 4.05, "learning_rate": 0.00026881644934804413, "loss": 2.6694, "theoretical_loss": 3.5071721500305797, "tokens_seen": 1543909376 }, { "epoch": 4.05, "learning_rate": 0.0002688064192577733, "loss": 2.8815, "theoretical_loss": 3.507159096891761, "tokens_seen": 1543974912 }, { "epoch": 4.05, "learning_rate": 0.0002687963891675025, "loss": 2.9577, "theoretical_loss": 3.5071460444621163, "tokens_seen": 1544040448 }, { "epoch": 4.05, "learning_rate": 0.0002687863590772317, "loss": 2.9701, "theoretical_loss": 3.5071329927415764, "tokens_seen": 1544105984 }, { "epoch": 4.05, "learning_rate": 0.00026877632898696086, "loss": 2.7849, "theoretical_loss": 3.507119941730074, "tokens_seen": 1544171520 }, { "epoch": 4.05, "learning_rate": 0.0002687662988966901, "loss": 2.8741, "theoretical_loss": 3.507106891427539, "tokens_seen": 1544237056 }, { "epoch": 4.05, "learning_rate": 0.0002687562688064193, "loss": 2.7645, "theoretical_loss": 3.507093841833904, "tokens_seen": 1544302592 }, { "epoch": 4.05, "learning_rate": 0.00026874623871614846, "loss": 2.8623, "theoretical_loss": 3.5070807929491004, "tokens_seen": 1544368128 }, { "epoch": 4.05, "learning_rate": 0.0002687362086258777, "loss": 2.756, "theoretical_loss": 3.5070677447730585, "tokens_seen": 1544433664 }, { "epoch": 4.05, "learning_rate": 0.0002687261785356068, "loss": 2.7692, "theoretical_loss": 3.507054697305711, "tokens_seen": 1544499200 }, { "epoch": 4.05, "learning_rate": 0.00026871614844533605, "loss": 2.6848, "theoretical_loss": 3.5070416505469884, "tokens_seen": 1544564736 }, { "epoch": 4.05, "learning_rate": 0.0002687061183550652, "loss": 2.8359, "theoretical_loss": 3.5070286044968224, "tokens_seen": 1544630272 }, { "epoch": 4.05, "learning_rate": 0.0002686960882647944, "loss": 2.9132, "theoretical_loss": 3.5070155591551453, "tokens_seen": 1544695808 }, { "epoch": 4.05, "learning_rate": 0.0002686860581745236, "loss": 2.6448, "theoretical_loss": 3.5070025145218873, "tokens_seen": 1544761344 }, { "epoch": 4.05, "learning_rate": 0.0002686760280842528, "loss": 2.9294, "theoretical_loss": 3.50698947059698, "tokens_seen": 1544826880 }, { "epoch": 4.05, "learning_rate": 0.00026866599799398196, "loss": 2.8339, "theoretical_loss": 3.506976427380356, "tokens_seen": 1544892416 }, { "epoch": 4.05, "objective/train/docs_used": 2466092, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8338515758514404, "objective/train/theoretical_loss": 3.5069633848719457, "objective/train/tokens_used": 1565417952, "theoretical_loss": 3.5069633848719457, "tokens_seen": 1544957952 }, { "epoch": 4.05, "learning_rate": 0.00026865596790371114, "loss": 2.8595, "theoretical_loss": 3.5069633848719457, "tokens_seen": 1544957952 }, { "epoch": 4.05, "learning_rate": 0.0002686459378134403, "loss": 2.8719, "theoretical_loss": 3.5069503430716815, "tokens_seen": 1545023488 }, { "epoch": 4.05, "learning_rate": 0.00026863590772316956, "loss": 2.7649, "theoretical_loss": 3.506937301979494, "tokens_seen": 1545089024 }, { "epoch": 4.05, "learning_rate": 0.0002686258776328987, "loss": 2.6658, "theoretical_loss": 3.5069242615953153, "tokens_seen": 1545154560 }, { "epoch": 4.05, "learning_rate": 0.0002686158475426279, "loss": 2.7814, "theoretical_loss": 3.506911221919077, "tokens_seen": 1545220096 }, { "epoch": 4.05, "learning_rate": 0.00026860581745235705, "loss": 2.7355, "theoretical_loss": 3.50689818295071, "tokens_seen": 1545285632 }, { "epoch": 4.05, "learning_rate": 0.0002685957873620863, "loss": 2.8399, "theoretical_loss": 3.5068851446901466, "tokens_seen": 1545351168 }, { "epoch": 4.05, "learning_rate": 0.00026858575727181546, "loss": 2.8878, "theoretical_loss": 3.506872107137318, "tokens_seen": 1545416704 }, { "epoch": 4.05, "learning_rate": 0.00026857572718154464, "loss": 2.7935, "theoretical_loss": 3.506859070292156, "tokens_seen": 1545482240 }, { "epoch": 4.05, "learning_rate": 0.0002685656970912738, "loss": 2.8905, "theoretical_loss": 3.506846034154592, "tokens_seen": 1545547776 }, { "epoch": 4.05, "learning_rate": 0.00026855566700100306, "loss": 2.7979, "theoretical_loss": 3.5068329987245574, "tokens_seen": 1545613312 }, { "epoch": 4.05, "learning_rate": 0.0002685456369107322, "loss": 2.8944, "theoretical_loss": 3.506819964001984, "tokens_seen": 1545678848 }, { "epoch": 4.05, "learning_rate": 0.0002685356068204614, "loss": 2.8997, "theoretical_loss": 3.506806929986803, "tokens_seen": 1545744384 }, { "epoch": 4.05, "learning_rate": 0.00026852557673019055, "loss": 2.767, "theoretical_loss": 3.506793896678947, "tokens_seen": 1545809920 }, { "epoch": 4.05, "learning_rate": 0.0002685155466399198, "loss": 2.6712, "theoretical_loss": 3.5067808640783475, "tokens_seen": 1545875456 }, { "epoch": 4.05, "learning_rate": 0.00026850551654964897, "loss": 2.676, "theoretical_loss": 3.506767832184935, "tokens_seen": 1545940992 }, { "epoch": 4.05, "learning_rate": 0.00026849548645937815, "loss": 2.9309, "theoretical_loss": 3.506754800998642, "tokens_seen": 1546006528 }, { "epoch": 4.05, "learning_rate": 0.00026848545636910733, "loss": 2.7838, "theoretical_loss": 3.5067417705194, "tokens_seen": 1546072064 }, { "epoch": 4.05, "learning_rate": 0.0002684754262788365, "loss": 2.923, "theoretical_loss": 3.506728740747141, "tokens_seen": 1546137600 }, { "epoch": 4.05, "learning_rate": 0.0002684653961885657, "loss": 2.8396, "theoretical_loss": 3.506715711681796, "tokens_seen": 1546203136 }, { "epoch": 4.05, "learning_rate": 0.0002684553660982949, "loss": 2.822, "theoretical_loss": 3.506702683323297, "tokens_seen": 1546268672 }, { "epoch": 4.05, "learning_rate": 0.00026844533600802405, "loss": 2.6776, "theoretical_loss": 3.5066896556715754, "tokens_seen": 1546334208 }, { "epoch": 4.05, "learning_rate": 0.0002684353059177533, "loss": 2.7224, "theoretical_loss": 3.506676628726564, "tokens_seen": 1546399744 }, { "epoch": 4.05, "learning_rate": 0.0002684252758274824, "loss": 2.8707, "theoretical_loss": 3.506663602488193, "tokens_seen": 1546465280 }, { "epoch": 4.05, "learning_rate": 0.00026841524573721165, "loss": 2.7887, "theoretical_loss": 3.506650576956396, "tokens_seen": 1546530816 }, { "epoch": 4.05, "objective/train/docs_used": 2467587, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0348122119903564, "objective/train/theoretical_loss": 3.5066375521311026, "objective/train/tokens_used": 1567056352, "theoretical_loss": 3.5066375521311026, "tokens_seen": 1546596352 }, { "epoch": 4.05, "learning_rate": 0.00026840521564694083, "loss": 2.848, "theoretical_loss": 3.5066375521311026, "tokens_seen": 1546596352 }, { "epoch": 4.05, "learning_rate": 0.00026839518555667, "loss": 2.9464, "theoretical_loss": 3.506624528012246, "tokens_seen": 1546661888 }, { "epoch": 4.05, "learning_rate": 0.0002683851554663992, "loss": 2.7989, "theoretical_loss": 3.5066115045997575, "tokens_seen": 1546727424 }, { "epoch": 4.05, "learning_rate": 0.00026837512537612843, "loss": 2.8521, "theoretical_loss": 3.506598481893569, "tokens_seen": 1546792960 }, { "epoch": 4.05, "learning_rate": 0.00026836509528585756, "loss": 2.7749, "theoretical_loss": 3.506585459893612, "tokens_seen": 1546858496 }, { "epoch": 4.05, "learning_rate": 0.0002683550651955868, "loss": 2.8035, "theoretical_loss": 3.506572438599818, "tokens_seen": 1546924032 }, { "epoch": 4.05, "learning_rate": 0.0002683450351053159, "loss": 2.6799, "theoretical_loss": 3.5065594180121202, "tokens_seen": 1546989568 }, { "epoch": 4.05, "learning_rate": 0.00026833500501504515, "loss": 2.7719, "theoretical_loss": 3.5065463981304488, "tokens_seen": 1547055104 }, { "epoch": 4.05, "learning_rate": 0.00026832497492477433, "loss": 2.895, "theoretical_loss": 3.506533378954736, "tokens_seen": 1547120640 }, { "epoch": 4.05, "learning_rate": 0.0002683149448345035, "loss": 2.8403, "theoretical_loss": 3.506520360484914, "tokens_seen": 1547186176 }, { "epoch": 4.05, "learning_rate": 0.0002683049147442327, "loss": 2.76, "theoretical_loss": 3.506507342720915, "tokens_seen": 1547251712 }, { "epoch": 4.05, "learning_rate": 0.0002682948846539619, "loss": 2.7831, "theoretical_loss": 3.50649432566267, "tokens_seen": 1547317248 }, { "epoch": 4.05, "learning_rate": 0.00026828485456369106, "loss": 2.6713, "theoretical_loss": 3.506481309310111, "tokens_seen": 1547382784 }, { "epoch": 4.05, "learning_rate": 0.0002682748244734203, "loss": 2.9049, "theoretical_loss": 3.50646829366317, "tokens_seen": 1547448320 }, { "epoch": 4.05, "learning_rate": 0.0002682647943831494, "loss": 2.8394, "theoretical_loss": 3.5064552787217798, "tokens_seen": 1547513856 }, { "epoch": 4.05, "learning_rate": 0.00026825476429287866, "loss": 2.7503, "theoretical_loss": 3.5064422644858704, "tokens_seen": 1547579392 }, { "epoch": 4.05, "learning_rate": 0.0002682447342026078, "loss": 2.8104, "theoretical_loss": 3.506429250955375, "tokens_seen": 1547644928 }, { "epoch": 4.05, "learning_rate": 0.000268234704112337, "loss": 2.8978, "theoretical_loss": 3.506416238130225, "tokens_seen": 1547710464 }, { "epoch": 4.05, "learning_rate": 0.0002682246740220662, "loss": 2.9151, "theoretical_loss": 3.5064032260103524, "tokens_seen": 1547776000 }, { "epoch": 4.05, "learning_rate": 0.0002682146439317954, "loss": 2.7231, "theoretical_loss": 3.50639021459569, "tokens_seen": 1547841536 }, { "epoch": 4.05, "learning_rate": 0.00026820461384152456, "loss": 2.7896, "theoretical_loss": 3.506377203886168, "tokens_seen": 1547907072 }, { "epoch": 4.05, "learning_rate": 0.0002681945837512538, "loss": 2.8728, "theoretical_loss": 3.506364193881719, "tokens_seen": 1547972608 }, { "epoch": 4.05, "learning_rate": 0.0002681845536609829, "loss": 2.8821, "theoretical_loss": 3.506351184582276, "tokens_seen": 1548038144 }, { "epoch": 4.05, "learning_rate": 0.00026817452357071216, "loss": 2.6331, "theoretical_loss": 3.5063381759877696, "tokens_seen": 1548103680 }, { "epoch": 4.05, "learning_rate": 0.0002681644934804413, "loss": 2.7972, "theoretical_loss": 3.5063251680981327, "tokens_seen": 1548169216 }, { "epoch": 4.05, "objective/train/docs_used": 2471324, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.74920654296875, "objective/train/theoretical_loss": 3.5063121609132963, "objective/train/tokens_used": 1568694752, "theoretical_loss": 3.5063121609132963, "tokens_seen": 1548234752 }, { "epoch": 4.05, "learning_rate": 0.0002681544633901705, "loss": 2.7903, "theoretical_loss": 3.5063121609132963, "tokens_seen": 1548234752 }, { "epoch": 4.05, "learning_rate": 0.0002681444332998997, "loss": 2.8008, "theoretical_loss": 3.5062991544331936, "tokens_seen": 1548300288 }, { "epoch": 4.06, "learning_rate": 0.0002681344032096289, "loss": 2.7542, "theoretical_loss": 3.5062861486577557, "tokens_seen": 1548365824 }, { "epoch": 4.06, "learning_rate": 0.00026812437311935807, "loss": 2.762, "theoretical_loss": 3.5062731435869146, "tokens_seen": 1548431360 }, { "epoch": 4.06, "learning_rate": 0.00026811434302908725, "loss": 2.9471, "theoretical_loss": 3.5062601392206028, "tokens_seen": 1548496896 }, { "epoch": 4.06, "learning_rate": 0.00026810431293881643, "loss": 2.8033, "theoretical_loss": 3.506247135558752, "tokens_seen": 1548562432 }, { "epoch": 4.06, "learning_rate": 0.00026809428284854566, "loss": 2.8069, "theoretical_loss": 3.5062341326012945, "tokens_seen": 1548627968 }, { "epoch": 4.06, "learning_rate": 0.0002680842527582748, "loss": 2.6765, "theoretical_loss": 3.5062211303481616, "tokens_seen": 1548693504 }, { "epoch": 4.06, "learning_rate": 0.000268074222668004, "loss": 2.7398, "theoretical_loss": 3.506208128799286, "tokens_seen": 1548759040 }, { "epoch": 4.06, "learning_rate": 0.00026806419257773315, "loss": 2.7068, "theoretical_loss": 3.5061951279545998, "tokens_seen": 1548824576 }, { "epoch": 4.06, "learning_rate": 0.0002680541624874624, "loss": 2.7947, "theoretical_loss": 3.506182127814035, "tokens_seen": 1548890112 }, { "epoch": 4.06, "learning_rate": 0.00026804413239719157, "loss": 2.7617, "theoretical_loss": 3.5061691283775236, "tokens_seen": 1548955648 }, { "epoch": 4.06, "learning_rate": 0.00026803410230692075, "loss": 2.8322, "theoretical_loss": 3.5061561296449972, "tokens_seen": 1549021184 }, { "epoch": 4.06, "learning_rate": 0.00026802407221664993, "loss": 2.7196, "theoretical_loss": 3.5061431316163887, "tokens_seen": 1549086720 }, { "epoch": 4.06, "learning_rate": 0.00026801404212637917, "loss": 2.7955, "theoretical_loss": 3.50613013429163, "tokens_seen": 1549152256 }, { "epoch": 4.06, "learning_rate": 0.00026800401203610835, "loss": 2.8704, "theoretical_loss": 3.506117137670653, "tokens_seen": 1549217792 }, { "epoch": 4.06, "learning_rate": 0.00026799398194583753, "loss": 2.7715, "theoretical_loss": 3.506104141753389, "tokens_seen": 1549283328 }, { "epoch": 4.06, "learning_rate": 0.0002679839518555667, "loss": 2.854, "theoretical_loss": 3.506091146539772, "tokens_seen": 1549348864 }, { "epoch": 4.06, "learning_rate": 0.0002679739217652959, "loss": 2.8423, "theoretical_loss": 3.5060781520297333, "tokens_seen": 1549414400 }, { "epoch": 4.06, "learning_rate": 0.0002679638916750251, "loss": 2.8087, "theoretical_loss": 3.5060651582232047, "tokens_seen": 1549479936 }, { "epoch": 4.06, "learning_rate": 0.00026795386158475425, "loss": 2.7386, "theoretical_loss": 3.506052165120118, "tokens_seen": 1549545472 }, { "epoch": 4.06, "learning_rate": 0.0002679438314944835, "loss": 2.973, "theoretical_loss": 3.5060391727204063, "tokens_seen": 1549611008 }, { "epoch": 4.06, "learning_rate": 0.0002679338014042126, "loss": 2.8184, "theoretical_loss": 3.5060261810240014, "tokens_seen": 1549676544 }, { "epoch": 4.06, "learning_rate": 0.00026792377131394185, "loss": 2.9227, "theoretical_loss": 3.506013190030836, "tokens_seen": 1549742080 }, { "epoch": 4.06, "learning_rate": 0.00026791374122367103, "loss": 2.8499, "theoretical_loss": 3.506000199740841, "tokens_seen": 1549807616 }, { "epoch": 4.06, "objective/train/docs_used": 2474176, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6381454467773438, "objective/train/theoretical_loss": 3.5059872101539495, "objective/train/tokens_used": 1570333152, "theoretical_loss": 3.5059872101539495, "tokens_seen": 1549873152 }, { "epoch": 4.06, "learning_rate": 0.0002679037111334002, "loss": 2.7274, "theoretical_loss": 3.5059872101539495, "tokens_seen": 1549873152 }, { "epoch": 4.06, "learning_rate": 0.0002678936810431294, "loss": 2.9695, "theoretical_loss": 3.505974221270094, "tokens_seen": 1549938688 }, { "epoch": 4.06, "learning_rate": 0.00026788365095285863, "loss": 2.8306, "theoretical_loss": 3.5059612330892067, "tokens_seen": 1550004224 }, { "epoch": 4.06, "learning_rate": 0.00026787362086258776, "loss": 2.7053, "theoretical_loss": 3.5059482456112185, "tokens_seen": 1550069760 }, { "epoch": 4.06, "learning_rate": 0.000267863590772317, "loss": 2.7917, "theoretical_loss": 3.5059352588360637, "tokens_seen": 1550135296 }, { "epoch": 4.06, "learning_rate": 0.0002678535606820461, "loss": 2.7431, "theoretical_loss": 3.5059222727636725, "tokens_seen": 1550200832 }, { "epoch": 4.06, "learning_rate": 0.00026784353059177535, "loss": 2.8721, "theoretical_loss": 3.505909287393979, "tokens_seen": 1550266368 }, { "epoch": 4.06, "learning_rate": 0.00026783350050150453, "loss": 2.791, "theoretical_loss": 3.5058963027269137, "tokens_seen": 1550331904 }, { "epoch": 4.06, "learning_rate": 0.0002678234704112337, "loss": 2.8919, "theoretical_loss": 3.5058833187624105, "tokens_seen": 1550397440 }, { "epoch": 4.06, "learning_rate": 0.0002678134403209629, "loss": 2.7886, "theoretical_loss": 3.5058703355004006, "tokens_seen": 1550462976 }, { "epoch": 4.06, "learning_rate": 0.0002678034102306921, "loss": 2.9203, "theoretical_loss": 3.505857352940817, "tokens_seen": 1550528512 }, { "epoch": 4.06, "learning_rate": 0.00026779338014042126, "loss": 2.6999, "theoretical_loss": 3.5058443710835916, "tokens_seen": 1550594048 }, { "epoch": 4.06, "learning_rate": 0.0002677833500501505, "loss": 2.8706, "theoretical_loss": 3.505831389928657, "tokens_seen": 1550659584 }, { "epoch": 4.06, "learning_rate": 0.0002677733199598796, "loss": 2.8688, "theoretical_loss": 3.5058184094759453, "tokens_seen": 1550725120 }, { "epoch": 4.06, "learning_rate": 0.00026776328986960886, "loss": 2.5748, "theoretical_loss": 3.5058054297253882, "tokens_seen": 1550790656 }, { "epoch": 4.06, "learning_rate": 0.000267753259779338, "loss": 2.8204, "theoretical_loss": 3.5057924506769194, "tokens_seen": 1550856192 }, { "epoch": 4.06, "learning_rate": 0.0002677432296890672, "loss": 2.7825, "theoretical_loss": 3.5057794723304703, "tokens_seen": 1550921728 }, { "epoch": 4.06, "learning_rate": 0.0002677331995987964, "loss": 2.8972, "theoretical_loss": 3.505766494685974, "tokens_seen": 1550987264 }, { "epoch": 4.06, "learning_rate": 0.0002677231695085256, "loss": 2.7915, "theoretical_loss": 3.5057535177433623, "tokens_seen": 1551052800 }, { "epoch": 4.06, "learning_rate": 0.00026771313941825476, "loss": 2.7049, "theoretical_loss": 3.5057405415025675, "tokens_seen": 1551118336 }, { "epoch": 4.06, "learning_rate": 0.000267703109327984, "loss": 2.8593, "theoretical_loss": 3.505727565963522, "tokens_seen": 1551183872 }, { "epoch": 4.06, "learning_rate": 0.0002676930792377131, "loss": 2.7937, "theoretical_loss": 3.505714591126159, "tokens_seen": 1551249408 }, { "epoch": 4.06, "learning_rate": 0.00026768304914744236, "loss": 2.787, "theoretical_loss": 3.50570161699041, "tokens_seen": 1551314944 }, { "epoch": 4.06, "learning_rate": 0.0002676730190571715, "loss": 2.9586, "theoretical_loss": 3.5056886435562076, "tokens_seen": 1551380480 }, { "epoch": 4.06, "learning_rate": 0.0002676629889669007, "loss": 2.8987, "theoretical_loss": 3.505675670823485, "tokens_seen": 1551446016 }, { "epoch": 4.06, "objective/train/docs_used": 2475586, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.84519100189209, "objective/train/theoretical_loss": 3.5056626987921735, "objective/train/tokens_used": 1571971552, "theoretical_loss": 3.5056626987921735, "tokens_seen": 1551511552 }, { "epoch": 4.06, "learning_rate": 0.0002676529588766299, "loss": 2.8869, "theoretical_loss": 3.5056626987921735, "tokens_seen": 1551511552 }, { "epoch": 4.06, "learning_rate": 0.0002676429287863591, "loss": 2.9325, "theoretical_loss": 3.505649727462206, "tokens_seen": 1551577088 }, { "epoch": 4.06, "learning_rate": 0.00026763289869608827, "loss": 2.9277, "theoretical_loss": 3.505636756833515, "tokens_seen": 1551642624 }, { "epoch": 4.06, "learning_rate": 0.00026762286860581745, "loss": 2.9098, "theoretical_loss": 3.5056237869060336, "tokens_seen": 1551708160 }, { "epoch": 4.06, "learning_rate": 0.00026761283851554663, "loss": 2.7553, "theoretical_loss": 3.505610817679693, "tokens_seen": 1551773696 }, { "epoch": 4.06, "learning_rate": 0.00026760280842527586, "loss": 2.7181, "theoretical_loss": 3.5055978491544266, "tokens_seen": 1551839232 }, { "epoch": 4.06, "learning_rate": 0.000267592778335005, "loss": 2.785, "theoretical_loss": 3.505584881330167, "tokens_seen": 1551904768 }, { "epoch": 4.06, "learning_rate": 0.0002675827482447342, "loss": 2.8266, "theoretical_loss": 3.505571914206846, "tokens_seen": 1551970304 }, { "epoch": 4.06, "learning_rate": 0.00026757271815446335, "loss": 2.8196, "theoretical_loss": 3.505558947784397, "tokens_seen": 1552035840 }, { "epoch": 4.06, "learning_rate": 0.0002675626880641926, "loss": 2.8172, "theoretical_loss": 3.5055459820627517, "tokens_seen": 1552101376 }, { "epoch": 4.06, "learning_rate": 0.00026755265797392177, "loss": 2.8678, "theoretical_loss": 3.505533017041843, "tokens_seen": 1552166912 }, { "epoch": 4.06, "learning_rate": 0.00026754262788365095, "loss": 2.8324, "theoretical_loss": 3.505520052721603, "tokens_seen": 1552232448 }, { "epoch": 4.06, "learning_rate": 0.00026753259779338013, "loss": 2.8001, "theoretical_loss": 3.505507089101965, "tokens_seen": 1552297984 }, { "epoch": 4.06, "learning_rate": 0.00026752256770310937, "loss": 2.8885, "theoretical_loss": 3.505494126182861, "tokens_seen": 1552363520 }, { "epoch": 4.06, "learning_rate": 0.0002675125376128385, "loss": 2.6754, "theoretical_loss": 3.505481163964224, "tokens_seen": 1552429056 }, { "epoch": 4.06, "learning_rate": 0.00026750250752256773, "loss": 2.9385, "theoretical_loss": 3.5054682024459867, "tokens_seen": 1552494592 }, { "epoch": 4.06, "learning_rate": 0.00026749247743229686, "loss": 2.7698, "theoretical_loss": 3.5054552416280806, "tokens_seen": 1552560128 }, { "epoch": 4.06, "learning_rate": 0.0002674824473420261, "loss": 2.7963, "theoretical_loss": 3.50544228151044, "tokens_seen": 1552625664 }, { "epoch": 4.06, "learning_rate": 0.00026747241725175527, "loss": 2.7603, "theoretical_loss": 3.5054293220929957, "tokens_seen": 1552691200 }, { "epoch": 4.06, "learning_rate": 0.00026746238716148445, "loss": 2.7877, "theoretical_loss": 3.505416363375682, "tokens_seen": 1552756736 }, { "epoch": 4.06, "learning_rate": 0.00026745235707121363, "loss": 2.7465, "theoretical_loss": 3.5054034053584298, "tokens_seen": 1552822272 }, { "epoch": 4.06, "learning_rate": 0.0002674423269809428, "loss": 2.8187, "theoretical_loss": 3.5053904480411733, "tokens_seen": 1552887808 }, { "epoch": 4.06, "learning_rate": 0.000267432296890672, "loss": 2.954, "theoretical_loss": 3.505377491423844, "tokens_seen": 1552953344 }, { "epoch": 4.06, "learning_rate": 0.00026742226680040123, "loss": 2.8053, "theoretical_loss": 3.5053645355063754, "tokens_seen": 1553018880 }, { "epoch": 4.06, "learning_rate": 0.00026741223671013036, "loss": 2.7967, "theoretical_loss": 3.5053515802887, "tokens_seen": 1553084416 }, { "epoch": 4.06, "objective/train/docs_used": 2478252, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.899306535720825, "objective/train/theoretical_loss": 3.50533862577075, "objective/train/tokens_used": 1573609952, "theoretical_loss": 3.50533862577075, "tokens_seen": 1553149952 }, { "epoch": 4.06, "learning_rate": 0.0002674022066198596, "loss": 2.8182, "theoretical_loss": 3.50533862577075, "tokens_seen": 1553149952 }, { "epoch": 4.06, "learning_rate": 0.0002673921765295888, "loss": 2.9833, "theoretical_loss": 3.5053256719524586, "tokens_seen": 1553215488 }, { "epoch": 4.06, "learning_rate": 0.00026738214643931796, "loss": 2.7378, "theoretical_loss": 3.5053127188337583, "tokens_seen": 1553281024 }, { "epoch": 4.06, "learning_rate": 0.00026737211634904714, "loss": 2.8165, "theoretical_loss": 3.5052997664145815, "tokens_seen": 1553346560 }, { "epoch": 4.06, "learning_rate": 0.0002673620862587763, "loss": 2.8267, "theoretical_loss": 3.5052868146948613, "tokens_seen": 1553412096 }, { "epoch": 4.06, "learning_rate": 0.0002673520561685055, "loss": 2.8454, "theoretical_loss": 3.50527386367453, "tokens_seen": 1553477632 }, { "epoch": 4.06, "learning_rate": 0.00026734202607823474, "loss": 2.7356, "theoretical_loss": 3.5052609133535215, "tokens_seen": 1553543168 }, { "epoch": 4.06, "learning_rate": 0.00026733199598796386, "loss": 2.7854, "theoretical_loss": 3.505247963731767, "tokens_seen": 1553608704 }, { "epoch": 4.06, "learning_rate": 0.0002673219658976931, "loss": 2.8693, "theoretical_loss": 3.5052350148092, "tokens_seen": 1553674240 }, { "epoch": 4.06, "learning_rate": 0.0002673119358074222, "loss": 2.8381, "theoretical_loss": 3.5052220665857536, "tokens_seen": 1553739776 }, { "epoch": 4.06, "learning_rate": 0.00026730190571715146, "loss": 2.8834, "theoretical_loss": 3.5052091190613597, "tokens_seen": 1553805312 }, { "epoch": 4.06, "learning_rate": 0.00026729187562688064, "loss": 2.8204, "theoretical_loss": 3.5051961722359515, "tokens_seen": 1553870848 }, { "epoch": 4.06, "learning_rate": 0.0002672818455366098, "loss": 2.7118, "theoretical_loss": 3.505183226109462, "tokens_seen": 1553936384 }, { "epoch": 4.06, "learning_rate": 0.000267271815446339, "loss": 2.7365, "theoretical_loss": 3.505170280681824, "tokens_seen": 1554001920 }, { "epoch": 4.06, "learning_rate": 0.0002672617853560682, "loss": 2.8633, "theoretical_loss": 3.50515733595297, "tokens_seen": 1554067456 }, { "epoch": 4.06, "learning_rate": 0.0002672517552657974, "loss": 2.8147, "theoretical_loss": 3.5051443919228324, "tokens_seen": 1554132992 }, { "epoch": 4.06, "learning_rate": 0.0002672417251755266, "loss": 2.5305, "theoretical_loss": 3.5051314485913454, "tokens_seen": 1554198528 }, { "epoch": 4.06, "learning_rate": 0.0002672316950852558, "loss": 2.8315, "theoretical_loss": 3.505118505958441, "tokens_seen": 1554264064 }, { "epoch": 4.06, "learning_rate": 0.00026722166499498496, "loss": 2.9975, "theoretical_loss": 3.505105564024051, "tokens_seen": 1554329600 }, { "epoch": 4.06, "learning_rate": 0.0002672116349047142, "loss": 2.8382, "theoretical_loss": 3.5050926227881103, "tokens_seen": 1554395136 }, { "epoch": 4.06, "learning_rate": 0.0002672016048144433, "loss": 2.9253, "theoretical_loss": 3.5050796822505506, "tokens_seen": 1554460672 }, { "epoch": 4.06, "learning_rate": 0.00026719157472417256, "loss": 2.8637, "theoretical_loss": 3.5050667424113042, "tokens_seen": 1554526208 }, { "epoch": 4.06, "learning_rate": 0.0002671815446339017, "loss": 2.8331, "theoretical_loss": 3.5050538032703056, "tokens_seen": 1554591744 }, { "epoch": 4.06, "learning_rate": 0.0002671715145436309, "loss": 2.7935, "theoretical_loss": 3.5050408648274862, "tokens_seen": 1554657280 }, { "epoch": 4.06, "learning_rate": 0.0002671614844533601, "loss": 2.8674, "theoretical_loss": 3.50502792708278, "tokens_seen": 1554722816 }, { "epoch": 4.06, "objective/train/docs_used": 2481099, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.00907564163208, "objective/train/theoretical_loss": 3.505014990036119, "objective/train/tokens_used": 1575248352, "theoretical_loss": 3.505014990036119, "tokens_seen": 1554788352 }, { "epoch": 4.06, "learning_rate": 0.0002671514543630893, "loss": 2.9125, "theoretical_loss": 3.505014990036119, "tokens_seen": 1554788352 }, { "epoch": 4.06, "learning_rate": 0.00026714142427281847, "loss": 2.6684, "theoretical_loss": 3.5050020536874364, "tokens_seen": 1554853888 }, { "epoch": 4.06, "learning_rate": 0.00026713139418254765, "loss": 2.7928, "theoretical_loss": 3.504989118036666, "tokens_seen": 1554919424 }, { "epoch": 4.06, "learning_rate": 0.00026712136409227683, "loss": 2.8119, "theoretical_loss": 3.5049761830837394, "tokens_seen": 1554984960 }, { "epoch": 4.06, "learning_rate": 0.00026711133400200606, "loss": 2.7362, "theoretical_loss": 3.50496324882859, "tokens_seen": 1555050496 }, { "epoch": 4.06, "learning_rate": 0.0002671013039117352, "loss": 2.8518, "theoretical_loss": 3.5049503152711505, "tokens_seen": 1555116032 }, { "epoch": 4.06, "learning_rate": 0.0002670912738214644, "loss": 2.6682, "theoretical_loss": 3.504937382411355, "tokens_seen": 1555181568 }, { "epoch": 4.06, "learning_rate": 0.00026708124373119355, "loss": 2.801, "theoretical_loss": 3.5049244502491352, "tokens_seen": 1555247104 }, { "epoch": 4.06, "learning_rate": 0.0002670712136409228, "loss": 2.7386, "theoretical_loss": 3.504911518784425, "tokens_seen": 1555312640 }, { "epoch": 4.06, "learning_rate": 0.00026706118355065197, "loss": 2.8439, "theoretical_loss": 3.5048985880171566, "tokens_seen": 1555378176 }, { "epoch": 4.06, "learning_rate": 0.00026705115346038115, "loss": 2.8455, "theoretical_loss": 3.5048856579472636, "tokens_seen": 1555443712 }, { "epoch": 4.06, "learning_rate": 0.00026704112337011033, "loss": 2.6411, "theoretical_loss": 3.5048727285746786, "tokens_seen": 1555509248 }, { "epoch": 4.06, "learning_rate": 0.00026703109327983957, "loss": 2.8288, "theoretical_loss": 3.504859799899335, "tokens_seen": 1555574784 }, { "epoch": 4.06, "learning_rate": 0.0002670210631895687, "loss": 2.8799, "theoretical_loss": 3.5048468719211656, "tokens_seen": 1555640320 }, { "epoch": 4.06, "learning_rate": 0.00026701103309929793, "loss": 2.9752, "theoretical_loss": 3.5048339446401036, "tokens_seen": 1555705856 }, { "epoch": 4.06, "learning_rate": 0.00026700100300902706, "loss": 2.8093, "theoretical_loss": 3.5048210180560817, "tokens_seen": 1555771392 }, { "epoch": 4.06, "learning_rate": 0.0002669909729187563, "loss": 2.8134, "theoretical_loss": 3.5048080921690334, "tokens_seen": 1555836928 }, { "epoch": 4.06, "learning_rate": 0.00026698094282848547, "loss": 2.5981, "theoretical_loss": 3.5047951669788913, "tokens_seen": 1555902464 }, { "epoch": 4.06, "learning_rate": 0.00026697091273821465, "loss": 2.7638, "theoretical_loss": 3.5047822424855886, "tokens_seen": 1555968000 }, { "epoch": 4.06, "learning_rate": 0.00026696088264794383, "loss": 2.8355, "theoretical_loss": 3.5047693186890587, "tokens_seen": 1556033536 }, { "epoch": 4.06, "learning_rate": 0.000266950852557673, "loss": 2.7107, "theoretical_loss": 3.5047563955892347, "tokens_seen": 1556099072 }, { "epoch": 4.06, "learning_rate": 0.0002669408224674022, "loss": 2.8366, "theoretical_loss": 3.504743473186049, "tokens_seen": 1556164608 }, { "epoch": 4.06, "learning_rate": 0.00026693079237713143, "loss": 2.7856, "theoretical_loss": 3.504730551479436, "tokens_seen": 1556230144 }, { "epoch": 4.06, "learning_rate": 0.00026692076228686056, "loss": 2.8203, "theoretical_loss": 3.504717630469327, "tokens_seen": 1556295680 }, { "epoch": 4.06, "learning_rate": 0.0002669107321965898, "loss": 2.8178, "theoretical_loss": 3.5047047101556563, "tokens_seen": 1556361216 }, { "epoch": 4.06, "objective/train/docs_used": 2483723, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6913602352142334, "objective/train/theoretical_loss": 3.5046917905383577, "objective/train/tokens_used": 1576886752, "theoretical_loss": 3.5046917905383577, "tokens_seen": 1556426752 }, { "epoch": 4.06, "learning_rate": 0.000266900702106319, "loss": 2.7652, "theoretical_loss": 3.5046917905383577, "tokens_seen": 1556426752 }, { "epoch": 4.06, "learning_rate": 0.00026689067201604816, "loss": 2.7841, "theoretical_loss": 3.5046788716173625, "tokens_seen": 1556492288 }, { "epoch": 4.06, "learning_rate": 0.00026688064192577734, "loss": 2.9638, "theoretical_loss": 3.504665953392606, "tokens_seen": 1556557824 }, { "epoch": 4.06, "learning_rate": 0.0002668706118355065, "loss": 2.957, "theoretical_loss": 3.504653035864019, "tokens_seen": 1556623360 }, { "epoch": 4.06, "learning_rate": 0.0002668605817452357, "loss": 2.7675, "theoretical_loss": 3.504640119031537, "tokens_seen": 1556688896 }, { "epoch": 4.06, "learning_rate": 0.00026685055165496494, "loss": 2.8793, "theoretical_loss": 3.5046272028950916, "tokens_seen": 1556754432 }, { "epoch": 4.06, "learning_rate": 0.00026684052156469406, "loss": 2.8086, "theoretical_loss": 3.5046142874546167, "tokens_seen": 1556819968 }, { "epoch": 4.06, "learning_rate": 0.0002668304914744233, "loss": 2.8838, "theoretical_loss": 3.504601372710045, "tokens_seen": 1556885504 }, { "epoch": 4.06, "learning_rate": 0.0002668204613841524, "loss": 2.7969, "theoretical_loss": 3.5045884586613103, "tokens_seen": 1556951040 }, { "epoch": 4.06, "learning_rate": 0.00026681043129388166, "loss": 2.9017, "theoretical_loss": 3.5045755453083456, "tokens_seen": 1557016576 }, { "epoch": 4.06, "learning_rate": 0.00026680040120361084, "loss": 2.8812, "theoretical_loss": 3.504562632651084, "tokens_seen": 1557082112 }, { "epoch": 4.06, "learning_rate": 0.00026679037111334, "loss": 2.7373, "theoretical_loss": 3.5045497206894582, "tokens_seen": 1557147648 }, { "epoch": 4.06, "learning_rate": 0.0002667803410230692, "loss": 2.7967, "theoretical_loss": 3.504536809423403, "tokens_seen": 1557213184 }, { "epoch": 4.06, "learning_rate": 0.0002667703109327984, "loss": 2.7661, "theoretical_loss": 3.5045238988528506, "tokens_seen": 1557278720 }, { "epoch": 4.06, "learning_rate": 0.00026676028084252757, "loss": 2.8077, "theoretical_loss": 3.504510988977734, "tokens_seen": 1557344256 }, { "epoch": 4.06, "learning_rate": 0.0002667502507522568, "loss": 2.7409, "theoretical_loss": 3.504498079797987, "tokens_seen": 1557409792 }, { "epoch": 4.06, "learning_rate": 0.00026674022066198593, "loss": 2.7416, "theoretical_loss": 3.5044851713135428, "tokens_seen": 1557475328 }, { "epoch": 4.06, "learning_rate": 0.00026673019057171516, "loss": 2.9094, "theoretical_loss": 3.5044722635243346, "tokens_seen": 1557540864 }, { "epoch": 4.06, "learning_rate": 0.00026672016048144434, "loss": 2.7667, "theoretical_loss": 3.5044593564302957, "tokens_seen": 1557606400 }, { "epoch": 4.06, "learning_rate": 0.0002667101303911735, "loss": 2.7911, "theoretical_loss": 3.5044464500313595, "tokens_seen": 1557671936 }, { "epoch": 4.06, "learning_rate": 0.0002667001003009027, "loss": 2.9005, "theoretical_loss": 3.5044335443274597, "tokens_seen": 1557737472 }, { "epoch": 4.06, "learning_rate": 0.0002666900702106319, "loss": 2.792, "theoretical_loss": 3.504420639318529, "tokens_seen": 1557803008 }, { "epoch": 4.06, "learning_rate": 0.00026668004012036107, "loss": 2.7658, "theoretical_loss": 3.5044077350045004, "tokens_seen": 1557868544 }, { "epoch": 4.06, "learning_rate": 0.0002666700100300903, "loss": 2.716, "theoretical_loss": 3.5043948313853086, "tokens_seen": 1557934080 }, { "epoch": 4.06, "learning_rate": 0.00026665997993981943, "loss": 2.7088, "theoretical_loss": 3.504381928460886, "tokens_seen": 1557999616 }, { "epoch": 4.06, "objective/train/docs_used": 2486426, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9915366172790527, "objective/train/theoretical_loss": 3.504369026231166, "objective/train/tokens_used": 1578525152, "theoretical_loss": 3.504369026231166, "tokens_seen": 1558065152 }, { "epoch": 4.06, "learning_rate": 0.00026664994984954867, "loss": 2.9164, "theoretical_loss": 3.504369026231166, "tokens_seen": 1558065152 }, { "epoch": 4.06, "learning_rate": 0.0002666399197592778, "loss": 2.7955, "theoretical_loss": 3.5043561246960824, "tokens_seen": 1558130688 }, { "epoch": 4.06, "learning_rate": 0.00026662988966900703, "loss": 2.7756, "theoretical_loss": 3.504343223855568, "tokens_seen": 1558196224 }, { "epoch": 4.06, "learning_rate": 0.0002666198595787362, "loss": 2.8262, "theoretical_loss": 3.5043303237095564, "tokens_seen": 1558261760 }, { "epoch": 4.06, "learning_rate": 0.0002666098294884654, "loss": 2.7781, "theoretical_loss": 3.504317424257981, "tokens_seen": 1558327296 }, { "epoch": 4.06, "learning_rate": 0.00026659979939819457, "loss": 2.7608, "theoretical_loss": 3.5043045255007756, "tokens_seen": 1558392832 }, { "epoch": 4.06, "learning_rate": 0.00026658976930792375, "loss": 2.6326, "theoretical_loss": 3.5042916274378735, "tokens_seen": 1558458368 }, { "epoch": 4.06, "learning_rate": 0.00026657973921765293, "loss": 2.8367, "theoretical_loss": 3.504278730069208, "tokens_seen": 1558523904 }, { "epoch": 4.06, "learning_rate": 0.00026656970912738217, "loss": 2.7329, "theoretical_loss": 3.504265833394712, "tokens_seen": 1558589440 }, { "epoch": 4.06, "learning_rate": 0.0002665596790371113, "loss": 2.8143, "theoretical_loss": 3.50425293741432, "tokens_seen": 1558654976 }, { "epoch": 4.06, "learning_rate": 0.00026654964894684053, "loss": 2.9435, "theoretical_loss": 3.504240042127965, "tokens_seen": 1558720512 }, { "epoch": 4.06, "learning_rate": 0.0002665396188565697, "loss": 2.808, "theoretical_loss": 3.50422714753558, "tokens_seen": 1558786048 }, { "epoch": 4.06, "learning_rate": 0.0002665295887662989, "loss": 2.623, "theoretical_loss": 3.504214253637099, "tokens_seen": 1558851584 }, { "epoch": 4.06, "learning_rate": 0.0002665195586760281, "loss": 2.7156, "theoretical_loss": 3.5042013604324556, "tokens_seen": 1558917120 }, { "epoch": 4.06, "learning_rate": 0.00026650952858575726, "loss": 2.9012, "theoretical_loss": 3.5041884679215833, "tokens_seen": 1558982656 }, { "epoch": 4.06, "learning_rate": 0.0002664994984954865, "loss": 2.8552, "theoretical_loss": 3.504175576104415, "tokens_seen": 1559048192 }, { "epoch": 4.06, "learning_rate": 0.00026648946840521567, "loss": 2.8038, "theoretical_loss": 3.5041626849808845, "tokens_seen": 1559113728 }, { "epoch": 4.06, "learning_rate": 0.00026647943831494485, "loss": 2.9084, "theoretical_loss": 3.5041497945509255, "tokens_seen": 1559179264 }, { "epoch": 4.06, "learning_rate": 0.00026646940822467403, "loss": 2.7608, "theoretical_loss": 3.5041369048144713, "tokens_seen": 1559244800 }, { "epoch": 4.06, "learning_rate": 0.0002664593781344032, "loss": 2.7555, "theoretical_loss": 3.504124015771456, "tokens_seen": 1559310336 }, { "epoch": 4.06, "learning_rate": 0.0002664493480441324, "loss": 2.7809, "theoretical_loss": 3.5041111274218126, "tokens_seen": 1559375872 }, { "epoch": 4.06, "learning_rate": 0.00026643931795386163, "loss": 2.7522, "theoretical_loss": 3.504098239765475, "tokens_seen": 1559441408 }, { "epoch": 4.06, "learning_rate": 0.00026642928786359076, "loss": 2.6823, "theoretical_loss": 3.504085352802376, "tokens_seen": 1559506944 }, { "epoch": 4.06, "learning_rate": 0.00026641925777332, "loss": 2.8153, "theoretical_loss": 3.50407246653245, "tokens_seen": 1559572480 }, { "epoch": 4.06, "learning_rate": 0.0002664092276830492, "loss": 2.7988, "theoretical_loss": 3.504059580955631, "tokens_seen": 1559638016 }, { "epoch": 4.06, "objective/train/docs_used": 2489031, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.805288553237915, "objective/train/theoretical_loss": 3.5040466960718515, "objective/train/tokens_used": 1580163552, "theoretical_loss": 3.5040466960718515, "tokens_seen": 1559703552 }, { "epoch": 4.06, "learning_rate": 0.00026639919759277836, "loss": 2.7138, "theoretical_loss": 3.5040466960718515, "tokens_seen": 1559703552 }, { "epoch": 4.06, "learning_rate": 0.00026638916750250754, "loss": 2.9111, "theoretical_loss": 3.5040338118810452, "tokens_seen": 1559769088 }, { "epoch": 4.06, "learning_rate": 0.0002663791374122367, "loss": 2.8637, "theoretical_loss": 3.5040209283831465, "tokens_seen": 1559834624 }, { "epoch": 4.06, "learning_rate": 0.0002663691073219659, "loss": 2.7847, "theoretical_loss": 3.5040080455780886, "tokens_seen": 1559900160 }, { "epoch": 4.06, "learning_rate": 0.00026635907723169514, "loss": 2.9114, "theoretical_loss": 3.503995163465805, "tokens_seen": 1559965696 }, { "epoch": 4.06, "learning_rate": 0.00026634904714142426, "loss": 2.8852, "theoretical_loss": 3.5039822820462296, "tokens_seen": 1560031232 }, { "epoch": 4.06, "learning_rate": 0.0002663390170511535, "loss": 2.9204, "theoretical_loss": 3.5039694013192957, "tokens_seen": 1560096768 }, { "epoch": 4.06, "learning_rate": 0.0002663289869608826, "loss": 2.7681, "theoretical_loss": 3.5039565212849375, "tokens_seen": 1560162304 }, { "epoch": 4.06, "learning_rate": 0.00026631895687061186, "loss": 2.8161, "theoretical_loss": 3.503943641943088, "tokens_seen": 1560227840 }, { "epoch": 4.06, "learning_rate": 0.00026630892678034104, "loss": 2.6419, "theoretical_loss": 3.5039307632936816, "tokens_seen": 1560293376 }, { "epoch": 4.06, "learning_rate": 0.0002662988966900702, "loss": 2.667, "theoretical_loss": 3.5039178853366515, "tokens_seen": 1560358912 }, { "epoch": 4.06, "learning_rate": 0.0002662888665997994, "loss": 2.7263, "theoretical_loss": 3.5039050080719316, "tokens_seen": 1560424448 }, { "epoch": 4.06, "learning_rate": 0.0002662788365095286, "loss": 2.7429, "theoretical_loss": 3.5038921314994553, "tokens_seen": 1560489984 }, { "epoch": 4.06, "learning_rate": 0.00026626880641925777, "loss": 2.6944, "theoretical_loss": 3.503879255619157, "tokens_seen": 1560555520 }, { "epoch": 4.06, "learning_rate": 0.000266258776328987, "loss": 2.8734, "theoretical_loss": 3.5038663804309698, "tokens_seen": 1560621056 }, { "epoch": 4.06, "learning_rate": 0.00026624874623871613, "loss": 2.7966, "theoretical_loss": 3.5038535059348277, "tokens_seen": 1560686592 }, { "epoch": 4.06, "learning_rate": 0.00026623871614844536, "loss": 2.6636, "theoretical_loss": 3.503840632130664, "tokens_seen": 1560752128 }, { "epoch": 4.06, "learning_rate": 0.00026622868605817454, "loss": 2.8304, "theoretical_loss": 3.5038277590184133, "tokens_seen": 1560817664 }, { "epoch": 4.06, "learning_rate": 0.0002662186559679037, "loss": 2.8718, "theoretical_loss": 3.5038148865980085, "tokens_seen": 1560883200 }, { "epoch": 4.06, "learning_rate": 0.0002662086258776329, "loss": 2.809, "theoretical_loss": 3.5038020148693843, "tokens_seen": 1560948736 }, { "epoch": 4.06, "learning_rate": 0.0002661985957873621, "loss": 2.72, "theoretical_loss": 3.5037891438324733, "tokens_seen": 1561014272 }, { "epoch": 4.06, "learning_rate": 0.00026618856569709127, "loss": 2.8358, "theoretical_loss": 3.50377627348721, "tokens_seen": 1561079808 }, { "epoch": 4.06, "learning_rate": 0.0002661785356068205, "loss": 2.869, "theoretical_loss": 3.5037634038335286, "tokens_seen": 1561145344 }, { "epoch": 4.06, "learning_rate": 0.00026616850551654963, "loss": 2.7699, "theoretical_loss": 3.503750534871362, "tokens_seen": 1561210880 }, { "epoch": 4.06, "learning_rate": 0.00026615847542627887, "loss": 2.7713, "theoretical_loss": 3.503737666600645, "tokens_seen": 1561276416 }, { "epoch": 4.06, "objective/train/docs_used": 2490587, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7203080654144287, "objective/train/theoretical_loss": 3.5037247990213105, "objective/train/tokens_used": 1581801952, "theoretical_loss": 3.5037247990213105, "tokens_seen": 1561341952 }, { "epoch": 4.06, "learning_rate": 0.000266148445336008, "loss": 2.7894, "theoretical_loss": 3.5037247990213105, "tokens_seen": 1561341952 }, { "epoch": 4.06, "learning_rate": 0.00026613841524573723, "loss": 2.8714, "theoretical_loss": 3.5037119321332924, "tokens_seen": 1561407488 }, { "epoch": 4.06, "learning_rate": 0.0002661283851554664, "loss": 2.9401, "theoretical_loss": 3.5036990659365252, "tokens_seen": 1561473024 }, { "epoch": 4.06, "learning_rate": 0.0002661183550651956, "loss": 2.7851, "theoretical_loss": 3.5036862004309426, "tokens_seen": 1561538560 }, { "epoch": 4.06, "learning_rate": 0.00026610832497492477, "loss": 2.7785, "theoretical_loss": 3.5036733356164778, "tokens_seen": 1561604096 }, { "epoch": 4.06, "learning_rate": 0.00026609829488465395, "loss": 2.8395, "theoretical_loss": 3.503660471493066, "tokens_seen": 1561669632 }, { "epoch": 4.06, "learning_rate": 0.00026608826479438313, "loss": 2.7387, "theoretical_loss": 3.503647608060639, "tokens_seen": 1561735168 }, { "epoch": 4.06, "learning_rate": 0.00026607823470411237, "loss": 2.7896, "theoretical_loss": 3.503634745319133, "tokens_seen": 1561800704 }, { "epoch": 4.06, "learning_rate": 0.0002660682046138415, "loss": 2.7698, "theoretical_loss": 3.5036218832684805, "tokens_seen": 1561866240 }, { "epoch": 4.06, "learning_rate": 0.00026605817452357073, "loss": 2.7532, "theoretical_loss": 3.5036090219086153, "tokens_seen": 1561931776 }, { "epoch": 4.06, "learning_rate": 0.0002660481444332999, "loss": 2.6512, "theoretical_loss": 3.5035961612394724, "tokens_seen": 1561997312 }, { "epoch": 4.06, "learning_rate": 0.0002660481444332999, "loss": 2.7053, "theoretical_loss": 3.503583301260985, "tokens_seen": 1562062848 }, { "epoch": 4.06, "learning_rate": 0.0002660381143430291, "loss": 2.7713, "theoretical_loss": 3.5035704419730864, "tokens_seen": 1562128384 }, { "epoch": 4.06, "learning_rate": 0.0002660280842527583, "loss": 2.8611, "theoretical_loss": 3.503557583375712, "tokens_seen": 1562193920 }, { "epoch": 4.06, "learning_rate": 0.00026601805416248746, "loss": 2.714, "theoretical_loss": 3.5035447254687946, "tokens_seen": 1562259456 }, { "epoch": 4.06, "learning_rate": 0.00026600802407221664, "loss": 2.8299, "theoretical_loss": 3.503531868252269, "tokens_seen": 1562324992 }, { "epoch": 4.06, "learning_rate": 0.00026599799398194587, "loss": 2.781, "theoretical_loss": 3.503519011726068, "tokens_seen": 1562390528 }, { "epoch": 4.06, "learning_rate": 0.000265987963891675, "loss": 2.7591, "theoretical_loss": 3.5035061558901273, "tokens_seen": 1562456064 }, { "epoch": 4.06, "learning_rate": 0.00026597793380140423, "loss": 2.8415, "theoretical_loss": 3.5034933007443794, "tokens_seen": 1562521600 }, { "epoch": 4.06, "learning_rate": 0.00026596790371113336, "loss": 2.7243, "theoretical_loss": 3.5034804462887585, "tokens_seen": 1562587136 }, { "epoch": 4.06, "learning_rate": 0.0002659578736208626, "loss": 2.8866, "theoretical_loss": 3.503467592523199, "tokens_seen": 1562652672 }, { "epoch": 4.06, "learning_rate": 0.0002659478435305918, "loss": 2.694, "theoretical_loss": 3.503454739447635, "tokens_seen": 1562718208 }, { "epoch": 4.06, "learning_rate": 0.00026593781344032096, "loss": 2.8078, "theoretical_loss": 3.503441887062001, "tokens_seen": 1562783744 }, { "epoch": 4.06, "learning_rate": 0.00026592778335005014, "loss": 2.8109, "theoretical_loss": 3.5034290353662296, "tokens_seen": 1562849280 }, { "epoch": 4.06, "learning_rate": 0.0002659177532597794, "loss": 2.8179, "theoretical_loss": 3.5034161843602565, "tokens_seen": 1562914816 }, { "epoch": 4.06, "objective/train/docs_used": 2493331, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8963370323181152, "objective/train/theoretical_loss": 3.5034033340440143, "objective/train/tokens_used": 1583440352, "theoretical_loss": 3.5034033340440143, "tokens_seen": 1562980352 }, { "epoch": 4.06, "learning_rate": 0.0002659077231695085, "loss": 2.7916, "theoretical_loss": 3.5034033340440143, "tokens_seen": 1562980352 }, { "epoch": 4.06, "learning_rate": 0.00026589769307923774, "loss": 2.7087, "theoretical_loss": 3.5033904844174373, "tokens_seen": 1563045888 }, { "epoch": 4.06, "learning_rate": 0.00026588766298896686, "loss": 2.9769, "theoretical_loss": 3.5033776354804607, "tokens_seen": 1563111424 }, { "epoch": 4.06, "learning_rate": 0.0002658776328986961, "loss": 2.9657, "theoretical_loss": 3.503364787233017, "tokens_seen": 1563176960 }, { "epoch": 4.06, "learning_rate": 0.0002658676028084253, "loss": 2.9634, "theoretical_loss": 3.503351939675042, "tokens_seen": 1563242496 }, { "epoch": 4.06, "learning_rate": 0.00026585757271815446, "loss": 2.7202, "theoretical_loss": 3.5033390928064683, "tokens_seen": 1563308032 }, { "epoch": 4.06, "learning_rate": 0.00026584754262788364, "loss": 2.6102, "theoretical_loss": 3.5033262466272306, "tokens_seen": 1563373568 }, { "epoch": 4.06, "learning_rate": 0.0002658375125376128, "loss": 2.7398, "theoretical_loss": 3.5033134011372633, "tokens_seen": 1563439104 }, { "epoch": 4.06, "learning_rate": 0.000265827482447342, "loss": 2.8744, "theoretical_loss": 3.5033005563365003, "tokens_seen": 1563504640 }, { "epoch": 4.06, "learning_rate": 0.00026581745235707124, "loss": 2.9156, "theoretical_loss": 3.5032877122248753, "tokens_seen": 1563570176 }, { "epoch": 4.06, "learning_rate": 0.00026580742226680037, "loss": 2.8533, "theoretical_loss": 3.5032748688023236, "tokens_seen": 1563635712 }, { "epoch": 4.06, "learning_rate": 0.0002657973921765296, "loss": 2.8122, "theoretical_loss": 3.5032620260687777, "tokens_seen": 1563701248 }, { "epoch": 4.06, "learning_rate": 0.00026578736208625873, "loss": 2.7978, "theoretical_loss": 3.503249184024173, "tokens_seen": 1563766784 }, { "epoch": 4.06, "learning_rate": 0.00026577733199598797, "loss": 2.8632, "theoretical_loss": 3.5032363426684436, "tokens_seen": 1563832320 }, { "epoch": 4.06, "learning_rate": 0.00026576730190571715, "loss": 2.7906, "theoretical_loss": 3.503223502001523, "tokens_seen": 1563897856 }, { "epoch": 4.06, "learning_rate": 0.00026575727181544633, "loss": 2.8049, "theoretical_loss": 3.5032106620233465, "tokens_seen": 1563963392 }, { "epoch": 4.06, "learning_rate": 0.00026574724172517556, "loss": 2.636, "theoretical_loss": 3.5031978227338465, "tokens_seen": 1564028928 }, { "epoch": 4.06, "learning_rate": 0.00026573721163490474, "loss": 2.7839, "theoretical_loss": 3.503184984132959, "tokens_seen": 1564094464 }, { "epoch": 4.06, "learning_rate": 0.0002657271815446339, "loss": 2.9746, "theoretical_loss": 3.5031721462206176, "tokens_seen": 1564160000 }, { "epoch": 4.06, "learning_rate": 0.0002657171514543631, "loss": 2.6422, "theoretical_loss": 3.5031593089967563, "tokens_seen": 1564225536 }, { "epoch": 4.06, "learning_rate": 0.0002657071213640923, "loss": 2.5388, "theoretical_loss": 3.5031464724613093, "tokens_seen": 1564291072 }, { "epoch": 4.06, "learning_rate": 0.00026569709127382147, "loss": 2.8307, "theoretical_loss": 3.5031336366142116, "tokens_seen": 1564356608 }, { "epoch": 4.06, "learning_rate": 0.0002656870611835507, "loss": 2.851, "theoretical_loss": 3.5031208014553963, "tokens_seen": 1564422144 }, { "epoch": 4.06, "learning_rate": 0.00026567703109327983, "loss": 2.7459, "theoretical_loss": 3.5031079669847984, "tokens_seen": 1564487680 }, { "epoch": 4.06, "learning_rate": 0.00026566700100300907, "loss": 2.7261, "theoretical_loss": 3.5030951332023523, "tokens_seen": 1564553216 }, { "epoch": 4.06, "objective/train/docs_used": 2496156, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.800706148147583, "objective/train/theoretical_loss": 3.503082300107992, "objective/train/tokens_used": 1585078752, "theoretical_loss": 3.503082300107992, "tokens_seen": 1564618752 }, { "epoch": 4.06, "learning_rate": 0.0002656569709127382, "loss": 2.8627, "theoretical_loss": 3.503082300107992, "tokens_seen": 1564618752 }, { "epoch": 4.06, "learning_rate": 0.00026564694082246743, "loss": 2.7585, "theoretical_loss": 3.5030694677016507, "tokens_seen": 1564684288 }, { "epoch": 4.06, "learning_rate": 0.0002656369107321966, "loss": 2.8114, "theoretical_loss": 3.503056635983265, "tokens_seen": 1564749824 }, { "epoch": 4.06, "learning_rate": 0.0002656268806419258, "loss": 2.7525, "theoretical_loss": 3.5030438049527675, "tokens_seen": 1564815360 }, { "epoch": 4.06, "learning_rate": 0.00026561685055165497, "loss": 2.9207, "theoretical_loss": 3.503030974610093, "tokens_seen": 1564880896 }, { "epoch": 4.06, "learning_rate": 0.00026560682046138415, "loss": 2.8125, "theoretical_loss": 3.503018144955176, "tokens_seen": 1564946432 }, { "epoch": 4.06, "learning_rate": 0.00026559679037111333, "loss": 2.8898, "theoretical_loss": 3.5030053159879504, "tokens_seen": 1565011968 }, { "epoch": 4.06, "learning_rate": 0.00026558676028084257, "loss": 2.8283, "theoretical_loss": 3.502992487708351, "tokens_seen": 1565077504 }, { "epoch": 4.06, "learning_rate": 0.0002655767301905717, "loss": 2.7286, "theoretical_loss": 3.502979660116312, "tokens_seen": 1565143040 }, { "epoch": 4.06, "learning_rate": 0.00026556670010030093, "loss": 2.9253, "theoretical_loss": 3.502966833211768, "tokens_seen": 1565208576 }, { "epoch": 4.06, "learning_rate": 0.0002655566700100301, "loss": 2.7127, "theoretical_loss": 3.5029540069946528, "tokens_seen": 1565274112 }, { "epoch": 4.06, "learning_rate": 0.0002655466399197593, "loss": 2.6172, "theoretical_loss": 3.5029411814649007, "tokens_seen": 1565339648 }, { "epoch": 4.06, "learning_rate": 0.0002655366098294885, "loss": 2.8513, "theoretical_loss": 3.502928356622447, "tokens_seen": 1565405184 }, { "epoch": 4.06, "learning_rate": 0.00026552657973921766, "loss": 2.6793, "theoretical_loss": 3.502915532467225, "tokens_seen": 1565470720 }, { "epoch": 4.06, "learning_rate": 0.00026551654964894684, "loss": 2.7468, "theoretical_loss": 3.5029027089991702, "tokens_seen": 1565536256 }, { "epoch": 4.06, "learning_rate": 0.00026550651955867607, "loss": 2.8131, "theoretical_loss": 3.502889886218216, "tokens_seen": 1565601792 }, { "epoch": 4.06, "learning_rate": 0.0002654964894684052, "loss": 2.8883, "theoretical_loss": 3.5028770641242977, "tokens_seen": 1565667328 }, { "epoch": 4.06, "learning_rate": 0.00026548645937813443, "loss": 2.8219, "theoretical_loss": 3.502864242717349, "tokens_seen": 1565732864 }, { "epoch": 4.06, "learning_rate": 0.00026547642928786356, "loss": 2.884, "theoretical_loss": 3.5028514219973053, "tokens_seen": 1565798400 }, { "epoch": 4.06, "learning_rate": 0.0002654663991975928, "loss": 2.6794, "theoretical_loss": 3.5028386019640996, "tokens_seen": 1565863936 }, { "epoch": 4.06, "learning_rate": 0.000265456369107322, "loss": 2.8733, "theoretical_loss": 3.502825782617668, "tokens_seen": 1565929472 }, { "epoch": 4.06, "learning_rate": 0.00026544633901705116, "loss": 2.8091, "theoretical_loss": 3.5028129639579433, "tokens_seen": 1565995008 }, { "epoch": 4.06, "learning_rate": 0.00026543630892678034, "loss": 2.776, "theoretical_loss": 3.5028001459848617, "tokens_seen": 1566060544 }, { "epoch": 4.06, "learning_rate": 0.0002654262788365096, "loss": 2.8213, "theoretical_loss": 3.502787328698356, "tokens_seen": 1566126080 }, { "epoch": 4.06, "learning_rate": 0.0002654162487462387, "loss": 2.6281, "theoretical_loss": 3.5027745120983624, "tokens_seen": 1566191616 }, { "epoch": 4.06, "objective/train/docs_used": 2498947, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.993497848510742, "objective/train/theoretical_loss": 3.5027616961848143, "objective/train/tokens_used": 1586717152, "theoretical_loss": 3.5027616961848143, "tokens_seen": 1566257152 }, { "epoch": 4.06, "learning_rate": 0.00026540621865596794, "loss": 2.8296, "theoretical_loss": 3.5027616961848143, "tokens_seen": 1566257152 }, { "epoch": 4.06, "learning_rate": 0.00026539618856569707, "loss": 2.7515, "theoretical_loss": 3.502748880957646, "tokens_seen": 1566322688 }, { "epoch": 4.06, "learning_rate": 0.0002653861584754263, "loss": 2.8287, "theoretical_loss": 3.5027360664167926, "tokens_seen": 1566388224 }, { "epoch": 4.06, "learning_rate": 0.0002653761283851555, "loss": 2.7331, "theoretical_loss": 3.5027232525621885, "tokens_seen": 1566453760 }, { "epoch": 4.06, "learning_rate": 0.00026536609829488466, "loss": 2.7535, "theoretical_loss": 3.5027104393937685, "tokens_seen": 1566519296 }, { "epoch": 4.06, "learning_rate": 0.00026535606820461384, "loss": 2.6242, "theoretical_loss": 3.5026976269114662, "tokens_seen": 1566584832 }, { "epoch": 4.06, "learning_rate": 0.000265346038114343, "loss": 2.7438, "theoretical_loss": 3.5026848151152175, "tokens_seen": 1566650368 }, { "epoch": 4.06, "learning_rate": 0.0002653360080240722, "loss": 2.8149, "theoretical_loss": 3.5026720040049564, "tokens_seen": 1566715904 }, { "epoch": 4.06, "learning_rate": 0.00026532597793380144, "loss": 2.8187, "theoretical_loss": 3.502659193580617, "tokens_seen": 1566781440 }, { "epoch": 4.06, "learning_rate": 0.00026531594784353057, "loss": 2.9805, "theoretical_loss": 3.5026463838421344, "tokens_seen": 1566846976 }, { "epoch": 4.06, "learning_rate": 0.0002653059177532598, "loss": 2.6243, "theoretical_loss": 3.502633574789443, "tokens_seen": 1566912512 }, { "epoch": 4.06, "learning_rate": 0.00026529588766298893, "loss": 2.8421, "theoretical_loss": 3.5026207664224778, "tokens_seen": 1566978048 }, { "epoch": 4.06, "learning_rate": 0.00026528585757271817, "loss": 2.8715, "theoretical_loss": 3.502607958741173, "tokens_seen": 1567043584 }, { "epoch": 4.06, "learning_rate": 0.00026527582748244735, "loss": 2.8655, "theoretical_loss": 3.5025951517454628, "tokens_seen": 1567109120 }, { "epoch": 4.06, "learning_rate": 0.00026526579739217653, "loss": 2.7572, "theoretical_loss": 3.502582345435283, "tokens_seen": 1567174656 }, { "epoch": 4.06, "learning_rate": 0.0002652557673019057, "loss": 2.6732, "theoretical_loss": 3.502569539810567, "tokens_seen": 1567240192 }, { "epoch": 4.06, "learning_rate": 0.00026524573721163494, "loss": 2.8604, "theoretical_loss": 3.5025567348712507, "tokens_seen": 1567305728 }, { "epoch": 4.06, "learning_rate": 0.00026523570712136407, "loss": 2.8829, "theoretical_loss": 3.502543930617268, "tokens_seen": 1567371264 }, { "epoch": 4.06, "learning_rate": 0.0002652256770310933, "loss": 2.8483, "theoretical_loss": 3.5025311270485533, "tokens_seen": 1567436800 }, { "epoch": 4.06, "learning_rate": 0.00026521564694082243, "loss": 2.758, "theoretical_loss": 3.5025183241650417, "tokens_seen": 1567502336 }, { "epoch": 4.06, "learning_rate": 0.00026520561685055167, "loss": 2.785, "theoretical_loss": 3.5025055219666674, "tokens_seen": 1567567872 }, { "epoch": 4.06, "learning_rate": 0.00026519558676028085, "loss": 2.7104, "theoretical_loss": 3.5024927204533665, "tokens_seen": 1567633408 }, { "epoch": 4.06, "learning_rate": 0.00026518555667001003, "loss": 2.7126, "theoretical_loss": 3.5024799196250718, "tokens_seen": 1567698944 }, { "epoch": 4.06, "learning_rate": 0.0002651755265797392, "loss": 2.6655, "theoretical_loss": 3.5024671194817194, "tokens_seen": 1567764480 }, { "epoch": 4.06, "learning_rate": 0.0002651654964894684, "loss": 2.8181, "theoretical_loss": 3.5024543200232436, "tokens_seen": 1567830016 }, { "epoch": 4.06, "objective/train/docs_used": 2501833, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7367615699768066, "objective/train/theoretical_loss": 3.5024415212495787, "objective/train/tokens_used": 1588355552, "theoretical_loss": 3.5024415212495787, "tokens_seen": 1567895552 }, { "epoch": 4.06, "learning_rate": 0.0002651554663991976, "loss": 2.7664, "theoretical_loss": 3.5024415212495787, "tokens_seen": 1567895552 }, { "epoch": 4.06, "learning_rate": 0.0002651454363089268, "loss": 2.9091, "theoretical_loss": 3.5024287231606603, "tokens_seen": 1567961088 }, { "epoch": 4.06, "learning_rate": 0.00026513540621865594, "loss": 2.8975, "theoretical_loss": 3.5024159257564222, "tokens_seen": 1568026624 }, { "epoch": 4.06, "learning_rate": 0.00026512537612838517, "loss": 2.6881, "theoretical_loss": 3.5024031290368, "tokens_seen": 1568092160 }, { "epoch": 4.06, "learning_rate": 0.0002651153460381143, "loss": 2.8575, "theoretical_loss": 3.5023903330017276, "tokens_seen": 1568157696 }, { "epoch": 4.06, "learning_rate": 0.00026510531594784353, "loss": 2.8975, "theoretical_loss": 3.5023775376511406, "tokens_seen": 1568223232 }, { "epoch": 4.06, "learning_rate": 0.0002650952858575727, "loss": 2.916, "theoretical_loss": 3.5023647429849736, "tokens_seen": 1568288768 }, { "epoch": 4.06, "learning_rate": 0.0002650852557673019, "loss": 2.7826, "theoretical_loss": 3.502351949003161, "tokens_seen": 1568354304 }, { "epoch": 4.06, "learning_rate": 0.0002650752256770311, "loss": 2.6471, "theoretical_loss": 3.502339155705638, "tokens_seen": 1568419840 }, { "epoch": 4.06, "learning_rate": 0.0002650651955867603, "loss": 2.7344, "theoretical_loss": 3.502326363092339, "tokens_seen": 1568485376 }, { "epoch": 4.06, "learning_rate": 0.00026505516549648944, "loss": 2.763, "theoretical_loss": 3.5023135711631994, "tokens_seen": 1568550912 }, { "epoch": 4.06, "learning_rate": 0.0002650451354062187, "loss": 2.9773, "theoretical_loss": 3.502300779918153, "tokens_seen": 1568616448 }, { "epoch": 4.06, "learning_rate": 0.0002650351053159478, "loss": 2.8283, "theoretical_loss": 3.5022879893571357, "tokens_seen": 1568681984 }, { "epoch": 4.06, "learning_rate": 0.00026502507522567704, "loss": 2.7814, "theoretical_loss": 3.5022751994800823, "tokens_seen": 1568747520 }, { "epoch": 4.06, "learning_rate": 0.0002650150451354062, "loss": 2.7769, "theoretical_loss": 3.502262410286927, "tokens_seen": 1568813056 }, { "epoch": 4.06, "learning_rate": 0.0002650050150451354, "loss": 2.8043, "theoretical_loss": 3.5022496217776053, "tokens_seen": 1568878592 }, { "epoch": 4.06, "learning_rate": 0.00026499498495486464, "loss": 2.5854, "theoretical_loss": 3.5022368339520513, "tokens_seen": 1568944128 }, { "epoch": 4.06, "learning_rate": 0.00026498495486459376, "loss": 2.811, "theoretical_loss": 3.502224046810201, "tokens_seen": 1569009664 }, { "epoch": 4.06, "learning_rate": 0.000264974924774323, "loss": 2.8183, "theoretical_loss": 3.502211260351988, "tokens_seen": 1569075200 }, { "epoch": 4.06, "learning_rate": 0.0002649648946840522, "loss": 2.8462, "theoretical_loss": 3.502198474577348, "tokens_seen": 1569140736 }, { "epoch": 4.06, "learning_rate": 0.00026495486459378136, "loss": 2.8719, "theoretical_loss": 3.5021856894862164, "tokens_seen": 1569206272 }, { "epoch": 4.06, "learning_rate": 0.00026494483450351054, "loss": 2.8779, "theoretical_loss": 3.5021729050785266, "tokens_seen": 1569271808 }, { "epoch": 4.06, "learning_rate": 0.0002649348044132398, "loss": 2.6775, "theoretical_loss": 3.5021601213542146, "tokens_seen": 1569337344 }, { "epoch": 4.06, "learning_rate": 0.0002649247743229689, "loss": 2.7106, "theoretical_loss": 3.502147338313215, "tokens_seen": 1569402880 }, { "epoch": 4.06, "learning_rate": 0.00026491474423269814, "loss": 2.6868, "theoretical_loss": 3.5021345559554633, "tokens_seen": 1569468416 }, { "epoch": 4.06, "objective/train/docs_used": 2504634, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.525237560272217, "objective/train/theoretical_loss": 3.502121774280894, "objective/train/tokens_used": 1589993952, "theoretical_loss": 3.502121774280894, "tokens_seen": 1569533952 }, { "epoch": 4.06, "learning_rate": 0.00026490471414242727, "loss": 2.75, "theoretical_loss": 3.502121774280894, "tokens_seen": 1569533952 }, { "epoch": 4.06, "learning_rate": 0.0002648946840521565, "loss": 2.6886, "theoretical_loss": 3.5021089932894416, "tokens_seen": 1569599488 }, { "epoch": 4.06, "learning_rate": 0.0002648846539618857, "loss": 2.7856, "theoretical_loss": 3.502096212981042, "tokens_seen": 1569665024 }, { "epoch": 4.06, "learning_rate": 0.00026487462387161486, "loss": 2.9317, "theoretical_loss": 3.5020834333556294, "tokens_seen": 1569730560 }, { "epoch": 4.06, "learning_rate": 0.00026486459378134404, "loss": 2.6901, "theoretical_loss": 3.5020706544131395, "tokens_seen": 1569796096 }, { "epoch": 4.06, "learning_rate": 0.0002648545636910732, "loss": 2.8109, "theoretical_loss": 3.502057876153507, "tokens_seen": 1569861632 }, { "epoch": 4.06, "learning_rate": 0.0002648445336008024, "loss": 2.8369, "theoretical_loss": 3.502045098576666, "tokens_seen": 1569927168 }, { "epoch": 4.06, "learning_rate": 0.00026483450351053164, "loss": 2.6351, "theoretical_loss": 3.5020323216825533, "tokens_seen": 1569992704 }, { "epoch": 4.06, "learning_rate": 0.00026482447342026077, "loss": 2.7065, "theoretical_loss": 3.502019545471102, "tokens_seen": 1570058240 }, { "epoch": 4.06, "learning_rate": 0.00026481444332999, "loss": 2.8503, "theoretical_loss": 3.502006769942249, "tokens_seen": 1570123776 }, { "epoch": 4.06, "learning_rate": 0.00026480441323971913, "loss": 2.8745, "theoretical_loss": 3.501993995095928, "tokens_seen": 1570189312 }, { "epoch": 4.06, "learning_rate": 0.00026479438314944837, "loss": 2.6794, "theoretical_loss": 3.501981220932074, "tokens_seen": 1570254848 }, { "epoch": 4.06, "learning_rate": 0.00026478435305917755, "loss": 2.7932, "theoretical_loss": 3.5019684474506234, "tokens_seen": 1570320384 }, { "epoch": 4.06, "learning_rate": 0.00026477432296890673, "loss": 2.7202, "theoretical_loss": 3.5019556746515104, "tokens_seen": 1570385920 }, { "epoch": 4.06, "learning_rate": 0.0002647642928786359, "loss": 2.8404, "theoretical_loss": 3.5019429025346698, "tokens_seen": 1570451456 }, { "epoch": 4.06, "learning_rate": 0.00026475426278836514, "loss": 2.9148, "theoretical_loss": 3.501930131100037, "tokens_seen": 1570516992 }, { "epoch": 4.06, "learning_rate": 0.00026474423269809427, "loss": 2.7649, "theoretical_loss": 3.501917360347547, "tokens_seen": 1570582528 }, { "epoch": 4.06, "learning_rate": 0.0002647342026078235, "loss": 2.8115, "theoretical_loss": 3.501904590277135, "tokens_seen": 1570648064 }, { "epoch": 4.06, "learning_rate": 0.00026472417251755263, "loss": 2.5593, "theoretical_loss": 3.501891820888736, "tokens_seen": 1570713600 }, { "epoch": 4.06, "learning_rate": 0.00026471414242728187, "loss": 2.8495, "theoretical_loss": 3.5018790521822853, "tokens_seen": 1570779136 }, { "epoch": 4.06, "learning_rate": 0.00026470411233701105, "loss": 2.7081, "theoretical_loss": 3.501866284157718, "tokens_seen": 1570844672 }, { "epoch": 4.06, "learning_rate": 0.00026469408224674023, "loss": 2.8727, "theoretical_loss": 3.501853516814969, "tokens_seen": 1570910208 }, { "epoch": 4.06, "learning_rate": 0.0002646840521564694, "loss": 2.6568, "theoretical_loss": 3.501840750153974, "tokens_seen": 1570975744 }, { "epoch": 4.06, "learning_rate": 0.0002646740220661986, "loss": 2.7175, "theoretical_loss": 3.5018279841746676, "tokens_seen": 1571041280 }, { "epoch": 4.06, "learning_rate": 0.0002646639919759278, "loss": 2.8162, "theoretical_loss": 3.501815218876985, "tokens_seen": 1571106816 }, { "epoch": 4.06, "objective/train/docs_used": 2507445, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.663092613220215, "objective/train/theoretical_loss": 3.501802454260862, "objective/train/tokens_used": 1591632352, "theoretical_loss": 3.501802454260862, "tokens_seen": 1571172352 }, { "epoch": 4.06, "learning_rate": 0.000264653961885657, "loss": 2.6784, "theoretical_loss": 3.501802454260862, "tokens_seen": 1571172352 }, { "epoch": 4.06, "learning_rate": 0.00026464393179538614, "loss": 2.8073, "theoretical_loss": 3.501789690326233, "tokens_seen": 1571237888 }, { "epoch": 4.06, "learning_rate": 0.00026463390170511537, "loss": 2.7737, "theoretical_loss": 3.501776927073033, "tokens_seen": 1571303424 }, { "epoch": 4.06, "learning_rate": 0.0002646238716148445, "loss": 2.8274, "theoretical_loss": 3.5017641645011985, "tokens_seen": 1571368960 }, { "epoch": 4.06, "learning_rate": 0.00026461384152457373, "loss": 2.8801, "theoretical_loss": 3.5017514026106635, "tokens_seen": 1571434496 }, { "epoch": 4.06, "learning_rate": 0.0002646038114343029, "loss": 2.7773, "theoretical_loss": 3.5017386414013636, "tokens_seen": 1571500032 }, { "epoch": 4.06, "learning_rate": 0.0002645937813440321, "loss": 2.7748, "theoretical_loss": 3.5017258808732343, "tokens_seen": 1571565568 }, { "epoch": 4.06, "learning_rate": 0.0002645837512537613, "loss": 2.7166, "theoretical_loss": 3.501713121026211, "tokens_seen": 1571631104 }, { "epoch": 4.06, "learning_rate": 0.0002645737211634905, "loss": 2.7795, "theoretical_loss": 3.501700361860228, "tokens_seen": 1571696640 }, { "epoch": 4.06, "learning_rate": 0.00026456369107321964, "loss": 2.9204, "theoretical_loss": 3.5016876033752213, "tokens_seen": 1571762176 }, { "epoch": 4.06, "learning_rate": 0.0002645536609829489, "loss": 2.662, "theoretical_loss": 3.5016748455711255, "tokens_seen": 1571827712 }, { "epoch": 4.06, "learning_rate": 0.000264543630892678, "loss": 2.8956, "theoretical_loss": 3.501662088447877, "tokens_seen": 1571893248 }, { "epoch": 4.06, "learning_rate": 0.00026453360080240724, "loss": 2.8258, "theoretical_loss": 3.5016493320054103, "tokens_seen": 1571958784 }, { "epoch": 4.06, "learning_rate": 0.0002645235707121364, "loss": 2.945, "theoretical_loss": 3.5016365762436603, "tokens_seen": 1572024320 }, { "epoch": 4.06, "learning_rate": 0.0002645135406218656, "loss": 2.7954, "theoretical_loss": 3.501623821162563, "tokens_seen": 1572089856 }, { "epoch": 4.06, "learning_rate": 0.0002645035105315948, "loss": 2.8426, "theoretical_loss": 3.501611066762054, "tokens_seen": 1572155392 }, { "epoch": 4.06, "learning_rate": 0.00026449348044132396, "loss": 2.7741, "theoretical_loss": 3.501598313042068, "tokens_seen": 1572220928 }, { "epoch": 4.06, "learning_rate": 0.00026448345035105314, "loss": 2.8928, "theoretical_loss": 3.50158556000254, "tokens_seen": 1572286464 }, { "epoch": 4.06, "learning_rate": 0.0002644734202607824, "loss": 2.6463, "theoretical_loss": 3.5015728076434063, "tokens_seen": 1572352000 }, { "epoch": 4.06, "learning_rate": 0.0002644633901705115, "loss": 2.7951, "theoretical_loss": 3.5015600559646014, "tokens_seen": 1572417536 }, { "epoch": 4.06, "learning_rate": 0.00026445336008024074, "loss": 2.6684, "theoretical_loss": 3.501547304966061, "tokens_seen": 1572483072 }, { "epoch": 4.06, "learning_rate": 0.00026444332998996987, "loss": 2.6428, "theoretical_loss": 3.5015345546477206, "tokens_seen": 1572548608 }, { "epoch": 4.06, "learning_rate": 0.0002644332998996991, "loss": 2.9201, "theoretical_loss": 3.5015218050095154, "tokens_seen": 1572614144 }, { "epoch": 4.06, "learning_rate": 0.0002644232698094283, "loss": 2.6921, "theoretical_loss": 3.501509056051381, "tokens_seen": 1572679680 }, { "epoch": 4.06, "learning_rate": 0.00026441323971915747, "loss": 2.8257, "theoretical_loss": 3.501496307773252, "tokens_seen": 1572745216 }, { "epoch": 4.06, "objective/train/docs_used": 2508906, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8796980381011963, "objective/train/theoretical_loss": 3.501483560175065, "objective/train/tokens_used": 1593270752, "theoretical_loss": 3.501483560175065, "tokens_seen": 1572810752 }, { "epoch": 4.06, "learning_rate": 0.00026440320962888665, "loss": 2.8878, "theoretical_loss": 3.501483560175065, "tokens_seen": 1572810752 }, { "epoch": 4.06, "learning_rate": 0.0002643931795386159, "loss": 2.8654, "theoretical_loss": 3.5014708132567547, "tokens_seen": 1572876288 }, { "epoch": 4.06, "learning_rate": 0.000264383149448345, "loss": 2.7811, "theoretical_loss": 3.5014580670182562, "tokens_seen": 1572941824 }, { "epoch": 4.06, "learning_rate": 0.00026437311935807424, "loss": 2.902, "theoretical_loss": 3.5014453214595056, "tokens_seen": 1573007360 }, { "epoch": 4.06, "learning_rate": 0.00026436308926780337, "loss": 2.8129, "theoretical_loss": 3.501432576580438, "tokens_seen": 1573072896 }, { "epoch": 4.06, "learning_rate": 0.0002643530591775326, "loss": 2.8019, "theoretical_loss": 3.5014198323809893, "tokens_seen": 1573138432 }, { "epoch": 4.06, "learning_rate": 0.0002643430290872618, "loss": 2.6173, "theoretical_loss": 3.501407088861094, "tokens_seen": 1573203968 }, { "epoch": 4.06, "learning_rate": 0.00026433299899699097, "loss": 2.7722, "theoretical_loss": 3.501394346020688, "tokens_seen": 1573269504 }, { "epoch": 4.06, "learning_rate": 0.00026432296890672015, "loss": 2.782, "theoretical_loss": 3.5013816038597074, "tokens_seen": 1573335040 }, { "epoch": 4.06, "learning_rate": 0.00026431293881644933, "loss": 2.7094, "theoretical_loss": 3.5013688623780865, "tokens_seen": 1573400576 }, { "epoch": 4.06, "learning_rate": 0.0002643029087261785, "loss": 2.6864, "theoretical_loss": 3.5013561215757623, "tokens_seen": 1573466112 }, { "epoch": 4.06, "learning_rate": 0.00026429287863590775, "loss": 2.8888, "theoretical_loss": 3.501343381452669, "tokens_seen": 1573531648 }, { "epoch": 4.06, "learning_rate": 0.0002642828485456369, "loss": 2.8101, "theoretical_loss": 3.5013306420087424, "tokens_seen": 1573597184 }, { "epoch": 4.06, "learning_rate": 0.0002642728184553661, "loss": 2.7902, "theoretical_loss": 3.501317903243918, "tokens_seen": 1573662720 }, { "epoch": 4.06, "learning_rate": 0.00026426278836509534, "loss": 2.7672, "theoretical_loss": 3.501305165158132, "tokens_seen": 1573728256 }, { "epoch": 4.06, "learning_rate": 0.00026425275827482447, "loss": 2.9251, "theoretical_loss": 3.501292427751319, "tokens_seen": 1573793792 }, { "epoch": 4.06, "learning_rate": 0.0002642427281845537, "loss": 2.7763, "theoretical_loss": 3.5012796910234147, "tokens_seen": 1573859328 }, { "epoch": 4.06, "learning_rate": 0.00026423269809428283, "loss": 2.8182, "theoretical_loss": 3.5012669549743554, "tokens_seen": 1573924864 }, { "epoch": 4.06, "learning_rate": 0.00026422266800401207, "loss": 2.8677, "theoretical_loss": 3.5012542196040757, "tokens_seen": 1573990400 }, { "epoch": 4.06, "learning_rate": 0.00026421263791374125, "loss": 2.7401, "theoretical_loss": 3.501241484912512, "tokens_seen": 1574055936 }, { "epoch": 4.06, "learning_rate": 0.00026420260782347043, "loss": 2.8165, "theoretical_loss": 3.501228750899599, "tokens_seen": 1574121472 }, { "epoch": 4.06, "learning_rate": 0.0002641925777331996, "loss": 2.8186, "theoretical_loss": 3.501216017565273, "tokens_seen": 1574187008 }, { "epoch": 4.06, "learning_rate": 0.0002641825476429288, "loss": 2.5902, "theoretical_loss": 3.5012032849094696, "tokens_seen": 1574252544 }, { "epoch": 4.06, "learning_rate": 0.000264172517552658, "loss": 2.8164, "theoretical_loss": 3.501190552932124, "tokens_seen": 1574318080 }, { "epoch": 4.06, "learning_rate": 0.0002641624874623872, "loss": 2.7965, "theoretical_loss": 3.5011778216331715, "tokens_seen": 1574383616 }, { "epoch": 4.06, "objective/train/docs_used": 2511371, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8975937366485596, "objective/train/theoretical_loss": 3.501165091012549, "objective/train/tokens_used": 1594909152, "theoretical_loss": 3.501165091012549, "tokens_seen": 1574449152 }, { "epoch": 4.06, "learning_rate": 0.00026415245737211634, "loss": 2.865, "theoretical_loss": 3.501165091012549, "tokens_seen": 1574449152 }, { "epoch": 4.06, "learning_rate": 0.00026414242728184557, "loss": 2.6059, "theoretical_loss": 3.5011523610701905, "tokens_seen": 1574514688 }, { "epoch": 4.06, "learning_rate": 0.0002641323971915747, "loss": 2.7668, "theoretical_loss": 3.5011396318060326, "tokens_seen": 1574580224 }, { "epoch": 4.06, "learning_rate": 0.00026412236710130393, "loss": 2.7609, "theoretical_loss": 3.501126903220011, "tokens_seen": 1574645760 }, { "epoch": 4.06, "learning_rate": 0.0002641123370110331, "loss": 2.6646, "theoretical_loss": 3.5011141753120607, "tokens_seen": 1574711296 }, { "epoch": 4.06, "learning_rate": 0.0002641023069207623, "loss": 2.8494, "theoretical_loss": 3.5011014480821183, "tokens_seen": 1574776832 }, { "epoch": 4.06, "learning_rate": 0.0002640922768304915, "loss": 2.9463, "theoretical_loss": 3.5010887215301185, "tokens_seen": 1574842368 }, { "epoch": 4.06, "learning_rate": 0.0002640822467402207, "loss": 2.7621, "theoretical_loss": 3.501075995655998, "tokens_seen": 1574907904 }, { "epoch": 4.06, "learning_rate": 0.00026407221664994984, "loss": 2.989, "theoretical_loss": 3.501063270459691, "tokens_seen": 1574973440 }, { "epoch": 4.06, "learning_rate": 0.0002640621865596791, "loss": 2.8515, "theoretical_loss": 3.5010505459411343, "tokens_seen": 1575038976 }, { "epoch": 4.06, "learning_rate": 0.0002640521564694082, "loss": 2.655, "theoretical_loss": 3.5010378221002636, "tokens_seen": 1575104512 }, { "epoch": 4.06, "learning_rate": 0.00026404212637913744, "loss": 2.933, "theoretical_loss": 3.501025098937015, "tokens_seen": 1575170048 }, { "epoch": 4.06, "learning_rate": 0.0002640320962888666, "loss": 2.92, "theoretical_loss": 3.501012376451323, "tokens_seen": 1575235584 }, { "epoch": 4.06, "learning_rate": 0.0002640220661985958, "loss": 2.8554, "theoretical_loss": 3.500999654643124, "tokens_seen": 1575301120 }, { "epoch": 4.06, "learning_rate": 0.000264012036108325, "loss": 2.7702, "theoretical_loss": 3.500986933512354, "tokens_seen": 1575366656 }, { "epoch": 4.06, "learning_rate": 0.00026400200601805416, "loss": 2.9654, "theoretical_loss": 3.5009742130589485, "tokens_seen": 1575432192 }, { "epoch": 4.06, "learning_rate": 0.00026399197592778334, "loss": 2.8165, "theoretical_loss": 3.500961493282843, "tokens_seen": 1575497728 }, { "epoch": 4.06, "learning_rate": 0.0002639819458375126, "loss": 2.7741, "theoretical_loss": 3.500948774183973, "tokens_seen": 1575563264 }, { "epoch": 4.06, "learning_rate": 0.0002639719157472417, "loss": 2.9574, "theoretical_loss": 3.500936055762275, "tokens_seen": 1575628800 }, { "epoch": 4.06, "learning_rate": 0.00026396188565697094, "loss": 2.7267, "theoretical_loss": 3.500923338017685, "tokens_seen": 1575694336 }, { "epoch": 4.06, "learning_rate": 0.00026395185556670007, "loss": 2.7808, "theoretical_loss": 3.500910620950138, "tokens_seen": 1575759872 }, { "epoch": 4.06, "learning_rate": 0.0002639418254764293, "loss": 2.7061, "theoretical_loss": 3.5008979045595705, "tokens_seen": 1575825408 }, { "epoch": 4.06, "learning_rate": 0.0002639317953861585, "loss": 2.821, "theoretical_loss": 3.500885188845918, "tokens_seen": 1575890944 }, { "epoch": 4.06, "learning_rate": 0.00026392176529588767, "loss": 2.7796, "theoretical_loss": 3.5008724738091157, "tokens_seen": 1575956480 }, { "epoch": 4.06, "learning_rate": 0.00026391173520561685, "loss": 2.644, "theoretical_loss": 3.5008597594491, "tokens_seen": 1576022016 }, { "epoch": 4.06, "objective/train/docs_used": 2514179, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6059908866882324, "objective/train/theoretical_loss": 3.5008470457658074, "objective/train/tokens_used": 1596547552, "theoretical_loss": 3.5008470457658074, "tokens_seen": 1576087552 }, { "epoch": 4.06, "learning_rate": 0.0002639017051153461, "loss": 2.8047, "theoretical_loss": 3.5008470457658074, "tokens_seen": 1576087552 }, { "epoch": 4.06, "learning_rate": 0.0002638916750250752, "loss": 2.7043, "theoretical_loss": 3.5008343327591724, "tokens_seen": 1576153088 }, { "epoch": 4.06, "learning_rate": 0.00026388164493480444, "loss": 2.6549, "theoretical_loss": 3.5008216204291314, "tokens_seen": 1576218624 }, { "epoch": 4.06, "learning_rate": 0.00026387161484453357, "loss": 2.8756, "theoretical_loss": 3.500808908775621, "tokens_seen": 1576284160 }, { "epoch": 4.06, "learning_rate": 0.0002638615847542628, "loss": 2.8774, "theoretical_loss": 3.500796197798576, "tokens_seen": 1576349696 }, { "epoch": 4.06, "learning_rate": 0.000263851554663992, "loss": 2.7542, "theoretical_loss": 3.500783487497933, "tokens_seen": 1576415232 }, { "epoch": 4.06, "learning_rate": 0.00026384152457372117, "loss": 2.8839, "theoretical_loss": 3.500770777873628, "tokens_seen": 1576480768 }, { "epoch": 4.06, "learning_rate": 0.00026383149448345035, "loss": 2.72, "theoretical_loss": 3.5007580689255957, "tokens_seen": 1576546304 }, { "epoch": 4.06, "learning_rate": 0.00026382146439317953, "loss": 2.8264, "theoretical_loss": 3.5007453606537733, "tokens_seen": 1576611840 }, { "epoch": 4.06, "learning_rate": 0.0002638114343029087, "loss": 2.7289, "theoretical_loss": 3.500732653058096, "tokens_seen": 1576677376 }, { "epoch": 4.06, "learning_rate": 0.00026380140421263795, "loss": 2.8864, "theoretical_loss": 3.5007199461385, "tokens_seen": 1576742912 }, { "epoch": 4.06, "learning_rate": 0.0002637913741223671, "loss": 2.7102, "theoretical_loss": 3.5007072398949215, "tokens_seen": 1576808448 }, { "epoch": 4.06, "learning_rate": 0.0002637813440320963, "loss": 2.7375, "theoretical_loss": 3.500694534327296, "tokens_seen": 1576873984 }, { "epoch": 4.06, "learning_rate": 0.0002637713139418255, "loss": 2.9986, "theoretical_loss": 3.5006818294355595, "tokens_seen": 1576939520 }, { "epoch": 4.06, "learning_rate": 0.00026376128385155467, "loss": 2.8754, "theoretical_loss": 3.500669125219648, "tokens_seen": 1577005056 }, { "epoch": 4.06, "learning_rate": 0.00026375125376128385, "loss": 2.8449, "theoretical_loss": 3.5006564216794978, "tokens_seen": 1577070592 }, { "epoch": 4.06, "learning_rate": 0.00026374122367101303, "loss": 2.7767, "theoretical_loss": 3.500643718815044, "tokens_seen": 1577136128 }, { "epoch": 4.06, "learning_rate": 0.0002637311935807422, "loss": 2.7695, "theoretical_loss": 3.5006310166262233, "tokens_seen": 1577201664 }, { "epoch": 4.06, "learning_rate": 0.00026372116349047145, "loss": 2.8546, "theoretical_loss": 3.500618315112972, "tokens_seen": 1577267200 }, { "epoch": 4.06, "learning_rate": 0.0002637111334002006, "loss": 2.8352, "theoretical_loss": 3.5006056142752255, "tokens_seen": 1577332736 }, { "epoch": 4.06, "learning_rate": 0.0002637011033099298, "loss": 2.7555, "theoretical_loss": 3.5005929141129197, "tokens_seen": 1577398272 }, { "epoch": 4.06, "learning_rate": 0.00026369107321965894, "loss": 2.6095, "theoretical_loss": 3.500580214625991, "tokens_seen": 1577463808 }, { "epoch": 4.06, "learning_rate": 0.0002636810431293882, "loss": 2.7694, "theoretical_loss": 3.5005675158143754, "tokens_seen": 1577529344 }, { "epoch": 4.06, "learning_rate": 0.00026367101303911736, "loss": 2.892, "theoretical_loss": 3.500554817678009, "tokens_seen": 1577594880 }, { "epoch": 4.06, "learning_rate": 0.00026366098294884654, "loss": 2.6924, "theoretical_loss": 3.500542120216828, "tokens_seen": 1577660416 }, { "epoch": 4.06, "objective/train/docs_used": 2517085, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8722169399261475, "objective/train/theoretical_loss": 3.5005294234307676, "objective/train/tokens_used": 1598185952, "theoretical_loss": 3.5005294234307676, "tokens_seen": 1577725952 }, { "epoch": 4.06, "learning_rate": 0.0002636509528585757, "loss": 2.808, "theoretical_loss": 3.5005294234307676, "tokens_seen": 1577725952 }, { "epoch": 4.06, "learning_rate": 0.0002636409227683049, "loss": 2.7467, "theoretical_loss": 3.5005167273197646, "tokens_seen": 1577791488 }, { "epoch": 4.06, "learning_rate": 0.0002636308926780341, "loss": 2.6945, "theoretical_loss": 3.5005040318837546, "tokens_seen": 1577857024 }, { "epoch": 4.06, "learning_rate": 0.0002636208625877633, "loss": 2.7759, "theoretical_loss": 3.5004913371226745, "tokens_seen": 1577922560 }, { "epoch": 4.06, "learning_rate": 0.00026361083249749244, "loss": 2.9137, "theoretical_loss": 3.5004786430364594, "tokens_seen": 1577988096 }, { "epoch": 4.06, "learning_rate": 0.0002636008024072217, "loss": 2.787, "theoretical_loss": 3.5004659496250463, "tokens_seen": 1578053632 }, { "epoch": 4.06, "learning_rate": 0.00026359077231695086, "loss": 2.8659, "theoretical_loss": 3.5004532568883704, "tokens_seen": 1578119168 }, { "epoch": 4.06, "learning_rate": 0.00026358074222668004, "loss": 2.8359, "theoretical_loss": 3.5004405648263686, "tokens_seen": 1578184704 }, { "epoch": 4.06, "learning_rate": 0.0002635707121364092, "loss": 2.7166, "theoretical_loss": 3.5004278734389764, "tokens_seen": 1578250240 }, { "epoch": 4.06, "learning_rate": 0.0002635606820461384, "loss": 2.7769, "theoretical_loss": 3.5004151827261305, "tokens_seen": 1578315776 }, { "epoch": 4.06, "learning_rate": 0.0002635506519558676, "loss": 2.8659, "theoretical_loss": 3.500402492687767, "tokens_seen": 1578381312 }, { "epoch": 4.06, "learning_rate": 0.0002635406218655968, "loss": 2.5946, "theoretical_loss": 3.500389803323821, "tokens_seen": 1578446848 }, { "epoch": 4.06, "learning_rate": 0.00026353059177532595, "loss": 2.8308, "theoretical_loss": 3.5003771146342304, "tokens_seen": 1578512384 }, { "epoch": 4.06, "learning_rate": 0.0002635205616850552, "loss": 2.9744, "theoretical_loss": 3.50036442661893, "tokens_seen": 1578577920 }, { "epoch": 4.06, "learning_rate": 0.00026351053159478436, "loss": 2.6041, "theoretical_loss": 3.500351739277857, "tokens_seen": 1578643456 }, { "epoch": 4.06, "learning_rate": 0.00026350050150451354, "loss": 2.6611, "theoretical_loss": 3.500339052610946, "tokens_seen": 1578708992 }, { "epoch": 4.06, "learning_rate": 0.0002634904714142428, "loss": 2.7534, "theoretical_loss": 3.5003263666181352, "tokens_seen": 1578774528 }, { "epoch": 4.06, "learning_rate": 0.0002634804413239719, "loss": 2.7181, "theoretical_loss": 3.5003136812993594, "tokens_seen": 1578840064 }, { "epoch": 4.06, "learning_rate": 0.00026347041123370114, "loss": 2.9039, "theoretical_loss": 3.5003009966545546, "tokens_seen": 1578905600 }, { "epoch": 4.06, "learning_rate": 0.00026346038114343027, "loss": 2.9452, "theoretical_loss": 3.500288312683659, "tokens_seen": 1578971136 }, { "epoch": 4.06, "learning_rate": 0.0002634503510531595, "loss": 2.8619, "theoretical_loss": 3.500275629386606, "tokens_seen": 1579036672 }, { "epoch": 4.06, "learning_rate": 0.0002634403209628887, "loss": 2.8681, "theoretical_loss": 3.500262946763334, "tokens_seen": 1579102208 }, { "epoch": 4.06, "learning_rate": 0.00026343029087261787, "loss": 2.8443, "theoretical_loss": 3.5002502648137788, "tokens_seen": 1579167744 }, { "epoch": 4.06, "learning_rate": 0.00026342026078234705, "loss": 2.9463, "theoretical_loss": 3.500237583537876, "tokens_seen": 1579233280 }, { "epoch": 4.06, "learning_rate": 0.0002634102306920763, "loss": 2.8692, "theoretical_loss": 3.5002249029355625, "tokens_seen": 1579298816 }, { "epoch": 4.06, "objective/train/docs_used": 2519888, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9177846908569336, "objective/train/theoretical_loss": 3.5002122230067743, "objective/train/tokens_used": 1599824352, "theoretical_loss": 3.5002122230067743, "tokens_seen": 1579364352 }, { "epoch": 4.06, "learning_rate": 0.0002634002006018054, "loss": 2.8571, "theoretical_loss": 3.5002122230067743, "tokens_seen": 1579364352 }, { "epoch": 4.06, "learning_rate": 0.00026339017051153464, "loss": 2.7605, "theoretical_loss": 3.5001995437514477, "tokens_seen": 1579429888 }, { "epoch": 4.06, "learning_rate": 0.00026338014042126377, "loss": 2.8638, "theoretical_loss": 3.500186865169519, "tokens_seen": 1579495424 }, { "epoch": 4.06, "learning_rate": 0.000263370110330993, "loss": 2.7705, "theoretical_loss": 3.5001741872609244, "tokens_seen": 1579560960 }, { "epoch": 4.06, "learning_rate": 0.0002633600802407222, "loss": 2.8799, "theoretical_loss": 3.5001615100256007, "tokens_seen": 1579626496 }, { "epoch": 4.06, "learning_rate": 0.00026335005015045137, "loss": 2.7805, "theoretical_loss": 3.5001488334634834, "tokens_seen": 1579692032 }, { "epoch": 4.06, "learning_rate": 0.00026334002006018055, "loss": 2.9034, "theoretical_loss": 3.5001361575745094, "tokens_seen": 1579757568 }, { "epoch": 4.06, "learning_rate": 0.00026332998996990973, "loss": 2.958, "theoretical_loss": 3.5001234823586147, "tokens_seen": 1579823104 }, { "epoch": 4.06, "learning_rate": 0.0002633199598796389, "loss": 2.74, "theoretical_loss": 3.500110807815736, "tokens_seen": 1579888640 }, { "epoch": 4.06, "learning_rate": 0.00026330992978936815, "loss": 2.8079, "theoretical_loss": 3.5000981339458095, "tokens_seen": 1579954176 }, { "epoch": 4.06, "learning_rate": 0.0002632998996990973, "loss": 2.856, "theoretical_loss": 3.5000854607487715, "tokens_seen": 1580019712 }, { "epoch": 4.06, "learning_rate": 0.0002632898696088265, "loss": 2.8974, "theoretical_loss": 3.5000727882245584, "tokens_seen": 1580085248 }, { "epoch": 4.06, "learning_rate": 0.0002632798395185557, "loss": 2.7823, "theoretical_loss": 3.500060116373106, "tokens_seen": 1580150784 }, { "epoch": 4.06, "learning_rate": 0.00026326980942828487, "loss": 2.8216, "theoretical_loss": 3.5000474451943524, "tokens_seen": 1580216320 }, { "epoch": 4.06, "learning_rate": 0.00026325977933801405, "loss": 2.7007, "theoretical_loss": 3.500034774688232, "tokens_seen": 1580281856 }, { "epoch": 4.06, "learning_rate": 0.00026324974924774323, "loss": 2.8624, "theoretical_loss": 3.5000221048546827, "tokens_seen": 1580347392 }, { "epoch": 4.06, "learning_rate": 0.0002632397191574724, "loss": 2.7506, "theoretical_loss": 3.5000094356936398, "tokens_seen": 1580412928 }, { "epoch": 4.06, "learning_rate": 0.00026322968906720165, "loss": 2.732, "theoretical_loss": 3.4999967672050403, "tokens_seen": 1580478464 }, { "epoch": 4.06, "learning_rate": 0.0002632196589769308, "loss": 2.8169, "theoretical_loss": 3.4999840993888203, "tokens_seen": 1580544000 }, { "epoch": 4.06, "learning_rate": 0.00026320962888666, "loss": 2.8857, "theoretical_loss": 3.4999714322449167, "tokens_seen": 1580609536 }, { "epoch": 4.06, "learning_rate": 0.00026319959879638914, "loss": 2.6696, "theoretical_loss": 3.4999587657732656, "tokens_seen": 1580675072 }, { "epoch": 4.06, "learning_rate": 0.0002631895687061184, "loss": 2.9362, "theoretical_loss": 3.4999460999738035, "tokens_seen": 1580740608 }, { "epoch": 4.06, "learning_rate": 0.00026317953861584756, "loss": 2.8173, "theoretical_loss": 3.499933434846467, "tokens_seen": 1580806144 }, { "epoch": 4.06, "learning_rate": 0.00026316950852557674, "loss": 2.6503, "theoretical_loss": 3.499920770391192, "tokens_seen": 1580871680 }, { "epoch": 4.06, "learning_rate": 0.0002631594784353059, "loss": 2.6558, "theoretical_loss": 3.499908106607916, "tokens_seen": 1580937216 }, { "epoch": 4.06, "objective/train/docs_used": 2522805, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7653751373291016, "objective/train/theoretical_loss": 3.499895443496575, "objective/train/tokens_used": 1601462752, "theoretical_loss": 3.499895443496575, "tokens_seen": 1581002752 }, { "epoch": 4.06, "learning_rate": 0.0002631494483450351, "loss": 2.729, "theoretical_loss": 3.499895443496575, "tokens_seen": 1581002752 }, { "epoch": 4.06, "learning_rate": 0.0002631394182547643, "loss": 2.9083, "theoretical_loss": 3.499882781057105, "tokens_seen": 1581068288 }, { "epoch": 4.06, "learning_rate": 0.0002631293881644935, "loss": 2.888, "theoretical_loss": 3.4998701192894437, "tokens_seen": 1581133824 }, { "epoch": 4.06, "learning_rate": 0.00026311935807422264, "loss": 2.9569, "theoretical_loss": 3.4998574581935262, "tokens_seen": 1581199360 }, { "epoch": 4.06, "learning_rate": 0.0002631093279839519, "loss": 2.8017, "theoretical_loss": 3.4998447977692897, "tokens_seen": 1581264896 }, { "epoch": 4.06, "learning_rate": 0.00026309929789368106, "loss": 2.684, "theoretical_loss": 3.4998321380166706, "tokens_seen": 1581330432 }, { "epoch": 4.07, "learning_rate": 0.00026308926780341024, "loss": 2.8231, "theoretical_loss": 3.4998194789356063, "tokens_seen": 1581395968 }, { "epoch": 4.07, "learning_rate": 0.0002630792377131394, "loss": 2.6808, "theoretical_loss": 3.4998068205260315, "tokens_seen": 1581461504 }, { "epoch": 4.07, "learning_rate": 0.0002630692076228686, "loss": 2.8281, "theoretical_loss": 3.499794162787885, "tokens_seen": 1581527040 }, { "epoch": 4.07, "learning_rate": 0.0002630591775325978, "loss": 2.7832, "theoretical_loss": 3.4997815057211015, "tokens_seen": 1581592576 }, { "epoch": 4.07, "learning_rate": 0.000263049147442327, "loss": 2.8391, "theoretical_loss": 3.4997688493256183, "tokens_seen": 1581658112 }, { "epoch": 4.07, "learning_rate": 0.00026303911735205615, "loss": 3.0013, "theoretical_loss": 3.499756193601372, "tokens_seen": 1581723648 }, { "epoch": 4.07, "learning_rate": 0.0002630290872617854, "loss": 2.9971, "theoretical_loss": 3.4997435385483, "tokens_seen": 1581789184 }, { "epoch": 4.07, "learning_rate": 0.0002630190571715145, "loss": 2.8255, "theoretical_loss": 3.499730884166337, "tokens_seen": 1581854720 }, { "epoch": 4.07, "learning_rate": 0.00026300902708124374, "loss": 2.895, "theoretical_loss": 3.4997182304554206, "tokens_seen": 1581920256 }, { "epoch": 4.07, "learning_rate": 0.0002629989969909729, "loss": 2.8882, "theoretical_loss": 3.4997055774154884, "tokens_seen": 1581985792 }, { "epoch": 4.07, "learning_rate": 0.0002629889669007021, "loss": 2.8686, "theoretical_loss": 3.4996929250464754, "tokens_seen": 1582051328 }, { "epoch": 4.07, "learning_rate": 0.0002629789368104313, "loss": 2.723, "theoretical_loss": 3.499680273348319, "tokens_seen": 1582116864 }, { "epoch": 4.07, "learning_rate": 0.00026296890672016047, "loss": 2.7029, "theoretical_loss": 3.499667622320956, "tokens_seen": 1582182400 }, { "epoch": 4.07, "learning_rate": 0.00026295887662988965, "loss": 2.7879, "theoretical_loss": 3.499654971964323, "tokens_seen": 1582247936 }, { "epoch": 4.07, "learning_rate": 0.0002629488465396189, "loss": 2.6963, "theoretical_loss": 3.4996423222783566, "tokens_seen": 1582313472 }, { "epoch": 4.07, "learning_rate": 0.000262938816449348, "loss": 2.7866, "theoretical_loss": 3.4996296732629926, "tokens_seen": 1582379008 }, { "epoch": 4.07, "learning_rate": 0.00026292878635907725, "loss": 2.8255, "theoretical_loss": 3.4996170249181695, "tokens_seen": 1582444544 }, { "epoch": 4.07, "learning_rate": 0.00026291875626880643, "loss": 2.8285, "theoretical_loss": 3.499604377243822, "tokens_seen": 1582510080 }, { "epoch": 4.07, "learning_rate": 0.0002629087261785356, "loss": 2.9496, "theoretical_loss": 3.4995917302398882, "tokens_seen": 1582575616 }, { "epoch": 4.07, "objective/train/docs_used": 2525316, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9854226112365723, "objective/train/theoretical_loss": 3.499579083906305, "objective/train/tokens_used": 1603101152, "theoretical_loss": 3.499579083906305, "tokens_seen": 1582641152 }, { "epoch": 4.07, "learning_rate": 0.0002628986960882648, "loss": 2.8323, "theoretical_loss": 3.499579083906305, "tokens_seen": 1582641152 }, { "epoch": 4.07, "learning_rate": 0.00026288866599799397, "loss": 2.6216, "theoretical_loss": 3.499566438243007, "tokens_seen": 1582706688 }, { "epoch": 4.07, "learning_rate": 0.00026287863590772315, "loss": 2.6857, "theoretical_loss": 3.4995537932499334, "tokens_seen": 1582772224 }, { "epoch": 4.07, "learning_rate": 0.0002628686058174524, "loss": 2.8197, "theoretical_loss": 3.49954114892702, "tokens_seen": 1582837760 }, { "epoch": 4.07, "learning_rate": 0.0002628585757271815, "loss": 2.6561, "theoretical_loss": 3.4995285052742027, "tokens_seen": 1582903296 }, { "epoch": 4.07, "learning_rate": 0.00026284854563691075, "loss": 2.8458, "theoretical_loss": 3.4995158622914193, "tokens_seen": 1582968832 }, { "epoch": 4.07, "learning_rate": 0.0002628385155466399, "loss": 2.7215, "theoretical_loss": 3.4995032199786062, "tokens_seen": 1583034368 }, { "epoch": 4.07, "learning_rate": 0.0002628284854563691, "loss": 2.8373, "theoretical_loss": 3.4994905783357004, "tokens_seen": 1583099904 }, { "epoch": 4.07, "learning_rate": 0.0002628184553660983, "loss": 2.7527, "theoretical_loss": 3.4994779373626383, "tokens_seen": 1583165440 }, { "epoch": 4.07, "learning_rate": 0.0002628084252758275, "loss": 2.6878, "theoretical_loss": 3.499465297059357, "tokens_seen": 1583230976 }, { "epoch": 4.07, "learning_rate": 0.00026279839518555666, "loss": 2.8464, "theoretical_loss": 3.4994526574257927, "tokens_seen": 1583296512 }, { "epoch": 4.07, "learning_rate": 0.0002627883650952859, "loss": 2.8827, "theoretical_loss": 3.4994400184618835, "tokens_seen": 1583362048 }, { "epoch": 4.07, "learning_rate": 0.000262778335005015, "loss": 2.6565, "theoretical_loss": 3.499427380167565, "tokens_seen": 1583427584 }, { "epoch": 4.07, "learning_rate": 0.00026276830491474425, "loss": 2.6964, "theoretical_loss": 3.4994147425427737, "tokens_seen": 1583493120 }, { "epoch": 4.07, "learning_rate": 0.00026275827482447343, "loss": 2.6934, "theoretical_loss": 3.499402105587448, "tokens_seen": 1583558656 }, { "epoch": 4.07, "learning_rate": 0.0002627482447342026, "loss": 2.8422, "theoretical_loss": 3.499389469301523, "tokens_seen": 1583624192 }, { "epoch": 4.07, "learning_rate": 0.00026273821464393185, "loss": 2.8456, "theoretical_loss": 3.499376833684937, "tokens_seen": 1583689728 }, { "epoch": 4.07, "learning_rate": 0.000262728184553661, "loss": 2.7791, "theoretical_loss": 3.499364198737626, "tokens_seen": 1583755264 }, { "epoch": 4.07, "learning_rate": 0.0002627181544633902, "loss": 2.9081, "theoretical_loss": 3.499351564459527, "tokens_seen": 1583820800 }, { "epoch": 4.07, "learning_rate": 0.00026270812437311934, "loss": 2.9134, "theoretical_loss": 3.499338930850577, "tokens_seen": 1583886336 }, { "epoch": 4.07, "learning_rate": 0.0002626980942828486, "loss": 2.8652, "theoretical_loss": 3.499326297910713, "tokens_seen": 1583951872 }, { "epoch": 4.07, "learning_rate": 0.00026268806419257776, "loss": 2.8433, "theoretical_loss": 3.499313665639871, "tokens_seen": 1584017408 }, { "epoch": 4.07, "learning_rate": 0.00026267803410230694, "loss": 2.8823, "theoretical_loss": 3.4993010340379893, "tokens_seen": 1584082944 }, { "epoch": 4.07, "learning_rate": 0.0002626680040120361, "loss": 2.9539, "theoretical_loss": 3.4992884031050036, "tokens_seen": 1584148480 }, { "epoch": 4.07, "learning_rate": 0.0002626579739217653, "loss": 2.8248, "theoretical_loss": 3.4992757728408517, "tokens_seen": 1584214016 }, { "epoch": 4.07, "objective/train/docs_used": 2528122, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9905638694763184, "objective/train/theoretical_loss": 3.4992631432454697, "objective/train/tokens_used": 1604739552, "theoretical_loss": 3.4992631432454697, "tokens_seen": 1584279552 }, { "epoch": 4.07, "learning_rate": 0.0002626479438314945, "loss": 2.9598, "theoretical_loss": 3.4992631432454697, "tokens_seen": 1584279552 }, { "epoch": 4.07, "learning_rate": 0.0002626379137412237, "loss": 2.756, "theoretical_loss": 3.4992505143187955, "tokens_seen": 1584345088 }, { "epoch": 4.07, "learning_rate": 0.00026262788365095284, "loss": 2.9008, "theoretical_loss": 3.499237886060765, "tokens_seen": 1584410624 }, { "epoch": 4.07, "learning_rate": 0.0002626178535606821, "loss": 2.8346, "theoretical_loss": 3.499225258471316, "tokens_seen": 1584476160 }, { "epoch": 4.07, "learning_rate": 0.00026260782347041126, "loss": 2.8013, "theoretical_loss": 3.4992126315503844, "tokens_seen": 1584541696 }, { "epoch": 4.07, "learning_rate": 0.00026259779338014044, "loss": 2.8069, "theoretical_loss": 3.499200005297909, "tokens_seen": 1584607232 }, { "epoch": 4.07, "learning_rate": 0.0002625877632898696, "loss": 2.7587, "theoretical_loss": 3.4991873797138244, "tokens_seen": 1584672768 }, { "epoch": 4.07, "learning_rate": 0.0002625777331995988, "loss": 2.6986, "theoretical_loss": 3.4991747547980694, "tokens_seen": 1584738304 }, { "epoch": 4.07, "learning_rate": 0.000262567703109328, "loss": 2.6785, "theoretical_loss": 3.4991621305505802, "tokens_seen": 1584803840 }, { "epoch": 4.07, "learning_rate": 0.0002625576730190572, "loss": 2.9783, "theoretical_loss": 3.499149506971294, "tokens_seen": 1584869376 }, { "epoch": 4.07, "learning_rate": 0.00026254764292878635, "loss": 2.8849, "theoretical_loss": 3.499136884060148, "tokens_seen": 1584934912 }, { "epoch": 4.07, "learning_rate": 0.0002625376128385156, "loss": 2.8065, "theoretical_loss": 3.4991242618170784, "tokens_seen": 1585000448 }, { "epoch": 4.07, "learning_rate": 0.0002625275827482447, "loss": 2.9249, "theoretical_loss": 3.499111640242023, "tokens_seen": 1585065984 }, { "epoch": 4.07, "learning_rate": 0.00026251755265797394, "loss": 2.8391, "theoretical_loss": 3.4990990193349187, "tokens_seen": 1585131520 }, { "epoch": 4.07, "learning_rate": 0.0002625075225677031, "loss": 2.7016, "theoretical_loss": 3.4990863990957024, "tokens_seen": 1585197056 }, { "epoch": 4.07, "learning_rate": 0.0002624974924774323, "loss": 2.8311, "theoretical_loss": 3.499073779524311, "tokens_seen": 1585262592 }, { "epoch": 4.07, "learning_rate": 0.0002624874623871615, "loss": 2.9082, "theoretical_loss": 3.4990611606206823, "tokens_seen": 1585328128 }, { "epoch": 4.07, "learning_rate": 0.00026247743229689067, "loss": 2.689, "theoretical_loss": 3.4990485423847524, "tokens_seen": 1585393664 }, { "epoch": 4.07, "learning_rate": 0.00026246740220661985, "loss": 2.8843, "theoretical_loss": 3.499035924816459, "tokens_seen": 1585459200 }, { "epoch": 4.07, "learning_rate": 0.0002624573721163491, "loss": 2.8463, "theoretical_loss": 3.499023307915739, "tokens_seen": 1585524736 }, { "epoch": 4.07, "learning_rate": 0.0002624473420260782, "loss": 2.7105, "theoretical_loss": 3.499010691682529, "tokens_seen": 1585590272 }, { "epoch": 4.07, "learning_rate": 0.00026243731193580745, "loss": 2.7629, "theoretical_loss": 3.498998076116767, "tokens_seen": 1585655808 }, { "epoch": 4.07, "learning_rate": 0.00026242728184553663, "loss": 2.807, "theoretical_loss": 3.49898546121839, "tokens_seen": 1585721344 }, { "epoch": 4.07, "learning_rate": 0.0002624172517552658, "loss": 2.7732, "theoretical_loss": 3.498972846987334, "tokens_seen": 1585786880 }, { "epoch": 4.07, "learning_rate": 0.000262407221664995, "loss": 2.7193, "theoretical_loss": 3.4989602334235372, "tokens_seen": 1585852416 }, { "epoch": 4.07, "objective/train/docs_used": 2530888, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.008103370666504, "objective/train/theoretical_loss": 3.4989476205269368, "objective/train/tokens_used": 1606377952, "theoretical_loss": 3.4989476205269368, "tokens_seen": 1585917952 }, { "epoch": 4.07, "learning_rate": 0.00026239719157472417, "loss": 2.828, "theoretical_loss": 3.4989476205269368, "tokens_seen": 1585917952 }, { "epoch": 4.07, "learning_rate": 0.00026238716148445335, "loss": 2.7021, "theoretical_loss": 3.498935008297469, "tokens_seen": 1585983488 }, { "epoch": 4.07, "learning_rate": 0.0002623771313941826, "loss": 2.9176, "theoretical_loss": 3.4989223967350718, "tokens_seen": 1586049024 }, { "epoch": 4.07, "learning_rate": 0.0002623671013039117, "loss": 2.784, "theoretical_loss": 3.4989097858396816, "tokens_seen": 1586114560 }, { "epoch": 4.07, "learning_rate": 0.00026235707121364095, "loss": 2.8816, "theoretical_loss": 3.498897175611237, "tokens_seen": 1586180096 }, { "epoch": 4.07, "learning_rate": 0.0002623470411233701, "loss": 2.8642, "theoretical_loss": 3.4988845660496732, "tokens_seen": 1586245632 }, { "epoch": 4.07, "learning_rate": 0.0002623370110330993, "loss": 2.8956, "theoretical_loss": 3.498871957154929, "tokens_seen": 1586311168 }, { "epoch": 4.07, "learning_rate": 0.0002623269809428285, "loss": 2.8071, "theoretical_loss": 3.498859348926941, "tokens_seen": 1586376704 }, { "epoch": 4.07, "learning_rate": 0.0002623169508525577, "loss": 2.889, "theoretical_loss": 3.4988467413656466, "tokens_seen": 1586442240 }, { "epoch": 4.07, "learning_rate": 0.00026230692076228686, "loss": 2.7201, "theoretical_loss": 3.4988341344709823, "tokens_seen": 1586507776 }, { "epoch": 4.07, "learning_rate": 0.0002622968906720161, "loss": 2.8197, "theoretical_loss": 3.498821528242886, "tokens_seen": 1586573312 }, { "epoch": 4.07, "learning_rate": 0.0002622868605817452, "loss": 2.7659, "theoretical_loss": 3.4988089226812944, "tokens_seen": 1586638848 }, { "epoch": 4.07, "learning_rate": 0.00026227683049147445, "loss": 2.7812, "theoretical_loss": 3.4987963177861454, "tokens_seen": 1586704384 }, { "epoch": 4.07, "learning_rate": 0.0002622668004012036, "loss": 2.8376, "theoretical_loss": 3.4987837135573763, "tokens_seen": 1586769920 }, { "epoch": 4.07, "learning_rate": 0.0002622567703109328, "loss": 2.721, "theoretical_loss": 3.498771109994923, "tokens_seen": 1586835456 }, { "epoch": 4.07, "learning_rate": 0.000262246740220662, "loss": 2.7731, "theoretical_loss": 3.4987585070987244, "tokens_seen": 1586900992 }, { "epoch": 4.07, "learning_rate": 0.0002622367101303912, "loss": 2.7498, "theoretical_loss": 3.4987459048687173, "tokens_seen": 1586966528 }, { "epoch": 4.07, "learning_rate": 0.00026222668004012036, "loss": 2.7981, "theoretical_loss": 3.4987333033048382, "tokens_seen": 1587032064 }, { "epoch": 4.07, "learning_rate": 0.00026221664994984954, "loss": 2.8322, "theoretical_loss": 3.4987207024070255, "tokens_seen": 1587097600 }, { "epoch": 4.07, "learning_rate": 0.0002622066198595787, "loss": 2.8782, "theoretical_loss": 3.4987081021752156, "tokens_seen": 1587163136 }, { "epoch": 4.07, "learning_rate": 0.00026219658976930796, "loss": 2.9338, "theoretical_loss": 3.498695502609346, "tokens_seen": 1587228672 }, { "epoch": 4.07, "learning_rate": 0.0002621865596790371, "loss": 2.8397, "theoretical_loss": 3.498682903709354, "tokens_seen": 1587294208 }, { "epoch": 4.07, "learning_rate": 0.0002621765295887663, "loss": 2.7177, "theoretical_loss": 3.4986703054751778, "tokens_seen": 1587359744 }, { "epoch": 4.07, "learning_rate": 0.00026216649949849545, "loss": 2.8348, "theoretical_loss": 3.4986577079067533, "tokens_seen": 1587425280 }, { "epoch": 4.07, "learning_rate": 0.0002621564694082247, "loss": 2.8737, "theoretical_loss": 3.498645111004019, "tokens_seen": 1587490816 }, { "epoch": 4.07, "objective/train/docs_used": 2532371, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7981715202331543, "objective/train/theoretical_loss": 3.4986325147669115, "objective/train/tokens_used": 1608016352, "theoretical_loss": 3.4986325147669115, "tokens_seen": 1587556352 }, { "epoch": 4.07, "learning_rate": 0.00026214643931795386, "loss": 2.9171, "theoretical_loss": 3.4986325147669115, "tokens_seen": 1587556352 }, { "epoch": 4.07, "learning_rate": 0.00026213640922768304, "loss": 2.7848, "theoretical_loss": 3.4986199191953684, "tokens_seen": 1587621888 }, { "epoch": 4.07, "learning_rate": 0.0002621263791374122, "loss": 2.6658, "theoretical_loss": 3.4986073242893276, "tokens_seen": 1587687424 }, { "epoch": 4.07, "learning_rate": 0.00026211634904714146, "loss": 2.6687, "theoretical_loss": 3.4985947300487252, "tokens_seen": 1587752960 }, { "epoch": 4.07, "learning_rate": 0.0002621063189568706, "loss": 2.8589, "theoretical_loss": 3.4985821364735, "tokens_seen": 1587818496 }, { "epoch": 4.07, "learning_rate": 0.0002620962888665998, "loss": 2.7379, "theoretical_loss": 3.4985695435635886, "tokens_seen": 1587884032 }, { "epoch": 4.07, "learning_rate": 0.00026208625877632895, "loss": 2.7995, "theoretical_loss": 3.4985569513189283, "tokens_seen": 1587949568 }, { "epoch": 4.07, "learning_rate": 0.0002620762286860582, "loss": 2.7733, "theoretical_loss": 3.4985443597394568, "tokens_seen": 1588015104 }, { "epoch": 4.07, "learning_rate": 0.00026206619859578737, "loss": 2.7386, "theoretical_loss": 3.4985317688251114, "tokens_seen": 1588080640 }, { "epoch": 4.07, "learning_rate": 0.00026205616850551655, "loss": 2.7635, "theoretical_loss": 3.4985191785758296, "tokens_seen": 1588146176 }, { "epoch": 4.07, "learning_rate": 0.00026204613841524573, "loss": 2.7499, "theoretical_loss": 3.4985065889915488, "tokens_seen": 1588211712 }, { "epoch": 4.07, "learning_rate": 0.0002620361083249749, "loss": 2.6279, "theoretical_loss": 3.4984940000722062, "tokens_seen": 1588277248 }, { "epoch": 4.07, "learning_rate": 0.0002620260782347041, "loss": 2.6884, "theoretical_loss": 3.49848141181774, "tokens_seen": 1588342784 }, { "epoch": 4.07, "learning_rate": 0.0002620160481444333, "loss": 2.6973, "theoretical_loss": 3.4984688242280866, "tokens_seen": 1588408320 }, { "epoch": 4.07, "learning_rate": 0.0002620060180541625, "loss": 2.8485, "theoretical_loss": 3.498456237303184, "tokens_seen": 1588473856 }, { "epoch": 4.07, "learning_rate": 0.0002619959879638917, "loss": 2.8299, "theoretical_loss": 3.4984436510429697, "tokens_seen": 1588539392 }, { "epoch": 4.07, "learning_rate": 0.00026198595787362087, "loss": 2.8647, "theoretical_loss": 3.498431065447381, "tokens_seen": 1588604928 }, { "epoch": 4.07, "learning_rate": 0.00026197592778335005, "loss": 2.6688, "theoretical_loss": 3.4984184805163556, "tokens_seen": 1588670464 }, { "epoch": 4.07, "learning_rate": 0.0002619658976930793, "loss": 2.8012, "theoretical_loss": 3.4984058962498317, "tokens_seen": 1588736000 }, { "epoch": 4.07, "learning_rate": 0.0002619558676028084, "loss": 2.6863, "theoretical_loss": 3.498393312647745, "tokens_seen": 1588801536 }, { "epoch": 4.07, "learning_rate": 0.00026194583751253765, "loss": 2.652, "theoretical_loss": 3.4983807297100347, "tokens_seen": 1588867072 }, { "epoch": 4.07, "learning_rate": 0.00026193580742226683, "loss": 2.7619, "theoretical_loss": 3.4983681474366373, "tokens_seen": 1588932608 }, { "epoch": 4.07, "learning_rate": 0.000261925777331996, "loss": 2.8112, "theoretical_loss": 3.4983555658274907, "tokens_seen": 1588998144 }, { "epoch": 4.07, "learning_rate": 0.0002619157472417252, "loss": 2.6769, "theoretical_loss": 3.498342984882532, "tokens_seen": 1589063680 }, { "epoch": 4.07, "learning_rate": 0.00026190571715145437, "loss": 2.907, "theoretical_loss": 3.4983304046017, "tokens_seen": 1589129216 }, { "epoch": 4.07, "objective/train/docs_used": 2535204, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8760440349578857, "objective/train/theoretical_loss": 3.498317824984931, "objective/train/tokens_used": 1609654752, "theoretical_loss": 3.498317824984931, "tokens_seen": 1589194752 }, { "epoch": 4.07, "learning_rate": 0.00026189568706118355, "loss": 2.8736, "theoretical_loss": 3.498317824984931, "tokens_seen": 1589194752 }, { "epoch": 4.07, "learning_rate": 0.0002618856569709128, "loss": 2.7044, "theoretical_loss": 3.498305246032163, "tokens_seen": 1589260288 }, { "epoch": 4.07, "learning_rate": 0.0002618756268806419, "loss": 2.8686, "theoretical_loss": 3.4982926677433337, "tokens_seen": 1589325824 }, { "epoch": 4.07, "learning_rate": 0.00026186559679037115, "loss": 2.8483, "theoretical_loss": 3.4982800901183806, "tokens_seen": 1589391360 }, { "epoch": 4.07, "learning_rate": 0.0002618555667001003, "loss": 2.7771, "theoretical_loss": 3.498267513157241, "tokens_seen": 1589456896 }, { "epoch": 4.07, "learning_rate": 0.0002618455366098295, "loss": 2.694, "theoretical_loss": 3.498254936859853, "tokens_seen": 1589522432 }, { "epoch": 4.07, "learning_rate": 0.0002618355065195587, "loss": 2.6107, "theoretical_loss": 3.4982423612261537, "tokens_seen": 1589587968 }, { "epoch": 4.07, "learning_rate": 0.0002618254764292879, "loss": 2.7504, "theoretical_loss": 3.498229786256081, "tokens_seen": 1589653504 }, { "epoch": 4.07, "learning_rate": 0.00026181544633901706, "loss": 2.7509, "theoretical_loss": 3.4982172119495725, "tokens_seen": 1589719040 }, { "epoch": 4.07, "learning_rate": 0.0002618054162487463, "loss": 2.7179, "theoretical_loss": 3.4982046383065657, "tokens_seen": 1589784576 }, { "epoch": 4.07, "learning_rate": 0.0002617953861584754, "loss": 2.8148, "theoretical_loss": 3.4981920653269984, "tokens_seen": 1589850112 }, { "epoch": 4.07, "learning_rate": 0.00026178535606820465, "loss": 2.9311, "theoretical_loss": 3.4981794930108085, "tokens_seen": 1589915648 }, { "epoch": 4.07, "learning_rate": 0.0002617753259779338, "loss": 3.0216, "theoretical_loss": 3.4981669213579325, "tokens_seen": 1589981184 }, { "epoch": 4.07, "learning_rate": 0.000261765295887663, "loss": 2.8497, "theoretical_loss": 3.49815435036831, "tokens_seen": 1590046720 }, { "epoch": 4.07, "learning_rate": 0.0002617552657973922, "loss": 2.8423, "theoretical_loss": 3.498141780041877, "tokens_seen": 1590112256 }, { "epoch": 4.07, "learning_rate": 0.0002617452357071214, "loss": 2.6774, "theoretical_loss": 3.498129210378572, "tokens_seen": 1590177792 }, { "epoch": 4.07, "learning_rate": 0.00026173520561685056, "loss": 2.7442, "theoretical_loss": 3.4981166413783322, "tokens_seen": 1590243328 }, { "epoch": 4.07, "learning_rate": 0.00026172517552657974, "loss": 2.8496, "theoretical_loss": 3.4981040730410955, "tokens_seen": 1590308864 }, { "epoch": 4.07, "learning_rate": 0.0002617151454363089, "loss": 2.7418, "theoretical_loss": 3.4980915053667996, "tokens_seen": 1590374400 }, { "epoch": 4.07, "learning_rate": 0.00026170511534603816, "loss": 2.7405, "theoretical_loss": 3.4980789383553827, "tokens_seen": 1590439936 }, { "epoch": 4.07, "learning_rate": 0.0002616950852557673, "loss": 2.5855, "theoretical_loss": 3.4980663720067824, "tokens_seen": 1590505472 }, { "epoch": 4.07, "learning_rate": 0.0002616850551654965, "loss": 2.7794, "theoretical_loss": 3.4980538063209354, "tokens_seen": 1590571008 }, { "epoch": 4.07, "learning_rate": 0.00026167502507522565, "loss": 2.8249, "theoretical_loss": 3.4980412412977806, "tokens_seen": 1590636544 }, { "epoch": 4.07, "learning_rate": 0.0002616649949849549, "loss": 2.9051, "theoretical_loss": 3.498028676937255, "tokens_seen": 1590702080 }, { "epoch": 4.07, "learning_rate": 0.00026165496489468406, "loss": 2.7182, "theoretical_loss": 3.498016113239297, "tokens_seen": 1590767616 }, { "epoch": 4.07, "objective/train/docs_used": 2537594, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.848442316055298, "objective/train/theoretical_loss": 3.4980035502038436, "objective/train/tokens_used": 1611293152, "theoretical_loss": 3.4980035502038436, "tokens_seen": 1590833152 }, { "epoch": 4.07, "learning_rate": 0.00026164493480441324, "loss": 2.7978, "theoretical_loss": 3.4980035502038436, "tokens_seen": 1590833152 }, { "epoch": 4.07, "learning_rate": 0.0002616349047141424, "loss": 2.7627, "theoretical_loss": 3.4979909878308333, "tokens_seen": 1590898688 }, { "epoch": 4.07, "learning_rate": 0.00026162487462387166, "loss": 2.6104, "theoretical_loss": 3.497978426120204, "tokens_seen": 1590964224 }, { "epoch": 4.07, "learning_rate": 0.0002616148445336008, "loss": 2.8329, "theoretical_loss": 3.4979658650718926, "tokens_seen": 1591029760 }, { "epoch": 4.07, "learning_rate": 0.00026160481444333, "loss": 2.8554, "theoretical_loss": 3.4979533046858373, "tokens_seen": 1591095296 }, { "epoch": 4.07, "learning_rate": 0.00026159478435305915, "loss": 2.8415, "theoretical_loss": 3.4979407449619764, "tokens_seen": 1591160832 }, { "epoch": 4.07, "learning_rate": 0.0002615847542627884, "loss": 2.8215, "theoretical_loss": 3.497928185900247, "tokens_seen": 1591226368 }, { "epoch": 4.07, "learning_rate": 0.00026157472417251757, "loss": 2.6139, "theoretical_loss": 3.497915627500588, "tokens_seen": 1591291904 }, { "epoch": 4.07, "learning_rate": 0.00026156469408224675, "loss": 2.7489, "theoretical_loss": 3.497903069762936, "tokens_seen": 1591357440 }, { "epoch": 4.07, "learning_rate": 0.00026155466399197593, "loss": 2.9093, "theoretical_loss": 3.4978905126872286, "tokens_seen": 1591422976 }, { "epoch": 4.07, "learning_rate": 0.0002615446339017051, "loss": 2.7605, "theoretical_loss": 3.4978779562734053, "tokens_seen": 1591488512 }, { "epoch": 4.07, "learning_rate": 0.0002615346038114343, "loss": 2.8053, "theoretical_loss": 3.4978654005214027, "tokens_seen": 1591554048 }, { "epoch": 4.07, "learning_rate": 0.0002615245737211635, "loss": 2.6993, "theoretical_loss": 3.4978528454311593, "tokens_seen": 1591619584 }, { "epoch": 4.07, "learning_rate": 0.00026151454363089265, "loss": 2.7413, "theoretical_loss": 3.4978402910026123, "tokens_seen": 1591685120 }, { "epoch": 4.07, "learning_rate": 0.0002615045135406219, "loss": 2.7909, "theoretical_loss": 3.4978277372357, "tokens_seen": 1591750656 }, { "epoch": 4.07, "learning_rate": 0.000261494483450351, "loss": 2.7806, "theoretical_loss": 3.497815184130361, "tokens_seen": 1591816192 }, { "epoch": 4.07, "learning_rate": 0.00026148445336008025, "loss": 2.7664, "theoretical_loss": 3.4978026316865316, "tokens_seen": 1591881728 }, { "epoch": 4.07, "learning_rate": 0.00026147442326980943, "loss": 2.7291, "theoretical_loss": 3.497790079904151, "tokens_seen": 1591947264 }, { "epoch": 4.07, "learning_rate": 0.0002614643931795386, "loss": 2.9108, "theoretical_loss": 3.497777528783156, "tokens_seen": 1592012800 }, { "epoch": 4.07, "learning_rate": 0.0002614543630892678, "loss": 2.8008, "theoretical_loss": 3.497764978323486, "tokens_seen": 1592078336 }, { "epoch": 4.07, "learning_rate": 0.00026144433299899703, "loss": 2.8021, "theoretical_loss": 3.497752428525078, "tokens_seen": 1592143872 }, { "epoch": 4.07, "learning_rate": 0.00026143430290872616, "loss": 2.7266, "theoretical_loss": 3.4977398793878693, "tokens_seen": 1592209408 }, { "epoch": 4.07, "learning_rate": 0.0002614242728184554, "loss": 2.6637, "theoretical_loss": 3.4977273309117995, "tokens_seen": 1592274944 }, { "epoch": 4.07, "learning_rate": 0.0002614142427281845, "loss": 2.8072, "theoretical_loss": 3.4977147830968054, "tokens_seen": 1592340480 }, { "epoch": 4.07, "learning_rate": 0.00026140421263791375, "loss": 2.7803, "theoretical_loss": 3.4977022359428256, "tokens_seen": 1592406016 }, { "epoch": 4.07, "objective/train/docs_used": 2540556, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.629351854324341, "objective/train/theoretical_loss": 3.497689689449797, "objective/train/tokens_used": 1612931552, "theoretical_loss": 3.497689689449797, "tokens_seen": 1592471552 }, { "epoch": 4.07, "learning_rate": 0.00026139418254764293, "loss": 2.7081, "theoretical_loss": 3.497689689449797, "tokens_seen": 1592471552 }, { "epoch": 4.07, "learning_rate": 0.0002613841524573721, "loss": 2.7624, "theoretical_loss": 3.497677143617659, "tokens_seen": 1592537088 }, { "epoch": 4.07, "learning_rate": 0.0002613741223671013, "loss": 2.6812, "theoretical_loss": 3.497664598446349, "tokens_seen": 1592602624 }, { "epoch": 4.07, "learning_rate": 0.0002613640922768305, "loss": 2.8682, "theoretical_loss": 3.4976520539358043, "tokens_seen": 1592668160 }, { "epoch": 4.07, "learning_rate": 0.00026135406218655966, "loss": 2.8825, "theoretical_loss": 3.4976395100859636, "tokens_seen": 1592733696 }, { "epoch": 4.07, "learning_rate": 0.0002613440320962889, "loss": 2.7597, "theoretical_loss": 3.497626966896765, "tokens_seen": 1592799232 }, { "epoch": 4.07, "learning_rate": 0.000261334002006018, "loss": 2.7213, "theoretical_loss": 3.4976144243681464, "tokens_seen": 1592864768 }, { "epoch": 4.07, "learning_rate": 0.00026132397191574726, "loss": 2.8455, "theoretical_loss": 3.497601882500046, "tokens_seen": 1592930304 }, { "epoch": 4.07, "learning_rate": 0.0002613139418254764, "loss": 2.8156, "theoretical_loss": 3.4975893412924015, "tokens_seen": 1592995840 }, { "epoch": 4.07, "learning_rate": 0.0002613039117352056, "loss": 2.7588, "theoretical_loss": 3.497576800745151, "tokens_seen": 1593061376 }, { "epoch": 4.07, "learning_rate": 0.0002612938816449348, "loss": 2.751, "theoretical_loss": 3.4975642608582325, "tokens_seen": 1593126912 }, { "epoch": 4.07, "learning_rate": 0.000261283851554664, "loss": 2.6735, "theoretical_loss": 3.497551721631585, "tokens_seen": 1593192448 }, { "epoch": 4.07, "learning_rate": 0.00026127382146439316, "loss": 2.7161, "theoretical_loss": 3.497539183065145, "tokens_seen": 1593257984 }, { "epoch": 4.07, "learning_rate": 0.0002612637913741224, "loss": 2.9342, "theoretical_loss": 3.4975266451588514, "tokens_seen": 1593323520 }, { "epoch": 4.07, "learning_rate": 0.0002612537612838516, "loss": 2.723, "theoretical_loss": 3.4975141079126426, "tokens_seen": 1593389056 }, { "epoch": 4.07, "learning_rate": 0.00026124373119358076, "loss": 2.8229, "theoretical_loss": 3.497501571326456, "tokens_seen": 1593454592 }, { "epoch": 4.07, "learning_rate": 0.00026123370110330994, "loss": 2.8573, "theoretical_loss": 3.497489035400231, "tokens_seen": 1593520128 }, { "epoch": 4.07, "learning_rate": 0.0002612236710130391, "loss": 2.8809, "theoretical_loss": 3.497476500133904, "tokens_seen": 1593585664 }, { "epoch": 4.07, "learning_rate": 0.00026121364092276836, "loss": 2.7498, "theoretical_loss": 3.4974639655274142, "tokens_seen": 1593651200 }, { "epoch": 4.07, "learning_rate": 0.0002612036108324975, "loss": 2.7618, "theoretical_loss": 3.497451431580699, "tokens_seen": 1593716736 }, { "epoch": 4.07, "learning_rate": 0.0002611935807422267, "loss": 2.8061, "theoretical_loss": 3.497438898293698, "tokens_seen": 1593782272 }, { "epoch": 4.07, "learning_rate": 0.00026118355065195585, "loss": 2.902, "theoretical_loss": 3.4974263656663473, "tokens_seen": 1593847808 }, { "epoch": 4.07, "learning_rate": 0.0002611735205616851, "loss": 2.8769, "theoretical_loss": 3.4974138336985865, "tokens_seen": 1593913344 }, { "epoch": 4.07, "learning_rate": 0.00026116349047141426, "loss": 2.703, "theoretical_loss": 3.4974013023903536, "tokens_seen": 1593978880 }, { "epoch": 4.07, "learning_rate": 0.00026115346038114344, "loss": 2.8223, "theoretical_loss": 3.4973887717415866, "tokens_seen": 1594044416 }, { "epoch": 4.07, "objective/train/docs_used": 2543356, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8976242542266846, "objective/train/theoretical_loss": 3.4973762417522236, "objective/train/tokens_used": 1614569952, "theoretical_loss": 3.4973762417522236, "tokens_seen": 1594109952 }, { "epoch": 4.07, "learning_rate": 0.0002611434302908726, "loss": 2.7962, "theoretical_loss": 3.4973762417522236, "tokens_seen": 1594109952 }, { "epoch": 4.07, "learning_rate": 0.00026113340020060186, "loss": 2.8871, "theoretical_loss": 3.4973637124222026, "tokens_seen": 1594175488 }, { "epoch": 4.07, "learning_rate": 0.000261123370110331, "loss": 2.8038, "theoretical_loss": 3.497351183751462, "tokens_seen": 1594241024 }, { "epoch": 4.07, "learning_rate": 0.0002611133400200602, "loss": 2.8061, "theoretical_loss": 3.4973386557399406, "tokens_seen": 1594306560 }, { "epoch": 4.07, "learning_rate": 0.00026110330992978935, "loss": 2.8444, "theoretical_loss": 3.4973261283875754, "tokens_seen": 1594372096 }, { "epoch": 4.07, "learning_rate": 0.0002610932798395186, "loss": 2.8258, "theoretical_loss": 3.497313601694306, "tokens_seen": 1594437632 }, { "epoch": 4.07, "learning_rate": 0.00026108324974924777, "loss": 2.8768, "theoretical_loss": 3.4973010756600695, "tokens_seen": 1594503168 }, { "epoch": 4.07, "learning_rate": 0.00026107321965897695, "loss": 2.8407, "theoretical_loss": 3.4972885502848046, "tokens_seen": 1594568704 }, { "epoch": 4.07, "learning_rate": 0.00026106318956870613, "loss": 2.8422, "theoretical_loss": 3.49727602556845, "tokens_seen": 1594634240 }, { "epoch": 4.07, "learning_rate": 0.0002610531594784353, "loss": 2.5367, "theoretical_loss": 3.4972635015109423, "tokens_seen": 1594699776 }, { "epoch": 4.07, "learning_rate": 0.0002610431293881645, "loss": 2.8536, "theoretical_loss": 3.497250978112222, "tokens_seen": 1594765312 }, { "epoch": 4.07, "learning_rate": 0.0002610330992978937, "loss": 2.6478, "theoretical_loss": 3.497238455372226, "tokens_seen": 1594830848 }, { "epoch": 4.07, "learning_rate": 0.00026102306920762285, "loss": 2.8304, "theoretical_loss": 3.497225933290893, "tokens_seen": 1594896384 }, { "epoch": 4.07, "learning_rate": 0.0002610130391173521, "loss": 2.8668, "theoretical_loss": 3.4972134118681613, "tokens_seen": 1594961920 }, { "epoch": 4.07, "learning_rate": 0.0002610030090270812, "loss": 2.8607, "theoretical_loss": 3.4972008911039687, "tokens_seen": 1595027456 }, { "epoch": 4.07, "learning_rate": 0.00026099297893681045, "loss": 2.8723, "theoretical_loss": 3.4971883709982543, "tokens_seen": 1595092992 }, { "epoch": 4.07, "learning_rate": 0.00026098294884653963, "loss": 2.6349, "theoretical_loss": 3.4971758515509563, "tokens_seen": 1595158528 }, { "epoch": 4.07, "learning_rate": 0.0002609729187562688, "loss": 2.7029, "theoretical_loss": 3.497163332762012, "tokens_seen": 1595224064 }, { "epoch": 4.07, "learning_rate": 0.000260962888665998, "loss": 2.7173, "theoretical_loss": 3.497150814631361, "tokens_seen": 1595289600 }, { "epoch": 4.07, "learning_rate": 0.00026095285857572723, "loss": 2.6151, "theoretical_loss": 3.4971382971589415, "tokens_seen": 1595355136 }, { "epoch": 4.07, "learning_rate": 0.00026094282848545636, "loss": 2.7461, "theoretical_loss": 3.4971257803446907, "tokens_seen": 1595420672 }, { "epoch": 4.07, "learning_rate": 0.0002609327983951856, "loss": 2.7207, "theoretical_loss": 3.497113264188548, "tokens_seen": 1595486208 }, { "epoch": 4.07, "learning_rate": 0.0002609227683049147, "loss": 2.7459, "theoretical_loss": 3.497100748690452, "tokens_seen": 1595551744 }, { "epoch": 4.07, "learning_rate": 0.00026091273821464395, "loss": 2.8965, "theoretical_loss": 3.49708823385034, "tokens_seen": 1595617280 }, { "epoch": 4.07, "learning_rate": 0.00026090270812437313, "loss": 2.911, "theoretical_loss": 3.4970757196681515, "tokens_seen": 1595682816 }, { "epoch": 4.07, "objective/train/docs_used": 2546171, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.969170570373535, "objective/train/theoretical_loss": 3.497063206143824, "objective/train/tokens_used": 1616208352, "theoretical_loss": 3.497063206143824, "tokens_seen": 1595748352 }, { "epoch": 4.07, "learning_rate": 0.0002608926780341023, "loss": 2.9829, "theoretical_loss": 3.497063206143824, "tokens_seen": 1595748352 }, { "epoch": 4.07, "learning_rate": 0.0002608826479438315, "loss": 2.8301, "theoretical_loss": 3.4970506932772967, "tokens_seen": 1595813888 }, { "epoch": 4.07, "learning_rate": 0.0002608726178535607, "loss": 2.6239, "theoretical_loss": 3.4970381810685067, "tokens_seen": 1595879424 }, { "epoch": 4.07, "learning_rate": 0.00026086258776328986, "loss": 2.7874, "theoretical_loss": 3.4970256695173942, "tokens_seen": 1595944960 }, { "epoch": 4.07, "learning_rate": 0.0002608525576730191, "loss": 2.738, "theoretical_loss": 3.4970131586238966, "tokens_seen": 1596010496 }, { "epoch": 4.07, "learning_rate": 0.0002608425275827482, "loss": 2.7086, "theoretical_loss": 3.497000648387952, "tokens_seen": 1596076032 }, { "epoch": 4.07, "learning_rate": 0.00026083249749247746, "loss": 2.7999, "theoretical_loss": 3.4969881388095, "tokens_seen": 1596141568 }, { "epoch": 4.07, "learning_rate": 0.0002608224674022066, "loss": 2.8755, "theoretical_loss": 3.4969756298884778, "tokens_seen": 1596207104 }, { "epoch": 4.07, "learning_rate": 0.0002608124373119358, "loss": 2.8592, "theoretical_loss": 3.4969631216248245, "tokens_seen": 1596272640 }, { "epoch": 4.07, "learning_rate": 0.000260802407221665, "loss": 2.6315, "theoretical_loss": 3.4969506140184787, "tokens_seen": 1596338176 }, { "epoch": 4.07, "learning_rate": 0.0002607923771313942, "loss": 2.8927, "theoretical_loss": 3.4969381070693784, "tokens_seen": 1596403712 }, { "epoch": 4.07, "learning_rate": 0.00026078234704112336, "loss": 2.7485, "theoretical_loss": 3.4969256007774625, "tokens_seen": 1596469248 }, { "epoch": 4.07, "learning_rate": 0.0002607723169508526, "loss": 2.7183, "theoretical_loss": 3.496913095142669, "tokens_seen": 1596534784 }, { "epoch": 4.07, "learning_rate": 0.0002607622868605817, "loss": 2.9297, "theoretical_loss": 3.4969005901649375, "tokens_seen": 1596600320 }, { "epoch": 4.07, "learning_rate": 0.00026075225677031096, "loss": 2.829, "theoretical_loss": 3.4968880858442053, "tokens_seen": 1596665856 }, { "epoch": 4.07, "learning_rate": 0.0002607422266800401, "loss": 2.7342, "theoretical_loss": 3.4968755821804116, "tokens_seen": 1596731392 }, { "epoch": 4.07, "learning_rate": 0.0002607321965897693, "loss": 2.8953, "theoretical_loss": 3.4968630791734943, "tokens_seen": 1596796928 }, { "epoch": 4.07, "learning_rate": 0.0002607221664994985, "loss": 2.764, "theoretical_loss": 3.496850576823392, "tokens_seen": 1596862464 }, { "epoch": 4.07, "learning_rate": 0.0002607121364092277, "loss": 2.8347, "theoretical_loss": 3.4968380751300443, "tokens_seen": 1596928000 }, { "epoch": 4.07, "learning_rate": 0.00026070210631895687, "loss": 2.8935, "theoretical_loss": 3.4968255740933887, "tokens_seen": 1596993536 }, { "epoch": 4.07, "learning_rate": 0.00026069207622868605, "loss": 2.7704, "theoretical_loss": 3.4968130737133643, "tokens_seen": 1597059072 }, { "epoch": 4.07, "learning_rate": 0.00026068204613841523, "loss": 2.5682, "theoretical_loss": 3.4968005739899093, "tokens_seen": 1597124608 }, { "epoch": 4.07, "learning_rate": 0.00026067201604814446, "loss": 2.7091, "theoretical_loss": 3.4967880749229625, "tokens_seen": 1597190144 }, { "epoch": 4.07, "learning_rate": 0.0002606619859578736, "loss": 2.8096, "theoretical_loss": 3.496775576512462, "tokens_seen": 1597255680 }, { "epoch": 4.07, "learning_rate": 0.0002606519558676028, "loss": 2.7663, "theoretical_loss": 3.496763078758347, "tokens_seen": 1597321216 }, { "epoch": 4.07, "objective/train/docs_used": 2547507, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.069441556930542, "objective/train/theoretical_loss": 3.4967505816605557, "objective/train/tokens_used": 1617846752, "theoretical_loss": 3.4967505816605557, "tokens_seen": 1597386752 }, { "epoch": 4.07, "learning_rate": 0.00026064192577733195, "loss": 2.8627, "theoretical_loss": 3.4967505816605557, "tokens_seen": 1597386752 }, { "epoch": 4.07, "learning_rate": 0.0002606318956870612, "loss": 2.8848, "theoretical_loss": 3.4967380852190275, "tokens_seen": 1597452288 }, { "epoch": 4.07, "learning_rate": 0.00026062186559679037, "loss": 2.7409, "theoretical_loss": 3.4967255894337, "tokens_seen": 1597517824 }, { "epoch": 4.07, "learning_rate": 0.00026061183550651955, "loss": 2.7589, "theoretical_loss": 3.4967130943045124, "tokens_seen": 1597583360 }, { "epoch": 4.07, "learning_rate": 0.00026060180541624873, "loss": 2.8473, "theoretical_loss": 3.496700599831403, "tokens_seen": 1597648896 }, { "epoch": 4.07, "learning_rate": 0.00026059177532597797, "loss": 2.7446, "theoretical_loss": 3.496688106014311, "tokens_seen": 1597714432 }, { "epoch": 4.07, "learning_rate": 0.0002605817452357071, "loss": 2.7905, "theoretical_loss": 3.496675612853174, "tokens_seen": 1597779968 }, { "epoch": 4.07, "learning_rate": 0.00026057171514543633, "loss": 2.6615, "theoretical_loss": 3.496663120347932, "tokens_seen": 1597845504 }, { "epoch": 4.07, "learning_rate": 0.00026056168505516546, "loss": 2.9255, "theoretical_loss": 3.4966506284985224, "tokens_seen": 1597911040 }, { "epoch": 4.07, "learning_rate": 0.0002605516549648947, "loss": 2.8484, "theoretical_loss": 3.4966381373048847, "tokens_seen": 1597976576 }, { "epoch": 4.07, "learning_rate": 0.00026054162487462387, "loss": 2.7297, "theoretical_loss": 3.4966256467669576, "tokens_seen": 1598042112 }, { "epoch": 4.07, "learning_rate": 0.00026053159478435305, "loss": 2.6614, "theoretical_loss": 3.4966131568846794, "tokens_seen": 1598107648 }, { "epoch": 4.07, "learning_rate": 0.00026052156469408223, "loss": 2.7458, "theoretical_loss": 3.4966006676579884, "tokens_seen": 1598173184 }, { "epoch": 4.07, "learning_rate": 0.0002605115346038114, "loss": 2.6983, "theoretical_loss": 3.4965881790868245, "tokens_seen": 1598238720 }, { "epoch": 4.07, "learning_rate": 0.00026050150451354065, "loss": 2.789, "theoretical_loss": 3.4965756911711257, "tokens_seen": 1598304256 }, { "epoch": 4.07, "learning_rate": 0.00026049147442326983, "loss": 2.6961, "theoretical_loss": 3.4965632039108305, "tokens_seen": 1598369792 }, { "epoch": 4.07, "learning_rate": 0.000260481444332999, "loss": 2.7251, "theoretical_loss": 3.496550717305878, "tokens_seen": 1598435328 }, { "epoch": 4.07, "learning_rate": 0.0002604714142427282, "loss": 2.8334, "theoretical_loss": 3.496538231356207, "tokens_seen": 1598500864 }, { "epoch": 4.07, "learning_rate": 0.00026046138415245743, "loss": 2.9499, "theoretical_loss": 3.4965257460617556, "tokens_seen": 1598566400 }, { "epoch": 4.07, "learning_rate": 0.00026045135406218656, "loss": 2.6486, "theoretical_loss": 3.4965132614224634, "tokens_seen": 1598631936 }, { "epoch": 4.07, "learning_rate": 0.0002604413239719158, "loss": 2.864, "theoretical_loss": 3.4965007774382686, "tokens_seen": 1598697472 }, { "epoch": 4.07, "learning_rate": 0.0002604312938816449, "loss": 2.8578, "theoretical_loss": 3.4964882941091107, "tokens_seen": 1598763008 }, { "epoch": 4.07, "learning_rate": 0.00026042126379137415, "loss": 2.7437, "theoretical_loss": 3.4964758114349275, "tokens_seen": 1598828544 }, { "epoch": 4.07, "learning_rate": 0.00026041123370110333, "loss": 2.9244, "theoretical_loss": 3.4964633294156586, "tokens_seen": 1598894080 }, { "epoch": 4.07, "learning_rate": 0.0002604012036108325, "loss": 2.8368, "theoretical_loss": 3.4964508480512424, "tokens_seen": 1598959616 }, { "epoch": 4.07, "objective/train/docs_used": 2550296, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.720492124557495, "objective/train/theoretical_loss": 3.4964383673416175, "objective/train/tokens_used": 1619485152, "theoretical_loss": 3.4964383673416175, "tokens_seen": 1599025152 }, { "epoch": 4.07, "learning_rate": 0.0002603911735205617, "loss": 2.8639, "theoretical_loss": 3.4964383673416175, "tokens_seen": 1599025152 }, { "epoch": 4.07, "learning_rate": 0.0002603811434302909, "loss": 2.7793, "theoretical_loss": 3.4964258872867235, "tokens_seen": 1599090688 }, { "epoch": 4.07, "learning_rate": 0.00026037111334002006, "loss": 2.7223, "theoretical_loss": 3.4964134078864983, "tokens_seen": 1599156224 }, { "epoch": 4.07, "learning_rate": 0.0002603610832497493, "loss": 2.7565, "theoretical_loss": 3.496400929140881, "tokens_seen": 1599221760 }, { "epoch": 4.07, "learning_rate": 0.0002603510531594784, "loss": 2.888, "theoretical_loss": 3.496388451049811, "tokens_seen": 1599287296 }, { "epoch": 4.07, "learning_rate": 0.00026034102306920766, "loss": 2.6472, "theoretical_loss": 3.496375973613226, "tokens_seen": 1599352832 }, { "epoch": 4.07, "learning_rate": 0.0002603309929789368, "loss": 2.6992, "theoretical_loss": 3.4963634968310666, "tokens_seen": 1599418368 }, { "epoch": 4.07, "learning_rate": 0.000260320962888666, "loss": 2.8132, "theoretical_loss": 3.49635102070327, "tokens_seen": 1599483904 }, { "epoch": 4.07, "learning_rate": 0.0002603109327983952, "loss": 2.8405, "theoretical_loss": 3.4963385452297757, "tokens_seen": 1599549440 }, { "epoch": 4.07, "learning_rate": 0.0002603009027081244, "loss": 2.7776, "theoretical_loss": 3.4963260704105226, "tokens_seen": 1599614976 }, { "epoch": 4.07, "learning_rate": 0.00026029087261785356, "loss": 2.8086, "theoretical_loss": 3.4963135962454497, "tokens_seen": 1599680512 }, { "epoch": 4.07, "learning_rate": 0.0002602808425275828, "loss": 2.6623, "theoretical_loss": 3.4963011227344953, "tokens_seen": 1599746048 }, { "epoch": 4.07, "learning_rate": 0.0002602708124373119, "loss": 2.7661, "theoretical_loss": 3.4962886498775996, "tokens_seen": 1599811584 }, { "epoch": 4.07, "learning_rate": 0.00026026078234704116, "loss": 2.8732, "theoretical_loss": 3.4962761776747002, "tokens_seen": 1599877120 }, { "epoch": 4.07, "learning_rate": 0.0002602507522567703, "loss": 2.9504, "theoretical_loss": 3.4962637061257364, "tokens_seen": 1599942656 }, { "epoch": 4.07, "learning_rate": 0.0002602407221664995, "loss": 2.8733, "theoretical_loss": 3.496251235230648, "tokens_seen": 1600008192 }, { "epoch": 4.07, "learning_rate": 0.0002602306920762287, "loss": 3.0463, "theoretical_loss": 3.4962387649893722, "tokens_seen": 1600073728 }, { "epoch": 4.07, "learning_rate": 0.0002602206619859579, "loss": 2.8164, "theoretical_loss": 3.4962262954018497, "tokens_seen": 1600139264 }, { "epoch": 4.07, "learning_rate": 0.00026021063189568707, "loss": 2.5429, "theoretical_loss": 3.4962138264680185, "tokens_seen": 1600204800 }, { "epoch": 4.07, "learning_rate": 0.00026020060180541625, "loss": 2.942, "theoretical_loss": 3.4962013581878173, "tokens_seen": 1600270336 }, { "epoch": 4.07, "learning_rate": 0.00026019057171514543, "loss": 2.9309, "theoretical_loss": 3.4961888905611858, "tokens_seen": 1600335872 }, { "epoch": 4.07, "learning_rate": 0.00026018054162487466, "loss": 2.8139, "theoretical_loss": 3.4961764235880626, "tokens_seen": 1600401408 }, { "epoch": 4.07, "learning_rate": 0.0002601705115346038, "loss": 2.7862, "theoretical_loss": 3.496163957268387, "tokens_seen": 1600466944 }, { "epoch": 4.07, "learning_rate": 0.000260160481444333, "loss": 2.7544, "theoretical_loss": 3.4961514916020975, "tokens_seen": 1600532480 }, { "epoch": 4.07, "learning_rate": 0.00026015045135406215, "loss": 2.8149, "theoretical_loss": 3.4961390265891334, "tokens_seen": 1600598016 }, { "epoch": 4.07, "objective/train/docs_used": 2552663, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7522876262664795, "objective/train/theoretical_loss": 3.496126562229434, "objective/train/tokens_used": 1621123552, "theoretical_loss": 3.496126562229434, "tokens_seen": 1600663552 }, { "epoch": 4.07, "learning_rate": 0.0002601404212637914, "loss": 2.7903, "theoretical_loss": 3.496126562229434, "tokens_seen": 1600663552 }, { "epoch": 4.07, "learning_rate": 0.00026013039117352057, "loss": 2.7339, "theoretical_loss": 3.4961140985229378, "tokens_seen": 1600729088 }, { "epoch": 4.07, "learning_rate": 0.00026012036108324975, "loss": 2.7146, "theoretical_loss": 3.496101635469584, "tokens_seen": 1600794624 }, { "epoch": 4.07, "learning_rate": 0.00026011033099297893, "loss": 2.8172, "theoretical_loss": 3.4960891730693113, "tokens_seen": 1600860160 }, { "epoch": 4.07, "learning_rate": 0.00026010030090270817, "loss": 2.7866, "theoretical_loss": 3.4960767113220594, "tokens_seen": 1600925696 }, { "epoch": 4.07, "learning_rate": 0.0002600902708124373, "loss": 2.7309, "theoretical_loss": 3.4960642502277675, "tokens_seen": 1600991232 }, { "epoch": 4.07, "learning_rate": 0.00026008024072216653, "loss": 2.653, "theoretical_loss": 3.4960517897863737, "tokens_seen": 1601056768 }, { "epoch": 4.07, "learning_rate": 0.00026007021063189566, "loss": 2.9097, "theoretical_loss": 3.496039329997818, "tokens_seen": 1601122304 }, { "epoch": 4.07, "learning_rate": 0.0002600601805416249, "loss": 2.7403, "theoretical_loss": 3.4960268708620386, "tokens_seen": 1601187840 }, { "epoch": 4.07, "learning_rate": 0.00026005015045135407, "loss": 2.7192, "theoretical_loss": 3.496014412378975, "tokens_seen": 1601253376 }, { "epoch": 4.07, "learning_rate": 0.00026004012036108325, "loss": 2.7732, "theoretical_loss": 3.496001954548567, "tokens_seen": 1601318912 }, { "epoch": 4.07, "learning_rate": 0.00026003009027081243, "loss": 2.9272, "theoretical_loss": 3.4959894973707524, "tokens_seen": 1601384448 }, { "epoch": 4.07, "learning_rate": 0.0002600200601805416, "loss": 2.7231, "theoretical_loss": 3.4959770408454713, "tokens_seen": 1601449984 }, { "epoch": 4.07, "learning_rate": 0.0002600100300902708, "loss": 2.8103, "theoretical_loss": 3.495964584972662, "tokens_seen": 1601515520 }, { "epoch": 4.07, "learning_rate": 0.00026000000000000003, "loss": 2.8673, "theoretical_loss": 3.495952129752265, "tokens_seen": 1601581056 }, { "epoch": 4.07, "learning_rate": 0.00025998996990972916, "loss": 2.816, "theoretical_loss": 3.4959396751842178, "tokens_seen": 1601646592 }, { "epoch": 4.07, "learning_rate": 0.0002599799398194584, "loss": 2.8153, "theoretical_loss": 3.4959272212684604, "tokens_seen": 1601712128 }, { "epoch": 4.07, "learning_rate": 0.0002599699097291876, "loss": 2.7974, "theoretical_loss": 3.495914768004932, "tokens_seen": 1601777664 }, { "epoch": 4.07, "learning_rate": 0.00025995987963891676, "loss": 2.7865, "theoretical_loss": 3.4959023153935718, "tokens_seen": 1601843200 }, { "epoch": 4.07, "learning_rate": 0.00025994984954864594, "loss": 2.7716, "theoretical_loss": 3.4958898634343183, "tokens_seen": 1601908736 }, { "epoch": 4.07, "learning_rate": 0.0002599398194583751, "loss": 2.9264, "theoretical_loss": 3.4958774121271112, "tokens_seen": 1601974272 }, { "epoch": 4.07, "learning_rate": 0.0002599297893681043, "loss": 2.8584, "theoretical_loss": 3.4958649614718897, "tokens_seen": 1602039808 }, { "epoch": 4.07, "learning_rate": 0.00025991975927783353, "loss": 2.7913, "theoretical_loss": 3.495852511468593, "tokens_seen": 1602105344 }, { "epoch": 4.07, "learning_rate": 0.00025990972918756266, "loss": 2.7841, "theoretical_loss": 3.4958400621171597, "tokens_seen": 1602170880 }, { "epoch": 4.07, "learning_rate": 0.0002598996990972919, "loss": 2.6024, "theoretical_loss": 3.49582761341753, "tokens_seen": 1602236416 }, { "epoch": 4.07, "objective/train/docs_used": 2555521, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.673626661300659, "objective/train/theoretical_loss": 3.4958151653696428, "objective/train/tokens_used": 1622761952, "theoretical_loss": 3.4958151653696428, "tokens_seen": 1602301952 }, { "epoch": 4.07, "learning_rate": 0.000259889669007021, "loss": 2.6047, "theoretical_loss": 3.4958151653696428, "tokens_seen": 1602301952 }, { "epoch": 4.07, "learning_rate": 0.00025987963891675026, "loss": 2.8946, "theoretical_loss": 3.4958027179734366, "tokens_seen": 1602367488 }, { "epoch": 4.07, "learning_rate": 0.00025986960882647944, "loss": 2.718, "theoretical_loss": 3.4957902712288513, "tokens_seen": 1602433024 }, { "epoch": 4.07, "learning_rate": 0.0002598595787362086, "loss": 2.9667, "theoretical_loss": 3.495777825135826, "tokens_seen": 1602498560 }, { "epoch": 4.07, "learning_rate": 0.0002598495486459378, "loss": 2.6845, "theoretical_loss": 3.4957653796943005, "tokens_seen": 1602564096 }, { "epoch": 4.07, "learning_rate": 0.000259839518555667, "loss": 2.5827, "theoretical_loss": 3.4957529349042127, "tokens_seen": 1602629632 }, { "epoch": 4.07, "learning_rate": 0.00025982948846539616, "loss": 2.8139, "theoretical_loss": 3.4957404907655034, "tokens_seen": 1602695168 }, { "epoch": 4.07, "learning_rate": 0.0002598194583751254, "loss": 2.7457, "theoretical_loss": 3.4957280472781105, "tokens_seen": 1602760704 }, { "epoch": 4.07, "learning_rate": 0.0002598094282848545, "loss": 2.525, "theoretical_loss": 3.4957156044419744, "tokens_seen": 1602826240 }, { "epoch": 4.07, "learning_rate": 0.00025979939819458376, "loss": 2.8497, "theoretical_loss": 3.4957031622570334, "tokens_seen": 1602891776 }, { "epoch": 4.07, "learning_rate": 0.00025978936810431294, "loss": 2.6935, "theoretical_loss": 3.4956907207232284, "tokens_seen": 1602957312 }, { "epoch": 4.07, "learning_rate": 0.0002597793380140421, "loss": 2.8605, "theoretical_loss": 3.4956782798404964, "tokens_seen": 1603022848 }, { "epoch": 4.07, "learning_rate": 0.0002597693079237713, "loss": 2.6323, "theoretical_loss": 3.4956658396087787, "tokens_seen": 1603088384 }, { "epoch": 4.07, "learning_rate": 0.0002597592778335005, "loss": 2.7798, "theoretical_loss": 3.4956534000280137, "tokens_seen": 1603153920 }, { "epoch": 4.07, "learning_rate": 0.0002597492477432297, "loss": 2.8273, "theoretical_loss": 3.4956409610981405, "tokens_seen": 1603219456 }, { "epoch": 4.07, "learning_rate": 0.0002597392176529589, "loss": 2.6481, "theoretical_loss": 3.4956285228190995, "tokens_seen": 1603284992 }, { "epoch": 4.07, "learning_rate": 0.0002597291875626881, "loss": 2.9425, "theoretical_loss": 3.4956160851908287, "tokens_seen": 1603350528 }, { "epoch": 4.07, "learning_rate": 0.00025971915747241727, "loss": 2.7484, "theoretical_loss": 3.495603648213269, "tokens_seen": 1603416064 }, { "epoch": 4.07, "learning_rate": 0.00025970912738214645, "loss": 2.7188, "theoretical_loss": 3.495591211886358, "tokens_seen": 1603481600 }, { "epoch": 4.07, "learning_rate": 0.00025969909729187563, "loss": 2.6788, "theoretical_loss": 3.4955787762100363, "tokens_seen": 1603547136 }, { "epoch": 4.07, "learning_rate": 0.00025968906720160486, "loss": 2.9058, "theoretical_loss": 3.495566341184243, "tokens_seen": 1603612672 }, { "epoch": 4.07, "learning_rate": 0.000259679037111334, "loss": 2.6793, "theoretical_loss": 3.4955539068089174, "tokens_seen": 1603678208 }, { "epoch": 4.07, "learning_rate": 0.0002596690070210632, "loss": 2.6895, "theoretical_loss": 3.495541473083999, "tokens_seen": 1603743744 }, { "epoch": 4.07, "learning_rate": 0.00025965897693079235, "loss": 2.679, "theoretical_loss": 3.495529040009427, "tokens_seen": 1603809280 }, { "epoch": 4.07, "learning_rate": 0.0002596489468405216, "loss": 2.7134, "theoretical_loss": 3.4955166075851407, "tokens_seen": 1603874816 }, { "epoch": 4.07, "objective/train/docs_used": 2558144, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.695502996444702, "objective/train/theoretical_loss": 3.4955041758110803, "objective/train/tokens_used": 1624400352, "theoretical_loss": 3.4955041758110803, "tokens_seen": 1603940352 }, { "epoch": 4.07, "learning_rate": 0.00025963891675025077, "loss": 2.6919, "theoretical_loss": 3.4955041758110803, "tokens_seen": 1603940352 }, { "epoch": 4.07, "learning_rate": 0.00025962888665997995, "loss": 2.6863, "theoretical_loss": 3.4954917446871843, "tokens_seen": 1604005888 }, { "epoch": 4.07, "learning_rate": 0.00025961885656970913, "loss": 2.6283, "theoretical_loss": 3.4954793142133926, "tokens_seen": 1604071424 }, { "epoch": 4.07, "learning_rate": 0.00025960882647943837, "loss": 2.6639, "theoretical_loss": 3.4954668843896446, "tokens_seen": 1604136960 }, { "epoch": 4.07, "learning_rate": 0.0002595987963891675, "loss": 2.8449, "theoretical_loss": 3.4954544552158797, "tokens_seen": 1604202496 }, { "epoch": 4.07, "learning_rate": 0.00025958876629889673, "loss": 2.8003, "theoretical_loss": 3.495442026692037, "tokens_seen": 1604268032 }, { "epoch": 4.07, "learning_rate": 0.00025957873620862586, "loss": 2.8227, "theoretical_loss": 3.495429598818057, "tokens_seen": 1604333568 }, { "epoch": 4.07, "learning_rate": 0.0002595687061183551, "loss": 2.6591, "theoretical_loss": 3.495417171593878, "tokens_seen": 1604399104 }, { "epoch": 4.07, "learning_rate": 0.00025955867602808427, "loss": 2.7177, "theoretical_loss": 3.49540474501944, "tokens_seen": 1604464640 }, { "epoch": 4.07, "learning_rate": 0.00025954864593781345, "loss": 2.722, "theoretical_loss": 3.495392319094683, "tokens_seen": 1604530176 }, { "epoch": 4.07, "learning_rate": 0.00025953861584754263, "loss": 2.9484, "theoretical_loss": 3.4953798938195453, "tokens_seen": 1604595712 }, { "epoch": 4.07, "learning_rate": 0.0002595285857572718, "loss": 2.7938, "theoretical_loss": 3.4953674691939676, "tokens_seen": 1604661248 }, { "epoch": 4.07, "learning_rate": 0.000259518555667001, "loss": 2.797, "theoretical_loss": 3.495355045217889, "tokens_seen": 1604726784 }, { "epoch": 4.07, "learning_rate": 0.00025950852557673023, "loss": 2.7798, "theoretical_loss": 3.4953426218912487, "tokens_seen": 1604792320 }, { "epoch": 4.07, "learning_rate": 0.00025949849548645936, "loss": 2.817, "theoretical_loss": 3.4953301992139862, "tokens_seen": 1604857856 }, { "epoch": 4.07, "learning_rate": 0.0002594884653961886, "loss": 2.8386, "theoretical_loss": 3.4953177771860418, "tokens_seen": 1604923392 }, { "epoch": 4.07, "learning_rate": 0.0002594784353059178, "loss": 2.7218, "theoretical_loss": 3.4953053558073544, "tokens_seen": 1604988928 }, { "epoch": 4.07, "learning_rate": 0.00025946840521564696, "loss": 2.8442, "theoretical_loss": 3.495292935077863, "tokens_seen": 1605054464 }, { "epoch": 4.07, "learning_rate": 0.00025945837512537614, "loss": 2.7423, "theoretical_loss": 3.495280514997509, "tokens_seen": 1605120000 }, { "epoch": 4.07, "learning_rate": 0.0002594483450351053, "loss": 2.6689, "theoretical_loss": 3.4952680955662303, "tokens_seen": 1605185536 }, { "epoch": 4.07, "learning_rate": 0.0002594383149448345, "loss": 2.7586, "theoretical_loss": 3.495255676783967, "tokens_seen": 1605251072 }, { "epoch": 4.07, "learning_rate": 0.00025942828485456373, "loss": 2.7187, "theoretical_loss": 3.4952432586506585, "tokens_seen": 1605316608 }, { "epoch": 4.07, "learning_rate": 0.00025941825476429286, "loss": 2.8395, "theoretical_loss": 3.4952308411662454, "tokens_seen": 1605382144 }, { "epoch": 4.07, "learning_rate": 0.0002594082246740221, "loss": 2.8476, "theoretical_loss": 3.495218424330666, "tokens_seen": 1605447680 }, { "epoch": 4.07, "learning_rate": 0.0002593981945837512, "loss": 2.6949, "theoretical_loss": 3.4952060081438603, "tokens_seen": 1605513216 }, { "epoch": 4.07, "objective/train/docs_used": 2560974, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.914144277572632, "objective/train/theoretical_loss": 3.4951935926057685, "objective/train/tokens_used": 1626038752, "theoretical_loss": 3.4951935926057685, "tokens_seen": 1605578752 }, { "epoch": 4.07, "learning_rate": 0.00025938816449348046, "loss": 2.8123, "theoretical_loss": 3.4951935926057685, "tokens_seen": 1605578752 }, { "epoch": 4.07, "learning_rate": 0.00025937813440320964, "loss": 2.7247, "theoretical_loss": 3.4951811777163293, "tokens_seen": 1605644288 }, { "epoch": 4.07, "learning_rate": 0.0002593681043129388, "loss": 2.8554, "theoretical_loss": 3.495168763475483, "tokens_seen": 1605709824 }, { "epoch": 4.07, "learning_rate": 0.000259358074222668, "loss": 2.6561, "theoretical_loss": 3.4951563498831693, "tokens_seen": 1605775360 }, { "epoch": 4.07, "learning_rate": 0.0002593480441323972, "loss": 2.8229, "theoretical_loss": 3.4951439369393276, "tokens_seen": 1605840896 }, { "epoch": 4.07, "learning_rate": 0.00025933801404212636, "loss": 2.7922, "theoretical_loss": 3.495131524643897, "tokens_seen": 1605906432 }, { "epoch": 4.07, "learning_rate": 0.0002593279839518556, "loss": 2.7514, "theoretical_loss": 3.4951191129968184, "tokens_seen": 1605971968 }, { "epoch": 4.07, "learning_rate": 0.0002593179538615847, "loss": 2.5531, "theoretical_loss": 3.4951067019980306, "tokens_seen": 1606037504 }, { "epoch": 4.07, "learning_rate": 0.00025930792377131396, "loss": 2.7237, "theoretical_loss": 3.4950942916474736, "tokens_seen": 1606103040 }, { "epoch": 4.07, "learning_rate": 0.00025929789368104314, "loss": 2.6952, "theoretical_loss": 3.495081881945087, "tokens_seen": 1606168576 }, { "epoch": 4.07, "learning_rate": 0.0002592878635907723, "loss": 2.9099, "theoretical_loss": 3.4950694728908105, "tokens_seen": 1606234112 }, { "epoch": 4.07, "learning_rate": 0.0002592778335005015, "loss": 2.6877, "theoretical_loss": 3.495057064484584, "tokens_seen": 1606299648 }, { "epoch": 4.07, "learning_rate": 0.0002592678034102307, "loss": 2.8238, "theoretical_loss": 3.4950446567263462, "tokens_seen": 1606365184 }, { "epoch": 4.07, "learning_rate": 0.00025925777331995987, "loss": 2.9104, "theoretical_loss": 3.495032249616039, "tokens_seen": 1606430720 }, { "epoch": 4.07, "learning_rate": 0.0002592477432296891, "loss": 2.9416, "theoretical_loss": 3.4950198431536, "tokens_seen": 1606496256 }, { "epoch": 4.07, "learning_rate": 0.00025923771313941823, "loss": 2.88, "theoretical_loss": 3.4950074373389697, "tokens_seen": 1606561792 }, { "epoch": 4.07, "learning_rate": 0.00025922768304914747, "loss": 2.7936, "theoretical_loss": 3.4949950321720884, "tokens_seen": 1606627328 }, { "epoch": 4.07, "learning_rate": 0.0002592176529588766, "loss": 2.7557, "theoretical_loss": 3.494982627652895, "tokens_seen": 1606692864 }, { "epoch": 4.07, "learning_rate": 0.00025920762286860583, "loss": 2.6418, "theoretical_loss": 3.4949702237813294, "tokens_seen": 1606758400 }, { "epoch": 4.07, "learning_rate": 0.000259197592778335, "loss": 2.7584, "theoretical_loss": 3.4949578205573317, "tokens_seen": 1606823936 }, { "epoch": 4.07, "learning_rate": 0.0002591875626880642, "loss": 2.9235, "theoretical_loss": 3.494945417980842, "tokens_seen": 1606889472 }, { "epoch": 4.07, "learning_rate": 0.00025917753259779337, "loss": 2.8089, "theoretical_loss": 3.4949330160517995, "tokens_seen": 1606955008 }, { "epoch": 4.07, "learning_rate": 0.00025916750250752255, "loss": 2.7596, "theoretical_loss": 3.4949206147701437, "tokens_seen": 1607020544 }, { "epoch": 4.07, "learning_rate": 0.00025915747241725173, "loss": 2.6746, "theoretical_loss": 3.4949082141358154, "tokens_seen": 1607086080 }, { "epoch": 4.07, "learning_rate": 0.00025914744232698097, "loss": 2.8439, "theoretical_loss": 3.494895814148754, "tokens_seen": 1607151616 }, { "epoch": 4.07, "objective/train/docs_used": 2563705, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0043528079986572, "objective/train/theoretical_loss": 3.494883414808899, "objective/train/tokens_used": 1627677152, "theoretical_loss": 3.494883414808899, "tokens_seen": 1607217152 }, { "epoch": 4.07, "learning_rate": 0.0002591374122367101, "loss": 3.0015, "theoretical_loss": 3.494883414808899, "tokens_seen": 1607217152 }, { "epoch": 4.07, "learning_rate": 0.00025912738214643933, "loss": 2.5105, "theoretical_loss": 3.4948710161161904, "tokens_seen": 1607282688 }, { "epoch": 4.07, "learning_rate": 0.0002591173520561685, "loss": 2.6322, "theoretical_loss": 3.494858618070568, "tokens_seen": 1607348224 }, { "epoch": 4.07, "learning_rate": 0.0002591073219658977, "loss": 2.7902, "theoretical_loss": 3.494846220671972, "tokens_seen": 1607413760 }, { "epoch": 4.07, "learning_rate": 0.0002590972918756269, "loss": 2.8131, "theoretical_loss": 3.4948338239203416, "tokens_seen": 1607479296 }, { "epoch": 4.07, "learning_rate": 0.00025908726178535606, "loss": 2.8969, "theoretical_loss": 3.4948214278156176, "tokens_seen": 1607544832 }, { "epoch": 4.07, "learning_rate": 0.00025907723169508524, "loss": 2.7064, "theoretical_loss": 3.4948090323577388, "tokens_seen": 1607610368 }, { "epoch": 4.07, "learning_rate": 0.00025906720160481447, "loss": 2.8495, "theoretical_loss": 3.4947966375466457, "tokens_seen": 1607675904 }, { "epoch": 4.07, "learning_rate": 0.0002590571715145436, "loss": 2.6864, "theoretical_loss": 3.4947842433822784, "tokens_seen": 1607741440 }, { "epoch": 4.07, "learning_rate": 0.00025904714142427283, "loss": 2.5787, "theoretical_loss": 3.4947718498645766, "tokens_seen": 1607806976 }, { "epoch": 4.07, "learning_rate": 0.00025903711133400196, "loss": 2.6457, "theoretical_loss": 3.4947594569934797, "tokens_seen": 1607872512 }, { "epoch": 4.07, "learning_rate": 0.0002590270812437312, "loss": 2.6028, "theoretical_loss": 3.4947470647689283, "tokens_seen": 1607938048 }, { "epoch": 4.07, "learning_rate": 0.0002590170511534604, "loss": 2.7626, "theoretical_loss": 3.4947346731908615, "tokens_seen": 1608003584 }, { "epoch": 4.07, "learning_rate": 0.00025900702106318956, "loss": 2.8055, "theoretical_loss": 3.49472228225922, "tokens_seen": 1608069120 }, { "epoch": 4.07, "learning_rate": 0.0002589969909729188, "loss": 2.7829, "theoretical_loss": 3.494709891973944, "tokens_seen": 1608134656 }, { "epoch": 4.07, "learning_rate": 0.000258986960882648, "loss": 2.8495, "theoretical_loss": 3.4946975023349722, "tokens_seen": 1608200192 }, { "epoch": 4.07, "learning_rate": 0.00025897693079237716, "loss": 2.5408, "theoretical_loss": 3.4946851133422454, "tokens_seen": 1608265728 }, { "epoch": 4.07, "learning_rate": 0.00025896690070210634, "loss": 2.8186, "theoretical_loss": 3.494672724995704, "tokens_seen": 1608331264 }, { "epoch": 4.07, "learning_rate": 0.0002589568706118355, "loss": 2.7345, "theoretical_loss": 3.4946603372952865, "tokens_seen": 1608396800 }, { "epoch": 4.07, "learning_rate": 0.0002589468405215647, "loss": 2.6536, "theoretical_loss": 3.4946479502409344, "tokens_seen": 1608462336 }, { "epoch": 4.07, "learning_rate": 0.00025893681043129393, "loss": 2.7286, "theoretical_loss": 3.4946355638325874, "tokens_seen": 1608527872 }, { "epoch": 4.07, "learning_rate": 0.00025892678034102306, "loss": 2.8428, "theoretical_loss": 3.4946231780701846, "tokens_seen": 1608593408 }, { "epoch": 4.07, "learning_rate": 0.0002589167502507523, "loss": 2.5465, "theoretical_loss": 3.494610792953667, "tokens_seen": 1608658944 }, { "epoch": 4.07, "learning_rate": 0.0002589067201604814, "loss": 2.7843, "theoretical_loss": 3.4945984084829735, "tokens_seen": 1608724480 }, { "epoch": 4.07, "learning_rate": 0.00025889669007021066, "loss": 2.6292, "theoretical_loss": 3.4945860246580454, "tokens_seen": 1608790016 }, { "epoch": 4.07, "objective/train/docs_used": 2564936, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7150800228118896, "objective/train/theoretical_loss": 3.4945736414788215, "objective/train/tokens_used": 1629315552, "theoretical_loss": 3.4945736414788215, "tokens_seen": 1608855552 }, { "epoch": 4.07, "learning_rate": 0.00025888665997993984, "loss": 2.764, "theoretical_loss": 3.4945736414788215, "tokens_seen": 1608855552 }, { "epoch": 4.07, "learning_rate": 0.000258876629889669, "loss": 2.8905, "theoretical_loss": 3.494561258945243, "tokens_seen": 1608921088 }, { "epoch": 4.07, "learning_rate": 0.0002588665997993982, "loss": 2.8878, "theoretical_loss": 3.494548877057249, "tokens_seen": 1608986624 }, { "epoch": 4.07, "learning_rate": 0.0002588565697091274, "loss": 2.719, "theoretical_loss": 3.49453649581478, "tokens_seen": 1609052160 }, { "epoch": 4.07, "learning_rate": 0.00025884653961885657, "loss": 2.8972, "theoretical_loss": 3.494524115217776, "tokens_seen": 1609117696 }, { "epoch": 4.07, "learning_rate": 0.0002588365095285858, "loss": 2.566, "theoretical_loss": 3.494511735266177, "tokens_seen": 1609183232 }, { "epoch": 4.07, "learning_rate": 0.00025882647943831493, "loss": 2.5402, "theoretical_loss": 3.494499355959923, "tokens_seen": 1609248768 }, { "epoch": 4.07, "learning_rate": 0.00025881644934804416, "loss": 2.7147, "theoretical_loss": 3.494486977298955, "tokens_seen": 1609314304 }, { "epoch": 4.07, "learning_rate": 0.00025880641925777334, "loss": 2.8891, "theoretical_loss": 3.494474599283212, "tokens_seen": 1609379840 }, { "epoch": 4.07, "learning_rate": 0.0002587963891675025, "loss": 2.661, "theoretical_loss": 3.4944622219126336, "tokens_seen": 1609445376 }, { "epoch": 4.07, "learning_rate": 0.0002587863590772317, "loss": 2.656, "theoretical_loss": 3.4944498451871615, "tokens_seen": 1609510912 }, { "epoch": 4.07, "learning_rate": 0.0002587763289869609, "loss": 2.8132, "theoretical_loss": 3.4944374691067344, "tokens_seen": 1609576448 }, { "epoch": 4.07, "learning_rate": 0.00025876629889669007, "loss": 2.8165, "theoretical_loss": 3.4944250936712935, "tokens_seen": 1609641984 }, { "epoch": 4.07, "learning_rate": 0.0002587562688064193, "loss": 2.7964, "theoretical_loss": 3.494412718880778, "tokens_seen": 1609707520 }, { "epoch": 4.07, "learning_rate": 0.00025874623871614843, "loss": 2.7716, "theoretical_loss": 3.494400344735129, "tokens_seen": 1609773056 }, { "epoch": 4.07, "learning_rate": 0.00025873620862587767, "loss": 2.8476, "theoretical_loss": 3.4943879712342856, "tokens_seen": 1609838592 }, { "epoch": 4.07, "learning_rate": 0.0002587261785356068, "loss": 2.54, "theoretical_loss": 3.4943755983781886, "tokens_seen": 1609904128 }, { "epoch": 4.07, "learning_rate": 0.00025871614844533603, "loss": 2.7695, "theoretical_loss": 3.4943632261667785, "tokens_seen": 1609969664 }, { "epoch": 4.07, "learning_rate": 0.0002587061183550652, "loss": 2.75, "theoretical_loss": 3.4943508545999946, "tokens_seen": 1610035200 }, { "epoch": 4.07, "learning_rate": 0.0002586960882647944, "loss": 2.8404, "theoretical_loss": 3.4943384836777778, "tokens_seen": 1610100736 }, { "epoch": 4.07, "learning_rate": 0.00025868605817452357, "loss": 2.6369, "theoretical_loss": 3.4943261134000676, "tokens_seen": 1610166272 }, { "epoch": 4.07, "learning_rate": 0.00025867602808425275, "loss": 2.7154, "theoretical_loss": 3.4943137437668046, "tokens_seen": 1610231808 }, { "epoch": 4.07, "learning_rate": 0.00025866599799398193, "loss": 2.686, "theoretical_loss": 3.494301374777929, "tokens_seen": 1610297344 }, { "epoch": 4.07, "learning_rate": 0.00025865596790371117, "loss": 2.8082, "theoretical_loss": 3.494289006433381, "tokens_seen": 1610362880 }, { "epoch": 4.07, "learning_rate": 0.0002586459378134403, "loss": 2.6395, "theoretical_loss": 3.4942766387331003, "tokens_seen": 1610428416 }, { "epoch": 4.07, "objective/train/docs_used": 2567938, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.750584840774536, "objective/train/theoretical_loss": 3.4942642716770287, "objective/train/tokens_used": 1630953952, "theoretical_loss": 3.4942642716770287, "tokens_seen": 1610493952 }, { "epoch": 4.07, "learning_rate": 0.00025863590772316953, "loss": 2.7508, "theoretical_loss": 3.4942642716770287, "tokens_seen": 1610493952 }, { "epoch": 4.07, "learning_rate": 0.0002586258776328987, "loss": 2.7335, "theoretical_loss": 3.4942519052651044, "tokens_seen": 1610559488 }, { "epoch": 4.07, "learning_rate": 0.0002586158475426279, "loss": 2.8503, "theoretical_loss": 3.494239539497269, "tokens_seen": 1610625024 }, { "epoch": 4.07, "learning_rate": 0.0002586058174523571, "loss": 2.7116, "theoretical_loss": 3.494227174373462, "tokens_seen": 1610690560 }, { "epoch": 4.07, "learning_rate": 0.00025859578736208626, "loss": 2.8448, "theoretical_loss": 3.4942148098936245, "tokens_seen": 1610756096 }, { "epoch": 4.07, "learning_rate": 0.00025858575727181544, "loss": 2.9028, "theoretical_loss": 3.494202446057696, "tokens_seen": 1610821632 }, { "epoch": 4.07, "learning_rate": 0.00025857572718154467, "loss": 2.5954, "theoretical_loss": 3.494190082865617, "tokens_seen": 1610887168 }, { "epoch": 4.07, "learning_rate": 0.0002585656970912738, "loss": 2.7479, "theoretical_loss": 3.494177720317327, "tokens_seen": 1610952704 }, { "epoch": 4.07, "learning_rate": 0.00025855566700100303, "loss": 2.8812, "theoretical_loss": 3.4941653584127685, "tokens_seen": 1611018240 }, { "epoch": 4.07, "learning_rate": 0.00025854563691073216, "loss": 2.4944, "theoretical_loss": 3.49415299715188, "tokens_seen": 1611083776 }, { "epoch": 4.07, "learning_rate": 0.0002585356068204614, "loss": 2.7613, "theoretical_loss": 3.4941406365346017, "tokens_seen": 1611149312 }, { "epoch": 4.07, "learning_rate": 0.0002585255767301906, "loss": 2.688, "theoretical_loss": 3.4941282765608745, "tokens_seen": 1611214848 }, { "epoch": 4.07, "learning_rate": 0.00025851554663991976, "loss": 2.7159, "theoretical_loss": 3.4941159172306384, "tokens_seen": 1611280384 }, { "epoch": 4.07, "learning_rate": 0.00025850551654964894, "loss": 2.9137, "theoretical_loss": 3.4941035585438343, "tokens_seen": 1611345920 }, { "epoch": 4.07, "learning_rate": 0.0002584954864593782, "loss": 2.9319, "theoretical_loss": 3.494091200500402, "tokens_seen": 1611411456 }, { "epoch": 4.07, "learning_rate": 0.0002584854563691073, "loss": 2.746, "theoretical_loss": 3.494078843100282, "tokens_seen": 1611476992 }, { "epoch": 4.07, "learning_rate": 0.00025847542627883654, "loss": 2.8553, "theoretical_loss": 3.494066486343415, "tokens_seen": 1611542528 }, { "epoch": 4.07, "learning_rate": 0.00025846539618856566, "loss": 2.5207, "theoretical_loss": 3.494054130229741, "tokens_seen": 1611608064 }, { "epoch": 4.07, "learning_rate": 0.0002584553660982949, "loss": 2.7975, "theoretical_loss": 3.4940417747592, "tokens_seen": 1611673600 }, { "epoch": 4.07, "learning_rate": 0.0002584453360080241, "loss": 2.7355, "theoretical_loss": 3.494029419931733, "tokens_seen": 1611739136 }, { "epoch": 4.07, "learning_rate": 0.00025843530591775326, "loss": 2.6999, "theoretical_loss": 3.49401706574728, "tokens_seen": 1611804672 }, { "epoch": 4.07, "learning_rate": 0.00025842527582748244, "loss": 2.7882, "theoretical_loss": 3.494004712205782, "tokens_seen": 1611870208 }, { "epoch": 4.07, "learning_rate": 0.0002584152457372116, "loss": 2.69, "theoretical_loss": 3.493992359307178, "tokens_seen": 1611935744 }, { "epoch": 4.07, "learning_rate": 0.0002584052156469408, "loss": 2.7722, "theoretical_loss": 3.4939800070514107, "tokens_seen": 1612001280 }, { "epoch": 4.07, "learning_rate": 0.00025839518555667004, "loss": 2.6927, "theoretical_loss": 3.493967655438418, "tokens_seen": 1612066816 }, { "epoch": 4.07, "objective/train/docs_used": 2570818, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7817633152008057, "objective/train/theoretical_loss": 3.493955304468142, "objective/train/tokens_used": 1632592352, "theoretical_loss": 3.493955304468142, "tokens_seen": 1612132352 }, { "epoch": 4.07, "learning_rate": 0.00025838515546639917, "loss": 2.7704, "theoretical_loss": 3.493955304468142, "tokens_seen": 1612132352 }, { "epoch": 4.07, "learning_rate": 0.0002583751253761284, "loss": 2.8385, "theoretical_loss": 3.4939429541405227, "tokens_seen": 1612197888 }, { "epoch": 4.07, "learning_rate": 0.00025836509528585753, "loss": 2.8993, "theoretical_loss": 3.4939306044555, "tokens_seen": 1612263424 }, { "epoch": 4.07, "learning_rate": 0.00025835506519558677, "loss": 2.8169, "theoretical_loss": 3.493918255413015, "tokens_seen": 1612328960 }, { "epoch": 4.07, "learning_rate": 0.00025834503510531595, "loss": 2.8724, "theoretical_loss": 3.4939059070130085, "tokens_seen": 1612394496 }, { "epoch": 4.07, "learning_rate": 0.00025833500501504513, "loss": 2.7622, "theoretical_loss": 3.4938935592554197, "tokens_seen": 1612460032 }, { "epoch": 4.07, "learning_rate": 0.0002583249749247743, "loss": 2.8452, "theoretical_loss": 3.49388121214019, "tokens_seen": 1612525568 }, { "epoch": 4.07, "learning_rate": 0.00025831494483450354, "loss": 2.9211, "theoretical_loss": 3.4938688656672596, "tokens_seen": 1612591104 }, { "epoch": 4.07, "learning_rate": 0.00025830491474423267, "loss": 2.7183, "theoretical_loss": 3.4938565198365694, "tokens_seen": 1612656640 }, { "epoch": 4.07, "learning_rate": 0.0002582948846539619, "loss": 2.7813, "theoretical_loss": 3.4938441746480597, "tokens_seen": 1612722176 }, { "epoch": 4.07, "learning_rate": 0.00025828485456369103, "loss": 2.9565, "theoretical_loss": 3.49383183010167, "tokens_seen": 1612787712 }, { "epoch": 4.07, "learning_rate": 0.00025827482447342027, "loss": 2.7292, "theoretical_loss": 3.493819486197342, "tokens_seen": 1612853248 }, { "epoch": 4.07, "learning_rate": 0.00025826479438314945, "loss": 2.7943, "theoretical_loss": 3.493807142935016, "tokens_seen": 1612918784 }, { "epoch": 4.07, "learning_rate": 0.00025825476429287863, "loss": 2.5199, "theoretical_loss": 3.4937948003146326, "tokens_seen": 1612984320 }, { "epoch": 4.07, "learning_rate": 0.00025824473420260787, "loss": 2.8418, "theoretical_loss": 3.493782458336132, "tokens_seen": 1613049856 }, { "epoch": 4.07, "learning_rate": 0.000258234704112337, "loss": 2.8003, "theoretical_loss": 3.493770116999455, "tokens_seen": 1613115392 }, { "epoch": 4.07, "learning_rate": 0.00025822467402206623, "loss": 2.7679, "theoretical_loss": 3.493757776304542, "tokens_seen": 1613180928 }, { "epoch": 4.07, "learning_rate": 0.0002582146439317954, "loss": 2.7183, "theoretical_loss": 3.4937454362513334, "tokens_seen": 1613246464 }, { "epoch": 4.07, "learning_rate": 0.0002582046138415246, "loss": 2.7404, "theoretical_loss": 3.49373309683977, "tokens_seen": 1613312000 }, { "epoch": 4.07, "learning_rate": 0.00025819458375125377, "loss": 2.7119, "theoretical_loss": 3.4937207580697924, "tokens_seen": 1613377536 }, { "epoch": 4.07, "learning_rate": 0.00025818455366098295, "loss": 2.6862, "theoretical_loss": 3.493708419941341, "tokens_seen": 1613443072 }, { "epoch": 4.07, "learning_rate": 0.00025817452357071213, "loss": 2.8558, "theoretical_loss": 3.493696082454357, "tokens_seen": 1613508608 }, { "epoch": 4.07, "learning_rate": 0.00025816449348044137, "loss": 2.7634, "theoretical_loss": 3.49368374560878, "tokens_seen": 1613574144 }, { "epoch": 4.07, "learning_rate": 0.0002581544633901705, "loss": 2.8876, "theoretical_loss": 3.493671409404551, "tokens_seen": 1613639680 }, { "epoch": 4.07, "learning_rate": 0.00025814443329989973, "loss": 2.8852, "theoretical_loss": 3.493659073841611, "tokens_seen": 1613705216 }, { "epoch": 4.07, "objective/train/docs_used": 2573693, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9040908813476562, "objective/train/theoretical_loss": 3.4936467389199004, "objective/train/tokens_used": 1634230752, "theoretical_loss": 3.4936467389199004, "tokens_seen": 1613770752 }, { "epoch": 4.07, "learning_rate": 0.0002581344032096289, "loss": 2.8925, "theoretical_loss": 3.4936467389199004, "tokens_seen": 1613770752 }, { "epoch": 4.07, "learning_rate": 0.0002581243731193581, "loss": 2.8633, "theoretical_loss": 3.4936344046393595, "tokens_seen": 1613836288 }, { "epoch": 4.07, "learning_rate": 0.0002581143430290873, "loss": 2.7066, "theoretical_loss": 3.4936220709999293, "tokens_seen": 1613901824 }, { "epoch": 4.07, "learning_rate": 0.00025810431293881646, "loss": 2.765, "theoretical_loss": 3.4936097380015503, "tokens_seen": 1613967360 }, { "epoch": 4.07, "learning_rate": 0.00025809428284854564, "loss": 2.5781, "theoretical_loss": 3.493597405644163, "tokens_seen": 1614032896 }, { "epoch": 4.07, "learning_rate": 0.00025808425275827487, "loss": 2.722, "theoretical_loss": 3.493585073927709, "tokens_seen": 1614098432 }, { "epoch": 4.07, "learning_rate": 0.000258074222668004, "loss": 2.8528, "theoretical_loss": 3.4935727428521277, "tokens_seen": 1614163968 }, { "epoch": 4.07, "learning_rate": 0.00025806419257773323, "loss": 2.858, "theoretical_loss": 3.4935604124173603, "tokens_seen": 1614229504 }, { "epoch": 4.07, "learning_rate": 0.00025805416248746236, "loss": 2.8583, "theoretical_loss": 3.4935480826233474, "tokens_seen": 1614295040 }, { "epoch": 4.08, "learning_rate": 0.0002580441323971916, "loss": 2.7356, "theoretical_loss": 3.49353575347003, "tokens_seen": 1614360576 }, { "epoch": 4.08, "learning_rate": 0.0002580341023069208, "loss": 2.6257, "theoretical_loss": 3.493523424957348, "tokens_seen": 1614426112 }, { "epoch": 4.08, "learning_rate": 0.00025802407221664996, "loss": 2.8355, "theoretical_loss": 3.4935110970852437, "tokens_seen": 1614491648 }, { "epoch": 4.08, "learning_rate": 0.00025801404212637914, "loss": 2.6662, "theoretical_loss": 3.493498769853656, "tokens_seen": 1614557184 }, { "epoch": 4.08, "learning_rate": 0.0002580040120361084, "loss": 2.89, "theoretical_loss": 3.493486443262527, "tokens_seen": 1614622720 }, { "epoch": 4.08, "learning_rate": 0.0002579939819458375, "loss": 2.4684, "theoretical_loss": 3.4934741173117967, "tokens_seen": 1614688256 }, { "epoch": 4.08, "learning_rate": 0.00025798395185556674, "loss": 2.6809, "theoretical_loss": 3.4934617920014057, "tokens_seen": 1614753792 }, { "epoch": 4.08, "learning_rate": 0.00025797392176529586, "loss": 2.8481, "theoretical_loss": 3.4934494673312955, "tokens_seen": 1614819328 }, { "epoch": 4.08, "learning_rate": 0.0002579638916750251, "loss": 2.8603, "theoretical_loss": 3.4934371433014055, "tokens_seen": 1614884864 }, { "epoch": 4.08, "learning_rate": 0.0002579538615847543, "loss": 2.75, "theoretical_loss": 3.493424819911678, "tokens_seen": 1614950400 }, { "epoch": 4.08, "learning_rate": 0.00025794383149448346, "loss": 2.8215, "theoretical_loss": 3.493412497162053, "tokens_seen": 1615015936 }, { "epoch": 4.08, "learning_rate": 0.00025793380140421264, "loss": 2.8627, "theoretical_loss": 3.493400175052472, "tokens_seen": 1615081472 }, { "epoch": 4.08, "learning_rate": 0.0002579237713139418, "loss": 2.5863, "theoretical_loss": 3.493387853582875, "tokens_seen": 1615147008 }, { "epoch": 4.08, "learning_rate": 0.000257913741223671, "loss": 2.9129, "theoretical_loss": 3.493375532753203, "tokens_seen": 1615212544 }, { "epoch": 4.08, "learning_rate": 0.00025790371113340024, "loss": 2.9665, "theoretical_loss": 3.4933632125633958, "tokens_seen": 1615278080 }, { "epoch": 4.08, "learning_rate": 0.00025789368104312937, "loss": 2.7603, "theoretical_loss": 3.493350893013396, "tokens_seen": 1615343616 }, { "epoch": 4.08, "objective/train/docs_used": 2576581, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8399438858032227, "objective/train/theoretical_loss": 3.4933385741031437, "objective/train/tokens_used": 1635869152, "theoretical_loss": 3.4933385741031437, "tokens_seen": 1615409152 }, { "epoch": 4.08, "learning_rate": 0.0002578836509528586, "loss": 2.8101, "theoretical_loss": 3.4933385741031437, "tokens_seen": 1615409152 }, { "epoch": 4.08, "learning_rate": 0.00025787362086258773, "loss": 2.8263, "theoretical_loss": 3.4933262558325797, "tokens_seen": 1615474688 }, { "epoch": 4.08, "learning_rate": 0.00025786359077231697, "loss": 2.8926, "theoretical_loss": 3.4933139382016445, "tokens_seen": 1615540224 }, { "epoch": 4.08, "learning_rate": 0.00025785356068204615, "loss": 2.6699, "theoretical_loss": 3.4933016212102794, "tokens_seen": 1615605760 }, { "epoch": 4.08, "learning_rate": 0.00025784353059177533, "loss": 2.9264, "theoretical_loss": 3.493289304858425, "tokens_seen": 1615671296 }, { "epoch": 4.08, "learning_rate": 0.0002578335005015045, "loss": 2.6084, "theoretical_loss": 3.4932769891460222, "tokens_seen": 1615736832 }, { "epoch": 4.08, "learning_rate": 0.00025782347041123374, "loss": 2.7854, "theoretical_loss": 3.493264674073012, "tokens_seen": 1615802368 }, { "epoch": 4.08, "learning_rate": 0.00025781344032096287, "loss": 2.8198, "theoretical_loss": 3.4932523596393352, "tokens_seen": 1615867904 }, { "epoch": 4.08, "learning_rate": 0.0002578034102306921, "loss": 2.725, "theoretical_loss": 3.4932400458449324, "tokens_seen": 1615933440 }, { "epoch": 4.08, "learning_rate": 0.00025779338014042123, "loss": 2.7468, "theoretical_loss": 3.4932277326897454, "tokens_seen": 1615998976 }, { "epoch": 4.08, "learning_rate": 0.00025778335005015047, "loss": 2.7756, "theoretical_loss": 3.493215420173714, "tokens_seen": 1616064512 }, { "epoch": 4.08, "learning_rate": 0.00025777331995987965, "loss": 2.835, "theoretical_loss": 3.4932031082967794, "tokens_seen": 1616130048 }, { "epoch": 4.08, "learning_rate": 0.00025776328986960883, "loss": 2.658, "theoretical_loss": 3.4931907970588827, "tokens_seen": 1616195584 }, { "epoch": 4.08, "learning_rate": 0.000257753259779338, "loss": 2.8131, "theoretical_loss": 3.493178486459965, "tokens_seen": 1616261120 }, { "epoch": 4.08, "learning_rate": 0.0002577432296890672, "loss": 2.7117, "theoretical_loss": 3.493166176499967, "tokens_seen": 1616326656 }, { "epoch": 4.08, "learning_rate": 0.0002577331995987964, "loss": 2.8885, "theoretical_loss": 3.4931538671788296, "tokens_seen": 1616392192 }, { "epoch": 4.08, "learning_rate": 0.0002577231695085256, "loss": 2.8049, "theoretical_loss": 3.4931415584964935, "tokens_seen": 1616457728 }, { "epoch": 4.08, "learning_rate": 0.00025771313941825474, "loss": 2.8097, "theoretical_loss": 3.4931292504529003, "tokens_seen": 1616523264 }, { "epoch": 4.08, "learning_rate": 0.00025770310932798397, "loss": 2.8125, "theoretical_loss": 3.4931169430479905, "tokens_seen": 1616588800 }, { "epoch": 4.08, "learning_rate": 0.0002576930792377131, "loss": 2.8234, "theoretical_loss": 3.4931046362817053, "tokens_seen": 1616654336 }, { "epoch": 4.08, "learning_rate": 0.00025768304914744233, "loss": 2.7805, "theoretical_loss": 3.4930923301539853, "tokens_seen": 1616719872 }, { "epoch": 4.08, "learning_rate": 0.0002576730190571715, "loss": 2.7013, "theoretical_loss": 3.4930800246647715, "tokens_seen": 1616785408 }, { "epoch": 4.08, "learning_rate": 0.0002576629889669007, "loss": 2.703, "theoretical_loss": 3.4930677198140057, "tokens_seen": 1616850944 }, { "epoch": 4.08, "learning_rate": 0.0002576529588766299, "loss": 2.8638, "theoretical_loss": 3.4930554156016282, "tokens_seen": 1616916480 }, { "epoch": 4.08, "learning_rate": 0.0002576429287863591, "loss": 2.7107, "theoretical_loss": 3.49304311202758, "tokens_seen": 1616982016 }, { "epoch": 4.08, "objective/train/docs_used": 2578753, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.852828025817871, "objective/train/theoretical_loss": 3.4930308090918025, "objective/train/tokens_used": 1637507552, "theoretical_loss": 3.4930308090918025, "tokens_seen": 1617047552 }, { "epoch": 4.08, "learning_rate": 0.00025763289869608824, "loss": 2.8666, "theoretical_loss": 3.4930308090918025, "tokens_seen": 1617047552 }, { "epoch": 4.08, "learning_rate": 0.0002576228686058175, "loss": 2.8534, "theoretical_loss": 3.493018506794236, "tokens_seen": 1617113088 }, { "epoch": 4.08, "learning_rate": 0.0002576128385155466, "loss": 2.4889, "theoretical_loss": 3.4930062051348223, "tokens_seen": 1617178624 }, { "epoch": 4.08, "learning_rate": 0.00025760280842527584, "loss": 2.8245, "theoretical_loss": 3.4929939041135016, "tokens_seen": 1617244160 }, { "epoch": 4.08, "learning_rate": 0.000257592778335005, "loss": 2.9281, "theoretical_loss": 3.4929816037302164, "tokens_seen": 1617309696 }, { "epoch": 4.08, "learning_rate": 0.0002575827482447342, "loss": 2.8453, "theoretical_loss": 3.492969303984906, "tokens_seen": 1617375232 }, { "epoch": 4.08, "learning_rate": 0.0002575727181544634, "loss": 2.6738, "theoretical_loss": 3.492957004877513, "tokens_seen": 1617440768 }, { "epoch": 4.08, "learning_rate": 0.00025756268806419256, "loss": 2.7964, "theoretical_loss": 3.492944706407977, "tokens_seen": 1617506304 }, { "epoch": 4.08, "learning_rate": 0.00025755265797392174, "loss": 2.8034, "theoretical_loss": 3.4929324085762405, "tokens_seen": 1617571840 }, { "epoch": 4.08, "learning_rate": 0.000257542627883651, "loss": 2.8315, "theoretical_loss": 3.4929201113822437, "tokens_seen": 1617637376 }, { "epoch": 4.08, "learning_rate": 0.0002575325977933801, "loss": 2.6917, "theoretical_loss": 3.492907814825928, "tokens_seen": 1617702912 }, { "epoch": 4.08, "learning_rate": 0.00025752256770310934, "loss": 2.7282, "theoretical_loss": 3.4928955189072344, "tokens_seen": 1617768448 }, { "epoch": 4.08, "learning_rate": 0.00025751253761283847, "loss": 2.7803, "theoretical_loss": 3.492883223626104, "tokens_seen": 1617833984 }, { "epoch": 4.08, "learning_rate": 0.0002575025075225677, "loss": 2.7062, "theoretical_loss": 3.4928709289824775, "tokens_seen": 1617899520 }, { "epoch": 4.08, "learning_rate": 0.00025749247743229694, "loss": 2.8722, "theoretical_loss": 3.4928586349762973, "tokens_seen": 1617965056 }, { "epoch": 4.08, "learning_rate": 0.00025748244734202606, "loss": 2.6293, "theoretical_loss": 3.4928463416075033, "tokens_seen": 1618030592 }, { "epoch": 4.08, "learning_rate": 0.0002574724172517553, "loss": 2.8876, "theoretical_loss": 3.4928340488760368, "tokens_seen": 1618096128 }, { "epoch": 4.08, "learning_rate": 0.0002574623871614845, "loss": 2.6673, "theoretical_loss": 3.492821756781839, "tokens_seen": 1618161664 }, { "epoch": 4.08, "learning_rate": 0.00025745235707121366, "loss": 2.8324, "theoretical_loss": 3.4928094653248523, "tokens_seen": 1618227200 }, { "epoch": 4.08, "learning_rate": 0.00025744232698094284, "loss": 2.8291, "theoretical_loss": 3.492797174505016, "tokens_seen": 1618292736 }, { "epoch": 4.08, "learning_rate": 0.000257432296890672, "loss": 2.7371, "theoretical_loss": 3.492784884322272, "tokens_seen": 1618358272 }, { "epoch": 4.08, "learning_rate": 0.0002574222668004012, "loss": 2.7954, "theoretical_loss": 3.4927725947765618, "tokens_seen": 1618423808 }, { "epoch": 4.08, "learning_rate": 0.00025741223671013044, "loss": 2.7376, "theoretical_loss": 3.4927603058678267, "tokens_seen": 1618489344 }, { "epoch": 4.08, "learning_rate": 0.00025740220661985957, "loss": 2.75, "theoretical_loss": 3.492748017596007, "tokens_seen": 1618554880 }, { "epoch": 4.08, "learning_rate": 0.0002573921765295888, "loss": 2.8033, "theoretical_loss": 3.492735729961045, "tokens_seen": 1618620416 }, { "epoch": 4.08, "objective/train/docs_used": 2581501, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5162672996520996, "objective/train/theoretical_loss": 3.492723442962881, "objective/train/tokens_used": 1639145952, "theoretical_loss": 3.492723442962881, "tokens_seen": 1618685952 }, { "epoch": 4.08, "learning_rate": 0.00025738214643931793, "loss": 2.7002, "theoretical_loss": 3.492723442962881, "tokens_seen": 1618685952 }, { "epoch": 4.08, "learning_rate": 0.00025737211634904717, "loss": 2.6783, "theoretical_loss": 3.492711156601456, "tokens_seen": 1618751488 }, { "epoch": 4.08, "learning_rate": 0.00025736208625877635, "loss": 2.7945, "theoretical_loss": 3.492698870876713, "tokens_seen": 1618817024 }, { "epoch": 4.08, "learning_rate": 0.00025735205616850553, "loss": 2.654, "theoretical_loss": 3.4926865857885914, "tokens_seen": 1618882560 }, { "epoch": 4.08, "learning_rate": 0.0002573420260782347, "loss": 2.6729, "theoretical_loss": 3.4926743013370327, "tokens_seen": 1618948096 }, { "epoch": 4.08, "learning_rate": 0.00025733199598796394, "loss": 2.6757, "theoretical_loss": 3.492662017521979, "tokens_seen": 1619013632 }, { "epoch": 4.08, "learning_rate": 0.00025732196589769307, "loss": 2.8387, "theoretical_loss": 3.492649734343371, "tokens_seen": 1619079168 }, { "epoch": 4.08, "learning_rate": 0.0002573119358074223, "loss": 2.7987, "theoretical_loss": 3.4926374518011505, "tokens_seen": 1619144704 }, { "epoch": 4.08, "learning_rate": 0.00025730190571715143, "loss": 2.6122, "theoretical_loss": 3.4926251698952577, "tokens_seen": 1619210240 }, { "epoch": 4.08, "learning_rate": 0.00025729187562688067, "loss": 2.7678, "theoretical_loss": 3.4926128886256347, "tokens_seen": 1619275776 }, { "epoch": 4.08, "learning_rate": 0.00025728184553660985, "loss": 2.6116, "theoretical_loss": 3.4926006079922223, "tokens_seen": 1619341312 }, { "epoch": 4.08, "learning_rate": 0.00025727181544633903, "loss": 2.6796, "theoretical_loss": 3.4925883279949623, "tokens_seen": 1619406848 }, { "epoch": 4.08, "learning_rate": 0.0002572617853560682, "loss": 2.8075, "theoretical_loss": 3.4925760486337962, "tokens_seen": 1619472384 }, { "epoch": 4.08, "learning_rate": 0.0002572517552657974, "loss": 2.869, "theoretical_loss": 3.4925637699086645, "tokens_seen": 1619537920 }, { "epoch": 4.08, "learning_rate": 0.0002572417251755266, "loss": 2.7448, "theoretical_loss": 3.492551491819509, "tokens_seen": 1619603456 }, { "epoch": 4.08, "learning_rate": 0.0002572316950852558, "loss": 2.7782, "theoretical_loss": 3.492539214366271, "tokens_seen": 1619668992 }, { "epoch": 4.08, "learning_rate": 0.00025722166499498494, "loss": 2.88, "theoretical_loss": 3.4925269375488917, "tokens_seen": 1619734528 }, { "epoch": 4.08, "learning_rate": 0.00025721163490471417, "loss": 2.7566, "theoretical_loss": 3.4925146613673124, "tokens_seen": 1619800064 }, { "epoch": 4.08, "learning_rate": 0.0002572016048144433, "loss": 2.8095, "theoretical_loss": 3.4925023858214743, "tokens_seen": 1619865600 }, { "epoch": 4.08, "learning_rate": 0.00025719157472417253, "loss": 2.8695, "theoretical_loss": 3.49249011091132, "tokens_seen": 1619931136 }, { "epoch": 4.08, "learning_rate": 0.0002571815446339017, "loss": 2.8496, "theoretical_loss": 3.4924778366367892, "tokens_seen": 1619996672 }, { "epoch": 4.08, "learning_rate": 0.0002571715145436309, "loss": 2.6596, "theoretical_loss": 3.492465562997824, "tokens_seen": 1620062208 }, { "epoch": 4.08, "learning_rate": 0.0002571614844533601, "loss": 2.6486, "theoretical_loss": 3.4924532899943657, "tokens_seen": 1620127744 }, { "epoch": 4.08, "learning_rate": 0.0002571514543630893, "loss": 2.8084, "theoretical_loss": 3.4924410176263563, "tokens_seen": 1620193280 }, { "epoch": 4.08, "learning_rate": 0.00025714142427281844, "loss": 2.8378, "theoretical_loss": 3.4924287458937364, "tokens_seen": 1620258816 }, { "epoch": 4.08, "objective/train/docs_used": 2584455, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.614379405975342, "objective/train/theoretical_loss": 3.492416474796447, "objective/train/tokens_used": 1640784352, "theoretical_loss": 3.492416474796447, "tokens_seen": 1620324352 }, { "epoch": 4.08, "learning_rate": 0.0002571313941825477, "loss": 2.6963, "theoretical_loss": 3.492416474796447, "tokens_seen": 1620324352 }, { "epoch": 4.08, "learning_rate": 0.0002571213640922768, "loss": 2.8252, "theoretical_loss": 3.4924042043344308, "tokens_seen": 1620389888 }, { "epoch": 4.08, "learning_rate": 0.00025711133400200604, "loss": 2.7481, "theoretical_loss": 3.4923919345076286, "tokens_seen": 1620455424 }, { "epoch": 4.08, "learning_rate": 0.0002571013039117352, "loss": 2.6066, "theoretical_loss": 3.4923796653159815, "tokens_seen": 1620520960 }, { "epoch": 4.08, "learning_rate": 0.0002570912738214644, "loss": 2.762, "theoretical_loss": 3.4923673967594313, "tokens_seen": 1620586496 }, { "epoch": 4.08, "learning_rate": 0.0002570812437311936, "loss": 2.7727, "theoretical_loss": 3.4923551288379198, "tokens_seen": 1620652032 }, { "epoch": 4.08, "learning_rate": 0.00025707121364092276, "loss": 2.783, "theoretical_loss": 3.492342861551388, "tokens_seen": 1620717568 }, { "epoch": 4.08, "learning_rate": 0.00025706118355065194, "loss": 2.7734, "theoretical_loss": 3.492330594899777, "tokens_seen": 1620783104 }, { "epoch": 4.08, "learning_rate": 0.0002570511534603812, "loss": 2.8045, "theoretical_loss": 3.4923183288830284, "tokens_seen": 1620848640 }, { "epoch": 4.08, "learning_rate": 0.0002570411233701103, "loss": 2.7798, "theoretical_loss": 3.4923060635010845, "tokens_seen": 1620914176 }, { "epoch": 4.08, "learning_rate": 0.00025703109327983954, "loss": 2.7906, "theoretical_loss": 3.4922937987538862, "tokens_seen": 1620979712 }, { "epoch": 4.08, "learning_rate": 0.00025702106318956867, "loss": 2.8752, "theoretical_loss": 3.492281534641375, "tokens_seen": 1621045248 }, { "epoch": 4.08, "learning_rate": 0.0002570110330992979, "loss": 2.6874, "theoretical_loss": 3.4922692711634924, "tokens_seen": 1621110784 }, { "epoch": 4.08, "learning_rate": 0.0002570010030090271, "loss": 2.6398, "theoretical_loss": 3.49225700832018, "tokens_seen": 1621176320 }, { "epoch": 4.08, "learning_rate": 0.00025699097291875626, "loss": 2.8234, "theoretical_loss": 3.4922447461113792, "tokens_seen": 1621241856 }, { "epoch": 4.08, "learning_rate": 0.00025698094282848545, "loss": 2.7922, "theoretical_loss": 3.4922324845370314, "tokens_seen": 1621307392 }, { "epoch": 4.08, "learning_rate": 0.0002569709127382147, "loss": 2.8182, "theoretical_loss": 3.4922202235970783, "tokens_seen": 1621372928 }, { "epoch": 4.08, "learning_rate": 0.0002569608826479438, "loss": 2.733, "theoretical_loss": 3.4922079632914613, "tokens_seen": 1621438464 }, { "epoch": 4.08, "learning_rate": 0.00025695085255767304, "loss": 2.6544, "theoretical_loss": 3.4921957036201223, "tokens_seen": 1621504000 }, { "epoch": 4.08, "learning_rate": 0.00025694082246740217, "loss": 2.8794, "theoretical_loss": 3.4921834445830027, "tokens_seen": 1621569536 }, { "epoch": 4.08, "learning_rate": 0.0002569307923771314, "loss": 2.7, "theoretical_loss": 3.4921711861800437, "tokens_seen": 1621635072 }, { "epoch": 4.08, "learning_rate": 0.0002569207622868606, "loss": 2.8246, "theoretical_loss": 3.4921589284111874, "tokens_seen": 1621700608 }, { "epoch": 4.08, "learning_rate": 0.00025691073219658977, "loss": 2.5352, "theoretical_loss": 3.492146671276375, "tokens_seen": 1621766144 }, { "epoch": 4.08, "learning_rate": 0.00025690070210631895, "loss": 2.758, "theoretical_loss": 3.492134414775548, "tokens_seen": 1621831680 }, { "epoch": 4.08, "learning_rate": 0.00025689067201604813, "loss": 2.728, "theoretical_loss": 3.492122158908649, "tokens_seen": 1621897216 }, { "epoch": 4.08, "objective/train/docs_used": 2587231, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.728855609893799, "objective/train/theoretical_loss": 3.492109903675618, "objective/train/tokens_used": 1642422752, "theoretical_loss": 3.492109903675618, "tokens_seen": 1621962752 }, { "epoch": 4.08, "learning_rate": 0.0002568806419257773, "loss": 2.583, "theoretical_loss": 3.492109903675618, "tokens_seen": 1621962752 }, { "epoch": 4.08, "learning_rate": 0.00025687061183550655, "loss": 2.7319, "theoretical_loss": 3.4920976490763977, "tokens_seen": 1622028288 }, { "epoch": 4.08, "learning_rate": 0.0002568605817452357, "loss": 2.8633, "theoretical_loss": 3.492085395110929, "tokens_seen": 1622093824 }, { "epoch": 4.08, "learning_rate": 0.0002568505516549649, "loss": 2.913, "theoretical_loss": 3.492073141779155, "tokens_seen": 1622159360 }, { "epoch": 4.08, "learning_rate": 0.00025684052156469404, "loss": 2.734, "theoretical_loss": 3.4920608890810154, "tokens_seen": 1622224896 }, { "epoch": 4.08, "learning_rate": 0.00025683049147442327, "loss": 2.8557, "theoretical_loss": 3.4920486370164525, "tokens_seen": 1622290432 }, { "epoch": 4.08, "learning_rate": 0.00025682046138415245, "loss": 2.718, "theoretical_loss": 3.492036385585409, "tokens_seen": 1622355968 }, { "epoch": 4.08, "learning_rate": 0.00025681043129388163, "loss": 2.7873, "theoretical_loss": 3.492024134787825, "tokens_seen": 1622421504 }, { "epoch": 4.08, "learning_rate": 0.0002568004012036108, "loss": 2.5766, "theoretical_loss": 3.4920118846236434, "tokens_seen": 1622487040 }, { "epoch": 4.08, "learning_rate": 0.00025679037111334005, "loss": 2.8716, "theoretical_loss": 3.491999635092805, "tokens_seen": 1622552576 }, { "epoch": 4.08, "learning_rate": 0.0002567803410230692, "loss": 2.7181, "theoretical_loss": 3.491987386195252, "tokens_seen": 1622618112 }, { "epoch": 4.08, "learning_rate": 0.0002567703109327984, "loss": 2.6875, "theoretical_loss": 3.491975137930926, "tokens_seen": 1622683648 }, { "epoch": 4.08, "learning_rate": 0.00025676028084252754, "loss": 2.8252, "theoretical_loss": 3.491962890299768, "tokens_seen": 1622749184 }, { "epoch": 4.08, "learning_rate": 0.0002567502507522568, "loss": 2.7877, "theoretical_loss": 3.491950643301721, "tokens_seen": 1622814720 }, { "epoch": 4.08, "learning_rate": 0.000256740220661986, "loss": 2.844, "theoretical_loss": 3.4919383969367255, "tokens_seen": 1622880256 }, { "epoch": 4.08, "learning_rate": 0.00025673019057171514, "loss": 2.5773, "theoretical_loss": 3.491926151204724, "tokens_seen": 1622945792 }, { "epoch": 4.08, "learning_rate": 0.00025672016048144437, "loss": 2.6088, "theoretical_loss": 3.4919139061056583, "tokens_seen": 1623011328 }, { "epoch": 4.08, "learning_rate": 0.0002567101303911735, "loss": 2.8573, "theoretical_loss": 3.491901661639469, "tokens_seen": 1623076864 }, { "epoch": 4.08, "learning_rate": 0.00025670010030090273, "loss": 2.8917, "theoretical_loss": 3.491889417806099, "tokens_seen": 1623142400 }, { "epoch": 4.08, "learning_rate": 0.0002566900702106319, "loss": 2.8595, "theoretical_loss": 3.4918771746054897, "tokens_seen": 1623207936 }, { "epoch": 4.08, "learning_rate": 0.0002566800401203611, "loss": 2.8175, "theoretical_loss": 3.4918649320375827, "tokens_seen": 1623273472 }, { "epoch": 4.08, "learning_rate": 0.0002566700100300903, "loss": 2.8745, "theoretical_loss": 3.49185269010232, "tokens_seen": 1623339008 }, { "epoch": 4.08, "learning_rate": 0.0002566599799398195, "loss": 2.7573, "theoretical_loss": 3.491840448799643, "tokens_seen": 1623404544 }, { "epoch": 4.08, "learning_rate": 0.00025664994984954864, "loss": 2.5204, "theoretical_loss": 3.491828208129494, "tokens_seen": 1623470080 }, { "epoch": 4.08, "learning_rate": 0.0002566399197592779, "loss": 2.5307, "theoretical_loss": 3.491815968091814, "tokens_seen": 1623535616 }, { "epoch": 4.08, "objective/train/docs_used": 2588581, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.55642032623291, "objective/train/theoretical_loss": 3.4918037286865458, "objective/train/tokens_used": 1644061152, "theoretical_loss": 3.4918037286865458, "tokens_seen": 1623601152 }, { "epoch": 4.08, "learning_rate": 0.000256629889669007, "loss": 2.7602, "theoretical_loss": 3.4918037286865458, "tokens_seen": 1623601152 }, { "epoch": 4.08, "learning_rate": 0.00025661985957873624, "loss": 2.9181, "theoretical_loss": 3.4917914899136306, "tokens_seen": 1623666688 }, { "epoch": 4.08, "learning_rate": 0.0002566098294884654, "loss": 2.7759, "theoretical_loss": 3.49177925177301, "tokens_seen": 1623732224 }, { "epoch": 4.08, "learning_rate": 0.0002565997993981946, "loss": 2.7004, "theoretical_loss": 3.491767014264626, "tokens_seen": 1623797760 }, { "epoch": 4.08, "learning_rate": 0.0002565897693079238, "loss": 2.8049, "theoretical_loss": 3.4917547773884214, "tokens_seen": 1623863296 }, { "epoch": 4.08, "learning_rate": 0.00025657973921765296, "loss": 2.7236, "theoretical_loss": 3.491742541144337, "tokens_seen": 1623928832 }, { "epoch": 4.08, "learning_rate": 0.00025656970912738214, "loss": 2.687, "theoretical_loss": 3.491730305532314, "tokens_seen": 1623994368 }, { "epoch": 4.08, "learning_rate": 0.0002565596790371114, "loss": 2.7869, "theoretical_loss": 3.491718070552295, "tokens_seen": 1624059904 }, { "epoch": 4.08, "learning_rate": 0.0002565496489468405, "loss": 2.8296, "theoretical_loss": 3.491705836204223, "tokens_seen": 1624125440 }, { "epoch": 4.08, "learning_rate": 0.00025653961885656974, "loss": 2.7225, "theoretical_loss": 3.491693602488038, "tokens_seen": 1624190976 }, { "epoch": 4.08, "learning_rate": 0.00025652958876629887, "loss": 2.7592, "theoretical_loss": 3.4916813694036826, "tokens_seen": 1624256512 }, { "epoch": 4.08, "learning_rate": 0.0002565195586760281, "loss": 2.8465, "theoretical_loss": 3.4916691369510993, "tokens_seen": 1624322048 }, { "epoch": 4.08, "learning_rate": 0.0002565095285857573, "loss": 2.7393, "theoretical_loss": 3.491656905130229, "tokens_seen": 1624387584 }, { "epoch": 4.08, "learning_rate": 0.00025649949849548647, "loss": 2.7109, "theoretical_loss": 3.491644673941014, "tokens_seen": 1624453120 }, { "epoch": 4.08, "learning_rate": 0.00025648946840521565, "loss": 2.8019, "theoretical_loss": 3.491632443383396, "tokens_seen": 1624518656 }, { "epoch": 4.08, "learning_rate": 0.0002564794383149449, "loss": 2.6433, "theoretical_loss": 3.4916202134573173, "tokens_seen": 1624584192 }, { "epoch": 4.08, "learning_rate": 0.000256469408224674, "loss": 2.6772, "theoretical_loss": 3.4916079841627194, "tokens_seen": 1624649728 }, { "epoch": 4.08, "learning_rate": 0.00025645937813440324, "loss": 2.7716, "theoretical_loss": 3.491595755499545, "tokens_seen": 1624715264 }, { "epoch": 4.08, "learning_rate": 0.00025644934804413237, "loss": 2.7846, "theoretical_loss": 3.4915835274677347, "tokens_seen": 1624780800 }, { "epoch": 4.08, "learning_rate": 0.0002564393179538616, "loss": 2.7725, "theoretical_loss": 3.491571300067232, "tokens_seen": 1624846336 }, { "epoch": 4.08, "learning_rate": 0.0002564292878635908, "loss": 2.9007, "theoretical_loss": 3.4915590732979775, "tokens_seen": 1624911872 }, { "epoch": 4.08, "learning_rate": 0.00025641925777331997, "loss": 2.7627, "theoretical_loss": 3.4915468471599143, "tokens_seen": 1624977408 }, { "epoch": 4.08, "learning_rate": 0.00025640922768304915, "loss": 2.924, "theoretical_loss": 3.491534621652983, "tokens_seen": 1625042944 }, { "epoch": 4.08, "learning_rate": 0.00025639919759277833, "loss": 2.6986, "theoretical_loss": 3.4915223967771265, "tokens_seen": 1625108480 }, { "epoch": 4.08, "learning_rate": 0.0002563891675025075, "loss": 2.8094, "theoretical_loss": 3.491510172532287, "tokens_seen": 1625174016 }, { "epoch": 4.08, "objective/train/docs_used": 2591456, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5245280265808105, "objective/train/theoretical_loss": 3.491497948918406, "objective/train/tokens_used": 1645699552, "theoretical_loss": 3.491497948918406, "tokens_seen": 1625239552 }, { "epoch": 4.08, "learning_rate": 0.00025637913741223675, "loss": 2.7632, "theoretical_loss": 3.491497948918406, "tokens_seen": 1625239552 }, { "epoch": 4.08, "learning_rate": 0.0002563691073219659, "loss": 2.8477, "theoretical_loss": 3.4914857259354255, "tokens_seen": 1625305088 }, { "epoch": 4.08, "learning_rate": 0.0002563590772316951, "loss": 2.8688, "theoretical_loss": 3.491473503583287, "tokens_seen": 1625370624 }, { "epoch": 4.08, "learning_rate": 0.00025634904714142424, "loss": 2.8481, "theoretical_loss": 3.491461281861934, "tokens_seen": 1625436160 }, { "epoch": 4.08, "learning_rate": 0.00025633901705115347, "loss": 2.8688, "theoretical_loss": 3.4914490607713073, "tokens_seen": 1625501696 }, { "epoch": 4.08, "learning_rate": 0.00025632898696088265, "loss": 2.7704, "theoretical_loss": 3.491436840311349, "tokens_seen": 1625567232 }, { "epoch": 4.08, "learning_rate": 0.00025631895687061183, "loss": 2.7723, "theoretical_loss": 3.4914246204820016, "tokens_seen": 1625632768 }, { "epoch": 4.08, "learning_rate": 0.000256308926780341, "loss": 2.8062, "theoretical_loss": 3.491412401283207, "tokens_seen": 1625698304 }, { "epoch": 4.08, "learning_rate": 0.00025629889669007025, "loss": 2.784, "theoretical_loss": 3.491400182714907, "tokens_seen": 1625763840 }, { "epoch": 4.08, "learning_rate": 0.0002562888665997994, "loss": 2.9185, "theoretical_loss": 3.4913879647770436, "tokens_seen": 1625829376 }, { "epoch": 4.08, "learning_rate": 0.0002562788365095286, "loss": 2.8234, "theoretical_loss": 3.4913757474695597, "tokens_seen": 1625894912 }, { "epoch": 4.08, "learning_rate": 0.00025626880641925774, "loss": 2.7664, "theoretical_loss": 3.491363530792396, "tokens_seen": 1625960448 }, { "epoch": 4.08, "learning_rate": 0.000256258776328987, "loss": 2.7966, "theoretical_loss": 3.4913513147454958, "tokens_seen": 1626025984 }, { "epoch": 4.08, "learning_rate": 0.00025624874623871616, "loss": 2.7252, "theoretical_loss": 3.4913390993288003, "tokens_seen": 1626091520 }, { "epoch": 4.08, "learning_rate": 0.00025623871614844534, "loss": 2.7626, "theoretical_loss": 3.4913268845422523, "tokens_seen": 1626157056 }, { "epoch": 4.08, "learning_rate": 0.0002562286860581745, "loss": 2.7084, "theoretical_loss": 3.4913146703857936, "tokens_seen": 1626222592 }, { "epoch": 4.08, "learning_rate": 0.0002562186559679037, "loss": 2.9099, "theoretical_loss": 3.491302456859366, "tokens_seen": 1626288128 }, { "epoch": 4.08, "learning_rate": 0.0002562086258776329, "loss": 2.7726, "theoretical_loss": 3.4912902439629123, "tokens_seen": 1626353664 }, { "epoch": 4.08, "learning_rate": 0.0002561985957873621, "loss": 2.6744, "theoretical_loss": 3.491278031696374, "tokens_seen": 1626419200 }, { "epoch": 4.08, "learning_rate": 0.00025618856569709124, "loss": 2.7556, "theoretical_loss": 3.4912658200596933, "tokens_seen": 1626484736 }, { "epoch": 4.08, "learning_rate": 0.0002561785356068205, "loss": 2.7721, "theoretical_loss": 3.4912536090528126, "tokens_seen": 1626550272 }, { "epoch": 4.08, "learning_rate": 0.00025616850551654966, "loss": 2.8652, "theoretical_loss": 3.4912413986756743, "tokens_seen": 1626615808 }, { "epoch": 4.08, "learning_rate": 0.00025615847542627884, "loss": 2.7922, "theoretical_loss": 3.4912291889282194, "tokens_seen": 1626681344 }, { "epoch": 4.08, "learning_rate": 0.000256148445336008, "loss": 2.7333, "theoretical_loss": 3.4912169798103916, "tokens_seen": 1626746880 }, { "epoch": 4.08, "learning_rate": 0.0002561384152457372, "loss": 2.8699, "theoretical_loss": 3.491204771322132, "tokens_seen": 1626812416 }, { "epoch": 4.08, "objective/train/docs_used": 2594092, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6574928760528564, "objective/train/theoretical_loss": 3.4911925634633834, "objective/train/tokens_used": 1647337952, "theoretical_loss": 3.4911925634633834, "tokens_seen": 1626877952 }, { "epoch": 4.08, "learning_rate": 0.0002561283851554664, "loss": 2.6996, "theoretical_loss": 3.4911925634633834, "tokens_seen": 1626877952 }, { "epoch": 4.08, "learning_rate": 0.0002561183550651956, "loss": 2.8962, "theoretical_loss": 3.491180356234087, "tokens_seen": 1626943488 }, { "epoch": 4.08, "learning_rate": 0.00025610832497492475, "loss": 2.6775, "theoretical_loss": 3.491168149634186, "tokens_seen": 1627009024 }, { "epoch": 4.08, "learning_rate": 0.000256098294884654, "loss": 2.7384, "theoretical_loss": 3.491155943663623, "tokens_seen": 1627074560 }, { "epoch": 4.08, "learning_rate": 0.0002560882647943831, "loss": 2.7727, "theoretical_loss": 3.4911437383223385, "tokens_seen": 1627140096 }, { "epoch": 4.08, "learning_rate": 0.00025607823470411234, "loss": 2.7524, "theoretical_loss": 3.491131533610276, "tokens_seen": 1627205632 }, { "epoch": 4.08, "learning_rate": 0.0002560682046138415, "loss": 2.7429, "theoretical_loss": 3.4911193295273772, "tokens_seen": 1627271168 }, { "epoch": 4.08, "learning_rate": 0.0002560581745235707, "loss": 2.6829, "theoretical_loss": 3.491107126073585, "tokens_seen": 1627336704 }, { "epoch": 4.08, "learning_rate": 0.0002560481444332999, "loss": 2.9195, "theoretical_loss": 3.4910949232488413, "tokens_seen": 1627402240 }, { "epoch": 4.08, "learning_rate": 0.00025603811434302907, "loss": 2.9492, "theoretical_loss": 3.491082721053088, "tokens_seen": 1627467776 }, { "epoch": 4.08, "learning_rate": 0.00025602808425275825, "loss": 2.8204, "theoretical_loss": 3.491070519486267, "tokens_seen": 1627533312 }, { "epoch": 4.08, "learning_rate": 0.0002560180541624875, "loss": 2.8528, "theoretical_loss": 3.4910583185483217, "tokens_seen": 1627598848 }, { "epoch": 4.08, "learning_rate": 0.0002560080240722166, "loss": 2.7827, "theoretical_loss": 3.491046118239194, "tokens_seen": 1627664384 }, { "epoch": 4.08, "learning_rate": 0.00025599799398194585, "loss": 2.9325, "theoretical_loss": 3.4910339185588257, "tokens_seen": 1627729920 }, { "epoch": 4.08, "learning_rate": 0.0002559879638916751, "loss": 2.8984, "theoretical_loss": 3.4910217195071596, "tokens_seen": 1627795456 }, { "epoch": 4.08, "learning_rate": 0.0002559779338014042, "loss": 2.6768, "theoretical_loss": 3.491009521084137, "tokens_seen": 1627860992 }, { "epoch": 4.08, "learning_rate": 0.00025596790371113344, "loss": 2.6763, "theoretical_loss": 3.490997323289702, "tokens_seen": 1627926528 }, { "epoch": 4.08, "learning_rate": 0.00025595787362086257, "loss": 2.6759, "theoretical_loss": 3.490985126123795, "tokens_seen": 1627992064 }, { "epoch": 4.08, "learning_rate": 0.0002559478435305918, "loss": 2.6392, "theoretical_loss": 3.4909729295863596, "tokens_seen": 1628057600 }, { "epoch": 4.08, "learning_rate": 0.000255937813440321, "loss": 2.736, "theoretical_loss": 3.490960733677338, "tokens_seen": 1628123136 }, { "epoch": 4.08, "learning_rate": 0.00025592778335005017, "loss": 2.8614, "theoretical_loss": 3.490948538396671, "tokens_seen": 1628188672 }, { "epoch": 4.08, "learning_rate": 0.00025591775325977935, "loss": 2.7254, "theoretical_loss": 3.4909363437443033, "tokens_seen": 1628254208 }, { "epoch": 4.08, "learning_rate": 0.00025590772316950853, "loss": 2.864, "theoretical_loss": 3.4909241497201755, "tokens_seen": 1628319744 }, { "epoch": 4.08, "learning_rate": 0.0002558976930792377, "loss": 2.6649, "theoretical_loss": 3.490911956324231, "tokens_seen": 1628385280 }, { "epoch": 4.08, "learning_rate": 0.00025588766298896695, "loss": 2.8738, "theoretical_loss": 3.4908997635564116, "tokens_seen": 1628450816 }, { "epoch": 4.08, "objective/train/docs_used": 2597038, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.631977081298828, "objective/train/theoretical_loss": 3.4908875714166596, "objective/train/tokens_used": 1648976352, "theoretical_loss": 3.4908875714166596, "tokens_seen": 1628516352 }, { "epoch": 4.08, "learning_rate": 0.0002558776328986961, "loss": 2.7313, "theoretical_loss": 3.4908875714166596, "tokens_seen": 1628516352 }, { "epoch": 4.08, "learning_rate": 0.0002558676028084253, "loss": 2.7707, "theoretical_loss": 3.4908753799049173, "tokens_seen": 1628581888 }, { "epoch": 4.08, "learning_rate": 0.00025585757271815444, "loss": 2.7051, "theoretical_loss": 3.4908631890211277, "tokens_seen": 1628647424 }, { "epoch": 4.08, "learning_rate": 0.00025584754262788367, "loss": 2.82, "theoretical_loss": 3.490850998765233, "tokens_seen": 1628712960 }, { "epoch": 4.08, "learning_rate": 0.00025583751253761285, "loss": 2.9308, "theoretical_loss": 3.490838809137175, "tokens_seen": 1628778496 }, { "epoch": 4.08, "learning_rate": 0.00025582748244734203, "loss": 2.7248, "theoretical_loss": 3.4908266201368967, "tokens_seen": 1628844032 }, { "epoch": 4.08, "learning_rate": 0.0002558174523570712, "loss": 2.7391, "theoretical_loss": 3.490814431764341, "tokens_seen": 1628909568 }, { "epoch": 4.08, "learning_rate": 0.00025580742226680045, "loss": 2.7444, "theoretical_loss": 3.4908022440194486, "tokens_seen": 1628975104 }, { "epoch": 4.08, "learning_rate": 0.0002557973921765296, "loss": 2.7509, "theoretical_loss": 3.4907900569021635, "tokens_seen": 1629040640 }, { "epoch": 4.08, "learning_rate": 0.0002557873620862588, "loss": 2.7574, "theoretical_loss": 3.4907778704124275, "tokens_seen": 1629106176 }, { "epoch": 4.08, "learning_rate": 0.00025577733199598794, "loss": 2.714, "theoretical_loss": 3.4907656845501833, "tokens_seen": 1629171712 }, { "epoch": 4.08, "learning_rate": 0.0002557673019057172, "loss": 2.7932, "theoretical_loss": 3.490753499315373, "tokens_seen": 1629237248 }, { "epoch": 4.08, "learning_rate": 0.00025575727181544636, "loss": 2.9446, "theoretical_loss": 3.490741314707939, "tokens_seen": 1629302784 }, { "epoch": 4.08, "learning_rate": 0.00025574724172517554, "loss": 2.752, "theoretical_loss": 3.4907291307278245, "tokens_seen": 1629368320 }, { "epoch": 4.08, "learning_rate": 0.0002557372116349047, "loss": 2.711, "theoretical_loss": 3.4907169473749713, "tokens_seen": 1629433856 }, { "epoch": 4.08, "learning_rate": 0.0002557271815446339, "loss": 2.7705, "theoretical_loss": 3.4907047646493226, "tokens_seen": 1629499392 }, { "epoch": 4.08, "learning_rate": 0.0002557171514543631, "loss": 2.7571, "theoretical_loss": 3.4906925825508197, "tokens_seen": 1629564928 }, { "epoch": 4.08, "learning_rate": 0.0002557071213640923, "loss": 2.8714, "theoretical_loss": 3.490680401079406, "tokens_seen": 1629630464 }, { "epoch": 4.08, "learning_rate": 0.00025569709127382144, "loss": 2.6761, "theoretical_loss": 3.4906682202350234, "tokens_seen": 1629696000 }, { "epoch": 4.08, "learning_rate": 0.0002556870611835507, "loss": 2.9333, "theoretical_loss": 3.4906560400176154, "tokens_seen": 1629761536 }, { "epoch": 4.08, "learning_rate": 0.00025567703109327986, "loss": 2.7266, "theoretical_loss": 3.4906438604271237, "tokens_seen": 1629827072 }, { "epoch": 4.08, "learning_rate": 0.00025566700100300904, "loss": 2.8719, "theoretical_loss": 3.490631681463491, "tokens_seen": 1629892608 }, { "epoch": 4.08, "learning_rate": 0.0002556569709127382, "loss": 2.8566, "theoretical_loss": 3.49061950312666, "tokens_seen": 1629958144 }, { "epoch": 4.08, "learning_rate": 0.0002556469408224674, "loss": 2.6173, "theoretical_loss": 3.4906073254165726, "tokens_seen": 1630023680 }, { "epoch": 4.08, "learning_rate": 0.0002556369107321966, "loss": 2.7273, "theoretical_loss": 3.4905951483331723, "tokens_seen": 1630089216 }, { "epoch": 4.08, "objective/train/docs_used": 2599785, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6422271728515625, "objective/train/theoretical_loss": 3.490582971876401, "objective/train/tokens_used": 1650614752, "theoretical_loss": 3.490582971876401, "tokens_seen": 1630154752 }, { "epoch": 4.08, "learning_rate": 0.0002556268806419258, "loss": 2.7782, "theoretical_loss": 3.490582971876401, "tokens_seen": 1630154752 }, { "epoch": 4.08, "learning_rate": 0.00025561685055165495, "loss": 2.7847, "theoretical_loss": 3.4905707960462014, "tokens_seen": 1630220288 }, { "epoch": 4.08, "learning_rate": 0.0002556068204613842, "loss": 2.83, "theoretical_loss": 3.490558620842516, "tokens_seen": 1630285824 }, { "epoch": 4.08, "learning_rate": 0.0002555967903711133, "loss": 2.9184, "theoretical_loss": 3.490546446265288, "tokens_seen": 1630351360 }, { "epoch": 4.08, "learning_rate": 0.00025558676028084254, "loss": 2.821, "theoretical_loss": 3.4905342723144597, "tokens_seen": 1630416896 }, { "epoch": 4.08, "learning_rate": 0.0002555767301905717, "loss": 2.8293, "theoretical_loss": 3.490522098989973, "tokens_seen": 1630482432 }, { "epoch": 4.08, "learning_rate": 0.0002555667001003009, "loss": 2.8804, "theoretical_loss": 3.4905099262917707, "tokens_seen": 1630547968 }, { "epoch": 4.08, "learning_rate": 0.0002555566700100301, "loss": 2.8685, "theoretical_loss": 3.4904977542197964, "tokens_seen": 1630613504 }, { "epoch": 4.08, "learning_rate": 0.00025554663991975927, "loss": 2.7546, "theoretical_loss": 3.490485582773992, "tokens_seen": 1630679040 }, { "epoch": 4.08, "learning_rate": 0.00025553660982948845, "loss": 2.9534, "theoretical_loss": 3.4904734119542997, "tokens_seen": 1630744576 }, { "epoch": 4.08, "learning_rate": 0.0002555265797392177, "loss": 2.8133, "theoretical_loss": 3.490461241760663, "tokens_seen": 1630810112 }, { "epoch": 4.08, "learning_rate": 0.0002555165496489468, "loss": 2.7592, "theoretical_loss": 3.490449072193024, "tokens_seen": 1630875648 }, { "epoch": 4.08, "learning_rate": 0.00025550651955867605, "loss": 2.7576, "theoretical_loss": 3.490436903251325, "tokens_seen": 1630941184 }, { "epoch": 4.08, "learning_rate": 0.00025549648946840523, "loss": 2.8296, "theoretical_loss": 3.4904247349355098, "tokens_seen": 1631006720 }, { "epoch": 4.08, "learning_rate": 0.0002554864593781344, "loss": 2.9522, "theoretical_loss": 3.49041256724552, "tokens_seen": 1631072256 }, { "epoch": 4.08, "learning_rate": 0.0002554764292878636, "loss": 2.7469, "theoretical_loss": 3.490400400181299, "tokens_seen": 1631137792 }, { "epoch": 4.08, "learning_rate": 0.00025546639919759277, "loss": 2.7294, "theoretical_loss": 3.490388233742789, "tokens_seen": 1631203328 }, { "epoch": 4.08, "learning_rate": 0.00025545636910732195, "loss": 2.751, "theoretical_loss": 3.4903760679299323, "tokens_seen": 1631268864 }, { "epoch": 4.08, "learning_rate": 0.0002554463390170512, "loss": 2.7825, "theoretical_loss": 3.4903639027426725, "tokens_seen": 1631334400 }, { "epoch": 4.08, "learning_rate": 0.0002554363089267803, "loss": 2.8315, "theoretical_loss": 3.4903517381809523, "tokens_seen": 1631399936 }, { "epoch": 4.08, "learning_rate": 0.00025542627883650955, "loss": 2.8527, "theoretical_loss": 3.4903395742447136, "tokens_seen": 1631465472 }, { "epoch": 4.08, "learning_rate": 0.0002554162487462387, "loss": 2.8034, "theoretical_loss": 3.490327410933899, "tokens_seen": 1631531008 }, { "epoch": 4.08, "learning_rate": 0.0002554062186559679, "loss": 2.6907, "theoretical_loss": 3.4903152482484527, "tokens_seen": 1631596544 }, { "epoch": 4.08, "learning_rate": 0.0002553961885656971, "loss": 2.7987, "theoretical_loss": 3.490303086188316, "tokens_seen": 1631662080 }, { "epoch": 4.08, "learning_rate": 0.0002553861584754263, "loss": 2.7471, "theoretical_loss": 3.4902909247534324, "tokens_seen": 1631727616 }, { "epoch": 4.08, "objective/train/docs_used": 2602598, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5774004459381104, "objective/train/theoretical_loss": 3.4902787639437443, "objective/train/tokens_used": 1652253152, "theoretical_loss": 3.4902787639437443, "tokens_seen": 1631793152 }, { "epoch": 4.08, "learning_rate": 0.00025537612838515546, "loss": 2.7681, "theoretical_loss": 3.4902787639437443, "tokens_seen": 1631793152 }, { "epoch": 4.08, "learning_rate": 0.00025536609829488464, "loss": 2.881, "theoretical_loss": 3.4902666037591947, "tokens_seen": 1631858688 }, { "epoch": 4.08, "learning_rate": 0.0002553560682046138, "loss": 2.6849, "theoretical_loss": 3.4902544441997256, "tokens_seen": 1631924224 }, { "epoch": 4.08, "learning_rate": 0.00025534603811434305, "loss": 2.6216, "theoretical_loss": 3.490242285265281, "tokens_seen": 1631989760 }, { "epoch": 4.08, "learning_rate": 0.0002553360080240722, "loss": 2.7678, "theoretical_loss": 3.4902301269558027, "tokens_seen": 1632055296 }, { "epoch": 4.08, "learning_rate": 0.0002553259779338014, "loss": 2.8641, "theoretical_loss": 3.4902179692712334, "tokens_seen": 1632120832 }, { "epoch": 4.08, "learning_rate": 0.0002553159478435306, "loss": 2.73, "theoretical_loss": 3.490205812211517, "tokens_seen": 1632186368 }, { "epoch": 4.08, "learning_rate": 0.0002553059177532598, "loss": 2.8654, "theoretical_loss": 3.4901936557765953, "tokens_seen": 1632251904 }, { "epoch": 4.08, "learning_rate": 0.00025529588766298896, "loss": 2.7857, "theoretical_loss": 3.490181499966411, "tokens_seen": 1632317440 }, { "epoch": 4.08, "learning_rate": 0.00025528585757271814, "loss": 2.7662, "theoretical_loss": 3.4901693447809077, "tokens_seen": 1632382976 }, { "epoch": 4.08, "learning_rate": 0.0002552758274824473, "loss": 2.7415, "theoretical_loss": 3.490157190220028, "tokens_seen": 1632448512 }, { "epoch": 4.08, "learning_rate": 0.00025526579739217656, "loss": 2.6557, "theoretical_loss": 3.4901450362837148, "tokens_seen": 1632514048 }, { "epoch": 4.08, "learning_rate": 0.0002552557673019057, "loss": 2.7919, "theoretical_loss": 3.49013288297191, "tokens_seen": 1632579584 }, { "epoch": 4.08, "learning_rate": 0.0002552457372116349, "loss": 2.7853, "theoretical_loss": 3.4901207302845574, "tokens_seen": 1632645120 }, { "epoch": 4.08, "learning_rate": 0.0002552357071213641, "loss": 2.9201, "theoretical_loss": 3.4901085782215997, "tokens_seen": 1632710656 }, { "epoch": 4.08, "learning_rate": 0.0002552256770310933, "loss": 2.8904, "theoretical_loss": 3.49009642678298, "tokens_seen": 1632776192 }, { "epoch": 4.08, "learning_rate": 0.0002552156469408225, "loss": 2.6984, "theoretical_loss": 3.4900842759686403, "tokens_seen": 1632841728 }, { "epoch": 4.08, "learning_rate": 0.00025520561685055164, "loss": 2.7684, "theoretical_loss": 3.490072125778524, "tokens_seen": 1632907264 }, { "epoch": 4.08, "learning_rate": 0.0002551955867602809, "loss": 2.8347, "theoretical_loss": 3.4900599762125744, "tokens_seen": 1632972800 }, { "epoch": 4.08, "learning_rate": 0.00025518555667001006, "loss": 2.8758, "theoretical_loss": 3.4900478272707334, "tokens_seen": 1633038336 }, { "epoch": 4.08, "learning_rate": 0.00025517552657973924, "loss": 2.8064, "theoretical_loss": 3.490035678952945, "tokens_seen": 1633103872 }, { "epoch": 4.08, "learning_rate": 0.0002551654964894684, "loss": 2.9542, "theoretical_loss": 3.490023531259151, "tokens_seen": 1633169408 }, { "epoch": 4.08, "learning_rate": 0.0002551554663991976, "loss": 2.8289, "theoretical_loss": 3.4900113841892955, "tokens_seen": 1633234944 }, { "epoch": 4.08, "learning_rate": 0.0002551454363089268, "loss": 2.8509, "theoretical_loss": 3.48999923774332, "tokens_seen": 1633300480 }, { "epoch": 4.08, "learning_rate": 0.000255135406218656, "loss": 2.8115, "theoretical_loss": 3.489987091921169, "tokens_seen": 1633366016 }, { "epoch": 4.08, "objective/train/docs_used": 2605386, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.9150686264038086, "objective/train/theoretical_loss": 3.4899749467227843, "objective/train/tokens_used": 1653891552, "theoretical_loss": 3.4899749467227843, "tokens_seen": 1633431552 }, { "epoch": 4.08, "learning_rate": 0.00025512537612838515, "loss": 2.8597, "theoretical_loss": 3.4899749467227843, "tokens_seen": 1633431552 }, { "epoch": 4.08, "learning_rate": 0.0002551153460381144, "loss": 2.7524, "theoretical_loss": 3.489962802148109, "tokens_seen": 1633497088 }, { "epoch": 4.08, "learning_rate": 0.0002551053159478435, "loss": 2.5582, "theoretical_loss": 3.4899506581970865, "tokens_seen": 1633562624 }, { "epoch": 4.08, "learning_rate": 0.00025509528585757274, "loss": 2.8556, "theoretical_loss": 3.4899385148696593, "tokens_seen": 1633628160 }, { "epoch": 4.08, "learning_rate": 0.0002550852557673019, "loss": 2.7122, "theoretical_loss": 3.4899263721657707, "tokens_seen": 1633693696 }, { "epoch": 4.08, "learning_rate": 0.0002550752256770311, "loss": 2.758, "theoretical_loss": 3.489914230085364, "tokens_seen": 1633759232 }, { "epoch": 4.08, "learning_rate": 0.0002550651955867603, "loss": 2.7558, "theoretical_loss": 3.489902088628381, "tokens_seen": 1633824768 }, { "epoch": 4.08, "learning_rate": 0.00025505516549648947, "loss": 2.909, "theoretical_loss": 3.489889947794766, "tokens_seen": 1633890304 }, { "epoch": 4.08, "learning_rate": 0.00025504513540621865, "loss": 2.6206, "theoretical_loss": 3.4898778075844605, "tokens_seen": 1633955840 }, { "epoch": 4.08, "learning_rate": 0.0002550351053159479, "loss": 2.7304, "theoretical_loss": 3.4898656679974094, "tokens_seen": 1634021376 }, { "epoch": 4.08, "learning_rate": 0.000255025075225677, "loss": 2.8142, "theoretical_loss": 3.489853529033554, "tokens_seen": 1634086912 }, { "epoch": 4.08, "learning_rate": 0.00025501504513540625, "loss": 2.8304, "theoretical_loss": 3.4898413906928383, "tokens_seen": 1634152448 }, { "epoch": 4.08, "learning_rate": 0.00025500501504513543, "loss": 2.6079, "theoretical_loss": 3.4898292529752046, "tokens_seen": 1634217984 }, { "epoch": 4.08, "learning_rate": 0.0002549949849548646, "loss": 2.6459, "theoretical_loss": 3.489817115880597, "tokens_seen": 1634283520 }, { "epoch": 4.08, "learning_rate": 0.0002549849548645938, "loss": 2.7579, "theoretical_loss": 3.489804979408958, "tokens_seen": 1634349056 }, { "epoch": 4.08, "learning_rate": 0.00025497492477432297, "loss": 2.7196, "theoretical_loss": 3.4897928435602297, "tokens_seen": 1634414592 }, { "epoch": 4.08, "learning_rate": 0.00025496489468405215, "loss": 2.7006, "theoretical_loss": 3.489780708334356, "tokens_seen": 1634480128 }, { "epoch": 4.08, "learning_rate": 0.0002549548645937814, "loss": 2.7967, "theoretical_loss": 3.489768573731281, "tokens_seen": 1634545664 }, { "epoch": 4.08, "learning_rate": 0.0002549448345035105, "loss": 2.7009, "theoretical_loss": 3.4897564397509457, "tokens_seen": 1634611200 }, { "epoch": 4.08, "learning_rate": 0.00025493480441323975, "loss": 2.7105, "theoretical_loss": 3.4897443063932947, "tokens_seen": 1634676736 }, { "epoch": 4.08, "learning_rate": 0.0002549247743229689, "loss": 2.7602, "theoretical_loss": 3.489732173658271, "tokens_seen": 1634742272 }, { "epoch": 4.08, "learning_rate": 0.0002549147442326981, "loss": 2.8815, "theoretical_loss": 3.4897200415458167, "tokens_seen": 1634807808 }, { "epoch": 4.08, "learning_rate": 0.0002549047141424273, "loss": 2.8483, "theoretical_loss": 3.4897079100558757, "tokens_seen": 1634873344 }, { "epoch": 4.08, "learning_rate": 0.0002548946840521565, "loss": 2.5705, "theoretical_loss": 3.489695779188391, "tokens_seen": 1634938880 }, { "epoch": 4.08, "learning_rate": 0.00025488465396188566, "loss": 2.9359, "theoretical_loss": 3.489683648943305, "tokens_seen": 1635004416 }, { "epoch": 4.08, "objective/train/docs_used": 2607959, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.931088924407959, "objective/train/theoretical_loss": 3.489671519320562, "objective/train/tokens_used": 1655529952, "theoretical_loss": 3.489671519320562, "tokens_seen": 1635069952 }, { "epoch": 4.08, "learning_rate": 0.00025487462387161484, "loss": 2.759, "theoretical_loss": 3.489671519320562, "tokens_seen": 1635069952 }, { "epoch": 4.08, "learning_rate": 0.000254864593781344, "loss": 2.7083, "theoretical_loss": 3.4896593903201047, "tokens_seen": 1635135488 }, { "epoch": 4.08, "learning_rate": 0.00025485456369107325, "loss": 2.5294, "theoretical_loss": 3.4896472619418755, "tokens_seen": 1635201024 }, { "epoch": 4.08, "learning_rate": 0.0002548445336008024, "loss": 2.7863, "theoretical_loss": 3.4896351341858187, "tokens_seen": 1635266560 }, { "epoch": 4.08, "learning_rate": 0.0002548345035105316, "loss": 2.814, "theoretical_loss": 3.4896230070518763, "tokens_seen": 1635332096 }, { "epoch": 4.08, "learning_rate": 0.0002548244734202608, "loss": 2.6755, "theoretical_loss": 3.4896108805399924, "tokens_seen": 1635397632 }, { "epoch": 4.08, "learning_rate": 0.00025481444332999, "loss": 2.8913, "theoretical_loss": 3.4895987546501095, "tokens_seen": 1635463168 }, { "epoch": 4.08, "learning_rate": 0.00025480441323971916, "loss": 2.7587, "theoretical_loss": 3.4895866293821713, "tokens_seen": 1635528704 }, { "epoch": 4.08, "learning_rate": 0.00025479438314944834, "loss": 2.7157, "theoretical_loss": 3.4895745047361206, "tokens_seen": 1635594240 }, { "epoch": 4.08, "learning_rate": 0.0002547843530591775, "loss": 2.7511, "theoretical_loss": 3.4895623807119014, "tokens_seen": 1635659776 }, { "epoch": 4.08, "learning_rate": 0.00025477432296890676, "loss": 2.6703, "theoretical_loss": 3.4895502573094554, "tokens_seen": 1635725312 }, { "epoch": 4.08, "learning_rate": 0.0002547642928786359, "loss": 2.8594, "theoretical_loss": 3.4895381345287273, "tokens_seen": 1635790848 }, { "epoch": 4.08, "learning_rate": 0.0002547542627883651, "loss": 2.8338, "theoretical_loss": 3.489526012369659, "tokens_seen": 1635856384 }, { "epoch": 4.08, "learning_rate": 0.00025474423269809425, "loss": 2.809, "theoretical_loss": 3.489513890832195, "tokens_seen": 1635921920 }, { "epoch": 4.08, "learning_rate": 0.0002547342026078235, "loss": 2.7915, "theoretical_loss": 3.489501769916277, "tokens_seen": 1635987456 }, { "epoch": 4.08, "learning_rate": 0.00025472417251755266, "loss": 2.8564, "theoretical_loss": 3.4894896496218504, "tokens_seen": 1636052992 }, { "epoch": 4.08, "learning_rate": 0.00025471414242728184, "loss": 2.8271, "theoretical_loss": 3.4894775299488563, "tokens_seen": 1636118528 }, { "epoch": 4.08, "learning_rate": 0.000254704112337011, "loss": 2.7527, "theoretical_loss": 3.489465410897239, "tokens_seen": 1636184064 }, { "epoch": 4.08, "learning_rate": 0.00025469408224674026, "loss": 2.8084, "theoretical_loss": 3.4894532924669415, "tokens_seen": 1636249600 }, { "epoch": 4.08, "learning_rate": 0.0002546840521564694, "loss": 2.7927, "theoretical_loss": 3.489441174657907, "tokens_seen": 1636315136 }, { "epoch": 4.08, "learning_rate": 0.0002546740220661986, "loss": 2.7232, "theoretical_loss": 3.4894290574700797, "tokens_seen": 1636380672 }, { "epoch": 4.08, "learning_rate": 0.00025466399197592775, "loss": 2.7516, "theoretical_loss": 3.4894169409034013, "tokens_seen": 1636446208 }, { "epoch": 4.08, "learning_rate": 0.000254653961885657, "loss": 2.7562, "theoretical_loss": 3.4894048249578162, "tokens_seen": 1636511744 }, { "epoch": 4.08, "learning_rate": 0.00025464393179538616, "loss": 2.8204, "theoretical_loss": 3.4893927096332673, "tokens_seen": 1636577280 }, { "epoch": 4.08, "learning_rate": 0.00025463390170511535, "loss": 2.8689, "theoretical_loss": 3.4893805949296977, "tokens_seen": 1636642816 }, { "epoch": 4.08, "objective/train/docs_used": 2609380, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6114394664764404, "objective/train/theoretical_loss": 3.489368480847051, "objective/train/tokens_used": 1657168352, "theoretical_loss": 3.489368480847051, "tokens_seen": 1636708352 }, { "epoch": 4.08, "learning_rate": 0.00025462387161484453, "loss": 2.6029, "theoretical_loss": 3.489368480847051, "tokens_seen": 1636708352 }, { "epoch": 4.08, "learning_rate": 0.0002546138415245737, "loss": 2.9326, "theoretical_loss": 3.4893563673852706, "tokens_seen": 1636773888 }, { "epoch": 4.08, "learning_rate": 0.0002546038114343029, "loss": 2.8709, "theoretical_loss": 3.4893442545443003, "tokens_seen": 1636839424 }, { "epoch": 4.08, "learning_rate": 0.0002545937813440321, "loss": 2.7093, "theoretical_loss": 3.489332142324082, "tokens_seen": 1636904960 }, { "epoch": 4.08, "learning_rate": 0.00025458375125376125, "loss": 2.7719, "theoretical_loss": 3.48932003072456, "tokens_seen": 1636970496 }, { "epoch": 4.08, "learning_rate": 0.0002545737211634905, "loss": 2.8526, "theoretical_loss": 3.489307919745678, "tokens_seen": 1637036032 }, { "epoch": 4.08, "learning_rate": 0.0002545636910732196, "loss": 2.7806, "theoretical_loss": 3.4892958093873783, "tokens_seen": 1637101568 }, { "epoch": 4.08, "learning_rate": 0.00025455366098294885, "loss": 2.7215, "theoretical_loss": 3.4892836996496053, "tokens_seen": 1637167104 }, { "epoch": 4.08, "learning_rate": 0.00025454363089267803, "loss": 2.5637, "theoretical_loss": 3.4892715905323017, "tokens_seen": 1637232640 }, { "epoch": 4.08, "learning_rate": 0.0002545336008024072, "loss": 2.642, "theoretical_loss": 3.489259482035411, "tokens_seen": 1637298176 }, { "epoch": 4.08, "learning_rate": 0.0002545235707121364, "loss": 2.7624, "theoretical_loss": 3.4892473741588765, "tokens_seen": 1637363712 }, { "epoch": 4.08, "learning_rate": 0.00025451354062186563, "loss": 2.5993, "theoretical_loss": 3.489235266902642, "tokens_seen": 1637429248 }, { "epoch": 4.08, "learning_rate": 0.0002545035105315948, "loss": 2.4653, "theoretical_loss": 3.4892231602666506, "tokens_seen": 1637494784 }, { "epoch": 4.08, "learning_rate": 0.000254493480441324, "loss": 2.6945, "theoretical_loss": 3.4892110542508457, "tokens_seen": 1637560320 }, { "epoch": 4.08, "learning_rate": 0.00025448345035105317, "loss": 2.8099, "theoretical_loss": 3.4891989488551705, "tokens_seen": 1637625856 }, { "epoch": 4.08, "learning_rate": 0.00025447342026078235, "loss": 2.8332, "theoretical_loss": 3.489186844079569, "tokens_seen": 1637691392 }, { "epoch": 4.08, "learning_rate": 0.0002544633901705116, "loss": 2.6077, "theoretical_loss": 3.4891747399239845, "tokens_seen": 1637756928 }, { "epoch": 4.08, "learning_rate": 0.0002544533600802407, "loss": 2.8262, "theoretical_loss": 3.4891626363883597, "tokens_seen": 1637822464 }, { "epoch": 4.08, "learning_rate": 0.00025444332998996995, "loss": 2.7921, "theoretical_loss": 3.489150533472639, "tokens_seen": 1637888000 }, { "epoch": 4.08, "learning_rate": 0.0002544332998996991, "loss": 2.7302, "theoretical_loss": 3.489138431176765, "tokens_seen": 1637953536 }, { "epoch": 4.08, "learning_rate": 0.0002544232698094283, "loss": 2.8422, "theoretical_loss": 3.4891263295006816, "tokens_seen": 1638019072 }, { "epoch": 4.08, "learning_rate": 0.0002544132397191575, "loss": 2.7942, "theoretical_loss": 3.4891142284443326, "tokens_seen": 1638084608 }, { "epoch": 4.08, "learning_rate": 0.0002544032096288867, "loss": 2.637, "theoretical_loss": 3.489102128007661, "tokens_seen": 1638150144 }, { "epoch": 4.08, "learning_rate": 0.00025439317953861586, "loss": 2.7439, "theoretical_loss": 3.4890900281906103, "tokens_seen": 1638215680 }, { "epoch": 4.08, "learning_rate": 0.00025438314944834504, "loss": 2.5978, "theoretical_loss": 3.489077928993124, "tokens_seen": 1638281216 }, { "debugging/Self-BLEU-5": 0.5697579522792299, "debugging/distinct-1-grams": 0.749053450330667, "debugging/distinct-2-grams": 0.937513441131761, "debugging/entropy-1-grams": 6.209534441389631, "debugging/entropy-2-grams": 7.305798640093626, "debugging/length": 533.1818181818181, "debugging/num_segments": 22, "epoch": 4.08, "objective/train/docs_used": 2612137, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7098546028137207, "objective/train/theoretical_loss": 3.4890658304151456, "objective/train/tokens_used": 1658806752, "theoretical_loss": 3.4890658304151456, "tokens_seen": 1638346752 }, { "epoch": 4.08, "learning_rate": 0.0002543731193580742, "loss": 2.5966, "theoretical_loss": 3.4890658304151456, "tokens_seen": 1638346752 }, { "epoch": 4.08, "learning_rate": 0.00025436308926780345, "loss": 2.791, "theoretical_loss": 3.489053732456619, "tokens_seen": 1638412288 }, { "epoch": 4.08, "learning_rate": 0.0002543530591775326, "loss": 2.6664, "theoretical_loss": 3.4890416351174873, "tokens_seen": 1638477824 }, { "epoch": 4.08, "learning_rate": 0.0002543430290872618, "loss": 2.7367, "theoretical_loss": 3.489029538397694, "tokens_seen": 1638543360 }, { "epoch": 4.08, "learning_rate": 0.000254332998996991, "loss": 2.6642, "theoretical_loss": 3.489017442297183, "tokens_seen": 1638608896 }, { "epoch": 4.08, "learning_rate": 0.0002543229689067202, "loss": 2.9218, "theoretical_loss": 3.489005346815897, "tokens_seen": 1638674432 }, { "epoch": 4.08, "learning_rate": 0.00025431293881644936, "loss": 2.8442, "theoretical_loss": 3.4889932519537803, "tokens_seen": 1638739968 }, { "epoch": 4.08, "learning_rate": 0.00025430290872617854, "loss": 2.9407, "theoretical_loss": 3.4889811577107763, "tokens_seen": 1638805504 }, { "epoch": 4.08, "learning_rate": 0.0002542928786359077, "loss": 2.7257, "theoretical_loss": 3.4889690640868283, "tokens_seen": 1638871040 }, { "epoch": 4.08, "learning_rate": 0.00025428284854563696, "loss": 2.8034, "theoretical_loss": 3.4889569710818806, "tokens_seen": 1638936576 }, { "epoch": 4.08, "learning_rate": 0.0002542728184553661, "loss": 2.7198, "theoretical_loss": 3.4889448786958757, "tokens_seen": 1639002112 }, { "epoch": 4.08, "learning_rate": 0.0002542627883650953, "loss": 2.8815, "theoretical_loss": 3.488932786928758, "tokens_seen": 1639067648 }, { "epoch": 4.08, "learning_rate": 0.00025425275827482445, "loss": 2.8355, "theoretical_loss": 3.48892069578047, "tokens_seen": 1639133184 }, { "epoch": 4.08, "learning_rate": 0.0002542427281845537, "loss": 2.8571, "theoretical_loss": 3.4889086052509564, "tokens_seen": 1639198720 }, { "epoch": 4.08, "learning_rate": 0.00025423269809428286, "loss": 2.6203, "theoretical_loss": 3.488896515340161, "tokens_seen": 1639264256 }, { "epoch": 4.08, "learning_rate": 0.00025422266800401204, "loss": 2.6399, "theoretical_loss": 3.4888844260480267, "tokens_seen": 1639329792 }, { "epoch": 4.08, "learning_rate": 0.0002542126379137412, "loss": 2.7011, "theoretical_loss": 3.488872337374497, "tokens_seen": 1639395328 }, { "epoch": 4.08, "learning_rate": 0.00025420260782347046, "loss": 2.6896, "theoretical_loss": 3.4888602493195155, "tokens_seen": 1639460864 }, { "epoch": 4.08, "learning_rate": 0.0002541925777331996, "loss": 2.7319, "theoretical_loss": 3.4888481618830265, "tokens_seen": 1639526400 }, { "epoch": 4.08, "learning_rate": 0.0002541825476429288, "loss": 2.8166, "theoretical_loss": 3.488836075064973, "tokens_seen": 1639591936 }, { "epoch": 4.08, "learning_rate": 0.00025417251755265795, "loss": 2.6624, "theoretical_loss": 3.488823988865299, "tokens_seen": 1639657472 }, { "epoch": 4.08, "learning_rate": 0.0002541624874623872, "loss": 2.7508, "theoretical_loss": 3.488811903283948, "tokens_seen": 1639723008 }, { "epoch": 4.08, "learning_rate": 0.00025415245737211637, "loss": 2.7671, "theoretical_loss": 3.4887998183208637, "tokens_seen": 1639788544 }, { "epoch": 4.08, "learning_rate": 0.00025414242728184555, "loss": 2.8818, "theoretical_loss": 3.4887877339759905, "tokens_seen": 1639854080 }, { "epoch": 4.08, "learning_rate": 0.00025413239719157473, "loss": 2.8359, "theoretical_loss": 3.4887756502492704, "tokens_seen": 1639919616 }, { "epoch": 4.08, "objective/train/docs_used": 2614983, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.684725284576416, "objective/train/theoretical_loss": 3.4887635671406483, "objective/train/tokens_used": 1660445152, "theoretical_loss": 3.4887635671406483, "tokens_seen": 1639985152 }, { "epoch": 4.08, "learning_rate": 0.0002541223671013039, "loss": 2.7433, "theoretical_loss": 3.4887635671406483, "tokens_seen": 1639985152 }, { "epoch": 4.08, "learning_rate": 0.0002541123370110331, "loss": 2.8891, "theoretical_loss": 3.4887514846500673, "tokens_seen": 1640050688 }, { "epoch": 4.08, "learning_rate": 0.0002541023069207623, "loss": 2.8569, "theoretical_loss": 3.4887394027774716, "tokens_seen": 1640116224 }, { "epoch": 4.08, "learning_rate": 0.00025409227683049145, "loss": 2.8096, "theoretical_loss": 3.4887273215228047, "tokens_seen": 1640181760 }, { "epoch": 4.08, "learning_rate": 0.0002540822467402207, "loss": 2.8169, "theoretical_loss": 3.4887152408860103, "tokens_seen": 1640247296 }, { "epoch": 4.08, "learning_rate": 0.0002540722166499498, "loss": 2.6007, "theoretical_loss": 3.4887031608670322, "tokens_seen": 1640312832 }, { "epoch": 4.08, "learning_rate": 0.00025406218655967905, "loss": 2.7806, "theoretical_loss": 3.488691081465814, "tokens_seen": 1640378368 }, { "epoch": 4.08, "learning_rate": 0.00025405215646940823, "loss": 2.8171, "theoretical_loss": 3.488679002682299, "tokens_seen": 1640443904 }, { "epoch": 4.08, "learning_rate": 0.0002540421263791374, "loss": 2.7395, "theoretical_loss": 3.4886669245164317, "tokens_seen": 1640509440 }, { "epoch": 4.08, "learning_rate": 0.0002540320962888666, "loss": 2.7607, "theoretical_loss": 3.4886548469681555, "tokens_seen": 1640574976 }, { "epoch": 4.08, "learning_rate": 0.00025402206619859583, "loss": 2.754, "theoretical_loss": 3.4886427700374147, "tokens_seen": 1640640512 }, { "epoch": 4.08, "learning_rate": 0.00025401203610832495, "loss": 2.8641, "theoretical_loss": 3.488630693724152, "tokens_seen": 1640706048 }, { "epoch": 4.08, "learning_rate": 0.0002540020060180542, "loss": 2.7735, "theoretical_loss": 3.488618618028312, "tokens_seen": 1640771584 }, { "epoch": 4.08, "learning_rate": 0.0002539919759277833, "loss": 2.8378, "theoretical_loss": 3.488606542949838, "tokens_seen": 1640837120 }, { "epoch": 4.08, "learning_rate": 0.00025398194583751255, "loss": 2.7735, "theoretical_loss": 3.4885944684886736, "tokens_seen": 1640902656 }, { "epoch": 4.08, "learning_rate": 0.00025397191574724173, "loss": 2.8135, "theoretical_loss": 3.4885823946447636, "tokens_seen": 1640968192 }, { "epoch": 4.08, "learning_rate": 0.0002539618856569709, "loss": 2.9005, "theoretical_loss": 3.4885703214180506, "tokens_seen": 1641033728 }, { "epoch": 4.08, "learning_rate": 0.0002539518555667001, "loss": 2.8534, "theoretical_loss": 3.488558248808479, "tokens_seen": 1641099264 }, { "epoch": 4.08, "learning_rate": 0.0002539418254764293, "loss": 2.9124, "theoretical_loss": 3.4885461768159933, "tokens_seen": 1641164800 }, { "epoch": 4.08, "learning_rate": 0.00025393179538615846, "loss": 2.7318, "theoretical_loss": 3.488534105440536, "tokens_seen": 1641230336 }, { "epoch": 4.08, "learning_rate": 0.0002539217652958877, "loss": 2.7012, "theoretical_loss": 3.4885220346820516, "tokens_seen": 1641295872 }, { "epoch": 4.08, "learning_rate": 0.0002539117352056168, "loss": 2.7833, "theoretical_loss": 3.4885099645404836, "tokens_seen": 1641361408 }, { "epoch": 4.08, "learning_rate": 0.00025390170511534606, "loss": 2.7399, "theoretical_loss": 3.488497895015777, "tokens_seen": 1641426944 }, { "epoch": 4.08, "learning_rate": 0.0002538916750250752, "loss": 2.8223, "theoretical_loss": 3.4884858261078735, "tokens_seen": 1641492480 }, { "epoch": 4.08, "learning_rate": 0.0002538816449348044, "loss": 2.849, "theoretical_loss": 3.4884737578167186, "tokens_seen": 1641558016 }, { "epoch": 4.08, "objective/train/docs_used": 2617888, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.78139591217041, "objective/train/theoretical_loss": 3.488461690142256, "objective/train/tokens_used": 1662083552, "theoretical_loss": 3.488461690142256, "tokens_seen": 1641623552 }, { "epoch": 4.08, "learning_rate": 0.0002538716148445336, "loss": 2.7893, "theoretical_loss": 3.488461690142256, "tokens_seen": 1641623552 }, { "epoch": 4.08, "learning_rate": 0.0002538615847542628, "loss": 2.8602, "theoretical_loss": 3.488449623084429, "tokens_seen": 1641689088 }, { "epoch": 4.08, "learning_rate": 0.00025385155466399196, "loss": 2.8878, "theoretical_loss": 3.488437556643182, "tokens_seen": 1641754624 }, { "epoch": 4.08, "learning_rate": 0.0002538415245737212, "loss": 2.6467, "theoretical_loss": 3.4884254908184587, "tokens_seen": 1641820160 }, { "epoch": 4.08, "learning_rate": 0.0002538314944834503, "loss": 2.6929, "theoretical_loss": 3.488413425610203, "tokens_seen": 1641885696 }, { "epoch": 4.08, "learning_rate": 0.00025382146439317956, "loss": 2.7147, "theoretical_loss": 3.488401361018359, "tokens_seen": 1641951232 }, { "epoch": 4.08, "learning_rate": 0.0002538114343029087, "loss": 2.7898, "theoretical_loss": 3.4883892970428696, "tokens_seen": 1642016768 }, { "epoch": 4.08, "learning_rate": 0.0002538014042126379, "loss": 2.8414, "theoretical_loss": 3.4883772336836802, "tokens_seen": 1642082304 }, { "epoch": 4.08, "learning_rate": 0.0002537913741223671, "loss": 2.731, "theoretical_loss": 3.4883651709407335, "tokens_seen": 1642147840 }, { "epoch": 4.08, "learning_rate": 0.0002537813440320963, "loss": 2.7549, "theoretical_loss": 3.4883531088139743, "tokens_seen": 1642213376 }, { "epoch": 4.08, "learning_rate": 0.00025377131394182546, "loss": 2.7579, "theoretical_loss": 3.4883410473033463, "tokens_seen": 1642278912 }, { "epoch": 4.08, "learning_rate": 0.00025376128385155465, "loss": 2.9294, "theoretical_loss": 3.488328986408793, "tokens_seen": 1642344448 }, { "epoch": 4.08, "learning_rate": 0.0002537512537612839, "loss": 2.6869, "theoretical_loss": 3.4883169261302585, "tokens_seen": 1642409984 }, { "epoch": 4.08, "learning_rate": 0.00025374122367101306, "loss": 2.768, "theoretical_loss": 3.4883048664676872, "tokens_seen": 1642475520 }, { "epoch": 4.08, "learning_rate": 0.00025373119358074224, "loss": 2.7375, "theoretical_loss": 3.488292807421023, "tokens_seen": 1642541056 }, { "epoch": 4.08, "learning_rate": 0.0002537211634904714, "loss": 2.5672, "theoretical_loss": 3.488280748990209, "tokens_seen": 1642606592 }, { "epoch": 4.08, "learning_rate": 0.00025371113340020066, "loss": 2.6283, "theoretical_loss": 3.4882686911751906, "tokens_seen": 1642672128 }, { "epoch": 4.08, "learning_rate": 0.0002537011033099298, "loss": 2.7047, "theoretical_loss": 3.4882566339759107, "tokens_seen": 1642737664 }, { "epoch": 4.08, "learning_rate": 0.000253691073219659, "loss": 2.8131, "theoretical_loss": 3.4882445773923134, "tokens_seen": 1642803200 }, { "epoch": 4.08, "learning_rate": 0.00025368104312938815, "loss": 2.767, "theoretical_loss": 3.488232521424343, "tokens_seen": 1642868736 }, { "epoch": 4.08, "learning_rate": 0.0002536710130391174, "loss": 2.6832, "theoretical_loss": 3.488220466071944, "tokens_seen": 1642934272 }, { "epoch": 4.08, "learning_rate": 0.00025366098294884657, "loss": 2.889, "theoretical_loss": 3.4882084113350587, "tokens_seen": 1642999808 }, { "epoch": 4.08, "learning_rate": 0.00025365095285857575, "loss": 2.8112, "theoretical_loss": 3.488196357213633, "tokens_seen": 1643065344 }, { "epoch": 4.08, "learning_rate": 0.00025364092276830493, "loss": 2.7954, "theoretical_loss": 3.4881843037076106, "tokens_seen": 1643130880 }, { "epoch": 4.08, "learning_rate": 0.0002536308926780341, "loss": 2.6045, "theoretical_loss": 3.488172250816935, "tokens_seen": 1643196416 }, { "epoch": 4.08, "objective/train/docs_used": 2620141, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8036177158355713, "objective/train/theoretical_loss": 3.4881601985415496, "objective/train/tokens_used": 1663721952, "theoretical_loss": 3.4881601985415496, "tokens_seen": 1643261952 }, { "epoch": 4.08, "learning_rate": 0.0002536208625877633, "loss": 2.7463, "theoretical_loss": 3.4881601985415496, "tokens_seen": 1643261952 }, { "epoch": 4.08, "learning_rate": 0.0002536108324974925, "loss": 2.6954, "theoretical_loss": 3.4881481468813997, "tokens_seen": 1643327488 }, { "epoch": 4.08, "learning_rate": 0.00025360080240722165, "loss": 2.6754, "theoretical_loss": 3.4881360958364294, "tokens_seen": 1643393024 }, { "epoch": 4.08, "learning_rate": 0.0002535907723169509, "loss": 2.6364, "theoretical_loss": 3.4881240454065816, "tokens_seen": 1643458560 }, { "epoch": 4.08, "learning_rate": 0.00025358074222668, "loss": 2.7606, "theoretical_loss": 3.488111995591801, "tokens_seen": 1643524096 }, { "epoch": 4.08, "learning_rate": 0.00025357071213640925, "loss": 2.9624, "theoretical_loss": 3.4880999463920324, "tokens_seen": 1643589632 }, { "epoch": 4.08, "learning_rate": 0.00025356068204613843, "loss": 2.805, "theoretical_loss": 3.4880878978072185, "tokens_seen": 1643655168 }, { "epoch": 4.08, "learning_rate": 0.0002535506519558676, "loss": 2.7897, "theoretical_loss": 3.4880758498373043, "tokens_seen": 1643720704 }, { "epoch": 4.08, "learning_rate": 0.0002535406218655968, "loss": 2.6847, "theoretical_loss": 3.488063802482234, "tokens_seen": 1643786240 }, { "epoch": 4.08, "learning_rate": 0.00025353059177532603, "loss": 2.6944, "theoretical_loss": 3.4880517557419513, "tokens_seen": 1643851776 }, { "epoch": 4.08, "learning_rate": 0.00025352056168505516, "loss": 2.566, "theoretical_loss": 3.4880397096164, "tokens_seen": 1643917312 }, { "epoch": 4.08, "learning_rate": 0.0002535105315947844, "loss": 2.8638, "theoretical_loss": 3.488027664105525, "tokens_seen": 1643982848 }, { "epoch": 4.08, "learning_rate": 0.0002535005015045135, "loss": 2.7445, "theoretical_loss": 3.4880156192092704, "tokens_seen": 1644048384 }, { "epoch": 4.08, "learning_rate": 0.00025349047141424275, "loss": 2.8387, "theoretical_loss": 3.48800357492758, "tokens_seen": 1644113920 }, { "epoch": 4.08, "learning_rate": 0.00025348044132397193, "loss": 2.7209, "theoretical_loss": 3.4879915312603975, "tokens_seen": 1644179456 }, { "epoch": 4.08, "learning_rate": 0.0002534704112337011, "loss": 2.8211, "theoretical_loss": 3.4879794882076682, "tokens_seen": 1644244992 }, { "epoch": 4.08, "learning_rate": 0.0002534603811434303, "loss": 2.845, "theoretical_loss": 3.487967445769335, "tokens_seen": 1644310528 }, { "epoch": 4.08, "learning_rate": 0.0002534503510531595, "loss": 2.7586, "theoretical_loss": 3.4879554039453433, "tokens_seen": 1644376064 }, { "epoch": 4.08, "learning_rate": 0.00025344032096288866, "loss": 2.6129, "theoretical_loss": 3.487943362735636, "tokens_seen": 1644441600 }, { "epoch": 4.08, "learning_rate": 0.0002534302908726179, "loss": 2.8394, "theoretical_loss": 3.487931322140158, "tokens_seen": 1644507136 }, { "epoch": 4.08, "learning_rate": 0.000253420260782347, "loss": 2.6657, "theoretical_loss": 3.4879192821588543, "tokens_seen": 1644572672 }, { "epoch": 4.08, "learning_rate": 0.00025341023069207626, "loss": 2.7167, "theoretical_loss": 3.4879072427916675, "tokens_seen": 1644638208 }, { "epoch": 4.08, "learning_rate": 0.0002534002006018054, "loss": 2.772, "theoretical_loss": 3.4878952040385425, "tokens_seen": 1644703744 }, { "epoch": 4.08, "learning_rate": 0.0002533901705115346, "loss": 2.7533, "theoretical_loss": 3.4878831658994236, "tokens_seen": 1644769280 }, { "epoch": 4.08, "learning_rate": 0.0002533801404212638, "loss": 2.839, "theoretical_loss": 3.4878711283742554, "tokens_seen": 1644834816 }, { "epoch": 4.08, "objective/train/docs_used": 2622781, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5734827518463135, "objective/train/theoretical_loss": 3.4878590914629815, "objective/train/tokens_used": 1665360352, "theoretical_loss": 3.4878590914629815, "tokens_seen": 1644900352 }, { "epoch": 4.08, "learning_rate": 0.000253370110330993, "loss": 2.6675, "theoretical_loss": 3.4878590914629815, "tokens_seen": 1644900352 }, { "epoch": 4.08, "learning_rate": 0.00025336008024072216, "loss": 2.8216, "theoretical_loss": 3.487847055165546, "tokens_seen": 1644965888 }, { "epoch": 4.08, "learning_rate": 0.0002533500501504514, "loss": 2.6316, "theoretical_loss": 3.487835019481894, "tokens_seen": 1645031424 }, { "epoch": 4.08, "learning_rate": 0.0002533400200601805, "loss": 2.7607, "theoretical_loss": 3.4878229844119684, "tokens_seen": 1645096960 }, { "epoch": 4.08, "learning_rate": 0.00025332998996990976, "loss": 2.6864, "theoretical_loss": 3.487810949955715, "tokens_seen": 1645162496 }, { "epoch": 4.08, "learning_rate": 0.0002533199598796389, "loss": 2.6075, "theoretical_loss": 3.487798916113077, "tokens_seen": 1645228032 }, { "epoch": 4.08, "learning_rate": 0.0002533099297893681, "loss": 2.7815, "theoretical_loss": 3.4877868828839995, "tokens_seen": 1645293568 }, { "epoch": 4.08, "learning_rate": 0.0002532998996990973, "loss": 2.7429, "theoretical_loss": 3.487774850268426, "tokens_seen": 1645359104 }, { "epoch": 4.08, "learning_rate": 0.0002532898696088265, "loss": 2.647, "theoretical_loss": 3.487762818266301, "tokens_seen": 1645424640 }, { "epoch": 4.08, "learning_rate": 0.00025327983951855566, "loss": 2.7554, "theoretical_loss": 3.4877507868775686, "tokens_seen": 1645490176 }, { "epoch": 4.08, "learning_rate": 0.00025326980942828485, "loss": 2.7243, "theoretical_loss": 3.487738756102174, "tokens_seen": 1645555712 }, { "epoch": 4.08, "learning_rate": 0.000253259779338014, "loss": 2.8225, "theoretical_loss": 3.487726725940061, "tokens_seen": 1645621248 }, { "epoch": 4.08, "learning_rate": 0.00025324974924774326, "loss": 2.5985, "theoretical_loss": 3.487714696391173, "tokens_seen": 1645686784 }, { "epoch": 4.08, "learning_rate": 0.0002532397191574724, "loss": 2.6997, "theoretical_loss": 3.487702667455456, "tokens_seen": 1645752320 }, { "epoch": 4.08, "learning_rate": 0.0002532296890672016, "loss": 2.8385, "theoretical_loss": 3.487690639132853, "tokens_seen": 1645817856 }, { "epoch": 4.08, "learning_rate": 0.00025321965897693075, "loss": 2.8116, "theoretical_loss": 3.4876786114233087, "tokens_seen": 1645883392 }, { "epoch": 4.08, "learning_rate": 0.00025320962888666, "loss": 2.8568, "theoretical_loss": 3.487666584326768, "tokens_seen": 1645948928 }, { "epoch": 4.08, "learning_rate": 0.00025319959879638917, "loss": 2.7051, "theoretical_loss": 3.4876545578431744, "tokens_seen": 1646014464 }, { "epoch": 4.08, "learning_rate": 0.00025318956870611835, "loss": 2.7884, "theoretical_loss": 3.4876425319724724, "tokens_seen": 1646080000 }, { "epoch": 4.08, "learning_rate": 0.00025317953861584753, "loss": 2.8222, "theoretical_loss": 3.4876305067146074, "tokens_seen": 1646145536 }, { "epoch": 4.08, "learning_rate": 0.00025316950852557677, "loss": 2.8007, "theoretical_loss": 3.4876184820695224, "tokens_seen": 1646211072 }, { "epoch": 4.08, "learning_rate": 0.0002531594784353059, "loss": 2.705, "theoretical_loss": 3.4876064580371624, "tokens_seen": 1646276608 }, { "epoch": 4.08, "learning_rate": 0.00025314944834503513, "loss": 2.7412, "theoretical_loss": 3.4875944346174723, "tokens_seen": 1646342144 }, { "epoch": 4.08, "learning_rate": 0.00025313941825476425, "loss": 2.8918, "theoretical_loss": 3.4875824118103953, "tokens_seen": 1646407680 }, { "epoch": 4.08, "learning_rate": 0.0002531293881644935, "loss": 2.7472, "theoretical_loss": 3.4875703896158767, "tokens_seen": 1646473216 }, { "epoch": 4.08, "objective/train/docs_used": 2625761, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.875891923904419, "objective/train/theoretical_loss": 3.4875583680338607, "objective/train/tokens_used": 1666998752, "theoretical_loss": 3.4875583680338607, "tokens_seen": 1646538752 }, { "epoch": 4.08, "learning_rate": 0.00025311935807422267, "loss": 2.9028, "theoretical_loss": 3.4875583680338607, "tokens_seen": 1646538752 }, { "epoch": 4.08, "learning_rate": 0.00025310932798395185, "loss": 2.8887, "theoretical_loss": 3.487546347064292, "tokens_seen": 1646604288 }, { "epoch": 4.08, "learning_rate": 0.00025309929789368103, "loss": 2.736, "theoretical_loss": 3.487534326707114, "tokens_seen": 1646669824 }, { "epoch": 4.08, "learning_rate": 0.0002530892678034102, "loss": 2.7098, "theoretical_loss": 3.4875223069622727, "tokens_seen": 1646735360 }, { "epoch": 4.08, "learning_rate": 0.0002530792377131394, "loss": 2.7504, "theoretical_loss": 3.4875102878297106, "tokens_seen": 1646800896 }, { "epoch": 4.08, "learning_rate": 0.00025306920762286863, "loss": 2.5946, "theoretical_loss": 3.4874982693093743, "tokens_seen": 1646866432 }, { "epoch": 4.08, "learning_rate": 0.00025305917753259776, "loss": 2.7383, "theoretical_loss": 3.4874862514012066, "tokens_seen": 1646931968 }, { "epoch": 4.08, "learning_rate": 0.000253049147442327, "loss": 2.8821, "theoretical_loss": 3.487474234105153, "tokens_seen": 1646997504 }, { "epoch": 4.08, "learning_rate": 0.0002530391173520562, "loss": 2.8219, "theoretical_loss": 3.4874622174211574, "tokens_seen": 1647063040 }, { "epoch": 4.08, "learning_rate": 0.00025302908726178536, "loss": 2.6456, "theoretical_loss": 3.487450201349164, "tokens_seen": 1647128576 }, { "epoch": 4.08, "learning_rate": 0.00025301905717151454, "loss": 2.8211, "theoretical_loss": 3.4874381858891175, "tokens_seen": 1647194112 }, { "epoch": 4.08, "learning_rate": 0.0002530090270812437, "loss": 2.7489, "theoretical_loss": 3.487426171040963, "tokens_seen": 1647259648 }, { "epoch": 4.08, "learning_rate": 0.00025299899699097295, "loss": 2.7236, "theoretical_loss": 3.4874141568046446, "tokens_seen": 1647325184 }, { "epoch": 4.09, "learning_rate": 0.00025298896690070213, "loss": 2.7618, "theoretical_loss": 3.4874021431801068, "tokens_seen": 1647390720 }, { "epoch": 4.09, "learning_rate": 0.0002529789368104313, "loss": 2.7856, "theoretical_loss": 3.487390130167294, "tokens_seen": 1647456256 }, { "epoch": 4.09, "learning_rate": 0.0002529689067201605, "loss": 2.8748, "theoretical_loss": 3.4873781177661507, "tokens_seen": 1647521792 }, { "epoch": 4.09, "learning_rate": 0.0002529588766298897, "loss": 2.8309, "theoretical_loss": 3.4873661059766214, "tokens_seen": 1647587328 }, { "epoch": 4.09, "learning_rate": 0.00025294884653961886, "loss": 2.7867, "theoretical_loss": 3.487354094798651, "tokens_seen": 1647652864 }, { "epoch": 4.09, "learning_rate": 0.0002529388164493481, "loss": 2.7624, "theoretical_loss": 3.487342084232184, "tokens_seen": 1647718400 }, { "epoch": 4.09, "learning_rate": 0.0002529287863590772, "loss": 2.612, "theoretical_loss": 3.487330074277164, "tokens_seen": 1647783936 }, { "epoch": 4.09, "learning_rate": 0.00025291875626880646, "loss": 2.7693, "theoretical_loss": 3.487318064933537, "tokens_seen": 1647849472 }, { "epoch": 4.09, "learning_rate": 0.0002529087261785356, "loss": 2.9141, "theoretical_loss": 3.4873060562012466, "tokens_seen": 1647915008 }, { "epoch": 4.09, "learning_rate": 0.0002528986960882648, "loss": 2.6761, "theoretical_loss": 3.4872940480802375, "tokens_seen": 1647980544 }, { "epoch": 4.09, "learning_rate": 0.000252888665997994, "loss": 2.8397, "theoretical_loss": 3.4872820405704545, "tokens_seen": 1648046080 }, { "epoch": 4.09, "learning_rate": 0.0002528786359077232, "loss": 2.6523, "theoretical_loss": 3.4872700336718423, "tokens_seen": 1648111616 }, { "epoch": 4.09, "objective/train/docs_used": 2627138, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8079400062561035, "objective/train/theoretical_loss": 3.487258027384345, "objective/train/tokens_used": 1668637152, "theoretical_loss": 3.487258027384345, "tokens_seen": 1648177152 }, { "epoch": 4.09, "learning_rate": 0.00025286860581745236, "loss": 2.7099, "theoretical_loss": 3.487258027384345, "tokens_seen": 1648177152 }, { "epoch": 4.09, "learning_rate": 0.0002528585757271816, "loss": 2.7084, "theoretical_loss": 3.4872460217079073, "tokens_seen": 1648242688 }, { "epoch": 4.09, "learning_rate": 0.0002528485456369107, "loss": 2.6934, "theoretical_loss": 3.4872340166424745, "tokens_seen": 1648308224 }, { "epoch": 4.09, "learning_rate": 0.00025283851554663996, "loss": 2.8334, "theoretical_loss": 3.48722201218799, "tokens_seen": 1648373760 }, { "epoch": 4.09, "learning_rate": 0.0002528284854563691, "loss": 2.6124, "theoretical_loss": 3.4872100083444, "tokens_seen": 1648439296 }, { "epoch": 4.09, "learning_rate": 0.0002528184553660983, "loss": 2.7243, "theoretical_loss": 3.4871980051116482, "tokens_seen": 1648504832 }, { "epoch": 4.09, "learning_rate": 0.0002528084252758275, "loss": 2.7336, "theoretical_loss": 3.4871860024896786, "tokens_seen": 1648570368 }, { "epoch": 4.09, "learning_rate": 0.0002527983951855567, "loss": 2.7752, "theoretical_loss": 3.487174000478437, "tokens_seen": 1648635904 }, { "epoch": 4.09, "learning_rate": 0.00025278836509528586, "loss": 2.7386, "theoretical_loss": 3.487161999077867, "tokens_seen": 1648701440 }, { "epoch": 4.09, "learning_rate": 0.00025277833500501505, "loss": 2.8894, "theoretical_loss": 3.487149998287914, "tokens_seen": 1648766976 }, { "epoch": 4.09, "learning_rate": 0.0002527683049147442, "loss": 2.8086, "theoretical_loss": 3.487137998108523, "tokens_seen": 1648832512 }, { "epoch": 4.09, "learning_rate": 0.00025275827482447346, "loss": 2.6281, "theoretical_loss": 3.487125998539638, "tokens_seen": 1648898048 }, { "epoch": 4.09, "learning_rate": 0.0002527482447342026, "loss": 2.6876, "theoretical_loss": 3.4871139995812035, "tokens_seen": 1648963584 }, { "epoch": 4.09, "learning_rate": 0.0002527382146439318, "loss": 2.8921, "theoretical_loss": 3.487102001233165, "tokens_seen": 1649029120 }, { "epoch": 4.09, "learning_rate": 0.00025272818455366095, "loss": 2.8205, "theoretical_loss": 3.4870900034954664, "tokens_seen": 1649094656 }, { "epoch": 4.09, "learning_rate": 0.0002527181544633902, "loss": 2.7447, "theoretical_loss": 3.4870780063680527, "tokens_seen": 1649160192 }, { "epoch": 4.09, "learning_rate": 0.00025270812437311937, "loss": 2.6921, "theoretical_loss": 3.4870660098508686, "tokens_seen": 1649225728 }, { "epoch": 4.09, "learning_rate": 0.00025269809428284855, "loss": 2.6661, "theoretical_loss": 3.487054013943859, "tokens_seen": 1649291264 }, { "epoch": 4.09, "learning_rate": 0.00025268806419257773, "loss": 2.8173, "theoretical_loss": 3.487042018646968, "tokens_seen": 1649356800 }, { "epoch": 4.09, "learning_rate": 0.00025267803410230697, "loss": 2.7566, "theoretical_loss": 3.4870300239601413, "tokens_seen": 1649422336 }, { "epoch": 4.09, "learning_rate": 0.0002526680040120361, "loss": 2.8775, "theoretical_loss": 3.487018029883323, "tokens_seen": 1649487872 }, { "epoch": 4.09, "learning_rate": 0.00025265797392176533, "loss": 2.839, "theoretical_loss": 3.487006036416458, "tokens_seen": 1649553408 }, { "epoch": 4.09, "learning_rate": 0.00025264794383149445, "loss": 2.6915, "theoretical_loss": 3.4869940435594904, "tokens_seen": 1649618944 }, { "epoch": 4.09, "learning_rate": 0.0002526379137412237, "loss": 2.6855, "theoretical_loss": 3.486982051312366, "tokens_seen": 1649684480 }, { "epoch": 4.09, "learning_rate": 0.00025262788365095287, "loss": 2.702, "theoretical_loss": 3.4869700596750297, "tokens_seen": 1649750016 }, { "epoch": 4.09, "objective/train/docs_used": 2629919, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8951516151428223, "objective/train/theoretical_loss": 3.486958068647425, "objective/train/tokens_used": 1670275552, "theoretical_loss": 3.486958068647425, "tokens_seen": 1649815552 }, { "epoch": 4.09, "learning_rate": 0.00025261785356068205, "loss": 2.8958, "theoretical_loss": 3.486958068647425, "tokens_seen": 1649815552 }, { "epoch": 4.09, "learning_rate": 0.00025260782347041123, "loss": 2.7975, "theoretical_loss": 3.4869460782294976, "tokens_seen": 1649881088 }, { "epoch": 4.09, "learning_rate": 0.0002525977933801404, "loss": 2.8682, "theoretical_loss": 3.486934088421192, "tokens_seen": 1649946624 }, { "epoch": 4.09, "learning_rate": 0.0002525877632898696, "loss": 2.7426, "theoretical_loss": 3.486922099222453, "tokens_seen": 1650012160 }, { "epoch": 4.09, "learning_rate": 0.00025257773319959883, "loss": 2.723, "theoretical_loss": 3.486910110633226, "tokens_seen": 1650077696 }, { "epoch": 4.09, "learning_rate": 0.00025256770310932796, "loss": 2.6837, "theoretical_loss": 3.4868981226534546, "tokens_seen": 1650143232 }, { "epoch": 4.09, "learning_rate": 0.0002525576730190572, "loss": 2.8447, "theoretical_loss": 3.4868861352830844, "tokens_seen": 1650208768 }, { "epoch": 4.09, "learning_rate": 0.0002525476429287864, "loss": 2.7956, "theoretical_loss": 3.4868741485220607, "tokens_seen": 1650274304 }, { "epoch": 4.09, "learning_rate": 0.00025253761283851556, "loss": 2.7878, "theoretical_loss": 3.486862162370327, "tokens_seen": 1650339840 }, { "epoch": 4.09, "learning_rate": 0.00025252758274824474, "loss": 2.74, "theoretical_loss": 3.4868501768278293, "tokens_seen": 1650405376 }, { "epoch": 4.09, "learning_rate": 0.0002525175526579739, "loss": 2.64, "theoretical_loss": 3.4868381918945115, "tokens_seen": 1650470912 }, { "epoch": 4.09, "learning_rate": 0.0002525075225677031, "loss": 2.8324, "theoretical_loss": 3.48682620757032, "tokens_seen": 1650536448 }, { "epoch": 4.09, "learning_rate": 0.00025249749247743233, "loss": 2.6181, "theoretical_loss": 3.4868142238551973, "tokens_seen": 1650601984 }, { "epoch": 4.09, "learning_rate": 0.00025248746238716146, "loss": 2.8265, "theoretical_loss": 3.48680224074909, "tokens_seen": 1650667520 }, { "epoch": 4.09, "learning_rate": 0.0002524774322968907, "loss": 2.7293, "theoretical_loss": 3.486790258251943, "tokens_seen": 1650733056 }, { "epoch": 4.09, "learning_rate": 0.0002524674022066198, "loss": 2.8242, "theoretical_loss": 3.4867782763637005, "tokens_seen": 1650798592 }, { "epoch": 4.09, "learning_rate": 0.00025245737211634906, "loss": 2.6591, "theoretical_loss": 3.4867662950843075, "tokens_seen": 1650864128 }, { "epoch": 4.09, "learning_rate": 0.00025244734202607824, "loss": 2.6954, "theoretical_loss": 3.486754314413709, "tokens_seen": 1650929664 }, { "epoch": 4.09, "learning_rate": 0.0002524373119358074, "loss": 2.6419, "theoretical_loss": 3.48674233435185, "tokens_seen": 1650995200 }, { "epoch": 4.09, "learning_rate": 0.0002524272818455366, "loss": 2.7023, "theoretical_loss": 3.4867303548986754, "tokens_seen": 1651060736 }, { "epoch": 4.09, "learning_rate": 0.0002524172517552658, "loss": 2.9635, "theoretical_loss": 3.4867183760541303, "tokens_seen": 1651126272 }, { "epoch": 4.09, "learning_rate": 0.00025240722166499496, "loss": 2.7026, "theoretical_loss": 3.4867063978181587, "tokens_seen": 1651191808 }, { "epoch": 4.09, "learning_rate": 0.0002523971915747242, "loss": 2.6762, "theoretical_loss": 3.486694420190707, "tokens_seen": 1651257344 }, { "epoch": 4.09, "learning_rate": 0.0002523871614844533, "loss": 2.7316, "theoretical_loss": 3.486682443171719, "tokens_seen": 1651322880 }, { "epoch": 4.09, "learning_rate": 0.00025237713139418256, "loss": 2.8078, "theoretical_loss": 3.4866704667611397, "tokens_seen": 1651388416 }, { "epoch": 4.09, "objective/train/docs_used": 2632634, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8880741596221924, "objective/train/theoretical_loss": 3.4866584909589147, "objective/train/tokens_used": 1671913952, "theoretical_loss": 3.4866584909589147, "tokens_seen": 1651453952 }, { "epoch": 4.09, "learning_rate": 0.00025236710130391174, "loss": 2.8257, "theoretical_loss": 3.4866584909589147, "tokens_seen": 1651453952 }, { "epoch": 4.09, "learning_rate": 0.0002523570712136409, "loss": 2.9399, "theoretical_loss": 3.4866465157649884, "tokens_seen": 1651519488 }, { "epoch": 4.09, "learning_rate": 0.0002523470411233701, "loss": 2.6075, "theoretical_loss": 3.4866345411793054, "tokens_seen": 1651585024 }, { "epoch": 4.09, "learning_rate": 0.0002523370110330993, "loss": 2.7949, "theoretical_loss": 3.486622567201812, "tokens_seen": 1651650560 }, { "epoch": 4.09, "learning_rate": 0.00025232698094282847, "loss": 2.9367, "theoretical_loss": 3.4866105938324523, "tokens_seen": 1651716096 }, { "epoch": 4.09, "learning_rate": 0.0002523169508525577, "loss": 2.7789, "theoretical_loss": 3.4865986210711712, "tokens_seen": 1651781632 }, { "epoch": 4.09, "learning_rate": 0.00025230692076228683, "loss": 2.8364, "theoretical_loss": 3.486586648917914, "tokens_seen": 1651847168 }, { "epoch": 4.09, "learning_rate": 0.00025229689067201606, "loss": 2.8498, "theoretical_loss": 3.486574677372625, "tokens_seen": 1651912704 }, { "epoch": 4.09, "learning_rate": 0.0002522868605817452, "loss": 2.7506, "theoretical_loss": 3.4865627064352505, "tokens_seen": 1651978240 }, { "epoch": 4.09, "learning_rate": 0.00025227683049147443, "loss": 2.7428, "theoretical_loss": 3.4865507361057344, "tokens_seen": 1652043776 }, { "epoch": 4.09, "learning_rate": 0.0002522668004012036, "loss": 2.8763, "theoretical_loss": 3.4865387663840224, "tokens_seen": 1652109312 }, { "epoch": 4.09, "learning_rate": 0.0002522567703109328, "loss": 2.7458, "theoretical_loss": 3.4865267972700593, "tokens_seen": 1652174848 }, { "epoch": 4.09, "learning_rate": 0.000252246740220662, "loss": 2.7883, "theoretical_loss": 3.4865148287637897, "tokens_seen": 1652240384 }, { "epoch": 4.09, "learning_rate": 0.00025223671013039115, "loss": 2.8207, "theoretical_loss": 3.4865028608651594, "tokens_seen": 1652305920 }, { "epoch": 4.09, "learning_rate": 0.0002522266800401204, "loss": 2.6928, "theoretical_loss": 3.486490893574113, "tokens_seen": 1652371456 }, { "epoch": 4.09, "learning_rate": 0.00025221664994984957, "loss": 2.6448, "theoretical_loss": 3.4864789268905954, "tokens_seen": 1652436992 }, { "epoch": 4.09, "learning_rate": 0.00025220661985957875, "loss": 2.773, "theoretical_loss": 3.4864669608145524, "tokens_seen": 1652502528 }, { "epoch": 4.09, "learning_rate": 0.00025219658976930793, "loss": 2.8839, "theoretical_loss": 3.486454995345928, "tokens_seen": 1652568064 }, { "epoch": 4.09, "learning_rate": 0.00025218655967903717, "loss": 2.7772, "theoretical_loss": 3.4864430304846685, "tokens_seen": 1652633600 }, { "epoch": 4.09, "learning_rate": 0.0002521765295887663, "loss": 2.8112, "theoretical_loss": 3.4864310662307174, "tokens_seen": 1652699136 }, { "epoch": 4.09, "learning_rate": 0.00025216649949849553, "loss": 2.7888, "theoretical_loss": 3.486419102584022, "tokens_seen": 1652764672 }, { "epoch": 4.09, "learning_rate": 0.00025215646940822465, "loss": 2.7513, "theoretical_loss": 3.486407139544525, "tokens_seen": 1652830208 }, { "epoch": 4.09, "learning_rate": 0.0002521464393179539, "loss": 2.9109, "theoretical_loss": 3.4863951771121733, "tokens_seen": 1652895744 }, { "epoch": 4.09, "learning_rate": 0.00025213640922768307, "loss": 2.9214, "theoretical_loss": 3.486383215286911, "tokens_seen": 1652961280 }, { "epoch": 4.09, "learning_rate": 0.00025212637913741225, "loss": 2.7134, "theoretical_loss": 3.486371254068684, "tokens_seen": 1653026816 }, { "epoch": 4.09, "objective/train/docs_used": 2635279, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6735401153564453, "objective/train/theoretical_loss": 3.486359293457437, "objective/train/tokens_used": 1673552352, "theoretical_loss": 3.486359293457437, "tokens_seen": 1653092352 }, { "epoch": 4.09, "learning_rate": 0.00025211634904714143, "loss": 2.7435, "theoretical_loss": 3.486359293457437, "tokens_seen": 1653092352 }, { "epoch": 4.09, "learning_rate": 0.0002521063189568706, "loss": 2.641, "theoretical_loss": 3.4863473334531148, "tokens_seen": 1653157888 }, { "epoch": 4.09, "learning_rate": 0.0002520962888665998, "loss": 2.7871, "theoretical_loss": 3.4863353740556633, "tokens_seen": 1653223424 }, { "epoch": 4.09, "learning_rate": 0.00025208625877632903, "loss": 2.6685, "theoretical_loss": 3.486323415265027, "tokens_seen": 1653288960 }, { "epoch": 4.09, "learning_rate": 0.00025207622868605816, "loss": 2.7948, "theoretical_loss": 3.4863114570811513, "tokens_seen": 1653354496 }, { "epoch": 4.09, "learning_rate": 0.0002520661985957874, "loss": 2.7349, "theoretical_loss": 3.486299499503981, "tokens_seen": 1653420032 }, { "epoch": 4.09, "learning_rate": 0.0002520561685055166, "loss": 2.6903, "theoretical_loss": 3.4862875425334625, "tokens_seen": 1653485568 }, { "epoch": 4.09, "learning_rate": 0.00025204613841524576, "loss": 2.8608, "theoretical_loss": 3.4862755861695396, "tokens_seen": 1653551104 }, { "epoch": 4.09, "learning_rate": 0.00025203610832497494, "loss": 2.8795, "theoretical_loss": 3.486263630412158, "tokens_seen": 1653616640 }, { "epoch": 4.09, "learning_rate": 0.0002520260782347041, "loss": 2.8432, "theoretical_loss": 3.486251675261263, "tokens_seen": 1653682176 }, { "epoch": 4.09, "learning_rate": 0.0002520160481444333, "loss": 2.7491, "theoretical_loss": 3.4862397207168, "tokens_seen": 1653747712 }, { "epoch": 4.09, "learning_rate": 0.00025200601805416253, "loss": 2.8387, "theoretical_loss": 3.486227766778714, "tokens_seen": 1653813248 }, { "epoch": 4.09, "learning_rate": 0.00025199598796389166, "loss": 2.6922, "theoretical_loss": 3.4862158134469494, "tokens_seen": 1653878784 }, { "epoch": 4.09, "learning_rate": 0.0002519859578736209, "loss": 2.7764, "theoretical_loss": 3.4862038607214525, "tokens_seen": 1653944320 }, { "epoch": 4.09, "learning_rate": 0.00025197592778335, "loss": 2.9424, "theoretical_loss": 3.4861919086021684, "tokens_seen": 1654009856 }, { "epoch": 4.09, "learning_rate": 0.00025196589769307926, "loss": 2.7903, "theoretical_loss": 3.4861799570890417, "tokens_seen": 1654075392 }, { "epoch": 4.09, "learning_rate": 0.00025195586760280844, "loss": 2.6777, "theoretical_loss": 3.4861680061820186, "tokens_seen": 1654140928 }, { "epoch": 4.09, "learning_rate": 0.0002519458375125376, "loss": 2.6262, "theoretical_loss": 3.486156055881043, "tokens_seen": 1654206464 }, { "epoch": 4.09, "learning_rate": 0.0002519358074222668, "loss": 2.6915, "theoretical_loss": 3.4861441061860616, "tokens_seen": 1654272000 }, { "epoch": 4.09, "learning_rate": 0.000251925777331996, "loss": 2.832, "theoretical_loss": 3.486132157097019, "tokens_seen": 1654337536 }, { "epoch": 4.09, "learning_rate": 0.00025191574724172516, "loss": 2.7395, "theoretical_loss": 3.48612020861386, "tokens_seen": 1654403072 }, { "epoch": 4.09, "learning_rate": 0.0002519057171514544, "loss": 2.776, "theoretical_loss": 3.4861082607365312, "tokens_seen": 1654468608 }, { "epoch": 4.09, "learning_rate": 0.0002518956870611835, "loss": 2.7993, "theoretical_loss": 3.4860963134649765, "tokens_seen": 1654534144 }, { "epoch": 4.09, "learning_rate": 0.00025188565697091276, "loss": 2.6229, "theoretical_loss": 3.486084366799142, "tokens_seen": 1654599680 }, { "epoch": 4.09, "learning_rate": 0.00025187562688064194, "loss": 2.7258, "theoretical_loss": 3.486072420738973, "tokens_seen": 1654665216 }, { "epoch": 4.09, "objective/train/docs_used": 2638273, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.831702947616577, "objective/train/theoretical_loss": 3.4860604752844147, "objective/train/tokens_used": 1675190752, "theoretical_loss": 3.4860604752844147, "tokens_seen": 1654730752 }, { "epoch": 4.09, "learning_rate": 0.0002518655967903711, "loss": 2.7974, "theoretical_loss": 3.4860604752844147, "tokens_seen": 1654730752 }, { "epoch": 4.09, "learning_rate": 0.0002518555667001003, "loss": 2.7072, "theoretical_loss": 3.4860485304354114, "tokens_seen": 1654796288 }, { "epoch": 4.09, "learning_rate": 0.0002518455366098295, "loss": 2.7107, "theoretical_loss": 3.48603658619191, "tokens_seen": 1654861824 }, { "epoch": 4.09, "learning_rate": 0.00025183550651955867, "loss": 2.8792, "theoretical_loss": 3.486024642553855, "tokens_seen": 1654927360 }, { "epoch": 4.09, "learning_rate": 0.0002518254764292879, "loss": 2.7459, "theoretical_loss": 3.4860126995211917, "tokens_seen": 1654992896 }, { "epoch": 4.09, "learning_rate": 0.00025181544633901703, "loss": 2.7044, "theoretical_loss": 3.486000757093866, "tokens_seen": 1655058432 }, { "epoch": 4.09, "learning_rate": 0.00025180541624874627, "loss": 2.9541, "theoretical_loss": 3.4859888152718224, "tokens_seen": 1655123968 }, { "epoch": 4.09, "learning_rate": 0.0002517953861584754, "loss": 2.8944, "theoretical_loss": 3.485976874055007, "tokens_seen": 1655189504 }, { "epoch": 4.09, "learning_rate": 0.00025178535606820463, "loss": 2.7485, "theoretical_loss": 3.4859649334433653, "tokens_seen": 1655255040 }, { "epoch": 4.09, "learning_rate": 0.0002517753259779338, "loss": 2.7764, "theoretical_loss": 3.4859529934368414, "tokens_seen": 1655320576 }, { "epoch": 4.09, "learning_rate": 0.000251765295887663, "loss": 2.7738, "theoretical_loss": 3.485941054035382, "tokens_seen": 1655386112 }, { "epoch": 4.09, "learning_rate": 0.00025175526579739217, "loss": 2.7653, "theoretical_loss": 3.4859291152389322, "tokens_seen": 1655451648 }, { "epoch": 4.09, "learning_rate": 0.00025174523570712135, "loss": 2.6459, "theoretical_loss": 3.485917177047437, "tokens_seen": 1655517184 }, { "epoch": 4.09, "learning_rate": 0.00025173520561685053, "loss": 2.8504, "theoretical_loss": 3.485905239460842, "tokens_seen": 1655582720 }, { "epoch": 4.09, "learning_rate": 0.00025172517552657977, "loss": 2.8644, "theoretical_loss": 3.4858933024790932, "tokens_seen": 1655648256 }, { "epoch": 4.09, "learning_rate": 0.0002517151454363089, "loss": 2.8609, "theoretical_loss": 3.4858813661021344, "tokens_seen": 1655713792 }, { "epoch": 4.09, "learning_rate": 0.00025170511534603813, "loss": 2.6374, "theoretical_loss": 3.485869430329913, "tokens_seen": 1655779328 }, { "epoch": 4.09, "learning_rate": 0.0002516950852557673, "loss": 2.7804, "theoretical_loss": 3.485857495162373, "tokens_seen": 1655844864 }, { "epoch": 4.09, "learning_rate": 0.0002516850551654965, "loss": 2.834, "theoretical_loss": 3.4858455605994605, "tokens_seen": 1655910400 }, { "epoch": 4.09, "learning_rate": 0.0002516750250752257, "loss": 2.6814, "theoretical_loss": 3.4858336266411207, "tokens_seen": 1655975936 }, { "epoch": 4.09, "learning_rate": 0.00025166499498495486, "loss": 2.7028, "theoretical_loss": 3.485821693287299, "tokens_seen": 1656041472 }, { "epoch": 4.09, "learning_rate": 0.00025165496489468404, "loss": 2.7644, "theoretical_loss": 3.4858097605379412, "tokens_seen": 1656107008 }, { "epoch": 4.09, "learning_rate": 0.00025164493480441327, "loss": 2.725, "theoretical_loss": 3.4857978283929922, "tokens_seen": 1656172544 }, { "epoch": 4.09, "learning_rate": 0.0002516349047141424, "loss": 2.776, "theoretical_loss": 3.4857858968523985, "tokens_seen": 1656238080 }, { "epoch": 4.09, "learning_rate": 0.00025162487462387163, "loss": 2.6704, "theoretical_loss": 3.4857739659161044, "tokens_seen": 1656303616 }, { "epoch": 4.09, "objective/train/docs_used": 2641271, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.56002140045166, "objective/train/theoretical_loss": 3.485762035584056, "objective/train/tokens_used": 1676829152, "theoretical_loss": 3.485762035584056, "tokens_seen": 1656369152 }, { "epoch": 4.09, "learning_rate": 0.00025161484453360076, "loss": 2.7568, "theoretical_loss": 3.485762035584056, "tokens_seen": 1656369152 }, { "epoch": 4.09, "learning_rate": 0.00025160481444333, "loss": 2.8012, "theoretical_loss": 3.4857501058561984, "tokens_seen": 1656434688 }, { "epoch": 4.09, "learning_rate": 0.0002515947843530592, "loss": 2.8062, "theoretical_loss": 3.4857381767324775, "tokens_seen": 1656500224 }, { "epoch": 4.09, "learning_rate": 0.00025158475426278836, "loss": 2.7265, "theoretical_loss": 3.485726248212839, "tokens_seen": 1656565760 }, { "epoch": 4.09, "learning_rate": 0.00025157472417251754, "loss": 2.6949, "theoretical_loss": 3.4857143202972276, "tokens_seen": 1656631296 }, { "epoch": 4.09, "learning_rate": 0.0002515646940822468, "loss": 2.9139, "theoretical_loss": 3.4857023929855897, "tokens_seen": 1656696832 }, { "epoch": 4.09, "learning_rate": 0.0002515546639919759, "loss": 2.7508, "theoretical_loss": 3.4856904662778705, "tokens_seen": 1656762368 }, { "epoch": 4.09, "learning_rate": 0.00025154463390170514, "loss": 2.7807, "theoretical_loss": 3.485678540174015, "tokens_seen": 1656827904 }, { "epoch": 4.09, "learning_rate": 0.00025153460381143426, "loss": 2.6357, "theoretical_loss": 3.4856666146739697, "tokens_seen": 1656893440 }, { "epoch": 4.09, "learning_rate": 0.0002515245737211635, "loss": 2.7508, "theoretical_loss": 3.4856546897776792, "tokens_seen": 1656958976 }, { "epoch": 4.09, "learning_rate": 0.0002515145436308927, "loss": 2.811, "theoretical_loss": 3.48564276548509, "tokens_seen": 1657024512 }, { "epoch": 4.09, "learning_rate": 0.00025150451354062186, "loss": 2.726, "theoretical_loss": 3.4856308417961466, "tokens_seen": 1657090048 }, { "epoch": 4.09, "learning_rate": 0.0002514944834503511, "loss": 2.632, "theoretical_loss": 3.485618918710795, "tokens_seen": 1657155584 }, { "epoch": 4.09, "learning_rate": 0.0002514844533600802, "loss": 2.9045, "theoretical_loss": 3.4856069962289817, "tokens_seen": 1657221120 }, { "epoch": 4.09, "learning_rate": 0.00025147442326980946, "loss": 2.734, "theoretical_loss": 3.4855950743506514, "tokens_seen": 1657286656 }, { "epoch": 4.09, "learning_rate": 0.00025146439317953864, "loss": 2.719, "theoretical_loss": 3.4855831530757495, "tokens_seen": 1657352192 }, { "epoch": 4.09, "learning_rate": 0.0002514543630892678, "loss": 2.6546, "theoretical_loss": 3.4855712324042223, "tokens_seen": 1657417728 }, { "epoch": 4.09, "learning_rate": 0.000251444332998997, "loss": 2.7806, "theoretical_loss": 3.4855593123360142, "tokens_seen": 1657483264 }, { "epoch": 4.09, "learning_rate": 0.0002514343029087262, "loss": 2.7324, "theoretical_loss": 3.485547392871072, "tokens_seen": 1657548800 }, { "epoch": 4.09, "learning_rate": 0.00025142427281845536, "loss": 2.6661, "theoretical_loss": 3.4855354740093416, "tokens_seen": 1657614336 }, { "epoch": 4.09, "learning_rate": 0.0002514142427281846, "loss": 2.5061, "theoretical_loss": 3.485523555750767, "tokens_seen": 1657679872 }, { "epoch": 4.09, "learning_rate": 0.0002514042126379137, "loss": 2.8511, "theoretical_loss": 3.4855116380952955, "tokens_seen": 1657745408 }, { "epoch": 4.09, "learning_rate": 0.00025139418254764296, "loss": 2.78, "theoretical_loss": 3.4854997210428715, "tokens_seen": 1657810944 }, { "epoch": 4.09, "learning_rate": 0.00025138415245737214, "loss": 2.8565, "theoretical_loss": 3.4854878045934417, "tokens_seen": 1657876480 }, { "epoch": 4.09, "learning_rate": 0.0002513741223671013, "loss": 2.6581, "theoretical_loss": 3.485475888746951, "tokens_seen": 1657942016 }, { "epoch": 4.09, "objective/train/docs_used": 2642760, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1132235527038574, "objective/train/theoretical_loss": 3.485463973503345, "objective/train/tokens_used": 1678467552, "theoretical_loss": 3.485463973503345, "tokens_seen": 1658007552 }, { "epoch": 4.09, "learning_rate": 0.0002513640922768305, "loss": 2.8962, "theoretical_loss": 3.485463973503345, "tokens_seen": 1658007552 }, { "epoch": 4.09, "learning_rate": 0.0002513540621865597, "loss": 2.6627, "theoretical_loss": 3.48545205886257, "tokens_seen": 1658073088 }, { "epoch": 4.09, "learning_rate": 0.00025134403209628887, "loss": 2.8016, "theoretical_loss": 3.485440144824571, "tokens_seen": 1658138624 }, { "epoch": 4.09, "learning_rate": 0.0002513340020060181, "loss": 2.7335, "theoretical_loss": 3.485428231389294, "tokens_seen": 1658204160 }, { "epoch": 4.09, "learning_rate": 0.00025132397191574723, "loss": 2.7174, "theoretical_loss": 3.4854163185566853, "tokens_seen": 1658269696 }, { "epoch": 4.09, "learning_rate": 0.00025131394182547647, "loss": 2.7369, "theoretical_loss": 3.4854044063266896, "tokens_seen": 1658335232 }, { "epoch": 4.09, "learning_rate": 0.0002513039117352056, "loss": 2.7999, "theoretical_loss": 3.485392494699253, "tokens_seen": 1658400768 }, { "epoch": 4.09, "learning_rate": 0.00025129388164493483, "loss": 2.8974, "theoretical_loss": 3.4853805836743215, "tokens_seen": 1658466304 }, { "epoch": 4.09, "learning_rate": 0.000251283851554664, "loss": 2.6047, "theoretical_loss": 3.4853686732518403, "tokens_seen": 1658531840 }, { "epoch": 4.09, "learning_rate": 0.0002512738214643932, "loss": 2.6835, "theoretical_loss": 3.4853567634317555, "tokens_seen": 1658597376 }, { "epoch": 4.09, "learning_rate": 0.00025126379137412237, "loss": 2.9307, "theoretical_loss": 3.4853448542140124, "tokens_seen": 1658662912 }, { "epoch": 4.09, "learning_rate": 0.00025125376128385155, "loss": 2.6242, "theoretical_loss": 3.4853329455985573, "tokens_seen": 1658728448 }, { "epoch": 4.09, "learning_rate": 0.00025124373119358073, "loss": 2.7535, "theoretical_loss": 3.4853210375853356, "tokens_seen": 1658793984 }, { "epoch": 4.09, "learning_rate": 0.00025123370110330997, "loss": 2.7362, "theoretical_loss": 3.4853091301742936, "tokens_seen": 1658859520 }, { "epoch": 4.09, "learning_rate": 0.0002512236710130391, "loss": 2.6004, "theoretical_loss": 3.485297223365376, "tokens_seen": 1658925056 }, { "epoch": 4.09, "learning_rate": 0.00025121364092276833, "loss": 2.7934, "theoretical_loss": 3.485285317158529, "tokens_seen": 1658990592 }, { "epoch": 4.09, "learning_rate": 0.0002512036108324975, "loss": 2.8656, "theoretical_loss": 3.485273411553699, "tokens_seen": 1659056128 }, { "epoch": 4.09, "learning_rate": 0.0002511935807422267, "loss": 2.795, "theoretical_loss": 3.485261506550831, "tokens_seen": 1659121664 }, { "epoch": 4.09, "learning_rate": 0.0002511835506519559, "loss": 2.6786, "theoretical_loss": 3.4852496021498713, "tokens_seen": 1659187200 }, { "epoch": 4.09, "learning_rate": 0.00025117352056168506, "loss": 2.8309, "theoretical_loss": 3.4852376983507654, "tokens_seen": 1659252736 }, { "epoch": 4.09, "learning_rate": 0.00025116349047141424, "loss": 2.8039, "theoretical_loss": 3.4852257951534593, "tokens_seen": 1659318272 }, { "epoch": 4.09, "learning_rate": 0.00025115346038114347, "loss": 2.7151, "theoretical_loss": 3.4852138925578986, "tokens_seen": 1659383808 }, { "epoch": 4.09, "learning_rate": 0.0002511434302908726, "loss": 2.8231, "theoretical_loss": 3.48520199056403, "tokens_seen": 1659449344 }, { "epoch": 4.09, "learning_rate": 0.00025113340020060183, "loss": 2.6269, "theoretical_loss": 3.4851900891717973, "tokens_seen": 1659514880 }, { "epoch": 4.09, "learning_rate": 0.00025112337011033096, "loss": 2.7521, "theoretical_loss": 3.4851781883811483, "tokens_seen": 1659580416 }, { "epoch": 4.09, "objective/train/docs_used": 2645641, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.598112106323242, "objective/train/theoretical_loss": 3.485166288192028, "objective/train/tokens_used": 1680105952, "theoretical_loss": 3.485166288192028, "tokens_seen": 1659645952 }, { "epoch": 4.09, "learning_rate": 0.0002511133400200602, "loss": 2.7417, "theoretical_loss": 3.485166288192028, "tokens_seen": 1659645952 }, { "epoch": 4.09, "learning_rate": 0.0002511033099297894, "loss": 2.6778, "theoretical_loss": 3.485154388604382, "tokens_seen": 1659711488 }, { "epoch": 4.09, "learning_rate": 0.00025109327983951856, "loss": 2.829, "theoretical_loss": 3.4851424896181573, "tokens_seen": 1659777024 }, { "epoch": 4.09, "learning_rate": 0.00025108324974924774, "loss": 2.6558, "theoretical_loss": 3.4851305912332986, "tokens_seen": 1659842560 }, { "epoch": 4.09, "learning_rate": 0.000251073219658977, "loss": 2.8285, "theoretical_loss": 3.485118693449752, "tokens_seen": 1659908096 }, { "epoch": 4.09, "learning_rate": 0.0002510631895687061, "loss": 2.7948, "theoretical_loss": 3.4851067962674636, "tokens_seen": 1659973632 }, { "epoch": 4.09, "learning_rate": 0.00025105315947843534, "loss": 2.8816, "theoretical_loss": 3.485094899686379, "tokens_seen": 1660039168 }, { "epoch": 4.09, "learning_rate": 0.00025104312938816446, "loss": 2.8058, "theoretical_loss": 3.4850830037064444, "tokens_seen": 1660104704 }, { "epoch": 4.09, "learning_rate": 0.0002510330992978937, "loss": 2.8877, "theoretical_loss": 3.4850711083276056, "tokens_seen": 1660170240 }, { "epoch": 4.09, "learning_rate": 0.0002510230692076229, "loss": 2.5803, "theoretical_loss": 3.4850592135498086, "tokens_seen": 1660235776 }, { "epoch": 4.09, "learning_rate": 0.00025101303911735206, "loss": 2.7703, "theoretical_loss": 3.485047319372999, "tokens_seen": 1660301312 }, { "epoch": 4.09, "learning_rate": 0.00025100300902708124, "loss": 2.7417, "theoretical_loss": 3.4850354257971228, "tokens_seen": 1660366848 }, { "epoch": 4.09, "learning_rate": 0.0002509929789368104, "loss": 2.8876, "theoretical_loss": 3.485023532822126, "tokens_seen": 1660432384 }, { "epoch": 4.09, "learning_rate": 0.0002509829488465396, "loss": 2.8623, "theoretical_loss": 3.485011640447955, "tokens_seen": 1660497920 }, { "epoch": 4.09, "learning_rate": 0.00025097291875626884, "loss": 2.7322, "theoretical_loss": 3.484999748674555, "tokens_seen": 1660563456 }, { "epoch": 4.09, "learning_rate": 0.00025096288866599797, "loss": 2.7054, "theoretical_loss": 3.484987857501872, "tokens_seen": 1660628992 }, { "epoch": 4.09, "learning_rate": 0.0002509528585757272, "loss": 2.7282, "theoretical_loss": 3.484975966929852, "tokens_seen": 1660694528 }, { "epoch": 4.09, "learning_rate": 0.00025094282848545633, "loss": 2.6582, "theoretical_loss": 3.484964076958442, "tokens_seen": 1660760064 }, { "epoch": 4.09, "learning_rate": 0.00025093279839518556, "loss": 2.7918, "theoretical_loss": 3.484952187587586, "tokens_seen": 1660825600 }, { "epoch": 4.09, "learning_rate": 0.00025092276830491475, "loss": 2.8419, "theoretical_loss": 3.484940298817232, "tokens_seen": 1660891136 }, { "epoch": 4.09, "learning_rate": 0.0002509127382146439, "loss": 2.7666, "theoretical_loss": 3.4849284106473246, "tokens_seen": 1660956672 }, { "epoch": 4.09, "learning_rate": 0.0002509027081243731, "loss": 2.7607, "theoretical_loss": 3.4849165230778105, "tokens_seen": 1661022208 }, { "epoch": 4.09, "learning_rate": 0.00025089267803410234, "loss": 2.7524, "theoretical_loss": 3.4849046361086353, "tokens_seen": 1661087744 }, { "epoch": 4.09, "learning_rate": 0.00025088264794383147, "loss": 2.8152, "theoretical_loss": 3.4848927497397444, "tokens_seen": 1661153280 }, { "epoch": 4.09, "learning_rate": 0.0002508726178535607, "loss": 2.9241, "theoretical_loss": 3.4848808639710853, "tokens_seen": 1661218816 }, { "epoch": 4.09, "objective/train/docs_used": 2648038, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6420907974243164, "objective/train/theoretical_loss": 3.484868978802603, "objective/train/tokens_used": 1681744352, "theoretical_loss": 3.484868978802603, "tokens_seen": 1661284352 }, { "epoch": 4.09, "learning_rate": 0.00025086258776328983, "loss": 2.7337, "theoretical_loss": 3.484868978802603, "tokens_seen": 1661284352 }, { "epoch": 4.09, "learning_rate": 0.00025085255767301907, "loss": 2.8819, "theoretical_loss": 3.4848570942342443, "tokens_seen": 1661349888 }, { "epoch": 4.09, "learning_rate": 0.00025084252758274825, "loss": 2.8771, "theoretical_loss": 3.484845210265954, "tokens_seen": 1661415424 }, { "epoch": 4.09, "learning_rate": 0.00025083249749247743, "loss": 2.8267, "theoretical_loss": 3.4848333268976788, "tokens_seen": 1661480960 }, { "epoch": 4.09, "learning_rate": 0.0002508224674022066, "loss": 2.6207, "theoretical_loss": 3.4848214441293646, "tokens_seen": 1661546496 }, { "epoch": 4.09, "learning_rate": 0.0002508124373119358, "loss": 2.9838, "theoretical_loss": 3.484809561960958, "tokens_seen": 1661612032 } ], "max_steps": 50354, "num_train_epochs": 9223372036854775807, "total_flos": 8.47980437962752e+17, "trial_name": null, "trial_params": null }