diff --git "a/checkpoint-5070/trainer_state.json" "b/checkpoint-5070/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5070/trainer_state.json" @@ -0,0 +1,37742 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10068713508360805, + "global_step": 5070, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-07, + "loss": 10.8421, + "theoretical_loss": 20.81281780154715, + "tokens_seen": 65536 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-06, + "loss": 10.794, + "theoretical_loss": 17.566201104328645, + "tokens_seen": 131072 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904763e-06, + "loss": 10.7002, + "theoretical_loss": 15.939477092836569, + "tokens_seen": 196608 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-06, + "loss": 10.6201, + "theoretical_loss": 14.89231675598857, + "tokens_seen": 262144 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-06, + "loss": 10.1305, + "theoretical_loss": 14.136216937762974, + "tokens_seen": 327680 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809525e-06, + "loss": 10.084, + "theoretical_loss": 13.552561472550224, + "tokens_seen": 393216 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-06, + "loss": 9.8364, + "theoretical_loss": 13.08180900140119, + "tokens_seen": 458752 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507936e-06, + "loss": 9.897, + "theoretical_loss": 12.690129625483323, + "tokens_seen": 524288 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571428e-06, + "loss": 9.5044, + "theoretical_loss": 12.356592463873625, + "tokens_seen": 589824 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-06, + "loss": 9.6321, + "theoretical_loss": 12.067412607035077, + "tokens_seen": 655360 + }, + { + "epoch": 0.0, + "learning_rate": 1.0912698412698412e-05, + "loss": 9.4371, + "theoretical_loss": 11.813066231101676, + "tokens_seen": 720896 + }, + { + "epoch": 0.0, + "learning_rate": 1.1904761904761905e-05, + "loss": 9.6176, + "theoretical_loss": 11.586719208706729, + "tokens_seen": 786432 + }, + { + "epoch": 0.0, + "learning_rate": 1.2896825396825396e-05, + "loss": 9.199, + "theoretical_loss": 11.383314140186787, + "tokens_seen": 851968 + }, + { + "epoch": 0.0, + "learning_rate": 1.3888888888888888e-05, + "loss": 8.8935, + "theoretical_loss": 11.199011702111871, + "tokens_seen": 917504 + }, + { + "epoch": 0.0, + "learning_rate": 1.4880952380952381e-05, + "loss": 9.3252, + "theoretical_loss": 11.030833917977912, + "tokens_seen": 983040 + }, + { + "epoch": 0.0, + "learning_rate": 1.5873015873015872e-05, + "loss": 8.6232, + "theoretical_loss": 10.87642808645695, + "tokens_seen": 1048576 + }, + { + "epoch": 0.0, + "learning_rate": 1.6865079365079364e-05, + "loss": 9.0944, + "theoretical_loss": 10.733905740062724, + "tokens_seen": 1114112 + }, + { + "epoch": 0.0, + "learning_rate": 1.7857142857142855e-05, + "loss": 8.899, + "theoretical_loss": 10.60172987623028, + "tokens_seen": 1179648 + }, + { + "epoch": 0.0, + "learning_rate": 1.884920634920635e-05, + "loss": 8.9962, + "theoretical_loss": 10.478634172356642, + "tokens_seen": 1245184 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-05, + "loss": 9.3635, + "theoretical_loss": 10.36356394376333, + "tokens_seen": 1310720 + }, + { + "epoch": 0.0, + "learning_rate": 2.0833333333333333e-05, + "loss": 8.7939, + "theoretical_loss": 10.255632220896747, + "tokens_seen": 1376256 + }, + { + "epoch": 0.0, + "learning_rate": 2.1825396825396824e-05, + "loss": 8.8135, + "theoretical_loss": 10.15408655327002, + "tokens_seen": 1441792 + }, + { + "epoch": 0.0, + "learning_rate": 2.2817460317460315e-05, + "loss": 8.5781, + "theoretical_loss": 10.058283561732598, + "tokens_seen": 1507328 + }, + { + "epoch": 0.0, + "learning_rate": 2.380952380952381e-05, + "loss": 9.316, + "theoretical_loss": 9.967669178840278, + "tokens_seen": 1572864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 13112, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 8.743459701538086, + "objective/train/theoretical_loss": 9.881763126393109, + "objective/train/tokens_used": 22098400, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.48015873015873e-05, + "loss": 8.8669, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.5793650793650793e-05, + "loss": 8.868, + "theoretical_loss": 9.80014659154056, + "tokens_seen": 1703936 + }, + { + "epoch": 0.0, + "learning_rate": 2.6785714285714284e-05, + "loss": 8.3477, + "theoretical_loss": 9.722452346907446, + "tokens_seen": 1769472 + }, + { + "epoch": 0.0, + "learning_rate": 2.7777777777777776e-05, + "loss": 8.5573, + "theoretical_loss": 9.648356759081546, + "tokens_seen": 1835008 + }, + { + "epoch": 0.0, + "learning_rate": 2.876984126984127e-05, + "loss": 8.7737, + "theoretical_loss": 9.577573271145639, + "tokens_seen": 1900544 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904762e-05, + "loss": 8.9664, + "theoretical_loss": 9.509847046764852, + "tokens_seen": 1966080 + }, + { + "epoch": 0.0, + "learning_rate": 3.075396825396825e-05, + "loss": 7.8002, + "theoretical_loss": 9.444950537631936, + "tokens_seen": 2031616 + }, + { + "epoch": 0.0, + "learning_rate": 3.1746031746031745e-05, + "loss": 8.6073, + "theoretical_loss": 9.382679790910457, + "tokens_seen": 2097152 + }, + { + "epoch": 0.0, + "learning_rate": 3.273809523809524e-05, + "loss": 8.7208, + "theoretical_loss": 9.32285135423398, + "tokens_seen": 2162688 + }, + { + "epoch": 0.0, + "learning_rate": 3.373015873015873e-05, + "loss": 8.7212, + "theoretical_loss": 9.265299666660276, + "tokens_seen": 2228224 + }, + { + "epoch": 0.0, + "learning_rate": 3.472222222222222e-05, + "loss": 8.7873, + "theoretical_loss": 9.209874847444755, + "tokens_seen": 2293760 + }, + { + "epoch": 0.0, + "learning_rate": 3.571428571428571e-05, + "loss": 8.4388, + "theoretical_loss": 9.156440812508292, + "tokens_seen": 2359296 + }, + { + "epoch": 0.0, + "learning_rate": 3.670634920634921e-05, + "loss": 8.1114, + "theoretical_loss": 9.10487366241335, + "tokens_seen": 2424832 + }, + { + "epoch": 0.0, + "learning_rate": 3.76984126984127e-05, + "loss": 8.2518, + "theoretical_loss": 9.055060296533734, + "tokens_seen": 2490368 + }, + { + "epoch": 0.0, + "learning_rate": 3.8690476190476195e-05, + "loss": 8.2561, + "theoretical_loss": 9.006897216643829, + "tokens_seen": 2555904 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-05, + "loss": 8.1331, + "theoretical_loss": 8.960289489909357, + "tokens_seen": 2621440 + }, + { + "epoch": 0.0, + "learning_rate": 4.067460317460318e-05, + "loss": 8.0787, + "theoretical_loss": 8.915149846640611, + "tokens_seen": 2686976 + }, + { + "epoch": 0.0, + "learning_rate": 4.1666666666666665e-05, + "loss": 8.3421, + "theoretical_loss": 8.871397892478225, + "tokens_seen": 2752512 + }, + { + "epoch": 0.0, + "learning_rate": 4.265873015873016e-05, + "loss": 8.0099, + "theoretical_loss": 8.828959418153499, + "tokens_seen": 2818048 + }, + { + "epoch": 0.0, + "learning_rate": 4.365079365079365e-05, + "loss": 8.2887, + "theoretical_loss": 8.787765792778412, + "tokens_seen": 2883584 + }, + { + "epoch": 0.0, + "learning_rate": 4.464285714285714e-05, + "loss": 8.5295, + "theoretical_loss": 8.747753428911455, + "tokens_seen": 2949120 + }, + { + "epoch": 0.0, + "learning_rate": 4.563492063492063e-05, + "loss": 8.2913, + "theoretical_loss": 8.708863309520833, + "tokens_seen": 3014656 + }, + { + "epoch": 0.0, + "learning_rate": 4.6626984126984126e-05, + "loss": 8.319, + "theoretical_loss": 8.671040568508847, + "tokens_seen": 3080192 + }, + { + "epoch": 0.0, + "learning_rate": 4.761904761904762e-05, + "loss": 7.8599, + "theoretical_loss": 8.634234117735474, + "tokens_seen": 3145728 + }, + { + "epoch": 0.0, + "learning_rate": 4.8611111111111115e-05, + "loss": 7.7079, + "theoretical_loss": 8.598396314536323, + "tokens_seen": 3211264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 13755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.561188220977783, + "objective/train/theoretical_loss": 8.563482664611069, + "objective/train/tokens_used": 23736800, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-05, + "loss": 7.8004, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 5.05952380952381e-05, + "loss": 8.0798, + "theoretical_loss": 8.529451555895115, + "tokens_seen": 3342336 + }, + { + "epoch": 0.0, + "learning_rate": 5.1587301587301586e-05, + "loss": 7.9158, + "theoretical_loss": 8.496264019646002, + "tokens_seen": 3407872 + }, + { + "epoch": 0.0, + "learning_rate": 5.257936507936508e-05, + "loss": 7.5747, + "theoretical_loss": 8.463883515497187, + "tokens_seen": 3473408 + }, + { + "epoch": 0.0, + "learning_rate": 5.357142857142857e-05, + "loss": 7.8708, + "theoretical_loss": 8.432275737672779, + "tokens_seen": 3538944 + }, + { + "epoch": 0.0, + "learning_rate": 5.4563492063492063e-05, + "loss": 7.7568, + "theoretical_loss": 8.401408439930716, + "tokens_seen": 3604480 + }, + { + "epoch": 0.0, + "learning_rate": 5.555555555555555e-05, + "loss": 7.86, + "theoretical_loss": 8.371251277120209, + "tokens_seen": 3670016 + }, + { + "epoch": 0.0, + "learning_rate": 5.6547619047619046e-05, + "loss": 7.5876, + "theoretical_loss": 8.341775661511075, + "tokens_seen": 3735552 + }, + { + "epoch": 0.0, + "learning_rate": 5.753968253968254e-05, + "loss": 7.0476, + "theoretical_loss": 8.31295463228533, + "tokens_seen": 3801088 + }, + { + "epoch": 0.0, + "learning_rate": 5.8531746031746036e-05, + "loss": 7.6287, + "theoretical_loss": 8.284762736781182, + "tokens_seen": 3866624 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809524e-05, + "loss": 7.8486, + "theoretical_loss": 8.257175922251864, + "tokens_seen": 3932160 + }, + { + "epoch": 0.0, + "learning_rate": 6.051587301587302e-05, + "loss": 7.7722, + "theoretical_loss": 8.230171437050114, + "tokens_seen": 3997696 + }, + { + "epoch": 0.0, + "learning_rate": 6.15079365079365e-05, + "loss": 7.7011, + "theoretical_loss": 8.20372774027797, + "tokens_seen": 4063232 + }, + { + "epoch": 0.0, + "learning_rate": 6.25e-05, + "loss": 7.8601, + "theoretical_loss": 8.177824419053046, + "tokens_seen": 4128768 + }, + { + "epoch": 0.0, + "learning_rate": 6.349206349206349e-05, + "loss": 7.4565, + "theoretical_loss": 8.152442112639616, + "tokens_seen": 4194304 + }, + { + "epoch": 0.0, + "learning_rate": 6.448412698412699e-05, + "loss": 7.3253, + "theoretical_loss": 8.1275624427775, + "tokens_seen": 4259840 + }, + { + "epoch": 0.0, + "learning_rate": 6.547619047619048e-05, + "loss": 7.1366, + "theoretical_loss": 8.10316794961571, + "tokens_seen": 4325376 + }, + { + "epoch": 0.0, + "learning_rate": 6.646825396825397e-05, + "loss": 7.4872, + "theoretical_loss": 8.07924203272264, + "tokens_seen": 4390912 + }, + { + "epoch": 0.0, + "learning_rate": 6.746031746031745e-05, + "loss": 7.5183, + "theoretical_loss": 8.055768896701416, + "tokens_seen": 4456448 + }, + { + "epoch": 0.0, + "learning_rate": 6.845238095238096e-05, + "loss": 7.0588, + "theoretical_loss": 8.032733500989007, + "tokens_seen": 4521984 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-05, + "loss": 7.3674, + "theoretical_loss": 8.010121513461836, + "tokens_seen": 4587520 + }, + { + "epoch": 0.0, + "learning_rate": 7.043650793650793e-05, + "loss": 7.0203, + "theoretical_loss": 7.987919267509379, + "tokens_seen": 4653056 + }, + { + "epoch": 0.0, + "learning_rate": 7.142857142857142e-05, + "loss": 7.0155, + "theoretical_loss": 7.966113722271801, + "tokens_seen": 4718592 + }, + { + "epoch": 0.0, + "learning_rate": 7.242063492063492e-05, + "loss": 7.2468, + "theoretical_loss": 7.944692425767988, + "tokens_seen": 4784128 + }, + { + "epoch": 0.0, + "learning_rate": 7.341269841269842e-05, + "loss": 7.022, + "theoretical_loss": 7.9236434806675184, + "tokens_seen": 4849664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 15099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.9753007888793945, + "objective/train/theoretical_loss": 7.902955512484067, + "objective/train/tokens_used": 25375200, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.440476190476191e-05, + "loss": 7.1984, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.53968253968254e-05, + "loss": 7.0639, + "theoretical_loss": 7.882617639989203, + "tokens_seen": 4980736 + }, + { + "epoch": 0.0, + "learning_rate": 7.63888888888889e-05, + "loss": 6.8249, + "theoretical_loss": 7.862619447664628, + "tokens_seen": 5046272 + }, + { + "epoch": 0.0, + "learning_rate": 7.738095238095239e-05, + "loss": 6.9557, + "theoretical_loss": 7.842950960027937, + "tokens_seen": 5111808 + }, + { + "epoch": 0.0, + "learning_rate": 7.837301587301588e-05, + "loss": 7.1772, + "theoretical_loss": 7.823602617682313, + "tokens_seen": 5177344 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507937e-05, + "loss": 6.7487, + "theoretical_loss": 7.804565254954165, + "tokens_seen": 5242880 + }, + { + "epoch": 0.0, + "learning_rate": 8.035714285714287e-05, + "loss": 6.7244, + "theoretical_loss": 7.7858300789950725, + "tokens_seen": 5308416 + }, + { + "epoch": 0.0, + "learning_rate": 8.134920634920635e-05, + "loss": 7.2012, + "theoretical_loss": 7.767388650235364, + "tokens_seen": 5373952 + }, + { + "epoch": 0.0, + "learning_rate": 8.234126984126984e-05, + "loss": 6.7945, + "theoretical_loss": 7.749232864086619, + "tokens_seen": 5439488 + }, + { + "epoch": 0.0, + "learning_rate": 8.333333333333333e-05, + "loss": 6.4889, + "theoretical_loss": 7.731354933799318, + "tokens_seen": 5505024 + }, + { + "epoch": 0.0, + "learning_rate": 8.432539682539683e-05, + "loss": 7.0238, + "theoretical_loss": 7.71374737438992, + "tokens_seen": 5570560 + }, + { + "epoch": 0.0, + "learning_rate": 8.531746031746032e-05, + "loss": 6.9503, + "theoretical_loss": 7.696402987558934, + "tokens_seen": 5636096 + }, + { + "epoch": 0.0, + "learning_rate": 8.630952380952381e-05, + "loss": 6.8864, + "theoretical_loss": 7.679314847528181, + "tokens_seen": 5701632 + }, + { + "epoch": 0.0, + "learning_rate": 8.73015873015873e-05, + "loss": 6.5425, + "theoretical_loss": 7.662476287731328, + "tokens_seen": 5767168 + }, + { + "epoch": 0.0, + "learning_rate": 8.82936507936508e-05, + "loss": 6.9681, + "theoretical_loss": 7.645880888297279, + "tokens_seen": 5832704 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571429e-05, + "loss": 6.4495, + "theoretical_loss": 7.629522464270861, + "tokens_seen": 5898240 + }, + { + "epoch": 0.0, + "learning_rate": 9.027777777777777e-05, + "loss": 6.5397, + "theoretical_loss": 7.613395054519696, + "tokens_seen": 5963776 + }, + { + "epoch": 0.0, + "learning_rate": 9.126984126984126e-05, + "loss": 6.4287, + "theoretical_loss": 7.59749291128028, + "tokens_seen": 6029312 + }, + { + "epoch": 0.0, + "learning_rate": 9.226190476190476e-05, + "loss": 6.5709, + "theoretical_loss": 7.581810490299888, + "tokens_seen": 6094848 + }, + { + "epoch": 0.0, + "learning_rate": 9.325396825396825e-05, + "loss": 6.7154, + "theoretical_loss": 7.5663424415343705, + "tokens_seen": 6160384 + }, + { + "epoch": 0.0, + "learning_rate": 9.424603174603175e-05, + "loss": 6.4567, + "theoretical_loss": 7.551083600364949, + "tokens_seen": 6225920 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-05, + "loss": 6.351, + "theoretical_loss": 7.536028979299919, + "tokens_seen": 6291456 + }, + { + "epoch": 0.0, + "learning_rate": 9.623015873015874e-05, + "loss": 6.7204, + "theoretical_loss": 7.521173760129762, + "tokens_seen": 6356992 + }, + { + "epoch": 0.0, + "learning_rate": 9.722222222222223e-05, + "loss": 6.9272, + "theoretical_loss": 7.506513286506497, + "tokens_seen": 6422528 + }, + { + "epoch": 0.0, + "learning_rate": 9.821428571428572e-05, + "loss": 6.3143, + "theoretical_loss": 7.492043056920249, + "tokens_seen": 6488064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 15777, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.954127788543701, + "objective/train/theoretical_loss": 7.4777587180480305, + "objective/train/tokens_used": 27013600, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-05, + "loss": 6.6163, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010019841269841271, + "loss": 6.3178, + "theoretical_loss": 7.463656058451462, + "tokens_seen": 6619136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001011904761904762, + "loss": 6.3722, + "theoretical_loss": 7.449731002601916, + "tokens_seen": 6684672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010218253968253968, + "loss": 6.2211, + "theoretical_loss": 7.435979605213019, + "tokens_seen": 6750208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010317460317460317, + "loss": 6.5174, + "theoretical_loss": 7.422398045861905, + "tokens_seen": 6815744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010416666666666667, + "loss": 6.6538, + "theoretical_loss": 7.408982623881875, + "tokens_seen": 6881280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010515873015873016, + "loss": 6.1797, + "theoretical_loss": 7.395729753510345, + "tokens_seen": 6946816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010615079365079365, + "loss": 6.5865, + "theoretical_loss": 7.3826359592770325, + "tokens_seen": 7012352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010714285714285714, + "loss": 6.5482, + "theoretical_loss": 7.369697871618373, + "tokens_seen": 7077888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010813492063492064, + "loss": 6.7304, + "theoretical_loss": 7.3569122227050885, + "tokens_seen": 7143424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010912698412698413, + "loss": 6.2506, + "theoretical_loss": 7.3442758424706875, + "tokens_seen": 7208960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011011904761904761, + "loss": 6.1023, + "theoretical_loss": 7.331785654829519, + "tokens_seen": 7274496 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001111111111111111, + "loss": 6.6284, + "theoretical_loss": 7.319438674073677, + "tokens_seen": 7340032 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001121031746031746, + "loss": 6.3081, + "theoretical_loss": 7.307232001438824, + "tokens_seen": 7405568 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011309523809523809, + "loss": 6.299, + "theoretical_loss": 7.295162821829564, + "tokens_seen": 7471104 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011408730158730158, + "loss": 6.5472, + "theoretical_loss": 7.283228400695652, + "tokens_seen": 7536640 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011507936507936508, + "loss": 6.292, + "theoretical_loss": 7.271426081050832, + "tokens_seen": 7602176 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011607142857142858, + "loss": 6.2414, + "theoretical_loss": 7.259753280626623, + "tokens_seen": 7667712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011706349206349207, + "loss": 6.3899, + "theoretical_loss": 7.24820748915387, + "tokens_seen": 7733248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011805555555555556, + "loss": 6.3743, + "theoretical_loss": 7.236786265765262, + "tokens_seen": 7798784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011904761904761905, + "loss": 5.9537, + "theoretical_loss": 7.225487236512497, + "tokens_seen": 7864320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012003968253968255, + "loss": 6.0297, + "theoretical_loss": 7.21430809199212, + "tokens_seen": 7929856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012103174603174604, + "loss": 6.5923, + "theoretical_loss": 7.2032465850744005, + "tokens_seen": 7995392 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012202380952380953, + "loss": 6.116, + "theoretical_loss": 7.192300528730015, + "tokens_seen": 8060928 + }, + { + "epoch": 0.0, + "learning_rate": 0.000123015873015873, + "loss": 6.4398, + "theoretical_loss": 7.1814677939495155, + "tokens_seen": 8126464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 17166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.351791858673096, + "objective/train/theoretical_loss": 7.1707463077509646, + "objective/train/tokens_used": 28652000, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001240079365079365, + "loss": 6.4479, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000125, + "loss": 6.335, + "theoretical_loss": 7.160134051271272, + "tokens_seen": 8257536 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001259920634920635, + "loss": 6.1976, + "theoretical_loss": 7.149629057937138, + "tokens_seen": 8323072 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012698412698412698, + "loss": 6.4173, + "theoretical_loss": 7.139229411711638, + "tokens_seen": 8388608 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012797619047619048, + "loss": 6.0317, + "theoretical_loss": 7.128933245412794, + "tokens_seen": 8454144 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012896825396825398, + "loss": 6.0573, + "theoretical_loss": 7.118738739100616, + "tokens_seen": 8519680 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012996031746031748, + "loss": 6.0885, + "theoretical_loss": 7.1086441185293445, + "tokens_seen": 8585216 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013095238095238096, + "loss": 6.3385, + "theoretical_loss": 7.09864765366177, + "tokens_seen": 8650752 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013194444444444446, + "loss": 6.1813, + "theoretical_loss": 7.088747657242693, + "tokens_seen": 8716288 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013293650793650793, + "loss": 6.368, + "theoretical_loss": 7.078942483428749, + "tokens_seen": 8781824 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013392857142857144, + "loss": 5.9535, + "theoretical_loss": 7.069230526471966, + "tokens_seen": 8847360 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001349206349206349, + "loss": 6.0473, + "theoretical_loss": 7.059610219454568, + "tokens_seen": 8912896 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001359126984126984, + "loss": 6.0929, + "theoretical_loss": 7.0500800330726685, + "tokens_seen": 8978432 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001369047619047619, + "loss": 6.3183, + "theoretical_loss": 7.040638474466625, + "tokens_seen": 9043968 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013789682539682541, + "loss": 6.2457, + "theoretical_loss": 7.031284086095933, + "tokens_seen": 9109504 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001388888888888889, + "loss": 6.2984, + "theoretical_loss": 7.022015444656678, + "tokens_seen": 9175040 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001398809523809524, + "loss": 6.1863, + "theoretical_loss": 7.012831160039609, + "tokens_seen": 9240576 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014087301587301586, + "loss": 6.3604, + "theoretical_loss": 7.003729874327071, + "tokens_seen": 9306112 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014186507936507937, + "loss": 6.0677, + "theoretical_loss": 6.994710260827057, + "tokens_seen": 9371648 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014285714285714284, + "loss": 6.2727, + "theoretical_loss": 6.98577102314278, + "tokens_seen": 9437184 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014384920634920634, + "loss": 6.2095, + "theoretical_loss": 6.976910894276189, + "tokens_seen": 9502720 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014484126984126984, + "loss": 6.2549, + "theoretical_loss": 6.968128635764015, + "tokens_seen": 9568256 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014583333333333335, + "loss": 6.3308, + "theoretical_loss": 6.959423036844894, + "tokens_seen": 9633792 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014682539682539685, + "loss": 6.364, + "theoretical_loss": 6.950792913656309, + "tokens_seen": 9699328 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014781746031746032, + "loss": 6.0516, + "theoretical_loss": 6.942237108460029, + "tokens_seen": 9764864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 17861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.117774486541748, + "objective/train/theoretical_loss": 6.9337544888949, + "objective/train/tokens_used": 30290400, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014880952380952382, + "loss": 6.1082, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001498015873015873, + "loss": 5.8009, + "theoretical_loss": 6.925343947255817, + "tokens_seen": 9895936 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001507936507936508, + "loss": 5.8948, + "theoretical_loss": 6.917004399797798, + "tokens_seen": 9961472 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015178571428571427, + "loss": 5.8529, + "theoretical_loss": 6.908734786064147, + "tokens_seen": 10027008 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001527777777777778, + "loss": 6.0551, + "theoretical_loss": 6.900534068237688, + "tokens_seen": 10092544 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015376984126984128, + "loss": 6.1428, + "theoretical_loss": 6.89240123051416, + "tokens_seen": 10158080 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015476190476190478, + "loss": 5.741, + "theoretical_loss": 6.884335278496871, + "tokens_seen": 10223616 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015575396825396825, + "loss": 5.8161, + "theoretical_loss": 6.87633523861175, + "tokens_seen": 10289152 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015674603174603175, + "loss": 6.1526, + "theoretical_loss": 6.868400157541997, + "tokens_seen": 10354688 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015773809523809523, + "loss": 5.9937, + "theoretical_loss": 6.860529101681551, + "tokens_seen": 10420224 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015873015873015873, + "loss": 6.0325, + "theoretical_loss": 6.85272115660663, + "tokens_seen": 10485760 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001597222222222222, + "loss": 5.938, + "theoretical_loss": 6.844975426564642, + "tokens_seen": 10551296 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016071428571428573, + "loss": 6.1597, + "theoretical_loss": 6.8372910339797945, + "tokens_seen": 10616832 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001617063492063492, + "loss": 5.92, + "theoretical_loss": 6.829667118974749, + "tokens_seen": 10682368 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001626984126984127, + "loss": 5.4308, + "theoretical_loss": 6.8221028389077185, + "tokens_seen": 10747904 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016369047619047618, + "loss": 6.2676, + "theoretical_loss": 6.814597367924395, + "tokens_seen": 10813440 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016468253968253969, + "loss": 6.0017, + "theoretical_loss": 6.807149896524181, + "tokens_seen": 10878976 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016567460317460316, + "loss": 6.1886, + "theoretical_loss": 6.799759631140145, + "tokens_seen": 10944512 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016666666666666666, + "loss": 5.7028, + "theoretical_loss": 6.7924257937322245, + "tokens_seen": 11010048 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016765873015873016, + "loss": 6.1481, + "theoretical_loss": 6.785147621393148, + "tokens_seen": 11075584 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016865079365079366, + "loss": 5.8603, + "theoretical_loss": 6.777924365966638, + "tokens_seen": 11141120 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016964285714285717, + "loss": 5.6239, + "theoretical_loss": 6.770755293677423, + "tokens_seen": 11206656 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017063492063492064, + "loss": 5.6818, + "theoretical_loss": 6.763639684772625, + "tokens_seen": 11272192 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017162698412698414, + "loss": 5.7517, + "theoretical_loss": 6.756576833174123, + "tokens_seen": 11337728 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017261904761904762, + "loss": 5.7609, + "theoretical_loss": 6.749566046141486, + "tokens_seen": 11403264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 18493, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.560274600982666, + "objective/train/theoretical_loss": 6.7426066439450905, + "objective/train/tokens_used": 31928800, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017361111111111112, + "loss": 5.9297, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001746031746031746, + "loss": 5.8221, + "theoretical_loss": 6.735697959549075, + "tokens_seen": 11534336 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001755952380952381, + "loss": 6.0875, + "theoretical_loss": 6.728839338303761, + "tokens_seen": 11599872 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001765873015873016, + "loss": 5.6213, + "theoretical_loss": 6.722030137647226, + "tokens_seen": 11665408 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001775793650793651, + "loss": 5.8156, + "theoretical_loss": 6.715269726815689, + "tokens_seen": 11730944 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017857142857142857, + "loss": 5.6855, + "theoretical_loss": 6.7085574865624125, + "tokens_seen": 11796480 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017956349206349207, + "loss": 6.0002, + "theoretical_loss": 6.701892808884824, + "tokens_seen": 11862016 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018055555555555555, + "loss": 5.4675, + "theoretical_loss": 6.695275096759559, + "tokens_seen": 11927552 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018154761904761905, + "loss": 5.6532, + "theoretical_loss": 6.68870376388518, + "tokens_seen": 11993088 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018253968253968252, + "loss": 5.6652, + "theoretical_loss": 6.682178234432274, + "tokens_seen": 12058624 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018353174603174602, + "loss": 5.7053, + "theoretical_loss": 6.675697942800715, + "tokens_seen": 12124160 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018452380952380953, + "loss": 5.638, + "theoretical_loss": 6.669262333383815, + "tokens_seen": 12189696 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018551587301587303, + "loss": 5.9672, + "theoretical_loss": 6.662870860339158, + "tokens_seen": 12255232 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001865079365079365, + "loss": 5.785, + "theoretical_loss": 6.656522987365879, + "tokens_seen": 12320768 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001875, + "loss": 5.7804, + "theoretical_loss": 6.6502181874881705, + "tokens_seen": 12386304 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001884920634920635, + "loss": 5.7629, + "theoretical_loss": 6.643955942844831, + "tokens_seen": 12451840 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018948412698412698, + "loss": 5.7272, + "theoretical_loss": 6.637735744484626, + "tokens_seen": 12517376 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019047619047619048, + "loss": 5.9396, + "theoretical_loss": 6.631557092167304, + "tokens_seen": 12582912 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019146825396825398, + "loss": 5.6727, + "theoretical_loss": 6.625419494170049, + "tokens_seen": 12648448 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019246031746031748, + "loss": 5.9043, + "theoretical_loss": 6.619322467099223, + "tokens_seen": 12713984 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019345238095238096, + "loss": 5.4898, + "theoretical_loss": 6.613265535707211, + "tokens_seen": 12779520 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019444444444444446, + "loss": 5.5153, + "theoretical_loss": 6.607248232714213, + "tokens_seen": 12845056 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019543650793650793, + "loss": 5.534, + "theoretical_loss": 6.60127009863481, + "tokens_seen": 12910592 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019642857142857144, + "loss": 5.7539, + "theoretical_loss": 6.59533068160918, + "tokens_seen": 12976128 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001974206349206349, + "loss": 5.633, + "theoretical_loss": 6.589429537238785, + "tokens_seen": 13041664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 19731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.428110122680664, + "objective/train/theoretical_loss": 6.583566228426414, + "objective/train/tokens_used": 33567200, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001984126984126984, + "loss": 5.3663, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019940476190476191, + "loss": 5.7325, + "theoretical_loss": 6.5777403252204305, + "tokens_seen": 13172736 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020039682539682542, + "loss": 5.8639, + "theoretical_loss": 6.571951404663098, + "tokens_seen": 13238272 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002013888888888889, + "loss": 5.6695, + "theoretical_loss": 6.566199050642863, + "tokens_seen": 13303808 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002023809523809524, + "loss": 5.5443, + "theoretical_loss": 6.560482853750463, + "tokens_seen": 13369344 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020337301587301587, + "loss": 5.599, + "theoretical_loss": 6.554802411138745, + "tokens_seen": 13434880 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020436507936507937, + "loss": 5.3973, + "theoretical_loss": 6.549157326386091, + "tokens_seen": 13500416 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020535714285714284, + "loss": 5.6837, + "theoretical_loss": 6.54354720936333, + "tokens_seen": 13565952 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020634920634920634, + "loss": 5.4813, + "theoretical_loss": 6.537971676104026, + "tokens_seen": 13631488 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020734126984126985, + "loss": 5.6414, + "theoretical_loss": 6.532430348678068, + "tokens_seen": 13697024 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020833333333333335, + "loss": 5.5939, + "theoretical_loss": 6.5269228550684195, + "tokens_seen": 13762560 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020932539682539685, + "loss": 5.2664, + "theoretical_loss": 6.521448829050978, + "tokens_seen": 13828096 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021031746031746032, + "loss": 5.7298, + "theoretical_loss": 6.516007910077416, + "tokens_seen": 13893632 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021130952380952382, + "loss": 5.7504, + "theoretical_loss": 6.51059974316095, + "tokens_seen": 13959168 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002123015873015873, + "loss": 5.6754, + "theoretical_loss": 6.50522397876491, + "tokens_seen": 14024704 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002132936507936508, + "loss": 6.0764, + "theoretical_loss": 6.499880272694068, + "tokens_seen": 14090240 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021428571428571427, + "loss": 5.7123, + "theoretical_loss": 6.494568285988618, + "tokens_seen": 14155776 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002152777777777778, + "loss": 5.5138, + "theoretical_loss": 6.489287684820745, + "tokens_seen": 14221312 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021626984126984128, + "loss": 5.9094, + "theoretical_loss": 6.484038140393699, + "tokens_seen": 14286848 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021726190476190478, + "loss": 5.6079, + "theoretical_loss": 6.4788193288433105, + "tokens_seen": 14352384 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021825396825396825, + "loss": 5.5548, + "theoretical_loss": 6.473630931141869, + "tokens_seen": 14417920 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021924603174603176, + "loss": 5.5485, + "theoretical_loss": 6.468472633004308, + "tokens_seen": 14483456 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022023809523809523, + "loss": 5.6022, + "theoretical_loss": 6.463344124796616, + "tokens_seen": 14548992 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022123015873015873, + "loss": 5.715, + "theoretical_loss": 6.45824510144643, + "tokens_seen": 14614528 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002222222222222222, + "loss": 5.67, + "theoretical_loss": 6.45317526235573, + "tokens_seen": 14680064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 20428, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.015859127044678, + "objective/train/theoretical_loss": 6.448134311315593, + "objective/train/tokens_used": 35205600, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022321428571428573, + "loss": 5.3803, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002242063492063492, + "loss": 5.4398, + "theoretical_loss": 6.443121956422939, + "tokens_seen": 14811136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002251984126984127, + "loss": 5.6742, + "theoretical_loss": 6.438137909999214, + "tokens_seen": 14876672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022619047619047618, + "loss": 5.7337, + "theoretical_loss": 6.433181888510964, + "tokens_seen": 14942208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022718253968253969, + "loss": 5.5962, + "theoretical_loss": 6.428253612492239, + "tokens_seen": 15007744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022817460317460316, + "loss": 5.8581, + "theoretical_loss": 6.4233528064687855, + "tokens_seen": 15073280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022916666666666666, + "loss": 5.7247, + "theoretical_loss": 6.418479198883969, + "tokens_seen": 15138816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023015873015873016, + "loss": 5.6249, + "theoretical_loss": 6.413632522026391, + "tokens_seen": 15204352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023115079365079367, + "loss": 5.6726, + "theoretical_loss": 6.40881251195914, + "tokens_seen": 15269888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023214285714285717, + "loss": 5.4304, + "theoretical_loss": 6.404018908450656, + "tokens_seen": 15335424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023313492063492064, + "loss": 5.4765, + "theoretical_loss": 6.399251454907132, + "tokens_seen": 15400960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023412698412698414, + "loss": 5.865, + "theoretical_loss": 6.394509898306452, + "tokens_seen": 15466496 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023511904761904762, + "loss": 5.7674, + "theoretical_loss": 6.389793989133574, + "tokens_seen": 15532032 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023611111111111112, + "loss": 5.2223, + "theoretical_loss": 6.385103481317387, + "tokens_seen": 15597568 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002371031746031746, + "loss": 5.6511, + "theoretical_loss": 6.380438132168923, + "tokens_seen": 15663104 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002380952380952381, + "loss": 5.4302, + "theoretical_loss": 6.375797702320966, + "tokens_seen": 15728640 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002390873015873016, + "loss": 5.4526, + "theoretical_loss": 6.371181955668966, + "tokens_seen": 15794176 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002400793650793651, + "loss": 5.3307, + "theoretical_loss": 6.366590659313248, + "tokens_seen": 15859712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024107142857142857, + "loss": 5.4796, + "theoretical_loss": 6.36202358350248, + "tokens_seen": 15925248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024206349206349207, + "loss": 5.4457, + "theoretical_loss": 6.357480501578371, + "tokens_seen": 15990784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024305555555555555, + "loss": 5.5368, + "theoretical_loss": 6.352961189921553, + "tokens_seen": 16056320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024404761904761905, + "loss": 5.3346, + "theoretical_loss": 6.348465427898629, + "tokens_seen": 16121856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024503968253968255, + "loss": 5.2158, + "theoretical_loss": 6.343992997810366, + "tokens_seen": 16187392 + }, + { + "epoch": 0.0, + "learning_rate": 0.000246031746031746, + "loss": 5.5594, + "theoretical_loss": 6.33954368484097, + "tokens_seen": 16252928 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024702380952380955, + "loss": 5.5983, + "theoretical_loss": 6.33511727700846, + "tokens_seen": 16318464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 21641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.571507930755615, + "objective/train/theoretical_loss": 6.330713565116083, + "objective/train/tokens_used": 36844000, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000248015873015873, + "loss": 5.6484, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002490079365079365, + "loss": 5.6139, + "theoretical_loss": 6.326332342704751, + "tokens_seen": 16449536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025, + "loss": 5.6404, + "theoretical_loss": 6.32197340600647, + "tokens_seen": 16515072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002509920634920635, + "loss": 5.5749, + "theoretical_loss": 6.3176365538987636, + "tokens_seen": 16580608 + }, + { + "epoch": 0.01, + "learning_rate": 0.000251984126984127, + "loss": 5.176, + "theoretical_loss": 6.313321587860021, + "tokens_seen": 16646144 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025297619047619046, + "loss": 5.4498, + "theoretical_loss": 6.309028311925785, + "tokens_seen": 16711680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025396825396825396, + "loss": 5.059, + "theoretical_loss": 6.304756532645939, + "tokens_seen": 16777216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025496031746031746, + "loss": 5.6538, + "theoretical_loss": 6.300506059042775, + "tokens_seen": 16842752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025595238095238096, + "loss": 5.4578, + "theoretical_loss": 6.296276702569918, + "tokens_seen": 16908288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002569444444444444, + "loss": 5.675, + "theoretical_loss": 6.292068277072099, + "tokens_seen": 16973824 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025793650793650796, + "loss": 5.6443, + "theoretical_loss": 6.28788059874573, + "tokens_seen": 17039360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025892857142857146, + "loss": 4.9578, + "theoretical_loss": 6.283713486100297, + "tokens_seen": 17104896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025992063492063497, + "loss": 5.6714, + "theoretical_loss": 6.279566759920507, + "tokens_seen": 17170432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002609126984126984, + "loss": 5.4797, + "theoretical_loss": 6.275440243229228, + "tokens_seen": 17235968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002619047619047619, + "loss": 5.2515, + "theoretical_loss": 6.271333761251142, + "tokens_seen": 17301504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002628968253968254, + "loss": 5.4154, + "theoretical_loss": 6.267247141377137, + "tokens_seen": 17367040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002638888888888889, + "loss": 5.5372, + "theoretical_loss": 6.2631802131294085, + "tokens_seen": 17432576 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026488095238095237, + "loss": 5.4489, + "theoretical_loss": 6.259132808127246, + "tokens_seen": 17498112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026587301587301587, + "loss": 5.5914, + "theoretical_loss": 6.255104760053497, + "tokens_seen": 17563648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026686507936507937, + "loss": 5.1736, + "theoretical_loss": 6.251095904621689, + "tokens_seen": 17629184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026785714285714287, + "loss": 5.6947, + "theoretical_loss": 6.247106079543801, + "tokens_seen": 17694720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002688492063492063, + "loss": 5.515, + "theoretical_loss": 6.243135124498652, + "tokens_seen": 17760256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002698412698412698, + "loss": 5.531, + "theoretical_loss": 6.239182881100916, + "tokens_seen": 17825792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002708333333333333, + "loss": 5.2666, + "theoretical_loss": 6.235249192870732, + "tokens_seen": 17891328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002718253968253968, + "loss": 5.5261, + "theoretical_loss": 6.231333905203899, + "tokens_seen": 17956864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 22253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.281318664550781, + "objective/train/theoretical_loss": 6.227436865342643, + "objective/train/tokens_used": 38482400, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002728174603174603, + "loss": 5.2597, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002738095238095238, + "loss": 5.6137, + "theoretical_loss": 6.223557922346955, + "tokens_seen": 18087936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002748015873015873, + "loss": 5.2549, + "theoretical_loss": 6.219696927066456, + "tokens_seen": 18153472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027579365079365083, + "loss": 5.3473, + "theoretical_loss": 6.215853732112821, + "tokens_seen": 18219008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027678571428571433, + "loss": 5.2604, + "theoretical_loss": 6.212028191832702, + "tokens_seen": 18284544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002777777777777778, + "loss": 5.3027, + "theoretical_loss": 6.208220162281178, + "tokens_seen": 18350080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002787698412698413, + "loss": 5.2227, + "theoretical_loss": 6.204429501195701, + "tokens_seen": 18415616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002797619047619048, + "loss": 5.4059, + "theoretical_loss": 6.20065606797053, + "tokens_seen": 18481152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002807539682539683, + "loss": 5.5029, + "theoretical_loss": 6.19689972363164, + "tokens_seen": 18546688 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028174603174603173, + "loss": 5.3127, + "theoretical_loss": 6.1931603308120975, + "tokens_seen": 18612224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028273809523809523, + "loss": 5.2913, + "theoretical_loss": 6.189437753727901, + "tokens_seen": 18677760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028373015873015873, + "loss": 5.3535, + "theoretical_loss": 6.185731858154261, + "tokens_seen": 18743296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028472222222222223, + "loss": 5.4967, + "theoretical_loss": 6.182042511402313, + "tokens_seen": 18808832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002857142857142857, + "loss": 5.325, + "theoretical_loss": 6.17836958229627, + "tokens_seen": 18874368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002867063492063492, + "loss": 5.3921, + "theoretical_loss": 6.1747129411509825, + "tokens_seen": 18939904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002876984126984127, + "loss": 4.9391, + "theoretical_loss": 6.171072459749913, + "tokens_seen": 19005440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002886904761904762, + "loss": 5.3633, + "theoretical_loss": 6.1674480113235095, + "tokens_seen": 19070976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002896825396825397, + "loss": 5.2082, + "theoretical_loss": 6.163839470527964, + "tokens_seen": 19136512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002906746031746032, + "loss": 5.3773, + "theoretical_loss": 6.160246713424372, + "tokens_seen": 19202048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002916666666666667, + "loss": 5.7411, + "theoretical_loss": 6.156669617458243, + "tokens_seen": 19267584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002926587301587302, + "loss": 4.9287, + "theoretical_loss": 6.153108061439397, + "tokens_seen": 19333120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002936507936507937, + "loss": 5.493, + "theoretical_loss": 6.149561925522211, + "tokens_seen": 19398656 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029464285714285714, + "loss": 5.3644, + "theoretical_loss": 6.146031091186222, + "tokens_seen": 19464192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029563492063492064, + "loss": 5.3852, + "theoretical_loss": 6.142515441217064, + "tokens_seen": 19529728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029662698412698414, + "loss": 5.2977, + "theoretical_loss": 6.1390148596877605, + "tokens_seen": 19595264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 23612, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.15675163269043, + "objective/train/theoretical_loss": 6.135529231940326, + "objective/train/tokens_used": 40120800, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029761904761904765, + "loss": 5.137, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002986111111111111, + "loss": 5.1525, + "theoretical_loss": 6.132058444567705, + "tokens_seen": 19726336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002996031746031746, + "loss": 5.0443, + "theoretical_loss": 6.128602385396022, + "tokens_seen": 19791872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003005952380952381, + "loss": 5.4228, + "theoretical_loss": 6.125160943467138, + "tokens_seen": 19857408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003015873015873016, + "loss": 5.194, + "theoretical_loss": 6.121734009021521, + "tokens_seen": 19922944 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030257936507936505, + "loss": 5.2838, + "theoretical_loss": 6.118321473481398, + "tokens_seen": 19988480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030357142857142855, + "loss": 5.1757, + "theoretical_loss": 6.114923229434213, + "tokens_seen": 20054016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030456349206349205, + "loss": 5.5533, + "theoretical_loss": 6.111539170616359, + "tokens_seen": 20119552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003055555555555556, + "loss": 5.0377, + "theoretical_loss": 6.108169191897195, + "tokens_seen": 20185088 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030654761904761905, + "loss": 5.006, + "theoretical_loss": 6.104813189263336, + "tokens_seen": 20250624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030753968253968255, + "loss": 5.2312, + "theoretical_loss": 6.101471059803204, + "tokens_seen": 20316160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030853174603174605, + "loss": 5.2477, + "theoretical_loss": 6.098142701691856, + "tokens_seen": 20381696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030952380952380956, + "loss": 5.1064, + "theoretical_loss": 6.094828014176053, + "tokens_seen": 20447232 + }, + { + "epoch": 0.01, + "learning_rate": 0.000310515873015873, + "loss": 5.202, + "theoretical_loss": 6.091526897559593, + "tokens_seen": 20512768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003115079365079365, + "loss": 5.0853, + "theoretical_loss": 6.088239253188885, + "tokens_seen": 20578304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003125, + "loss": 5.3478, + "theoretical_loss": 6.084964983438763, + "tokens_seen": 20643840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003134920634920635, + "loss": 5.2918, + "theoretical_loss": 6.0817039916985465, + "tokens_seen": 20709376 + }, + { + "epoch": 0.01, + "learning_rate": 0.000314484126984127, + "loss": 5.0812, + "theoretical_loss": 6.078456182358325, + "tokens_seen": 20774912 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031547619047619046, + "loss": 5.1169, + "theoretical_loss": 6.075221460795472, + "tokens_seen": 20840448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031646825396825396, + "loss": 5.279, + "theoretical_loss": 6.071999733361386, + "tokens_seen": 20905984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031746031746031746, + "loss": 5.3881, + "theoretical_loss": 6.068790907368448, + "tokens_seen": 20971520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031845238095238096, + "loss": 5.26, + "theoretical_loss": 6.0655948910771915, + "tokens_seen": 21037056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003194444444444444, + "loss": 5.2783, + "theoretical_loss": 6.062411593683687, + "tokens_seen": 21102592 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032043650793650796, + "loss": 5.062, + "theoretical_loss": 6.059240925307134, + "tokens_seen": 21168128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032142857142857147, + "loss": 5.2426, + "theoretical_loss": 6.056082796977648, + "tokens_seen": 21233664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 24330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.736891746520996, + "objective/train/theoretical_loss": 6.052937120624258, + "objective/train/tokens_used": 41759200, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032242063492063497, + "loss": 5.1217, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003234126984126984, + "loss": 5.4246, + "theoretical_loss": 6.049803809063083, + "tokens_seen": 21364736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003244047619047619, + "loss": 5.4049, + "theoretical_loss": 6.0466827759857145, + "tokens_seen": 21430272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003253968253968254, + "loss": 5.1161, + "theoretical_loss": 6.04357393594778, + "tokens_seen": 21495808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003263888888888889, + "loss": 5.2259, + "theoretical_loss": 6.040477204357686, + "tokens_seen": 21561344 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032738095238095237, + "loss": 5.4008, + "theoretical_loss": 6.037392497465552, + "tokens_seen": 21626880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032837301587301587, + "loss": 5.0396, + "theoretical_loss": 6.034319732352309, + "tokens_seen": 21692416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032936507936507937, + "loss": 5.1953, + "theoretical_loss": 6.031258826918979, + "tokens_seen": 21757952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033035714285714287, + "loss": 5.3939, + "theoretical_loss": 6.0282096998761245, + "tokens_seen": 21823488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003313492063492063, + "loss": 5.5128, + "theoretical_loss": 6.025172270733464, + "tokens_seen": 21889024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003323412698412698, + "loss": 5.1908, + "theoretical_loss": 6.0221464597896475, + "tokens_seen": 21954560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003333333333333333, + "loss": 4.7464, + "theoretical_loss": 6.0191321881221995, + "tokens_seen": 22020096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003343253968253968, + "loss": 5.2092, + "theoretical_loss": 6.016129377577614, + "tokens_seen": 22085632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003353174603174603, + "loss": 5.3966, + "theoretical_loss": 6.01313795076161, + "tokens_seen": 22151168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003363095238095238, + "loss": 4.8061, + "theoretical_loss": 6.010157831029533, + "tokens_seen": 22216704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033730158730158733, + "loss": 5.0669, + "theoretical_loss": 6.007188942476907, + "tokens_seen": 22282240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033829365079365083, + "loss": 5.2141, + "theoretical_loss": 6.0042312099301425, + "tokens_seen": 22347776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033928571428571433, + "loss": 5.1484, + "theoretical_loss": 6.001284558937368, + "tokens_seen": 22413312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003402777777777778, + "loss": 5.2602, + "theoretical_loss": 5.998348915759426, + "tokens_seen": 22478848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003412698412698413, + "loss": 5.1164, + "theoretical_loss": 5.995424207360987, + "tokens_seen": 22544384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003422619047619048, + "loss": 5.0515, + "theoretical_loss": 5.992510361401818, + "tokens_seen": 22609920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003432539682539683, + "loss": 5.218, + "theoretical_loss": 5.989607306228168, + "tokens_seen": 22675456 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034424603174603173, + "loss": 4.9102, + "theoretical_loss": 5.986714970864292, + "tokens_seen": 22740992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034523809523809523, + "loss": 5.0805, + "theoretical_loss": 5.983833285004112, + "tokens_seen": 22806528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034623015873015873, + "loss": 5.2809, + "theoretical_loss": 5.980962179002983, + "tokens_seen": 22872064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 24983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.134525299072266, + "objective/train/theoretical_loss": 5.978101583869607, + "objective/train/tokens_used": 43397600, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034722222222222224, + "loss": 5.1396, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003482142857142857, + "loss": 5.0727, + "theoretical_loss": 5.975251431258057, + "tokens_seen": 23003136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003492063492063492, + "loss": 5.1882, + "theoretical_loss": 5.972411653459913, + "tokens_seen": 23068672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003501984126984127, + "loss": 4.8682, + "theoretical_loss": 5.9695821833965335, + "tokens_seen": 23134208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003511904761904762, + "loss": 5.2343, + "theoretical_loss": 5.966762954611432, + "tokens_seen": 23199744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003521825396825397, + "loss": 5.195, + "theoretical_loss": 5.963953901262764, + "tokens_seen": 23265280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003531746031746032, + "loss": 5.2282, + "theoretical_loss": 5.961154958115937, + "tokens_seen": 23330816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003541666666666667, + "loss": 4.7545, + "theoretical_loss": 5.958366060536315, + "tokens_seen": 23396352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003551587301587302, + "loss": 5.1342, + "theoretical_loss": 5.955587144482044, + "tokens_seen": 23461888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003561507936507937, + "loss": 5.0144, + "theoretical_loss": 5.952818146496978, + "tokens_seen": 23527424 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035714285714285714, + "loss": 5.0659, + "theoretical_loss": 5.950059003703704, + "tokens_seen": 23592960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035813492063492064, + "loss": 5.1206, + "theoretical_loss": 5.94730965379668, + "tokens_seen": 23658496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035912698412698415, + "loss": 4.9633, + "theoretical_loss": 5.944570035035458, + "tokens_seen": 23724032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036011904761904765, + "loss": 5.1591, + "theoretical_loss": 5.941840086238027, + "tokens_seen": 23789568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003611111111111111, + "loss": 5.3314, + "theoretical_loss": 5.939119746774228, + "tokens_seen": 23855104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003621031746031746, + "loss": 5.2119, + "theoretical_loss": 5.936408956559284, + "tokens_seen": 23920640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003630952380952381, + "loss": 5.3978, + "theoretical_loss": 5.933707656047414, + "tokens_seen": 23986176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003640873015873016, + "loss": 4.9459, + "theoretical_loss": 5.93101578622554, + "tokens_seen": 24051712 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036507936507936505, + "loss": 5.031, + "theoretical_loss": 5.928333288607086, + "tokens_seen": 24117248 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036607142857142855, + "loss": 5.1772, + "theoretical_loss": 5.925660105225867, + "tokens_seen": 24182784 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036706349206349205, + "loss": 4.8824, + "theoretical_loss": 5.92299617863006, + "tokens_seen": 24248320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003680555555555556, + "loss": 5.0966, + "theoretical_loss": 5.920341451876267, + "tokens_seen": 24313856 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036904761904761905, + "loss": 5.1533, + "theoretical_loss": 5.9176958685236585, + "tokens_seen": 24379392 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037003968253968255, + "loss": 5.2939, + "theoretical_loss": 5.9150593726282015, + "tokens_seen": 24444928 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037103174603174606, + "loss": 5.0167, + "theoretical_loss": 5.912431908736972, + "tokens_seen": 24510464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 26439, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.887441635131836, + "objective/train/theoretical_loss": 5.909813421882534, + "objective/train/tokens_used": 45036000, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037202380952380956, + "loss": 5.2049, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000373015873015873, + "loss": 5.2346, + "theoretical_loss": 5.907203857577422, + "tokens_seen": 24641536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003740079365079365, + "loss": 5.2112, + "theoretical_loss": 5.9046031618086765, + "tokens_seen": 24707072 + }, + { + "epoch": 0.01, + "learning_rate": 0.000375, + "loss": 5.092, + "theoretical_loss": 5.902011281032472, + "tokens_seen": 24772608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003759920634920635, + "loss": 5.2228, + "theoretical_loss": 5.899428162168808, + "tokens_seen": 24838144 + }, + { + "epoch": 0.01, + "learning_rate": 0.000376984126984127, + "loss": 4.9695, + "theoretical_loss": 5.896853752596286, + "tokens_seen": 24903680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037797619047619046, + "loss": 5.1379, + "theoretical_loss": 5.894288000146949, + "tokens_seen": 24969216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037896825396825396, + "loss": 5.0639, + "theoretical_loss": 5.891730853101199, + "tokens_seen": 25034752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037996031746031746, + "loss": 5.0119, + "theoretical_loss": 5.88918226018278, + "tokens_seen": 25100288 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038095238095238096, + "loss": 5.0625, + "theoretical_loss": 5.8866421705538325, + "tokens_seen": 25165824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003819444444444444, + "loss": 5.2315, + "theoretical_loss": 5.8841105338100155, + "tokens_seen": 25231360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038293650793650797, + "loss": 4.8079, + "theoretical_loss": 5.881587299975694, + "tokens_seen": 25296896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038392857142857147, + "loss": 5.3085, + "theoretical_loss": 5.8790724194991935, + "tokens_seen": 25362432 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038492063492063497, + "loss": 5.1826, + "theoretical_loss": 5.876565843248124, + "tokens_seen": 25427968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003859126984126984, + "loss": 4.8865, + "theoretical_loss": 5.8740675225047525, + "tokens_seen": 25493504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003869047619047619, + "loss": 4.8372, + "theoretical_loss": 5.871577408961457, + "tokens_seen": 25559040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003878968253968254, + "loss": 5.1074, + "theoretical_loss": 5.869095454716231, + "tokens_seen": 25624576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003888888888888889, + "loss": 5.0627, + "theoretical_loss": 5.866621612268246, + "tokens_seen": 25690112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038988095238095237, + "loss": 5.1708, + "theoretical_loss": 5.864155834513486, + "tokens_seen": 25755648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039087301587301587, + "loss": 4.9657, + "theoretical_loss": 5.8616980747404295, + "tokens_seen": 25821184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039186507936507937, + "loss": 5.1406, + "theoretical_loss": 5.859248286625787, + "tokens_seen": 25886720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003928571428571429, + "loss": 5.253, + "theoretical_loss": 5.856806424230314, + "tokens_seen": 25952256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003938492063492063, + "loss": 4.9518, + "theoretical_loss": 5.854372441994654, + "tokens_seen": 26017792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003948412698412698, + "loss": 4.9569, + "theoretical_loss": 5.851946294735258, + "tokens_seen": 26083328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003958333333333333, + "loss": 4.9376, + "theoretical_loss": 5.849527937640345, + "tokens_seen": 26148864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 27060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.355391025543213, + "objective/train/theoretical_loss": 5.8471173262659235, + "objective/train/tokens_used": 46674400, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003968253968253968, + "loss": 4.8582, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003978174603174603, + "loss": 4.9394, + "theoretical_loss": 5.84471441653186, + "tokens_seen": 26279936 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039880952380952383, + "loss": 5.2367, + "theoretical_loss": 5.842319164718004, + "tokens_seen": 26345472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039980158730158733, + "loss": 5.1756, + "theoretical_loss": 5.83993152746036, + "tokens_seen": 26411008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040079365079365083, + "loss": 5.1252, + "theoretical_loss": 5.83755146174731, + "tokens_seen": 26476544 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040178571428571433, + "loss": 5.0547, + "theoretical_loss": 5.835178924915889, + "tokens_seen": 26542080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004027777777777778, + "loss": 5.0688, + "theoretical_loss": 5.832813874648102, + "tokens_seen": 26607616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004037698412698413, + "loss": 5.0608, + "theoretical_loss": 5.8304562689673, + "tokens_seen": 26673152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004047619047619048, + "loss": 5.0401, + "theoretical_loss": 5.828106066234588, + "tokens_seen": 26738688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004057539682539683, + "loss": 5.0167, + "theoretical_loss": 5.825763225145295, + "tokens_seen": 26804224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040674603174603173, + "loss": 5.1791, + "theoretical_loss": 5.823427704725473, + "tokens_seen": 26869760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040773809523809523, + "loss": 4.8063, + "theoretical_loss": 5.82109946432846, + "tokens_seen": 26935296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040873015873015874, + "loss": 4.9278, + "theoretical_loss": 5.818778463631473, + "tokens_seen": 27000832 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040972222222222224, + "loss": 4.7786, + "theoretical_loss": 5.816464662632243, + "tokens_seen": 27066368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004107142857142857, + "loss": 5.1196, + "theoretical_loss": 5.8141580216457065, + "tokens_seen": 27131904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004117063492063492, + "loss": 4.6589, + "theoretical_loss": 5.811858501300729, + "tokens_seen": 27197440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004126984126984127, + "loss": 4.8227, + "theoretical_loss": 5.809566062536868, + "tokens_seen": 27262976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004136904761904762, + "loss": 5.0245, + "theoretical_loss": 5.807280666601191, + "tokens_seen": 27328512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004146825396825397, + "loss": 4.9783, + "theoretical_loss": 5.805002275045111, + "tokens_seen": 27394048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004156746031746032, + "loss": 5.0457, + "theoretical_loss": 5.8027308497212875, + "tokens_seen": 27459584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004166666666666667, + "loss": 5.002, + "theoretical_loss": 5.800466352780546, + "tokens_seen": 27525120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004176587301587302, + "loss": 4.7979, + "theoretical_loss": 5.798208746668847, + "tokens_seen": 27590656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004186507936507937, + "loss": 5.0751, + "theoretical_loss": 5.795957994124291, + "tokens_seen": 27656192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00041964285714285714, + "loss": 4.5446, + "theoretical_loss": 5.7937140581741575, + "tokens_seen": 27721728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042063492063492065, + "loss": 4.9859, + "theoretical_loss": 5.791476902131985, + "tokens_seen": 27787264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 28453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.114999294281006, + "objective/train/theoretical_loss": 5.789246489594688, + "objective/train/tokens_used": 48312800, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042162698412698415, + "loss": 4.7322, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042261904761904765, + "loss": 4.9798, + "theoretical_loss": 5.787022784439701, + "tokens_seen": 27918336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004236111111111111, + "loss": 5.1612, + "theoretical_loss": 5.784805750822171, + "tokens_seen": 27983872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004246031746031746, + "loss": 5.0818, + "theoretical_loss": 5.782595353172176, + "tokens_seen": 28049408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004255952380952381, + "loss": 4.5939, + "theoretical_loss": 5.780391556191977, + "tokens_seen": 28114944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004265873015873016, + "loss": 4.9998, + "theoretical_loss": 5.778194324853311, + "tokens_seen": 28180480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042757936507936505, + "loss": 5.1566, + "theoretical_loss": 5.776003624394711, + "tokens_seen": 28246016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042857142857142855, + "loss": 5.2049, + "theoretical_loss": 5.773819420318858, + "tokens_seen": 28311552 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042956349206349205, + "loss": 4.7744, + "theoretical_loss": 5.771641678389971, + "tokens_seen": 28377088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004305555555555556, + "loss": 4.8862, + "theoretical_loss": 5.769470364631225, + "tokens_seen": 28442624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043154761904761905, + "loss": 4.7604, + "theoretical_loss": 5.767305445322201, + "tokens_seen": 28508160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043253968253968256, + "loss": 4.9808, + "theoretical_loss": 5.765146886996363, + "tokens_seen": 28573696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043353174603174606, + "loss": 4.7763, + "theoretical_loss": 5.762994656438579, + "tokens_seen": 28639232 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043452380952380956, + "loss": 5.0911, + "theoretical_loss": 5.760848720682651, + "tokens_seen": 28704768 + }, + { + "epoch": 0.01, + "learning_rate": 0.000435515873015873, + "loss": 5.0289, + "theoretical_loss": 5.758709047008894, + "tokens_seen": 28770304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004365079365079365, + "loss": 5.0362, + "theoretical_loss": 5.756575602941732, + "tokens_seen": 28835840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004375, + "loss": 5.1589, + "theoretical_loss": 5.75444835624733, + "tokens_seen": 28901376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004384920634920635, + "loss": 5.1524, + "theoretical_loss": 5.752327274931249, + "tokens_seen": 28966912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000439484126984127, + "loss": 4.6286, + "theoretical_loss": 5.750212327236129, + "tokens_seen": 29032448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044047619047619046, + "loss": 4.9918, + "theoretical_loss": 5.7481034816394105, + "tokens_seen": 29097984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044146825396825396, + "loss": 5.0182, + "theoretical_loss": 5.7460007068510635, + "tokens_seen": 29163520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044246031746031746, + "loss": 5.1442, + "theoretical_loss": 5.74390397181136, + "tokens_seen": 29229056 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044345238095238096, + "loss": 4.9284, + "theoretical_loss": 5.741813245688668, + "tokens_seen": 29294592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004444444444444444, + "loss": 4.7274, + "theoretical_loss": 5.739728497877267, + "tokens_seen": 29360128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044543650793650797, + "loss": 4.9829, + "theoretical_loss": 5.737649697995197, + "tokens_seen": 29425664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 29115, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.875042915344238, + "objective/train/theoretical_loss": 5.7355768158821245, + "objective/train/tokens_used": 49951200, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044642857142857147, + "loss": 5.0802, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044742063492063497, + "loss": 4.8054, + "theoretical_loss": 5.73350982159724, + "tokens_seen": 29556736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004484126984126984, + "loss": 4.9775, + "theoretical_loss": 5.731448685417178, + "tokens_seen": 29622272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004494047619047619, + "loss": 4.9123, + "theoretical_loss": 5.729393377833956, + "tokens_seen": 29687808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004503968253968254, + "loss": 4.5612, + "theoretical_loss": 5.7273438695529535, + "tokens_seen": 29753344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004513888888888889, + "loss": 5.0027, + "theoretical_loss": 5.725300131490888, + "tokens_seen": 29818880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045238095238095237, + "loss": 4.9368, + "theoretical_loss": 5.7232621347738455, + "tokens_seen": 29884416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045337301587301587, + "loss": 5.0227, + "theoretical_loss": 5.721229850735305, + "tokens_seen": 29949952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045436507936507937, + "loss": 5.0473, + "theoretical_loss": 5.719203250914208, + "tokens_seen": 30015488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004553571428571429, + "loss": 5.0973, + "theoretical_loss": 5.717182307053037, + "tokens_seen": 30081024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004563492063492063, + "loss": 4.757, + "theoretical_loss": 5.715166991095922, + "tokens_seen": 30146560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004573412698412698, + "loss": 4.8415, + "theoretical_loss": 5.713157275186761, + "tokens_seen": 30212096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004583333333333333, + "loss": 4.9583, + "theoretical_loss": 5.71115313166738, + "tokens_seen": 30277632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004593253968253968, + "loss": 5.1126, + "theoretical_loss": 5.709154533075688, + "tokens_seen": 30343168 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046031746031746033, + "loss": 4.9564, + "theoretical_loss": 5.707161452143879, + "tokens_seen": 30408704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046130952380952383, + "loss": 5.0017, + "theoretical_loss": 5.7051738617966326, + "tokens_seen": 30474240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046230158730158733, + "loss": 5.1598, + "theoretical_loss": 5.7031917351493515, + "tokens_seen": 30539776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046329365079365083, + "loss": 4.7815, + "theoretical_loss": 5.701215045506411, + "tokens_seen": 30605312 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046428571428571433, + "loss": 5.0404, + "theoretical_loss": 5.699243766359421, + "tokens_seen": 30670848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004652777777777778, + "loss": 4.958, + "theoretical_loss": 5.697277871385534, + "tokens_seen": 30736384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004662698412698413, + "loss": 4.8348, + "theoretical_loss": 5.695317334445736, + "tokens_seen": 30801920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004672619047619048, + "loss": 5.1101, + "theoretical_loss": 5.693362129583184, + "tokens_seen": 30867456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004682539682539683, + "loss": 5.1583, + "theoretical_loss": 5.691412231021549, + "tokens_seen": 30932992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046924603174603173, + "loss": 5.0497, + "theoretical_loss": 5.689467613163388, + "tokens_seen": 30998528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047023809523809523, + "loss": 5.0994, + "theoretical_loss": 5.687528250588518, + "tokens_seen": 31064064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 30152, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.741349697113037, + "objective/train/theoretical_loss": 5.6855941180524265, + "objective/train/tokens_used": 51589600, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047123015873015874, + "loss": 4.9423, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047222222222222224, + "loss": 4.9094, + "theoretical_loss": 5.683665190484683, + "tokens_seen": 31195136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004732142857142857, + "loss": 5.0356, + "theoretical_loss": 5.681741442987381, + "tokens_seen": 31260672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004742063492063492, + "loss": 4.8249, + "theoretical_loss": 5.679822850833591, + "tokens_seen": 31326208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004751984126984127, + "loss": 4.9261, + "theoretical_loss": 5.677909389465831, + "tokens_seen": 31391744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004761904761904762, + "loss": 4.5377, + "theoretical_loss": 5.676001034494554, + "tokens_seen": 31457280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004771825396825397, + "loss": 4.464, + "theoretical_loss": 5.674097761696653, + "tokens_seen": 31522816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004781746031746032, + "loss": 4.6988, + "theoretical_loss": 5.672199547013983, + "tokens_seen": 31588352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004791666666666667, + "loss": 4.6733, + "theoretical_loss": 5.670306366551898, + "tokens_seen": 31653888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004801587301587302, + "loss": 4.7771, + "theoretical_loss": 5.6684181965778, + "tokens_seen": 31719424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004811507936507937, + "loss": 5.0062, + "theoretical_loss": 5.666535013519715, + "tokens_seen": 31784960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048214285714285715, + "loss": 4.7684, + "theoretical_loss": 5.6646567939648715, + "tokens_seen": 31850496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048313492063492065, + "loss": 4.8464, + "theoretical_loss": 5.6627835146583045, + "tokens_seen": 31916032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048412698412698415, + "loss": 4.689, + "theoretical_loss": 5.660915152501465, + "tokens_seen": 31981568 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048511904761904765, + "loss": 4.5762, + "theoretical_loss": 5.659051684550857, + "tokens_seen": 32047104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004861111111111111, + "loss": 4.8833, + "theoretical_loss": 5.657193088016677, + "tokens_seen": 32112640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004871031746031746, + "loss": 4.6286, + "theoretical_loss": 5.655339340261474, + "tokens_seen": 32178176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004880952380952381, + "loss": 4.791, + "theoretical_loss": 5.653490418798825, + "tokens_seen": 32243712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004890873015873016, + "loss": 5.0112, + "theoretical_loss": 5.651646301292022, + "tokens_seen": 32309248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004900793650793651, + "loss": 4.7112, + "theoretical_loss": 5.649806965552774, + "tokens_seen": 32374784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004910714285714286, + "loss": 4.7717, + "theoretical_loss": 5.6479723895399205, + "tokens_seen": 32440320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000492063492063492, + "loss": 4.7971, + "theoretical_loss": 5.6461425513581665, + "tokens_seen": 32505856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004930555555555556, + "loss": 4.9546, + "theoretical_loss": 5.6443174292568195, + "tokens_seen": 32571392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004940476190476191, + "loss": 4.6853, + "theoretical_loss": 5.6424970016285485, + "tokens_seen": 32636928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004950396825396826, + "loss": 4.8721, + "theoretical_loss": 5.640681247008156, + "tokens_seen": 32702464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 30758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.8071393966674805, + "objective/train/theoretical_loss": 5.638870144071353, + "objective/train/tokens_used": 53228000, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000496031746031746, + "loss": 4.8628, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004970238095238095, + "loss": 4.7892, + "theoretical_loss": 5.637063671633564, + "tokens_seen": 32833536 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498015873015873, + "loss": 4.8607, + "theoretical_loss": 5.635261808648728, + "tokens_seen": 32899072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990079365079365, + "loss": 5.0888, + "theoretical_loss": 5.6334645342081195, + "tokens_seen": 32964608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 4.9737, + "theoretical_loss": 5.631671827539186, + "tokens_seen": 33030144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999899699097292, + "loss": 5.0126, + "theoretical_loss": 5.629883668004389, + "tokens_seen": 33095680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999799398194584, + "loss": 4.9297, + "theoretical_loss": 5.628100035100061, + "tokens_seen": 33161216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999699097291876, + "loss": 5.0938, + "theoretical_loss": 5.626320908455279, + "tokens_seen": 33226752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999598796389167, + "loss": 4.758, + "theoretical_loss": 5.6245462678307385, + "tokens_seen": 33292288 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499949849548646, + "loss": 4.6107, + "theoretical_loss": 5.622776093117652, + "tokens_seen": 33357824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999398194583751, + "loss": 4.9094, + "theoretical_loss": 5.621010364336651, + "tokens_seen": 33423360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999297893681044, + "loss": 4.7434, + "theoretical_loss": 5.619249061636698, + "tokens_seen": 33488896 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999197592778335, + "loss": 4.7376, + "theoretical_loss": 5.61749216529402, + "tokens_seen": 33554432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999097291875627, + "loss": 4.8733, + "theoretical_loss": 5.615739655711037, + "tokens_seen": 33619968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998996990972919, + "loss": 4.8815, + "theoretical_loss": 5.61399151341532, + "tokens_seen": 33685504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998896690070211, + "loss": 4.8804, + "theoretical_loss": 5.6122477190585425, + "tokens_seen": 33751040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998796389167503, + "loss": 4.6509, + "theoretical_loss": 5.610508253415453, + "tokens_seen": 33816576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998696088264795, + "loss": 5.0255, + "theoretical_loss": 5.6087730973828585, + "tokens_seen": 33882112 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998595787362087, + "loss": 4.7137, + "theoretical_loss": 5.6070422319786095, + "tokens_seen": 33947648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998495486459378, + "loss": 4.8675, + "theoretical_loss": 5.605315638340606, + "tokens_seen": 34013184 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499839518555667, + "loss": 4.8488, + "theoretical_loss": 5.603593297725807, + "tokens_seen": 34078720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998294884653962, + "loss": 5.0848, + "theoretical_loss": 5.601875191509249, + "tokens_seen": 34144256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998194583751254, + "loss": 4.4149, + "theoretical_loss": 5.600161301183084, + "tokens_seen": 34209792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998094282848546, + "loss": 5.0088, + "theoretical_loss": 5.598451608355614, + "tokens_seen": 34275328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997993981945837, + "loss": 4.9221, + "theoretical_loss": 5.596746094750342, + "tokens_seen": 34340864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 31355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.054786205291748, + "objective/train/theoretical_loss": 5.595044742205037, + "objective/train/tokens_used": 54866400, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997893681043129, + "loss": 5.0983, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997793380140421, + "loss": 4.5808, + "theoretical_loss": 5.5933475326707995, + "tokens_seen": 34471936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997693079237714, + "loss": 4.5657, + "theoretical_loss": 5.591654448211143, + "tokens_seen": 34537472 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997592778335005, + "loss": 4.8711, + "theoretical_loss": 5.589965471001077, + "tokens_seen": 34603008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997492477432298, + "loss": 4.8155, + "theoretical_loss": 5.5882805833262115, + "tokens_seen": 34668544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997392176529588, + "loss": 4.9002, + "theoretical_loss": 5.586599767581859, + "tokens_seen": 34734080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997291875626881, + "loss": 4.9815, + "theoretical_loss": 5.584923006272151, + "tokens_seen": 34799616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997191574724173, + "loss": 4.9042, + "theoretical_loss": 5.583250282009159, + "tokens_seen": 34865152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997091273821465, + "loss": 4.9647, + "theoretical_loss": 5.581581577512031, + "tokens_seen": 34930688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996990972918757, + "loss": 5.0026, + "theoretical_loss": 5.579916875606134, + "tokens_seen": 34996224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996890672016048, + "loss": 4.7549, + "theoretical_loss": 5.578256159222196, + "tokens_seen": 35061760 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499679037111334, + "loss": 4.6224, + "theoretical_loss": 5.576599411395472, + "tokens_seen": 35127296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996690070210632, + "loss": 4.7725, + "theoretical_loss": 5.574946615264906, + "tokens_seen": 35192832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996589769307924, + "loss": 4.7807, + "theoretical_loss": 5.5732977540723105, + "tokens_seen": 35258368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996489468405216, + "loss": 4.8423, + "theoretical_loss": 5.571652811161542, + "tokens_seen": 35323904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996389167502507, + "loss": 4.9602, + "theoretical_loss": 5.570011769977693, + "tokens_seen": 35389440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996288866599799, + "loss": 4.6312, + "theoretical_loss": 5.568374614066299, + "tokens_seen": 35454976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996188565697091, + "loss": 5.0662, + "theoretical_loss": 5.566741327072535, + "tokens_seen": 35520512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996088264794383, + "loss": 4.7143, + "theoretical_loss": 5.565111892740433, + "tokens_seen": 35586048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995987963891675, + "loss": 4.8642, + "theoretical_loss": 5.563486294912105, + "tokens_seen": 35651584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995887662988968, + "loss": 4.6962, + "theoretical_loss": 5.56186451752697, + "tokens_seen": 35717120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995787362086258, + "loss": 4.7386, + "theoretical_loss": 5.560246544620993, + "tokens_seen": 35782656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995687061183551, + "loss": 4.7368, + "theoretical_loss": 5.558632360325929, + "tokens_seen": 35848192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995586760280842, + "loss": 4.6722, + "theoretical_loss": 5.557021948868571, + "tokens_seen": 35913728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995486459378135, + "loss": 4.8891, + "theoretical_loss": 5.555415294570011, + "tokens_seen": 35979264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 32385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.4478864669799805, + "objective/train/theoretical_loss": 5.553812381844907, + "objective/train/tokens_used": 56504800, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995386158475427, + "loss": 4.8836, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995285857572718, + "loss": 4.7132, + "theoretical_loss": 5.552213195200755, + "tokens_seen": 36110336 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499518555667001, + "loss": 4.7041, + "theoretical_loss": 5.550617719237167, + "tokens_seen": 36175872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995085255767302, + "loss": 4.8574, + "theoretical_loss": 5.549025938645155, + "tokens_seen": 36241408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994984954864594, + "loss": 4.743, + "theoretical_loss": 5.547437838206435, + "tokens_seen": 36306944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994884653961886, + "loss": 4.7443, + "theoretical_loss": 5.545853402792717, + "tokens_seen": 36372480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994784353059178, + "loss": 4.8769, + "theoretical_loss": 5.544272617365014, + "tokens_seen": 36438016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994684052156469, + "loss": 4.8343, + "theoretical_loss": 5.542695466972956, + "tokens_seen": 36503552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994583751253761, + "loss": 4.7822, + "theoretical_loss": 5.541121936754111, + "tokens_seen": 36569088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994483450351053, + "loss": 4.7453, + "theoretical_loss": 5.539552011933312, + "tokens_seen": 36634624 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994383149448345, + "loss": 4.8192, + "theoretical_loss": 5.537985677821986, + "tokens_seen": 36700160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994282848545637, + "loss": 4.7944, + "theoretical_loss": 5.536422919817495, + "tokens_seen": 36765696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994182547642928, + "loss": 4.8368, + "theoretical_loss": 5.5348637234024824, + "tokens_seen": 36831232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994082246740221, + "loss": 4.8909, + "theoretical_loss": 5.53330807414422, + "tokens_seen": 36896768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993981945837512, + "loss": 4.7606, + "theoretical_loss": 5.5317559576939725, + "tokens_seen": 36962304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993881644934805, + "loss": 4.3933, + "theoretical_loss": 5.530207359786353, + "tokens_seen": 37027840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993781344032096, + "loss": 4.9105, + "theoretical_loss": 5.5286622662386975, + "tokens_seen": 37093376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993681043129389, + "loss": 4.725, + "theoretical_loss": 5.52712066295044, + "tokens_seen": 37158912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499358074222668, + "loss": 4.6152, + "theoretical_loss": 5.525582535902489, + "tokens_seen": 37224448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993480441323972, + "loss": 4.8967, + "theoretical_loss": 5.524047871156618, + "tokens_seen": 37289984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993380140421264, + "loss": 4.304, + "theoretical_loss": 5.52251665485486, + "tokens_seen": 37355520 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993279839518556, + "loss": 4.8395, + "theoretical_loss": 5.520988873218897, + "tokens_seen": 37421056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993179538615848, + "loss": 4.7291, + "theoretical_loss": 5.519464512549478, + "tokens_seen": 37486592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993079237713139, + "loss": 4.5346, + "theoretical_loss": 5.5179435592258095, + "tokens_seen": 37552128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992978936810431, + "loss": 4.7017, + "theoretical_loss": 5.516425999704987, + "tokens_seen": 37617664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 32883, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.716377258300781, + "objective/train/theoretical_loss": 5.514911820521407, + "objective/train/tokens_used": 58143200, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992878635907723, + "loss": 4.9009, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992778335005015, + "loss": 4.9884, + "theoretical_loss": 5.5134010082861895, + "tokens_seen": 37748736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992678034102307, + "loss": 4.7545, + "theoretical_loss": 5.511893549686616, + "tokens_seen": 37814272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992577733199598, + "loss": 4.7888, + "theoretical_loss": 5.51038943148556, + "tokens_seen": 37879808 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499247743229689, + "loss": 5.0533, + "theoretical_loss": 5.508888640520928, + "tokens_seen": 37945344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992377131394183, + "loss": 4.8807, + "theoretical_loss": 5.50739116370511, + "tokens_seen": 38010880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992276830491475, + "loss": 4.8592, + "theoretical_loss": 5.505896988024423, + "tokens_seen": 38076416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992176529588767, + "loss": 4.7331, + "theoretical_loss": 5.5044061005385725, + "tokens_seen": 38141952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992076228686059, + "loss": 4.8333, + "theoretical_loss": 5.502918488380116, + "tokens_seen": 38207488 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499197592778335, + "loss": 4.7589, + "theoretical_loss": 5.501434138753918, + "tokens_seen": 38273024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991875626880642, + "loss": 4.7598, + "theoretical_loss": 5.499953038936635, + "tokens_seen": 38338560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991775325977934, + "loss": 4.7997, + "theoretical_loss": 5.498475176276176, + "tokens_seen": 38404096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991675025075226, + "loss": 4.7163, + "theoretical_loss": 5.497000538191195, + "tokens_seen": 38469632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991574724172518, + "loss": 4.7824, + "theoretical_loss": 5.495529112170568, + "tokens_seen": 38535168 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499147442326981, + "loss": 4.8341, + "theoretical_loss": 5.494060885772887, + "tokens_seen": 38600704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991374122367101, + "loss": 4.7344, + "theoretical_loss": 5.492595846625951, + "tokens_seen": 38666240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991273821464393, + "loss": 4.7977, + "theoretical_loss": 5.491133982426266, + "tokens_seen": 38731776 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991173520561685, + "loss": 4.3782, + "theoretical_loss": 5.489675280938547, + "tokens_seen": 38797312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991073219658977, + "loss": 4.7085, + "theoretical_loss": 5.488219729995227, + "tokens_seen": 38862848 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499097291875627, + "loss": 4.5818, + "theoretical_loss": 5.486767317495966, + "tokens_seen": 38928384 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499087261785356, + "loss": 5.0102, + "theoretical_loss": 5.48531803140717, + "tokens_seen": 38993920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990772316950853, + "loss": 4.6955, + "theoretical_loss": 5.483871859761511, + "tokens_seen": 39059456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990672016048144, + "loss": 4.6598, + "theoretical_loss": 5.482428790657449, + "tokens_seen": 39124992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990571715145437, + "loss": 4.822, + "theoretical_loss": 5.480988812258763, + "tokens_seen": 39190528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990471414242729, + "loss": 4.8752, + "theoretical_loss": 5.479551912794086, + "tokens_seen": 39256064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 34166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.898425579071045, + "objective/train/theoretical_loss": 5.478118080556438, + "objective/train/tokens_used": 59781600, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499037111334002, + "loss": 4.7202, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990270812437312, + "loss": 4.6432, + "theoretical_loss": 5.476687303902768, + "tokens_seen": 39387136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990170511534604, + "loss": 4.8362, + "theoretical_loss": 5.475259571253502, + "tokens_seen": 39452672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990070210631896, + "loss": 4.5375, + "theoretical_loss": 5.473834871092089, + "tokens_seen": 39518208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989969909729188, + "loss": 4.6575, + "theoretical_loss": 5.4724131919645576, + "tokens_seen": 39583744 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498986960882648, + "loss": 4.6854, + "theoretical_loss": 5.470994522479069, + "tokens_seen": 39649280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989769307923771, + "loss": 4.5501, + "theoretical_loss": 5.4695788513054815, + "tokens_seen": 39714816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989669007021063, + "loss": 4.7545, + "theoretical_loss": 5.468166167174912, + "tokens_seen": 39780352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989568706118355, + "loss": 4.714, + "theoretical_loss": 5.466756458879306, + "tokens_seen": 39845888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989468405215647, + "loss": 4.5583, + "theoretical_loss": 5.465349715271013, + "tokens_seen": 39911424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989368104312939, + "loss": 4.7862, + "theoretical_loss": 5.463945925262355, + "tokens_seen": 39976960 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498926780341023, + "loss": 4.4681, + "theoretical_loss": 5.462545077825214, + "tokens_seen": 40042496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989167502507523, + "loss": 4.9631, + "theoretical_loss": 5.461147161990611, + "tokens_seen": 40108032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989067201604814, + "loss": 4.662, + "theoretical_loss": 5.459752166848292, + "tokens_seen": 40173568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988966900702107, + "loss": 4.7581, + "theoretical_loss": 5.458360081546321, + "tokens_seen": 40239104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988866599799398, + "loss": 4.448, + "theoretical_loss": 5.456970895290674, + "tokens_seen": 40304640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988766298896691, + "loss": 4.548, + "theoretical_loss": 5.455584597344835, + "tokens_seen": 40370176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988665997993982, + "loss": 4.5585, + "theoretical_loss": 5.454201177029395, + "tokens_seen": 40435712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988565697091274, + "loss": 4.6426, + "theoretical_loss": 5.452820623721662, + "tokens_seen": 40501248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988465396188566, + "loss": 4.5398, + "theoretical_loss": 5.45144292685526, + "tokens_seen": 40566784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988365095285858, + "loss": 4.5747, + "theoretical_loss": 5.450068075919752, + "tokens_seen": 40632320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498826479438315, + "loss": 4.6306, + "theoretical_loss": 5.44869606046024, + "tokens_seen": 40697856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988164493480441, + "loss": 4.6076, + "theoretical_loss": 5.447326870076996, + "tokens_seen": 40763392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988064192577733, + "loss": 4.6846, + "theoretical_loss": 5.445960494425072, + "tokens_seen": 40828928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987963891675025, + "loss": 4.5425, + "theoretical_loss": 5.444596923213931, + "tokens_seen": 40894464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 34866, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.677554607391357, + "objective/train/theoretical_loss": 5.443236146207074, + "objective/train/tokens_used": 61420000, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987863590772317, + "loss": 4.517, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987763289869609, + "loss": 4.4186, + "theoretical_loss": 5.441878153221662, + "tokens_seen": 41025536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049876629889669, + "loss": 4.3927, + "theoretical_loss": 5.440522934128164, + "tokens_seen": 41091072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987562688064192, + "loss": 4.521, + "theoretical_loss": 5.439170478849976, + "tokens_seen": 41156608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987462387161484, + "loss": 4.4519, + "theoretical_loss": 5.437820777363078, + "tokens_seen": 41222144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987362086258777, + "loss": 4.5882, + "theoretical_loss": 5.4364738196956655, + "tokens_seen": 41287680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987261785356068, + "loss": 4.4504, + "theoretical_loss": 5.435129595927794, + "tokens_seen": 41353216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987161484453361, + "loss": 4.1807, + "theoretical_loss": 5.433788096191039, + "tokens_seen": 41418752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987061183550651, + "loss": 4.5904, + "theoretical_loss": 5.432449310668134, + "tokens_seen": 41484288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986960882647944, + "loss": 4.5925, + "theoretical_loss": 5.4311132295926345, + "tokens_seen": 41549824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986860581745236, + "loss": 4.6165, + "theoretical_loss": 5.42977984324857, + "tokens_seen": 41615360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986760280842528, + "loss": 4.49, + "theoretical_loss": 5.428449141970107, + "tokens_seen": 41680896 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498665997993982, + "loss": 4.5819, + "theoretical_loss": 5.427121116141212, + "tokens_seen": 41746432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986559679037111, + "loss": 4.7141, + "theoretical_loss": 5.42579575619531, + "tokens_seen": 41811968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986459378134403, + "loss": 4.6584, + "theoretical_loss": 5.424473052614967, + "tokens_seen": 41877504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986359077231695, + "loss": 4.7053, + "theoretical_loss": 5.423152995931552, + "tokens_seen": 41943040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986258776328987, + "loss": 4.677, + "theoretical_loss": 5.421835576724906, + "tokens_seen": 42008576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986158475426279, + "loss": 4.5363, + "theoretical_loss": 5.420520785623031, + "tokens_seen": 42074112 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498605817452357, + "loss": 4.4347, + "theoretical_loss": 5.4192086133017625, + "tokens_seen": 42139648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985957873620862, + "loss": 4.5436, + "theoretical_loss": 5.417899050484451, + "tokens_seen": 42205184 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985857572718154, + "loss": 4.5196, + "theoretical_loss": 5.416592087941646, + "tokens_seen": 42270720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985757271815446, + "loss": 4.474, + "theoretical_loss": 5.415287716490787, + "tokens_seen": 42336256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985656970912738, + "loss": 4.9763, + "theoretical_loss": 5.413985926995892, + "tokens_seen": 42401792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985556670010031, + "loss": 4.4879, + "theoretical_loss": 5.412686710367245, + "tokens_seen": 42467328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985456369107321, + "loss": 4.7709, + "theoretical_loss": 5.411390057561097, + "tokens_seen": 42532864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 35868, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.189578056335449, + "objective/train/theoretical_loss": 5.410095959579362, + "objective/train/tokens_used": 63058400, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985356068204614, + "loss": 4.5885, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985255767301905, + "loss": 4.4367, + "theoretical_loss": 5.408804407469308, + "tokens_seen": 42663936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985155466399198, + "loss": 4.697, + "theoretical_loss": 5.407515392323276, + "tokens_seen": 42729472 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498505516549649, + "loss": 4.6402, + "theoretical_loss": 5.406228905278368, + "tokens_seen": 42795008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984954864593782, + "loss": 4.743, + "theoretical_loss": 5.404944937516161, + "tokens_seen": 42860544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984854563691073, + "loss": 4.4767, + "theoretical_loss": 5.403663480262418, + "tokens_seen": 42926080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984754262788365, + "loss": 4.2795, + "theoretical_loss": 5.402384524786797, + "tokens_seen": 42991616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984653961885657, + "loss": 4.5048, + "theoretical_loss": 5.401108062402562, + "tokens_seen": 43057152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984553660982949, + "loss": 4.285, + "theoretical_loss": 5.399834084466306, + "tokens_seen": 43122688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984453360080241, + "loss": 4.5244, + "theoretical_loss": 5.398562582377666, + "tokens_seen": 43188224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984353059177532, + "loss": 4.4912, + "theoretical_loss": 5.397293547579041, + "tokens_seen": 43253760 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984252758274825, + "loss": 4.5286, + "theoretical_loss": 5.396026971555319, + "tokens_seen": 43319296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984152457372116, + "loss": 4.635, + "theoretical_loss": 5.394762845833601, + "tokens_seen": 43384832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984052156469409, + "loss": 4.6563, + "theoretical_loss": 5.393501161982926, + "tokens_seen": 43450368 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049839518555667, + "loss": 4.5338, + "theoretical_loss": 5.392241911614005, + "tokens_seen": 43515904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983851554663993, + "loss": 4.6497, + "theoretical_loss": 5.390985086378949, + "tokens_seen": 43581440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983751253761284, + "loss": 4.656, + "theoretical_loss": 5.389730677971002, + "tokens_seen": 43646976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983650952858576, + "loss": 4.2885, + "theoretical_loss": 5.388478678124285, + "tokens_seen": 43712512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983550651955868, + "loss": 4.6653, + "theoretical_loss": 5.387229078613521, + "tokens_seen": 43778048 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498345035105316, + "loss": 4.8537, + "theoretical_loss": 5.385981871253785, + "tokens_seen": 43843584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983350050150452, + "loss": 4.5315, + "theoretical_loss": 5.384737047900243, + "tokens_seen": 43909120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983249749247743, + "loss": 4.4477, + "theoretical_loss": 5.3834946004478965, + "tokens_seen": 43974656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983149448345035, + "loss": 4.2702, + "theoretical_loss": 5.382254520831328, + "tokens_seen": 44040192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983049147442327, + "loss": 4.4734, + "theoretical_loss": 5.381016801024449, + "tokens_seen": 44105728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982948846539619, + "loss": 4.5673, + "theoretical_loss": 5.379781433040252, + "tokens_seen": 44171264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 36544, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.2153215408325195, + "objective/train/theoretical_loss": 5.378548408930558, + "objective/train/tokens_used": 64696800, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982848545636911, + "loss": 4.3754, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982748244734202, + "loss": 4.6866, + "theoretical_loss": 5.377317720785777, + "tokens_seen": 44302336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982647943831494, + "loss": 4.2689, + "theoretical_loss": 5.37608936073466, + "tokens_seen": 44367872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982547642928786, + "loss": 4.558, + "theoretical_loss": 5.374863320944057, + "tokens_seen": 44433408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982447342026079, + "loss": 4.3621, + "theoretical_loss": 5.373639593618675, + "tokens_seen": 44498944 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498234704112337, + "loss": 4.7059, + "theoretical_loss": 5.372418171000847, + "tokens_seen": 44564480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982246740220663, + "loss": 4.4685, + "theoretical_loss": 5.371199045370283, + "tokens_seen": 44630016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982146439317953, + "loss": 4.502, + "theoretical_loss": 5.369982209043851, + "tokens_seen": 44695552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982046138415246, + "loss": 4.5419, + "theoretical_loss": 5.368767654375327, + "tokens_seen": 44761088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981945837512538, + "loss": 4.6487, + "theoretical_loss": 5.367555373755179, + "tokens_seen": 44826624 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498184553660983, + "loss": 4.5139, + "theoretical_loss": 5.366345359610327, + "tokens_seen": 44892160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981745235707122, + "loss": 4.6486, + "theoretical_loss": 5.365137604403923, + "tokens_seen": 44957696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981644934804413, + "loss": 4.6411, + "theoretical_loss": 5.363932100635117, + "tokens_seen": 45023232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981544633901705, + "loss": 4.4685, + "theoretical_loss": 5.362728840838843, + "tokens_seen": 45088768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981444332998997, + "loss": 4.3304, + "theoretical_loss": 5.361527817585586, + "tokens_seen": 45154304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981344032096289, + "loss": 4.5782, + "theoretical_loss": 5.360329023481169, + "tokens_seen": 45219840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981243731193581, + "loss": 4.4319, + "theoretical_loss": 5.359132451166534, + "tokens_seen": 45285376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981143430290873, + "loss": 4.6029, + "theoretical_loss": 5.357938093317518, + "tokens_seen": 45350912 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981043129388164, + "loss": 4.458, + "theoretical_loss": 5.356745942644645, + "tokens_seen": 45416448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980942828485456, + "loss": 4.3535, + "theoretical_loss": 5.355555991892905, + "tokens_seen": 45481984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980842527582748, + "loss": 4.826, + "theoretical_loss": 5.35436823384155, + "tokens_seen": 45547520 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498074222668004, + "loss": 4.6619, + "theoretical_loss": 5.353182661303873, + "tokens_seen": 45613056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980641925777333, + "loss": 4.3435, + "theoretical_loss": 5.35199926712701, + "tokens_seen": 45678592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980541624874623, + "loss": 4.7139, + "theoretical_loss": 5.350818044191721, + "tokens_seen": 45744128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980441323971916, + "loss": 4.5414, + "theoretical_loss": 5.349638985412193, + "tokens_seen": 45809664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 37915, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.255738258361816, + "objective/train/theoretical_loss": 5.348462083735834, + "objective/train/tokens_used": 66335200, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980341023069207, + "loss": 4.2427, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049802407221665, + "loss": 4.4766, + "theoretical_loss": 5.347287332143064, + "tokens_seen": 45940736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980140421263792, + "loss": 4.6487, + "theoretical_loss": 5.346114723647119, + "tokens_seen": 46006272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980040120361084, + "loss": 4.744, + "theoretical_loss": 5.344944251293852, + "tokens_seen": 46071808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979939819458375, + "loss": 4.4822, + "theoretical_loss": 5.343775908161532, + "tokens_seen": 46137344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979839518555667, + "loss": 4.6656, + "theoretical_loss": 5.342609687360644, + "tokens_seen": 46202880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979739217652959, + "loss": 4.368, + "theoretical_loss": 5.341445582033705, + "tokens_seen": 46268416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979638916750251, + "loss": 4.5362, + "theoretical_loss": 5.3402835853550545, + "tokens_seen": 46333952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979538615847543, + "loss": 4.5253, + "theoretical_loss": 5.339123690530673, + "tokens_seen": 46399488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979438314944834, + "loss": 4.5891, + "theoretical_loss": 5.337965890797989, + "tokens_seen": 46465024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979338014042126, + "loss": 4.3895, + "theoretical_loss": 5.336810179425685, + "tokens_seen": 46530560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979237713139418, + "loss": 4.7045, + "theoretical_loss": 5.335656549713516, + "tokens_seen": 46596096 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497913741223671, + "loss": 4.3343, + "theoretical_loss": 5.334504994992115, + "tokens_seen": 46661632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979037111334002, + "loss": 4.5711, + "theoretical_loss": 5.333355508622814, + "tokens_seen": 46727168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978936810431293, + "loss": 4.5964, + "theoretical_loss": 5.332208083997459, + "tokens_seen": 46792704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978836509528586, + "loss": 4.3615, + "theoretical_loss": 5.33106271453822, + "tokens_seen": 46858240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978736208625877, + "loss": 4.6588, + "theoretical_loss": 5.329919393697422, + "tokens_seen": 46923776 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497863590772317, + "loss": 4.8373, + "theoretical_loss": 5.328778114957351, + "tokens_seen": 46989312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978535606820461, + "loss": 4.2761, + "theoretical_loss": 5.327638871830089, + "tokens_seen": 47054848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978435305917754, + "loss": 4.4197, + "theoretical_loss": 5.326501657857326, + "tokens_seen": 47120384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978335005015045, + "loss": 4.435, + "theoretical_loss": 5.32536646661019, + "tokens_seen": 47185920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978234704112337, + "loss": 4.6838, + "theoretical_loss": 5.324233291689069, + "tokens_seen": 47251456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978134403209629, + "loss": 4.3913, + "theoretical_loss": 5.323102126723439, + "tokens_seen": 47316992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978034102306921, + "loss": 4.6, + "theoretical_loss": 5.321972965371691, + "tokens_seen": 47382528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977933801404213, + "loss": 4.1382, + "theoretical_loss": 5.320845801320959, + "tokens_seen": 47448064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 38651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.045684814453125, + "objective/train/theoretical_loss": 5.319720628286955, + "objective/train/tokens_used": 67973600, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977833500501504, + "loss": 4.343, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977733199598796, + "loss": 4.3051, + "theoretical_loss": 5.318597440013795, + "tokens_seen": 47579136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977632898696088, + "loss": 4.6262, + "theoretical_loss": 5.317476230273831, + "tokens_seen": 47644672 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497753259779338, + "loss": 4.5661, + "theoretical_loss": 5.316356992867491, + "tokens_seen": 47710208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977432296890672, + "loss": 4.5954, + "theoretical_loss": 5.31523972162311, + "tokens_seen": 47775744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977331995987965, + "loss": 4.4414, + "theoretical_loss": 5.314124410396767, + "tokens_seen": 47841280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977231695085255, + "loss": 4.7758, + "theoretical_loss": 5.31301105307212, + "tokens_seen": 47906816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977131394182548, + "loss": 4.603, + "theoretical_loss": 5.311899643560251, + "tokens_seen": 47972352 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497703109327984, + "loss": 4.6245, + "theoretical_loss": 5.310790175799497, + "tokens_seen": 48037888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976930792377132, + "loss": 4.5712, + "theoretical_loss": 5.3096826437553, + "tokens_seen": 48103424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976830491474424, + "loss": 4.5459, + "theoretical_loss": 5.308577041420046, + "tokens_seen": 48168960 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976730190571715, + "loss": 4.6043, + "theoretical_loss": 5.3074733628129005, + "tokens_seen": 48234496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976629889669007, + "loss": 4.5158, + "theoretical_loss": 5.3063716019796665, + "tokens_seen": 48300032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976529588766299, + "loss": 4.4244, + "theoretical_loss": 5.305271752992619, + "tokens_seen": 48365568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976429287863591, + "loss": 4.5599, + "theoretical_loss": 5.304173809950358, + "tokens_seen": 48431104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976328986960883, + "loss": 4.5297, + "theoretical_loss": 5.303077766977653, + "tokens_seen": 48496640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976228686058175, + "loss": 4.3215, + "theoretical_loss": 5.3019836182252895, + "tokens_seen": 48562176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976128385155466, + "loss": 4.3245, + "theoretical_loss": 5.300891357869929, + "tokens_seen": 48627712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976028084252758, + "loss": 4.2442, + "theoretical_loss": 5.299800980113945, + "tokens_seen": 48693248 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497592778335005, + "loss": 4.2681, + "theoretical_loss": 5.298712479185288, + "tokens_seen": 48758784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975827482447342, + "loss": 4.5281, + "theoretical_loss": 5.297625849337331, + "tokens_seen": 48824320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975727181544635, + "loss": 4.5414, + "theoretical_loss": 5.296541084848727, + "tokens_seen": 48889856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975626880641925, + "loss": 4.3081, + "theoretical_loss": 5.295458180023262, + "tokens_seen": 48955392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975526579739218, + "loss": 4.1252, + "theoretical_loss": 5.294377129189715, + "tokens_seen": 49020928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975426278836509, + "loss": 4.3003, + "theoretical_loss": 5.293297926701706, + "tokens_seen": 49086464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 39958, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.954103946685791, + "objective/train/theoretical_loss": 5.292220566937567, + "objective/train/tokens_used": 69612000, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975325977933802, + "loss": 4.5056, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975225677031094, + "loss": 4.549, + "theoretical_loss": 5.29114504430019, + "tokens_seen": 49217536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975125376128386, + "loss": 4.2312, + "theoretical_loss": 5.290071353216895, + "tokens_seen": 49283072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975025075225677, + "loss": 4.4122, + "theoretical_loss": 5.288999488139284, + "tokens_seen": 49348608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974924774322969, + "loss": 4.4153, + "theoretical_loss": 5.28792944354311, + "tokens_seen": 49414144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974824473420261, + "loss": 4.5622, + "theoretical_loss": 5.286861213928137, + "tokens_seen": 49479680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974724172517553, + "loss": 4.4577, + "theoretical_loss": 5.285794793817999, + "tokens_seen": 49545216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974623871614845, + "loss": 4.3913, + "theoretical_loss": 5.284730177760077, + "tokens_seen": 49610752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974523570712136, + "loss": 4.0983, + "theoretical_loss": 5.283667360325351, + "tokens_seen": 49676288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974423269809428, + "loss": 4.3874, + "theoretical_loss": 5.2826063361082785, + "tokens_seen": 49741824 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497432296890672, + "loss": 4.4498, + "theoretical_loss": 5.281547099726654, + "tokens_seen": 49807360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974222668004012, + "loss": 4.2229, + "theoretical_loss": 5.280489645821483, + "tokens_seen": 49872896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974122367101304, + "loss": 4.4956, + "theoretical_loss": 5.279433969056848, + "tokens_seen": 49938432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974022066198595, + "loss": 4.2938, + "theoretical_loss": 5.278380064119782, + "tokens_seen": 50003968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973921765295888, + "loss": 4.2649, + "theoretical_loss": 5.277327925720137, + "tokens_seen": 50069504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973821464393179, + "loss": 4.6124, + "theoretical_loss": 5.276277548590457, + "tokens_seen": 50135040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973721163490472, + "loss": 4.4276, + "theoretical_loss": 5.275228927485855, + "tokens_seen": 50200576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973620862587763, + "loss": 4.4451, + "theoretical_loss": 5.2741820571838804, + "tokens_seen": 50266112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973520561685056, + "loss": 4.4044, + "theoretical_loss": 5.273136932484399, + "tokens_seen": 50331648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973420260782347, + "loss": 4.1472, + "theoretical_loss": 5.272093548209467, + "tokens_seen": 50397184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973319959879639, + "loss": 4.3286, + "theoretical_loss": 5.271051899203207, + "tokens_seen": 50462720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973219658976931, + "loss": 4.4202, + "theoretical_loss": 5.270011980331685, + "tokens_seen": 50528256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973119358074223, + "loss": 4.3812, + "theoretical_loss": 5.268973786482794, + "tokens_seen": 50593792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973019057171515, + "loss": 4.4977, + "theoretical_loss": 5.267937312566123, + "tokens_seen": 50659328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972918756268806, + "loss": 4.4049, + "theoretical_loss": 5.266902553512847, + "tokens_seen": 50724864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 40584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.171596527099609, + "objective/train/theoretical_loss": 5.265869504275602, + "objective/train/tokens_used": 71250400, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972818455366098, + "loss": 4.5105, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497271815446339, + "loss": 4.2898, + "theoretical_loss": 5.264838159828369, + "tokens_seen": 50855936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972617853560682, + "loss": 4.5284, + "theoretical_loss": 5.263808515166355, + "tokens_seen": 50921472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972517552657974, + "loss": 4.1863, + "theoretical_loss": 5.262780565305875, + "tokens_seen": 50987008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972417251755266, + "loss": 4.1704, + "theoretical_loss": 5.261754305284241, + "tokens_seen": 51052544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972316950852557, + "loss": 4.4493, + "theoretical_loss": 5.260729730159641, + "tokens_seen": 51118080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972216649949849, + "loss": 4.1237, + "theoretical_loss": 5.259706835011027, + "tokens_seen": 51183616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972116349047142, + "loss": 4.3399, + "theoretical_loss": 5.2586856149380035, + "tokens_seen": 51249152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972016048144433, + "loss": 4.0849, + "theoretical_loss": 5.257666065060709, + "tokens_seen": 51314688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971915747241726, + "loss": 4.4541, + "theoretical_loss": 5.256648180519708, + "tokens_seen": 51380224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971815446339017, + "loss": 4.353, + "theoretical_loss": 5.255631956475881, + "tokens_seen": 51445760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971715145436309, + "loss": 4.2266, + "theoretical_loss": 5.25461738811031, + "tokens_seen": 51511296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971614844533601, + "loss": 4.4598, + "theoretical_loss": 5.25360447062417, + "tokens_seen": 51576832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971514543630893, + "loss": 4.5701, + "theoretical_loss": 5.252593199238619, + "tokens_seen": 51642368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971414242728185, + "loss": 4.2901, + "theoretical_loss": 5.2515835691946915, + "tokens_seen": 51707904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971313941825477, + "loss": 4.3521, + "theoretical_loss": 5.2505755757531904, + "tokens_seen": 51773440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971213640922768, + "loss": 4.3861, + "theoretical_loss": 5.24956921419458, + "tokens_seen": 51838976 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497111334002006, + "loss": 4.3723, + "theoretical_loss": 5.248564479818876, + "tokens_seen": 51904512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971013039117352, + "loss": 4.1067, + "theoretical_loss": 5.247561367945544, + "tokens_seen": 51970048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970912738214644, + "loss": 4.3448, + "theoretical_loss": 5.246559873913396, + "tokens_seen": 52035584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970812437311936, + "loss": 4.2188, + "theoretical_loss": 5.245559993080484, + "tokens_seen": 52101120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970712136409227, + "loss": 4.0193, + "theoretical_loss": 5.24456172082399, + "tokens_seen": 52166656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970611835506519, + "loss": 4.537, + "theoretical_loss": 5.243565052540136, + "tokens_seen": 52232192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970511534603811, + "loss": 4.4592, + "theoretical_loss": 5.242569983644074, + "tokens_seen": 52297728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970411233701103, + "loss": 4.2549, + "theoretical_loss": 5.241576509569784, + "tokens_seen": 52363264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 41941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.2740397453308105, + "objective/train/theoretical_loss": 5.240584625769978, + "objective/train/tokens_used": 72888800, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970310932798396, + "loss": 4.1928, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970210631895686, + "loss": 4.4434, + "theoretical_loss": 5.239594327715992, + "tokens_seen": 52494336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970110330992979, + "loss": 4.5166, + "theoretical_loss": 5.238605610897698, + "tokens_seen": 52559872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970010030090271, + "loss": 3.9728, + "theoretical_loss": 5.237618470823394, + "tokens_seen": 52625408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969909729187563, + "loss": 4.3201, + "theoretical_loss": 5.2366329030197125, + "tokens_seen": 52690944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969809428284855, + "loss": 4.236, + "theoretical_loss": 5.235648903031521, + "tokens_seen": 52756480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969709127382147, + "loss": 4.3479, + "theoretical_loss": 5.2346664664218245, + "tokens_seen": 52822016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969608826479438, + "loss": 4.3036, + "theoretical_loss": 5.233685588771669, + "tokens_seen": 52887552 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496950852557673, + "loss": 4.3347, + "theoretical_loss": 5.232706265680049, + "tokens_seen": 52953088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969408224674022, + "loss": 4.3565, + "theoretical_loss": 5.231728492763811, + "tokens_seen": 53018624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969307923771314, + "loss": 4.6295, + "theoretical_loss": 5.230752265657554, + "tokens_seen": 53084160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969207622868606, + "loss": 4.3937, + "theoretical_loss": 5.229777580013545, + "tokens_seen": 53149696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969107321965897, + "loss": 4.4336, + "theoretical_loss": 5.228804431501619, + "tokens_seen": 53215232 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496900702106319, + "loss": 4.2758, + "theoretical_loss": 5.227832815809087, + "tokens_seen": 53280768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968906720160481, + "loss": 4.1819, + "theoretical_loss": 5.226862728640651, + "tokens_seen": 53346304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968806419257774, + "loss": 4.365, + "theoretical_loss": 5.2258941657183, + "tokens_seen": 53411840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968706118355065, + "loss": 4.316, + "theoretical_loss": 5.2249271227812315, + "tokens_seen": 53477376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968605817452358, + "loss": 4.0596, + "theoretical_loss": 5.223961595585755, + "tokens_seen": 53542912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968505516549649, + "loss": 4.3552, + "theoretical_loss": 5.222997579905204, + "tokens_seen": 53608448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968405215646941, + "loss": 4.435, + "theoretical_loss": 5.222035071529845, + "tokens_seen": 53673984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968304914744233, + "loss": 4.4653, + "theoretical_loss": 5.2210740662667945, + "tokens_seen": 53739520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968204613841525, + "loss": 4.4802, + "theoretical_loss": 5.220114559939923, + "tokens_seen": 53805056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968104312938817, + "loss": 4.3459, + "theoretical_loss": 5.219156548389775, + "tokens_seen": 53870592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968004012036108, + "loss": 4.4126, + "theoretical_loss": 5.218200027473481, + "tokens_seen": 53936128 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049679037111334, + "loss": 3.9646, + "theoretical_loss": 5.217244993064664, + "tokens_seen": 54001664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 42708, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.427297115325928, + "objective/train/theoretical_loss": 5.216291441053366, + "objective/train/tokens_used": 74527200, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967803410230692, + "loss": 4.2018, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967703109327984, + "loss": 4.5827, + "theoretical_loss": 5.215339367345955, + "tokens_seen": 54132736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967602808425276, + "loss": 4.4737, + "theoretical_loss": 5.214388767865036, + "tokens_seen": 54198272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967502507522568, + "loss": 4.2806, + "theoretical_loss": 5.2134396385493815, + "tokens_seen": 54263808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967402206619859, + "loss": 4.5709, + "theoretical_loss": 5.212491975353835, + "tokens_seen": 54329344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967301905717151, + "loss": 4.2733, + "theoretical_loss": 5.211545774249233, + "tokens_seen": 54394880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967201604814444, + "loss": 4.1417, + "theoretical_loss": 5.210601031222324, + "tokens_seen": 54460416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967101303911735, + "loss": 4.3607, + "theoretical_loss": 5.209657742275683, + "tokens_seen": 54525952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967001003009028, + "loss": 4.3571, + "theoretical_loss": 5.208715903427631, + "tokens_seen": 54591488 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496690070210632, + "loss": 4.1888, + "theoretical_loss": 5.207775510712159, + "tokens_seen": 54657024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966800401203611, + "loss": 4.3009, + "theoretical_loss": 5.2068365601788384, + "tokens_seen": 54722560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966700100300903, + "loss": 4.217, + "theoretical_loss": 5.205899047892753, + "tokens_seen": 54788096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966599799398195, + "loss": 4.2894, + "theoretical_loss": 5.2049629699344075, + "tokens_seen": 54853632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966499498495487, + "loss": 4.1144, + "theoretical_loss": 5.204028322399658, + "tokens_seen": 54919168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966399197592779, + "loss": 4.0488, + "theoretical_loss": 5.203095101399628, + "tokens_seen": 54984704 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496629889669007, + "loss": 4.272, + "theoretical_loss": 5.202163303060633, + "tokens_seen": 55050240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966198595787362, + "loss": 4.055, + "theoretical_loss": 5.201232923524104, + "tokens_seen": 55115776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966098294884654, + "loss": 4.5155, + "theoretical_loss": 5.20030395894651, + "tokens_seen": 55181312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965997993981946, + "loss": 4.467, + "theoretical_loss": 5.199376405499277, + "tokens_seen": 55246848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965897693079238, + "loss": 4.1735, + "theoretical_loss": 5.198450259368721, + "tokens_seen": 55312384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965797392176529, + "loss": 4.4801, + "theoretical_loss": 5.197525516755965, + "tokens_seen": 55377920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965697091273821, + "loss": 4.0844, + "theoretical_loss": 5.196602173876867, + "tokens_seen": 55443456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965596790371113, + "loss": 4.2085, + "theoretical_loss": 5.195680226961947, + "tokens_seen": 55508992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965496489468405, + "loss": 4.1159, + "theoretical_loss": 5.194759672256309, + "tokens_seen": 55574528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965396188565698, + "loss": 4.256, + "theoretical_loss": 5.19384050601957, + "tokens_seen": 55640064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 44116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.755551815032959, + "objective/train/theoretical_loss": 5.192922724525789, + "objective/train/tokens_used": 76165600, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965295887662988, + "loss": 4.1988, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965195586760281, + "loss": 4.4217, + "theoretical_loss": 5.19200632406339, + "tokens_seen": 55771136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965095285857573, + "loss": 4.2675, + "theoretical_loss": 5.19109130093509, + "tokens_seen": 55836672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964994984954865, + "loss": 4.1937, + "theoretical_loss": 5.190177651457833, + "tokens_seen": 55902208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964894684052157, + "loss": 4.4199, + "theoretical_loss": 5.189265371962712, + "tokens_seen": 55967744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964794383149449, + "loss": 3.8862, + "theoretical_loss": 5.188354458794902, + "tokens_seen": 56033280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496469408224674, + "loss": 4.3709, + "theoretical_loss": 5.187444908313586, + "tokens_seen": 56098816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964593781344032, + "loss": 3.9035, + "theoretical_loss": 5.186536716891892, + "tokens_seen": 56164352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964493480441324, + "loss": 4.1296, + "theoretical_loss": 5.185629880916814, + "tokens_seen": 56229888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964393179538616, + "loss": 4.0278, + "theoretical_loss": 5.18472439678915, + "tokens_seen": 56295424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964292878635908, + "loss": 4.1566, + "theoretical_loss": 5.18382026092343, + "tokens_seen": 56360960 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049641925777332, + "loss": 4.2582, + "theoretical_loss": 5.182917469747851, + "tokens_seen": 56426496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964092276830491, + "loss": 3.9274, + "theoretical_loss": 5.182016019704204, + "tokens_seen": 56492032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963991975927783, + "loss": 4.1733, + "theoretical_loss": 5.1811159072478095, + "tokens_seen": 56557568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963891675025075, + "loss": 4.1092, + "theoretical_loss": 5.180217128847451, + "tokens_seen": 56623104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963791374122367, + "loss": 3.9516, + "theoretical_loss": 5.17931968098531, + "tokens_seen": 56688640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963691073219659, + "loss": 4.113, + "theoretical_loss": 5.178423560156894, + "tokens_seen": 56754176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963590772316951, + "loss": 4.232, + "theoretical_loss": 5.177528762870973, + "tokens_seen": 56819712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963490471414242, + "loss": 4.2272, + "theoretical_loss": 5.176635285649521, + "tokens_seen": 56885248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963390170511535, + "loss": 4.1245, + "theoretical_loss": 5.175743125027638, + "tokens_seen": 56950784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963289869608827, + "loss": 4.2108, + "theoretical_loss": 5.174852277553498, + "tokens_seen": 57016320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963189568706119, + "loss": 4.4959, + "theoretical_loss": 5.173962739788276, + "tokens_seen": 57081856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496308926780341, + "loss": 4.3544, + "theoretical_loss": 5.17307450830609, + "tokens_seen": 57147392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962988966900702, + "loss": 4.1915, + "theoretical_loss": 5.172187579693933, + "tokens_seen": 57212928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962888665997994, + "loss": 4.1823, + "theoretical_loss": 5.1713019505516105, + "tokens_seen": 57278464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 44698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.967449188232422, + "objective/train/theoretical_loss": 5.170417617491682, + "objective/train/tokens_used": 77804000, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962788365095286, + "loss": 4.3458, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962688064192578, + "loss": 4.5212, + "theoretical_loss": 5.169534577139395, + "tokens_seen": 57409536 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496258776328987, + "loss": 4.2458, + "theoretical_loss": 5.168652826132623, + "tokens_seen": 57475072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962487462387161, + "loss": 4.1007, + "theoretical_loss": 5.167772361121805, + "tokens_seen": 57540608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962387161484453, + "loss": 4.2059, + "theoretical_loss": 5.166893178769884, + "tokens_seen": 57606144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962286860581746, + "loss": 4.1405, + "theoretical_loss": 5.1660152757522475, + "tokens_seen": 57671680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962186559679037, + "loss": 4.0437, + "theoretical_loss": 5.165138648756665, + "tokens_seen": 57737216 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496208625877633, + "loss": 3.8546, + "theoretical_loss": 5.164263294483226, + "tokens_seen": 57802752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961985957873621, + "loss": 3.9834, + "theoretical_loss": 5.163389209644287, + "tokens_seen": 57868288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961885656970913, + "loss": 3.9333, + "theoretical_loss": 5.162516390964408, + "tokens_seen": 57933824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961785356068205, + "loss": 4.0711, + "theoretical_loss": 5.1616448351802875, + "tokens_seen": 57999360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961685055165497, + "loss": 4.2007, + "theoretical_loss": 5.160774539040716, + "tokens_seen": 58064896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961584754262789, + "loss": 4.1938, + "theoretical_loss": 5.159905499306511, + "tokens_seen": 58130432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961484453360081, + "loss": 4.0338, + "theoretical_loss": 5.159037712750455, + "tokens_seen": 58195968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961384152457372, + "loss": 4.1705, + "theoretical_loss": 5.158171176157245, + "tokens_seen": 58261504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961283851554664, + "loss": 4.2802, + "theoretical_loss": 5.157305886323435, + "tokens_seen": 58327040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961183550651956, + "loss": 4.2534, + "theoretical_loss": 5.156441840057371, + "tokens_seen": 58392576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961083249749248, + "loss": 4.072, + "theoretical_loss": 5.155579034179144, + "tokens_seen": 58458112 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496098294884654, + "loss": 4.2144, + "theoretical_loss": 5.15471746552053, + "tokens_seen": 58523648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960882647943831, + "loss": 4.1997, + "theoretical_loss": 5.153857130924929, + "tokens_seen": 58589184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960782347041123, + "loss": 4.2174, + "theoretical_loss": 5.1529980272473175, + "tokens_seen": 58654720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960682046138415, + "loss": 4.3491, + "theoretical_loss": 5.152140151354191, + "tokens_seen": 58720256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960581745235707, + "loss": 4.1085, + "theoretical_loss": 5.151283500123505, + "tokens_seen": 58785792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960481444333, + "loss": 4.1087, + "theoretical_loss": 5.150428070444621, + "tokens_seen": 58851328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496038114343029, + "loss": 4.0075, + "theoretical_loss": 5.149573859218261, + "tokens_seen": 58916864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 46033, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9614341259002686, + "objective/train/theoretical_loss": 5.1487208633564405, + "objective/train/tokens_used": 79442400, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960280842527583, + "loss": 4.125, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960180541624875, + "loss": 4.0149, + "theoretical_loss": 5.147869079782423, + "tokens_seen": 59047936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960080240722167, + "loss": 4.2106, + "theoretical_loss": 5.147018505430666, + "tokens_seen": 59113472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959979939819459, + "loss": 4.2445, + "theoretical_loss": 5.146169137246765, + "tokens_seen": 59179008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959879638916751, + "loss": 4.2846, + "theoretical_loss": 5.145320972187402, + "tokens_seen": 59244544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959779338014042, + "loss": 4.0537, + "theoretical_loss": 5.144474007220293, + "tokens_seen": 59310080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959679037111334, + "loss": 4.3207, + "theoretical_loss": 5.143628239324139, + "tokens_seen": 59375616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959578736208626, + "loss": 4.0034, + "theoretical_loss": 5.142783665488567, + "tokens_seen": 59441152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959478435305918, + "loss": 3.8394, + "theoretical_loss": 5.1419402827140885, + "tokens_seen": 59506688 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495937813440321, + "loss": 4.0548, + "theoretical_loss": 5.141098088012036, + "tokens_seen": 59572224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959277833500501, + "loss": 3.6982, + "theoretical_loss": 5.140257078404524, + "tokens_seen": 59637760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959177532597793, + "loss": 4.1571, + "theoretical_loss": 5.13941725092439, + "tokens_seen": 59703296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959077231695085, + "loss": 4.1842, + "theoretical_loss": 5.138578602615146, + "tokens_seen": 59768832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958976930792377, + "loss": 4.2649, + "theoretical_loss": 5.137741130530934, + "tokens_seen": 59834368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958876629889669, + "loss": 4.1585, + "theoretical_loss": 5.1369048317364685, + "tokens_seen": 59899904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495877632898696, + "loss": 4.1038, + "theoretical_loss": 5.13606970330699, + "tokens_seen": 59965440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958676028084253, + "loss": 4.0452, + "theoretical_loss": 5.135235742328217, + "tokens_seen": 60030976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958575727181544, + "loss": 4.2665, + "theoretical_loss": 5.134402945896297, + "tokens_seen": 60096512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958475426278837, + "loss": 4.0482, + "theoretical_loss": 5.133571311117755, + "tokens_seen": 60162048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958375125376129, + "loss": 4.1642, + "theoretical_loss": 5.132740835109448, + "tokens_seen": 60227584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958274824473421, + "loss": 4.2918, + "theoretical_loss": 5.131911514998518, + "tokens_seen": 60293120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958174523570712, + "loss": 4.0217, + "theoretical_loss": 5.131083347922338, + "tokens_seen": 60358656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958074222668004, + "loss": 4.0733, + "theoretical_loss": 5.130256331028474, + "tokens_seen": 60424192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957973921765296, + "loss": 4.5194, + "theoretical_loss": 5.129430461474628, + "tokens_seen": 60489728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957873620862588, + "loss": 4.0056, + "theoretical_loss": 5.128605736428597, + "tokens_seen": 60555264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 46630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.424596786499023, + "objective/train/theoretical_loss": 5.127782153068225, + "objective/train/tokens_used": 81080800, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495777331995988, + "loss": 4.4411, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957673019057172, + "loss": 3.919, + "theoretical_loss": 5.126959708581356, + "tokens_seen": 60686336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957572718154463, + "loss": 4.032, + "theoretical_loss": 5.1261384001657895, + "tokens_seen": 60751872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957472417251755, + "loss": 4.1885, + "theoretical_loss": 5.125318225029231, + "tokens_seen": 60817408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957372116349047, + "loss": 4.0579, + "theoretical_loss": 5.124499180389249, + "tokens_seen": 60882944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957271815446339, + "loss": 3.8165, + "theoretical_loss": 5.12368126347323, + "tokens_seen": 60948480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957171514543631, + "loss": 3.8252, + "theoretical_loss": 5.122864471518334, + "tokens_seen": 61014016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957071213640923, + "loss": 4.0573, + "theoretical_loss": 5.122048801771443, + "tokens_seen": 61079552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956970912738214, + "loss": 4.0186, + "theoretical_loss": 5.121234251489128, + "tokens_seen": 61145088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956870611835507, + "loss": 4.0424, + "theoretical_loss": 5.120420817937591, + "tokens_seen": 61210624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956770310932798, + "loss": 4.2235, + "theoretical_loss": 5.119608498392633, + "tokens_seen": 61276160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956670010030091, + "loss": 3.7588, + "theoretical_loss": 5.118797290139605, + "tokens_seen": 61341696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956569709127383, + "loss": 4.0559, + "theoretical_loss": 5.117987190473361, + "tokens_seen": 61407232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956469408224674, + "loss": 4.3216, + "theoretical_loss": 5.1171781966982195, + "tokens_seen": 61472768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956369107321966, + "loss": 3.99, + "theoretical_loss": 5.116370306127921, + "tokens_seen": 61538304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956268806419258, + "loss": 3.96, + "theoretical_loss": 5.11556351608558, + "tokens_seen": 61603840 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495616850551655, + "loss": 3.9305, + "theoretical_loss": 5.114757823903647, + "tokens_seen": 61669376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956068204613842, + "loss": 3.885, + "theoretical_loss": 5.113953226923864, + "tokens_seen": 61734912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955967903711133, + "loss": 4.2558, + "theoretical_loss": 5.113149722497221, + "tokens_seen": 61800448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955867602808425, + "loss": 4.0224, + "theoretical_loss": 5.112347307983919, + "tokens_seen": 61865984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955767301905717, + "loss": 4.0078, + "theoretical_loss": 5.111545980753322, + "tokens_seen": 61931520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955667001003009, + "loss": 4.2075, + "theoretical_loss": 5.110745738183919, + "tokens_seen": 61997056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955566700100301, + "loss": 4.2822, + "theoretical_loss": 5.109946577663284, + "tokens_seen": 62062592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955466399197592, + "loss": 3.8552, + "theoretical_loss": 5.109148496588032, + "tokens_seen": 62128128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955366098294884, + "loss": 4.1157, + "theoretical_loss": 5.108351492363779, + "tokens_seen": 62193664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 47896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.253730773925781, + "objective/train/theoretical_loss": 5.107555562405102, + "objective/train/tokens_used": 82719200, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955265797392177, + "loss": 3.8586, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955165496489468, + "loss": 4.043, + "theoretical_loss": 5.106760704135499, + "tokens_seen": 62324736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955065195586761, + "loss": 3.9028, + "theoretical_loss": 5.105966914987349, + "tokens_seen": 62390272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954964894684052, + "loss": 4.1462, + "theoretical_loss": 5.1051741924018685, + "tokens_seen": 62455808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954864593781344, + "loss": 3.9346, + "theoretical_loss": 5.10438253382908, + "tokens_seen": 62521344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954764292878636, + "loss": 4.1546, + "theoretical_loss": 5.103591936727762, + "tokens_seen": 62586880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954663991975928, + "loss": 4.1114, + "theoretical_loss": 5.102802398565418, + "tokens_seen": 62652416 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495456369107322, + "loss": 4.2456, + "theoretical_loss": 5.102013916818235, + "tokens_seen": 62717952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954463390170512, + "loss": 4.1117, + "theoretical_loss": 5.101226488971042, + "tokens_seen": 62783488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954363089267803, + "loss": 3.8726, + "theoretical_loss": 5.100440112517276, + "tokens_seen": 62849024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954262788365095, + "loss": 3.9981, + "theoretical_loss": 5.09965478495894, + "tokens_seen": 62914560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954162487462387, + "loss": 4.048, + "theoretical_loss": 5.098870503806567, + "tokens_seen": 62980096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954062186559679, + "loss": 3.8959, + "theoretical_loss": 5.09808726657918, + "tokens_seen": 63045632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953961885656971, + "loss": 3.7875, + "theoretical_loss": 5.097305070804255, + "tokens_seen": 63111168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953861584754263, + "loss": 4.108, + "theoretical_loss": 5.096523914017688, + "tokens_seen": 63176704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953761283851555, + "loss": 4.1691, + "theoretical_loss": 5.095743793763747, + "tokens_seen": 63242240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953660982948846, + "loss": 4.0009, + "theoretical_loss": 5.094964707595047, + "tokens_seen": 63307776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953560682046139, + "loss": 3.7485, + "theoretical_loss": 5.094186653072505, + "tokens_seen": 63373312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953460381143431, + "loss": 4.0331, + "theoretical_loss": 5.093409627765306, + "tokens_seen": 63438848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953360080240723, + "loss": 3.9678, + "theoretical_loss": 5.092633629250866, + "tokens_seen": 63504384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953259779338014, + "loss": 4.031, + "theoretical_loss": 5.091858655114796, + "tokens_seen": 63569920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953159478435306, + "loss": 4.0263, + "theoretical_loss": 5.091084702950868, + "tokens_seen": 63635456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953059177532598, + "loss": 3.9067, + "theoretical_loss": 5.090311770360971, + "tokens_seen": 63700992 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495295887662989, + "loss": 4.0007, + "theoretical_loss": 5.089539854955088, + "tokens_seen": 63766528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952858575727182, + "loss": 3.7926, + "theoretical_loss": 5.088768954351249, + "tokens_seen": 63832064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 48551, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.194922924041748, + "objective/train/theoretical_loss": 5.087999066175502, + "objective/train/tokens_used": 84357600, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952758274824474, + "loss": 4.0708, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952657973921765, + "loss": 4.289, + "theoretical_loss": 5.0872301880618735, + "tokens_seen": 63963136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952557673019057, + "loss": 4.0031, + "theoretical_loss": 5.086462317652341, + "tokens_seen": 64028672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952457372116349, + "loss": 3.7044, + "theoretical_loss": 5.085695452596788, + "tokens_seen": 64094208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952357071213641, + "loss": 3.8383, + "theoretical_loss": 5.084929590552976, + "tokens_seen": 64159744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952256770310933, + "loss": 3.8943, + "theoretical_loss": 5.0841647291865115, + "tokens_seen": 64225280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952156469408225, + "loss": 4.0131, + "theoretical_loss": 5.083400866170806, + "tokens_seen": 64290816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952056168505516, + "loss": 3.8292, + "theoretical_loss": 5.082637999187046, + "tokens_seen": 64356352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951955867602809, + "loss": 3.8947, + "theoretical_loss": 5.081876125924159, + "tokens_seen": 64421888 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049518555667001, + "loss": 3.8301, + "theoretical_loss": 5.0811152440787755, + "tokens_seen": 64487424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951755265797393, + "loss": 3.9172, + "theoretical_loss": 5.0803553513552036, + "tokens_seen": 64552960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951654964894685, + "loss": 3.9521, + "theoretical_loss": 5.079596445465386, + "tokens_seen": 64618496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951554663991976, + "loss": 3.874, + "theoretical_loss": 5.078838524128878, + "tokens_seen": 64684032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951454363089268, + "loss": 3.886, + "theoretical_loss": 5.078081585072802, + "tokens_seen": 64749568 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495135406218656, + "loss": 3.9186, + "theoretical_loss": 5.077325626031826, + "tokens_seen": 64815104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951253761283852, + "loss": 3.8648, + "theoretical_loss": 5.076570644748123, + "tokens_seen": 64880640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951153460381144, + "loss": 4.1708, + "theoretical_loss": 5.075816638971341, + "tokens_seen": 64946176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951053159478435, + "loss": 3.8125, + "theoretical_loss": 5.075063606458576, + "tokens_seen": 65011712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950952858575727, + "loss": 3.7121, + "theoretical_loss": 5.074311544974331, + "tokens_seen": 65077248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950852557673019, + "loss": 3.7576, + "theoretical_loss": 5.07356045229049, + "tokens_seen": 65142784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950752256770311, + "loss": 3.9499, + "theoretical_loss": 5.072810326186285, + "tokens_seen": 65208320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950651955867603, + "loss": 3.7499, + "theoretical_loss": 5.072061164448261, + "tokens_seen": 65273856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950551654964894, + "loss": 3.833, + "theoretical_loss": 5.071312964870252, + "tokens_seen": 65339392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950451354062186, + "loss": 4.1678, + "theoretical_loss": 5.070565725253344, + "tokens_seen": 65404928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950351053159479, + "loss": 3.7484, + "theoretical_loss": 5.069819443405842, + "tokens_seen": 65470464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 49128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.299447536468506, + "objective/train/theoretical_loss": 5.069074117143246, + "objective/train/tokens_used": 85996000, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495025075225677, + "loss": 4.2265, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950150451354063, + "loss": 3.8292, + "theoretical_loss": 5.068329744288216, + "tokens_seen": 65601536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950050150451354, + "loss": 3.9529, + "theoretical_loss": 5.067586322670541, + "tokens_seen": 65667072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949949849548646, + "loss": 3.9573, + "theoretical_loss": 5.0668438501271105, + "tokens_seen": 65732608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949849548645938, + "loss": 3.8014, + "theoretical_loss": 5.066102324501883, + "tokens_seen": 65798144 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494974924774323, + "loss": 4.0132, + "theoretical_loss": 5.065361743645855, + "tokens_seen": 65863680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949648946840522, + "loss": 3.8756, + "theoretical_loss": 5.064622105417033, + "tokens_seen": 65929216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949548645937814, + "loss": 3.938, + "theoretical_loss": 5.063883407680405, + "tokens_seen": 65994752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949448345035105, + "loss": 3.7738, + "theoretical_loss": 5.063145648307904, + "tokens_seen": 66060288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949348044132397, + "loss": 3.8514, + "theoretical_loss": 5.062408825178388, + "tokens_seen": 66125824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949247743229689, + "loss": 3.8407, + "theoretical_loss": 5.061672936177604, + "tokens_seen": 66191360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949147442326981, + "loss": 3.949, + "theoretical_loss": 5.06093797919816, + "tokens_seen": 66256896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949047141424273, + "loss": 3.9103, + "theoretical_loss": 5.060203952139497, + "tokens_seen": 66322432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948946840521565, + "loss": 3.7828, + "theoretical_loss": 5.059470852907861, + "tokens_seen": 66387968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948846539618856, + "loss": 4.0738, + "theoretical_loss": 5.0587386794162725, + "tokens_seen": 66453504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948746238716148, + "loss": 3.7987, + "theoretical_loss": 5.058007429584498, + "tokens_seen": 66519040 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494864593781344, + "loss": 4.0708, + "theoretical_loss": 5.057277101339023, + "tokens_seen": 66584576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948545636910733, + "loss": 3.9817, + "theoretical_loss": 5.056547692613021, + "tokens_seen": 66650112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948445336008024, + "loss": 3.9141, + "theoretical_loss": 5.055819201346331, + "tokens_seen": 66715648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948345035105316, + "loss": 3.7643, + "theoretical_loss": 5.055091625485421, + "tokens_seen": 66781184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948244734202607, + "loss": 3.9492, + "theoretical_loss": 5.054364962983367, + "tokens_seen": 66846720 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049481444332999, + "loss": 3.7609, + "theoretical_loss": 5.053639211799824, + "tokens_seen": 66912256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948044132397192, + "loss": 4.1554, + "theoretical_loss": 5.052914369900997, + "tokens_seen": 66977792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947943831494484, + "loss": 3.6762, + "theoretical_loss": 5.052190435259614, + "tokens_seen": 67043328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947843530591776, + "loss": 3.8481, + "theoretical_loss": 5.051467405854897, + "tokens_seen": 67108864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 50658, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4438347816467285, + "objective/train/theoretical_loss": 5.05074527967254, + "objective/train/tokens_used": 87634400, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947743229689067, + "loss": 3.7927, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947642928786359, + "loss": 3.9986, + "theoretical_loss": 5.050024054704677, + "tokens_seen": 67239936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947542627883651, + "loss": 3.8366, + "theoretical_loss": 5.049303728949859, + "tokens_seen": 67305472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947442326980943, + "loss": 3.6166, + "theoretical_loss": 5.048584300413019, + "tokens_seen": 67371008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947342026078235, + "loss": 3.666, + "theoretical_loss": 5.04786576710546, + "tokens_seen": 67436544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947241725175527, + "loss": 3.6312, + "theoretical_loss": 5.0471481270448155, + "tokens_seen": 67502080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947141424272818, + "loss": 3.6529, + "theoretical_loss": 5.046431378255027, + "tokens_seen": 67567616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947041123370111, + "loss": 4.1165, + "theoretical_loss": 5.045715518766322, + "tokens_seen": 67633152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946940822467402, + "loss": 3.9446, + "theoretical_loss": 5.0450005466151815, + "tokens_seen": 67698688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946840521564695, + "loss": 3.916, + "theoretical_loss": 5.044286459844319, + "tokens_seen": 67764224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946740220661987, + "loss": 4.0487, + "theoretical_loss": 5.043573256502652, + "tokens_seen": 67829760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946639919759278, + "loss": 3.751, + "theoretical_loss": 5.0428609346452795, + "tokens_seen": 67895296 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494653961885657, + "loss": 4.055, + "theoretical_loss": 5.042149492333452, + "tokens_seen": 67960832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946439317953862, + "loss": 3.8691, + "theoretical_loss": 5.041438927634549, + "tokens_seen": 68026368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946339017051154, + "loss": 3.9087, + "theoretical_loss": 5.040729238622053, + "tokens_seen": 68091904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946238716148446, + "loss": 3.9294, + "theoretical_loss": 5.040020423375525, + "tokens_seen": 68157440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946138415245737, + "loss": 3.8602, + "theoretical_loss": 5.039312479980579, + "tokens_seen": 68222976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946038114343029, + "loss": 3.9095, + "theoretical_loss": 5.038605406528857, + "tokens_seen": 68288512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945937813440321, + "loss": 4.2147, + "theoretical_loss": 5.037899201118005, + "tokens_seen": 68354048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945837512537613, + "loss": 3.6029, + "theoretical_loss": 5.037193861851646, + "tokens_seen": 68419584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945737211634905, + "loss": 3.8454, + "theoretical_loss": 5.03648938683936, + "tokens_seen": 68485120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945636910732196, + "loss": 3.9214, + "theoretical_loss": 5.035785774196654, + "tokens_seen": 68550656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945536609829488, + "loss": 3.6581, + "theoretical_loss": 5.035083022044944, + "tokens_seen": 68616192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945436308926781, + "loss": 3.6618, + "theoretical_loss": 5.034381128511525, + "tokens_seen": 68681728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945336008024072, + "loss": 3.7457, + "theoretical_loss": 5.0336800917295506, + "tokens_seen": 68747264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 51428, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8635945320129395, + "objective/train/theoretical_loss": 5.032979909838007, + "objective/train/tokens_used": 89272800, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945235707121365, + "loss": 3.842, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945135406218656, + "loss": 4.0085, + "theoretical_loss": 5.032280580981691, + "tokens_seen": 68878336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945035105315948, + "loss": 3.8896, + "theoretical_loss": 5.031582103311187, + "tokens_seen": 68943872 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494493480441324, + "loss": 3.8326, + "theoretical_loss": 5.030884474982842, + "tokens_seen": 69009408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944834503510532, + "loss": 3.8144, + "theoretical_loss": 5.030187694158739, + "tokens_seen": 69074944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944734202607824, + "loss": 3.797, + "theoretical_loss": 5.02949175900668, + "tokens_seen": 69140480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944633901705116, + "loss": 3.9856, + "theoretical_loss": 5.028796667700159, + "tokens_seen": 69206016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944533600802407, + "loss": 3.8468, + "theoretical_loss": 5.0281024184183405, + "tokens_seen": 69271552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944433299899699, + "loss": 3.706, + "theoretical_loss": 5.0274090093460355, + "tokens_seen": 69337088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944332998996991, + "loss": 3.692, + "theoretical_loss": 5.026716438673677, + "tokens_seen": 69402624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944232698094283, + "loss": 3.9909, + "theoretical_loss": 5.0260247045973045, + "tokens_seen": 69468160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944132397191575, + "loss": 3.9516, + "theoretical_loss": 5.02533380531853, + "tokens_seen": 69533696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944032096288867, + "loss": 4.0669, + "theoretical_loss": 5.024643739044526, + "tokens_seen": 69599232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943931795386158, + "loss": 3.8696, + "theoretical_loss": 5.023954503987998, + "tokens_seen": 69664768 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494383149448345, + "loss": 3.8331, + "theoretical_loss": 5.023266098367161, + "tokens_seen": 69730304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943731193580742, + "loss": 3.9494, + "theoretical_loss": 5.022578520405721, + "tokens_seen": 69795840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943630892678035, + "loss": 3.4961, + "theoretical_loss": 5.0218917683328534, + "tokens_seen": 69861376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943530591775326, + "loss": 3.8677, + "theoretical_loss": 5.021205840383175, + "tokens_seen": 69926912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943430290872618, + "loss": 3.6711, + "theoretical_loss": 5.020520734796728, + "tokens_seen": 69992448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943329989969909, + "loss": 3.8103, + "theoretical_loss": 5.019836449818957, + "tokens_seen": 70057984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943229689067202, + "loss": 3.9293, + "theoretical_loss": 5.019152983700687, + "tokens_seen": 70123520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943129388164494, + "loss": 3.6354, + "theoretical_loss": 5.018470334698101, + "tokens_seen": 70189056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943029087261786, + "loss": 3.5086, + "theoretical_loss": 5.01778850107272, + "tokens_seen": 70254592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942928786359078, + "loss": 3.5231, + "theoretical_loss": 5.017107481091379, + "tokens_seen": 70320128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942828485456369, + "loss": 3.7571, + "theoretical_loss": 5.016427273026212, + "tokens_seen": 70385664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 52769, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7651798725128174, + "objective/train/theoretical_loss": 5.015747875154622, + "objective/train/tokens_used": 90911200, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942728184553661, + "loss": 3.8123, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942627883650953, + "loss": 3.593, + "theoretical_loss": 5.015069285759269, + "tokens_seen": 70516736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942527582748245, + "loss": 3.6413, + "theoretical_loss": 5.01439150312804, + "tokens_seen": 70582272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942427281845537, + "loss": 3.6802, + "theoretical_loss": 5.0137145255540405, + "tokens_seen": 70647808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942326980942828, + "loss": 3.6497, + "theoretical_loss": 5.013038351335559, + "tokens_seen": 70713344 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494222668004012, + "loss": 4.0304, + "theoretical_loss": 5.012362978776057, + "tokens_seen": 70778880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942126379137412, + "loss": 3.6552, + "theoretical_loss": 5.011688406184147, + "tokens_seen": 70844416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942026078234704, + "loss": 3.5983, + "theoretical_loss": 5.011014631873566, + "tokens_seen": 70909952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941925777331996, + "loss": 3.9619, + "theoretical_loss": 5.010341654163167, + "tokens_seen": 70975488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941825476429289, + "loss": 3.6699, + "theoretical_loss": 5.009669471376882, + "tokens_seen": 71041024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941725175526579, + "loss": 3.8407, + "theoretical_loss": 5.008998081843721, + "tokens_seen": 71106560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941624874623872, + "loss": 3.7259, + "theoretical_loss": 5.008327483897736, + "tokens_seen": 71172096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941524573721163, + "loss": 3.5894, + "theoretical_loss": 5.00765767587801, + "tokens_seen": 71237632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941424272818456, + "loss": 3.576, + "theoretical_loss": 5.006988656128635, + "tokens_seen": 71303168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941323971915748, + "loss": 3.9922, + "theoretical_loss": 5.006320422998691, + "tokens_seen": 71368704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941223671013039, + "loss": 3.8547, + "theoretical_loss": 5.00565297484223, + "tokens_seen": 71434240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941123370110331, + "loss": 3.8379, + "theoretical_loss": 5.004986310018252, + "tokens_seen": 71499776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941023069207623, + "loss": 3.7754, + "theoretical_loss": 5.004320426890686, + "tokens_seen": 71565312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940922768304915, + "loss": 3.6593, + "theoretical_loss": 5.003655323828376, + "tokens_seen": 71630848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940822467402207, + "loss": 3.6119, + "theoretical_loss": 5.002990999205057, + "tokens_seen": 71696384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940722166499498, + "loss": 3.9359, + "theoretical_loss": 5.002327451399335, + "tokens_seen": 71761920 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494062186559679, + "loss": 3.947, + "theoretical_loss": 5.001664678794671, + "tokens_seen": 71827456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940521564694082, + "loss": 4.1257, + "theoretical_loss": 5.001002679779363, + "tokens_seen": 71892992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940421263791374, + "loss": 3.5797, + "theoretical_loss": 5.0003414527465235, + "tokens_seen": 71958528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940320962888666, + "loss": 3.4931, + "theoretical_loss": 4.99968099609406, + "tokens_seen": 72024064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 53229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.443143129348755, + "objective/train/theoretical_loss": 4.999021308224664, + "objective/train/tokens_used": 92549600, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940220661985958, + "loss": 3.8379, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940120361083249, + "loss": 3.7169, + "theoretical_loss": 4.998362387545782, + "tokens_seen": 72155136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940020060180542, + "loss": 3.7454, + "theoretical_loss": 4.997704232469606, + "tokens_seen": 72220672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939919759277834, + "loss": 3.7546, + "theoretical_loss": 4.997046841413049, + "tokens_seen": 72286208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939819458375126, + "loss": 3.7608, + "theoretical_loss": 4.996390212797728, + "tokens_seen": 72351744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939719157472418, + "loss": 3.9281, + "theoretical_loss": 4.995734345049949, + "tokens_seen": 72417280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493961885656971, + "loss": 3.6721, + "theoretical_loss": 4.995079236600686, + "tokens_seen": 72482816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939518555667001, + "loss": 3.5009, + "theoretical_loss": 4.994424885885564, + "tokens_seen": 72548352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939418254764293, + "loss": 4.0443, + "theoretical_loss": 4.993771291344839, + "tokens_seen": 72613888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939317953861585, + "loss": 3.851, + "theoretical_loss": 4.993118451423381, + "tokens_seen": 72679424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939217652958877, + "loss": 3.8891, + "theoretical_loss": 4.992466364570659, + "tokens_seen": 72744960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939117352056169, + "loss": 3.7893, + "theoretical_loss": 4.991815029240721, + "tokens_seen": 72810496 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493901705115346, + "loss": 3.8364, + "theoretical_loss": 4.991164443892175, + "tokens_seen": 72876032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938916750250752, + "loss": 3.4055, + "theoretical_loss": 4.990514606988173, + "tokens_seen": 72941568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938816449348044, + "loss": 3.4979, + "theoretical_loss": 4.989865516996396, + "tokens_seen": 73007104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938716148445337, + "loss": 3.8367, + "theoretical_loss": 4.98921717238903, + "tokens_seen": 73072640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938615847542628, + "loss": 3.8357, + "theoretical_loss": 4.988569571642756, + "tokens_seen": 73138176 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493851554663992, + "loss": 3.7075, + "theoretical_loss": 4.98792271323873, + "tokens_seen": 73203712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938415245737211, + "loss": 3.939, + "theoretical_loss": 4.9872765956625615, + "tokens_seen": 73269248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938314944834504, + "loss": 3.8696, + "theoretical_loss": 4.9866312174043035, + "tokens_seen": 73334784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938214643931796, + "loss": 3.8818, + "theoretical_loss": 4.9859865769584335, + "tokens_seen": 73400320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938114343029088, + "loss": 3.8674, + "theoretical_loss": 4.9853426728238315, + "tokens_seen": 73465856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493801404212638, + "loss": 3.7658, + "theoretical_loss": 4.984699503503771, + "tokens_seen": 73531392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937913741223671, + "loss": 3.7394, + "theoretical_loss": 4.984057067505898, + "tokens_seen": 73596928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937813440320963, + "loss": 3.8743, + "theoretical_loss": 4.9834153633422105, + "tokens_seen": 73662464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 53865, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.384488582611084, + "objective/train/theoretical_loss": 4.982774389529053, + "objective/train/tokens_used": 94188000, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937713139418255, + "loss": 4.1487, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937612838515547, + "loss": 3.535, + "theoretical_loss": 4.9821341445870875, + "tokens_seen": 73793536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937512537612839, + "loss": 3.769, + "theoretical_loss": 4.981494627041286, + "tokens_seen": 73859072 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493741223671013, + "loss": 3.6317, + "theoretical_loss": 4.98085583542091, + "tokens_seen": 73924608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937311935807422, + "loss": 3.9226, + "theoretical_loss": 4.980217768259496, + "tokens_seen": 73990144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937211634904714, + "loss": 3.7611, + "theoretical_loss": 4.979580424094836, + "tokens_seen": 74055680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937111334002006, + "loss": 3.765, + "theoretical_loss": 4.978943801468967, + "tokens_seen": 74121216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937011033099298, + "loss": 3.6153, + "theoretical_loss": 4.978307898928149, + "tokens_seen": 74186752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936910732196591, + "loss": 3.7389, + "theoretical_loss": 4.977672715022855, + "tokens_seen": 74252288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936810431293881, + "loss": 3.5885, + "theoretical_loss": 4.97703824830775, + "tokens_seen": 74317824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936710130391174, + "loss": 3.7634, + "theoretical_loss": 4.976404497341676, + "tokens_seen": 74383360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936609829488465, + "loss": 3.62, + "theoretical_loss": 4.975771460687641, + "tokens_seen": 74448896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936509528585758, + "loss": 3.8546, + "theoretical_loss": 4.975139136912794, + "tokens_seen": 74514432 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493640922768305, + "loss": 3.768, + "theoretical_loss": 4.974507524588424, + "tokens_seen": 74579968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936308926780341, + "loss": 3.9096, + "theoretical_loss": 4.973876622289927, + "tokens_seen": 74645504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936208625877633, + "loss": 3.6973, + "theoretical_loss": 4.973246428596802, + "tokens_seen": 74711040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936108324974925, + "loss": 3.672, + "theoretical_loss": 4.972616942092634, + "tokens_seen": 74776576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936008024072217, + "loss": 3.4177, + "theoretical_loss": 4.971988161365077, + "tokens_seen": 74842112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935907723169509, + "loss": 3.9026, + "theoretical_loss": 4.9713600850058395, + "tokens_seen": 74907648 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049358074222668, + "loss": 3.5683, + "theoretical_loss": 4.970732711610667, + "tokens_seen": 74973184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935707121364092, + "loss": 3.6196, + "theoretical_loss": 4.97010603977933, + "tokens_seen": 75038720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935606820461384, + "loss": 3.8518, + "theoretical_loss": 4.96948006811561, + "tokens_seen": 75104256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935506519558676, + "loss": 3.8093, + "theoretical_loss": 4.968854795227281, + "tokens_seen": 75169792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935406218655968, + "loss": 3.7438, + "theoretical_loss": 4.968230219726093, + "tokens_seen": 75235328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493530591775326, + "loss": 3.6534, + "theoretical_loss": 4.967606340227765, + "tokens_seen": 75300864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 54881, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.100285053253174, + "objective/train/theoretical_loss": 4.966983155351962, + "objective/train/tokens_used": 95826400, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935205616850551, + "loss": 3.7242, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935105315947844, + "loss": 3.7258, + "theoretical_loss": 4.966360663722287, + "tokens_seen": 75431936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935005015045135, + "loss": 3.5934, + "theoretical_loss": 4.96573886396626, + "tokens_seen": 75497472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934904714142428, + "loss": 3.4784, + "theoretical_loss": 4.965117754715307, + "tokens_seen": 75563008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934804413239719, + "loss": 3.4207, + "theoretical_loss": 4.964497334604748, + "tokens_seen": 75628544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934704112337011, + "loss": 3.8166, + "theoretical_loss": 4.963877602273776, + "tokens_seen": 75694080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934603811434303, + "loss": 3.8047, + "theoretical_loss": 4.963258556365449, + "tokens_seen": 75759616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934503510531595, + "loss": 3.7029, + "theoretical_loss": 4.962640195526673, + "tokens_seen": 75825152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934403209628887, + "loss": 3.7634, + "theoretical_loss": 4.962022518408183, + "tokens_seen": 75890688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934302908726179, + "loss": 3.5837, + "theoretical_loss": 4.96140552366454, + "tokens_seen": 75956224 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493420260782347, + "loss": 3.6947, + "theoretical_loss": 4.9607892099541075, + "tokens_seen": 76021760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934102306920762, + "loss": 3.6329, + "theoretical_loss": 4.9601735759390415, + "tokens_seen": 76087296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934002006018054, + "loss": 3.615, + "theoretical_loss": 4.959558620285274, + "tokens_seen": 76152832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933901705115346, + "loss": 3.7512, + "theoretical_loss": 4.958944341662502, + "tokens_seen": 76218368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933801404212638, + "loss": 3.6382, + "theoretical_loss": 4.958330738744172, + "tokens_seen": 76283904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493370110330993, + "loss": 3.7997, + "theoretical_loss": 4.957717810207466, + "tokens_seen": 76349440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933600802407221, + "loss": 3.6714, + "theoretical_loss": 4.957105554733289, + "tokens_seen": 76414976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933500501504513, + "loss": 3.4102, + "theoretical_loss": 4.956493971006253, + "tokens_seen": 76480512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933400200601805, + "loss": 3.6513, + "theoretical_loss": 4.955883057714669, + "tokens_seen": 76546048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933299899699098, + "loss": 3.7063, + "theoretical_loss": 4.955272813550524, + "tokens_seen": 76611584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933199598796389, + "loss": 3.5682, + "theoretical_loss": 4.954663237209477, + "tokens_seen": 76677120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933099297893682, + "loss": 3.859, + "theoretical_loss": 4.954054327390841, + "tokens_seen": 76742656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932998996990972, + "loss": 3.6904, + "theoretical_loss": 4.9534460827975675, + "tokens_seen": 76808192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932898696088265, + "loss": 3.5948, + "theoretical_loss": 4.952838502136241, + "tokens_seen": 76873728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932798395185557, + "loss": 3.6899, + "theoretical_loss": 4.952231584117056, + "tokens_seen": 76939264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 55424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.752882957458496, + "objective/train/theoretical_loss": 4.951625327453812, + "objective/train/tokens_used": 97464800, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932698094282849, + "loss": 3.6065, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932597793380141, + "loss": 3.69, + "theoretical_loss": 4.951019730863894, + "tokens_seen": 77070336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932497492477432, + "loss": 3.5959, + "theoretical_loss": 4.950414793068266, + "tokens_seen": 77135872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932397191574724, + "loss": 3.7588, + "theoretical_loss": 4.94981051279145, + "tokens_seen": 77201408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932296890672016, + "loss": 3.8397, + "theoretical_loss": 4.94920688876152, + "tokens_seen": 77266944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932196589769308, + "loss": 3.4345, + "theoretical_loss": 4.948603919710088, + "tokens_seen": 77332480 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049320962888666, + "loss": 3.6935, + "theoretical_loss": 4.948001604372287, + "tokens_seen": 77398016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931995987963893, + "loss": 3.6558, + "theoretical_loss": 4.947399941486762, + "tokens_seen": 77463552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931895687061183, + "loss": 4.0196, + "theoretical_loss": 4.946798929795658, + "tokens_seen": 77529088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931795386158476, + "loss": 3.5714, + "theoretical_loss": 4.946198568044602, + "tokens_seen": 77594624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931695085255767, + "loss": 3.4172, + "theoretical_loss": 4.945598854982698, + "tokens_seen": 77660160 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493159478435306, + "loss": 3.8399, + "theoretical_loss": 4.944999789362508, + "tokens_seen": 77725696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931494483450352, + "loss": 3.7901, + "theoretical_loss": 4.944401369940043, + "tokens_seen": 77791232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931394182547643, + "loss": 3.5237, + "theoretical_loss": 4.9438035954747495, + "tokens_seen": 77856768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931293881644935, + "loss": 3.4995, + "theoretical_loss": 4.9432064647294975, + "tokens_seen": 77922304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931193580742227, + "loss": 3.8164, + "theoretical_loss": 4.942609976470566, + "tokens_seen": 77987840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931093279839519, + "loss": 3.5615, + "theoretical_loss": 4.942014129467637, + "tokens_seen": 78053376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930992978936811, + "loss": 3.3958, + "theoretical_loss": 4.941418922493774, + "tokens_seen": 78118912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930892678034102, + "loss": 3.5015, + "theoretical_loss": 4.940824354325419, + "tokens_seen": 78184448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930792377131394, + "loss": 3.5466, + "theoretical_loss": 4.940230423742372, + "tokens_seen": 78249984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930692076228686, + "loss": 3.503, + "theoretical_loss": 4.939637129527789, + "tokens_seen": 78315520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930591775325978, + "loss": 3.6357, + "theoretical_loss": 4.939044470468156, + "tokens_seen": 78381056 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493049147442327, + "loss": 3.7812, + "theoretical_loss": 4.938452445353294, + "tokens_seen": 78446592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930391173520562, + "loss": 3.4835, + "theoretical_loss": 4.937861052976332, + "tokens_seen": 78512128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930290872617853, + "loss": 3.1521, + "theoretical_loss": 4.937270292133704, + "tokens_seen": 78577664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 56825, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.739461898803711, + "objective/train/theoretical_loss": 4.9366801616251355, + "objective/train/tokens_used": 99103200, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930190571715146, + "loss": 3.8046, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930090270812437, + "loss": 3.5368, + "theoretical_loss": 4.93609066025363, + "tokens_seen": 78708736 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492998996990973, + "loss": 3.878, + "theoretical_loss": 4.935501786825457, + "tokens_seen": 78774272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929889669007021, + "loss": 3.6171, + "theoretical_loss": 4.934913540150143, + "tokens_seen": 78839808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929789368104313, + "loss": 3.5845, + "theoretical_loss": 4.934325919040461, + "tokens_seen": 78905344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929689067201605, + "loss": 3.5543, + "theoretical_loss": 4.933738922312413, + "tokens_seen": 78970880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929588766298897, + "loss": 3.3024, + "theoretical_loss": 4.933152548785222, + "tokens_seen": 79036416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929488465396189, + "loss": 3.6027, + "theoretical_loss": 4.932566797281324, + "tokens_seen": 79101952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929388164493481, + "loss": 3.9121, + "theoretical_loss": 4.931981666626351, + "tokens_seen": 79167488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929287863590773, + "loss": 3.5938, + "theoretical_loss": 4.931397155649121, + "tokens_seen": 79233024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929187562688064, + "loss": 3.7354, + "theoretical_loss": 4.930813263181631, + "tokens_seen": 79298560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929087261785356, + "loss": 3.433, + "theoretical_loss": 4.93022998805904, + "tokens_seen": 79364096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928986960882648, + "loss": 3.6705, + "theoretical_loss": 4.929647329119659, + "tokens_seen": 79429632 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492888665997994, + "loss": 3.3607, + "theoretical_loss": 4.9290652852049455, + "tokens_seen": 79495168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928786359077232, + "loss": 3.5638, + "theoretical_loss": 4.928483855159485, + "tokens_seen": 79560704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928686058174523, + "loss": 3.8926, + "theoretical_loss": 4.927903037830983, + "tokens_seen": 79626240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928585757271815, + "loss": 3.8163, + "theoretical_loss": 4.9273228320702565, + "tokens_seen": 79691776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928485456369107, + "loss": 3.6785, + "theoretical_loss": 4.926743236731218, + "tokens_seen": 79757312 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049283851554664, + "loss": 3.5302, + "theoretical_loss": 4.926164250670868, + "tokens_seen": 79822848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928284854563691, + "loss": 3.4988, + "theoretical_loss": 4.925585872749284, + "tokens_seen": 79888384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928184553660984, + "loss": 3.6865, + "theoretical_loss": 4.925008101829608, + "tokens_seen": 79953920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928084252758274, + "loss": 3.5066, + "theoretical_loss": 4.9244309367780374, + "tokens_seen": 80019456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927983951855567, + "loss": 3.8955, + "theoretical_loss": 4.923854376463816, + "tokens_seen": 80084992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927883650952859, + "loss": 3.7358, + "theoretical_loss": 4.923278419759217, + "tokens_seen": 80150528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927783350050151, + "loss": 3.7513, + "theoretical_loss": 4.92270306553954, + "tokens_seen": 80216064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 57296, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.519014596939087, + "objective/train/theoretical_loss": 4.922128312683096, + "objective/train/tokens_used": 100741600, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927683049147443, + "loss": 3.5332, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927582748244734, + "loss": 3.5115, + "theoretical_loss": 4.921554160071194, + "tokens_seen": 80347136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927482447342026, + "loss": 3.4584, + "theoretical_loss": 4.920980606588142, + "tokens_seen": 80412672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927382146439318, + "loss": 3.6266, + "theoretical_loss": 4.920407651121222, + "tokens_seen": 80478208 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492728184553661, + "loss": 3.8078, + "theoretical_loss": 4.919835292560689, + "tokens_seen": 80543744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927181544633902, + "loss": 3.8442, + "theoretical_loss": 4.919263529799759, + "tokens_seen": 80609280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927081243731193, + "loss": 3.8532, + "theoretical_loss": 4.918692361734598, + "tokens_seen": 80674816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926980942828485, + "loss": 3.4396, + "theoretical_loss": 4.91812178726431, + "tokens_seen": 80740352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926880641925777, + "loss": 3.6955, + "theoretical_loss": 4.917551805290929, + "tokens_seen": 80805888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926780341023069, + "loss": 3.5437, + "theoretical_loss": 4.916982414719408, + "tokens_seen": 80871424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926680040120361, + "loss": 3.8737, + "theoretical_loss": 4.9164136144576105, + "tokens_seen": 80936960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926579739217654, + "loss": 3.464, + "theoretical_loss": 4.915845403416299, + "tokens_seen": 81002496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926479438314944, + "loss": 3.6247, + "theoretical_loss": 4.915277780509124, + "tokens_seen": 81068032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926379137412237, + "loss": 3.4444, + "theoretical_loss": 4.914710744652614, + "tokens_seen": 81133568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926278836509528, + "loss": 3.5921, + "theoretical_loss": 4.914144294766169, + "tokens_seen": 81199104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926178535606821, + "loss": 3.5012, + "theoretical_loss": 4.913578429772047, + "tokens_seen": 81264640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926078234704113, + "loss": 3.3114, + "theoretical_loss": 4.913013148595355, + "tokens_seen": 81330176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925977933801404, + "loss": 3.7158, + "theoretical_loss": 4.912448450164041, + "tokens_seen": 81395712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925877632898696, + "loss": 3.7015, + "theoretical_loss": 4.91188433340888, + "tokens_seen": 81461248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925777331995988, + "loss": 3.9955, + "theoretical_loss": 4.911320797263471, + "tokens_seen": 81526784 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492567703109328, + "loss": 3.8269, + "theoretical_loss": 4.910757840664219, + "tokens_seen": 81592320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925576730190572, + "loss": 3.5523, + "theoretical_loss": 4.910195462550334, + "tokens_seen": 81657856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925476429287864, + "loss": 3.552, + "theoretical_loss": 4.909633661863811, + "tokens_seen": 81723392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925376128385155, + "loss": 3.8094, + "theoretical_loss": 4.909072437549434, + "tokens_seen": 81788928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925275827482447, + "loss": 3.7451, + "theoretical_loss": 4.908511788554753, + "tokens_seen": 81854464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 58605, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.980154275894165, + "objective/train/theoretical_loss": 4.907951713830082, + "objective/train/tokens_used": 102380000, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925175526579739, + "loss": 3.833, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925075225677031, + "loss": 3.5955, + "theoretical_loss": 4.907392212328489, + "tokens_seen": 81985536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924974924774323, + "loss": 3.3892, + "theoretical_loss": 4.906833283005785, + "tokens_seen": 82051072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924874623871615, + "loss": 3.7307, + "theoretical_loss": 4.906274924820515, + "tokens_seen": 82116608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924774322968907, + "loss": 3.7447, + "theoretical_loss": 4.90571713673395, + "tokens_seen": 82182144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924674022066199, + "loss": 3.5136, + "theoretical_loss": 4.905159917710073, + "tokens_seen": 82247680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924573721163491, + "loss": 3.6628, + "theoretical_loss": 4.904603266715578, + "tokens_seen": 82313216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924473420260783, + "loss": 3.7499, + "theoretical_loss": 4.904047182719854, + "tokens_seen": 82378752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924373119358075, + "loss": 3.4528, + "theoretical_loss": 4.903491664694977, + "tokens_seen": 82444288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924272818455366, + "loss": 3.614, + "theoretical_loss": 4.902936711615702, + "tokens_seen": 82509824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924172517552658, + "loss": 3.7331, + "theoretical_loss": 4.902382322459456, + "tokens_seen": 82575360 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492407221664995, + "loss": 3.5698, + "theoretical_loss": 4.901828496206322, + "tokens_seen": 82640896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923971915747242, + "loss": 3.6838, + "theoretical_loss": 4.90127523183904, + "tokens_seen": 82706432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923871614844534, + "loss": 3.7826, + "theoretical_loss": 4.900722528342988, + "tokens_seen": 82771968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923771313941825, + "loss": 3.6392, + "theoretical_loss": 4.900170384706181, + "tokens_seen": 82837504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923671013039117, + "loss": 3.491, + "theoretical_loss": 4.899618799919256, + "tokens_seen": 82903040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923570712136409, + "loss": 3.4612, + "theoretical_loss": 4.899067772975469, + "tokens_seen": 82968576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923470411233702, + "loss": 3.687, + "theoretical_loss": 4.898517302870679, + "tokens_seen": 83034112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923370110330993, + "loss": 3.7585, + "theoretical_loss": 4.897967388603346, + "tokens_seen": 83099648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923269809428286, + "loss": 3.5586, + "theoretical_loss": 4.897418029174519, + "tokens_seen": 83165184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923169508525576, + "loss": 3.9548, + "theoretical_loss": 4.896869223587828, + "tokens_seen": 83230720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923069207622869, + "loss": 3.8464, + "theoretical_loss": 4.896320970849472, + "tokens_seen": 83296256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922968906720161, + "loss": 3.5297, + "theoretical_loss": 4.895773269968219, + "tokens_seen": 83361792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922868605817453, + "loss": 3.7033, + "theoretical_loss": 4.895226119955386, + "tokens_seen": 83427328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922768304914745, + "loss": 3.9995, + "theoretical_loss": 4.894679519824841, + "tokens_seen": 83492864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 59310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.926790952682495, + "objective/train/theoretical_loss": 4.894133468592984, + "objective/train/tokens_used": 104018400, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922668004012036, + "loss": 3.6992, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922567703109328, + "loss": 3.6724, + "theoretical_loss": 4.8935879652787495, + "tokens_seen": 83623936 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492246740220662, + "loss": 3.7634, + "theoretical_loss": 4.893043008903591, + "tokens_seen": 83689472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922367101303912, + "loss": 3.701, + "theoretical_loss": 4.892498598491473, + "tokens_seen": 83755008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922266800401204, + "loss": 3.6549, + "theoretical_loss": 4.891954733068863, + "tokens_seen": 83820544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922166499498495, + "loss": 3.6165, + "theoretical_loss": 4.891411411664727, + "tokens_seen": 83886080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922066198595787, + "loss": 3.682, + "theoretical_loss": 4.890868633310515, + "tokens_seen": 83951616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921965897693079, + "loss": 3.4424, + "theoretical_loss": 4.890326397040158, + "tokens_seen": 84017152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921865596790371, + "loss": 3.6114, + "theoretical_loss": 4.889784701890056, + "tokens_seen": 84082688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921765295887663, + "loss": 3.5193, + "theoretical_loss": 4.8892435468990705, + "tokens_seen": 84148224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921664994984956, + "loss": 3.2842, + "theoretical_loss": 4.88870293110852, + "tokens_seen": 84213760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921564694082246, + "loss": 3.7096, + "theoretical_loss": 4.888162853562166, + "tokens_seen": 84279296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921464393179539, + "loss": 3.5071, + "theoretical_loss": 4.88762331330621, + "tokens_seen": 84344832 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492136409227683, + "loss": 3.4462, + "theoretical_loss": 4.88708430938928, + "tokens_seen": 84410368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921263791374123, + "loss": 3.7536, + "theoretical_loss": 4.8865458408624285, + "tokens_seen": 84475904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921163490471415, + "loss": 3.493, + "theoretical_loss": 4.8860079067791204, + "tokens_seen": 84541440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921063189568706, + "loss": 3.7587, + "theoretical_loss": 4.885470506195227, + "tokens_seen": 84606976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920962888665998, + "loss": 3.6445, + "theoretical_loss": 4.884933638169014, + "tokens_seen": 84672512 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492086258776329, + "loss": 3.5871, + "theoretical_loss": 4.88439730176114, + "tokens_seen": 84738048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920762286860582, + "loss": 3.772, + "theoretical_loss": 4.883861496034644, + "tokens_seen": 84803584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920661985957874, + "loss": 3.4383, + "theoretical_loss": 4.88332622005494, + "tokens_seen": 84869120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920561685055166, + "loss": 3.4985, + "theoretical_loss": 4.8827914728898065, + "tokens_seen": 84934656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920461384152457, + "loss": 3.5046, + "theoretical_loss": 4.88225725360938, + "tokens_seen": 85000192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920361083249749, + "loss": 3.682, + "theoretical_loss": 4.881723561286149, + "tokens_seen": 85065728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920260782347041, + "loss": 3.422, + "theoretical_loss": 4.881190394994943, + "tokens_seen": 85131264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 60492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.639910936355591, + "objective/train/theoretical_loss": 4.880657753812926, + "objective/train/tokens_used": 105656800, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920160481444333, + "loss": 3.7149, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920060180541625, + "loss": 3.5313, + "theoretical_loss": 4.880125636819594, + "tokens_seen": 85262336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919959879638916, + "loss": 3.6137, + "theoretical_loss": 4.879594043096755, + "tokens_seen": 85327872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919859578736209, + "loss": 3.4991, + "theoretical_loss": 4.879062971728534, + "tokens_seen": 85393408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049197592778335, + "loss": 3.6767, + "theoretical_loss": 4.87853242180136, + "tokens_seen": 85458944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919658976930793, + "loss": 3.7686, + "theoretical_loss": 4.878002392403959, + "tokens_seen": 85524480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919558676028084, + "loss": 3.5113, + "theoretical_loss": 4.877472882627343, + "tokens_seen": 85590016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919458375125377, + "loss": 3.7634, + "theoretical_loss": 4.8769438915648085, + "tokens_seen": 85655552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919358074222668, + "loss": 3.4673, + "theoretical_loss": 4.876415418311928, + "tokens_seen": 85721088 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491925777331996, + "loss": 3.562, + "theoretical_loss": 4.875887461966537, + "tokens_seen": 85786624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919157472417252, + "loss": 3.7082, + "theoretical_loss": 4.875360021628733, + "tokens_seen": 85852160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919057171514544, + "loss": 3.558, + "theoretical_loss": 4.874833096400865, + "tokens_seen": 85917696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918956870611836, + "loss": 3.259, + "theoretical_loss": 4.874306685387525, + "tokens_seen": 85983232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918856569709127, + "loss": 3.6862, + "theoretical_loss": 4.873780787695547, + "tokens_seen": 86048768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918756268806419, + "loss": 3.546, + "theoretical_loss": 4.87325540243399, + "tokens_seen": 86114304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918655967903711, + "loss": 3.5487, + "theoretical_loss": 4.872730528714139, + "tokens_seen": 86179840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918555667001003, + "loss": 3.57, + "theoretical_loss": 4.872206165649493, + "tokens_seen": 86245376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918455366098295, + "loss": 3.6859, + "theoretical_loss": 4.871682312355761, + "tokens_seen": 86310912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918355065195586, + "loss": 3.588, + "theoretical_loss": 4.871158967950852, + "tokens_seen": 86376448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918254764292878, + "loss": 3.6822, + "theoretical_loss": 4.870636131554869, + "tokens_seen": 86441984 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491815446339017, + "loss": 3.4897, + "theoretical_loss": 4.8701138022901045, + "tokens_seen": 86507520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918054162487463, + "loss": 3.5301, + "theoretical_loss": 4.869591979281028, + "tokens_seen": 86573056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917953861584754, + "loss": 3.3777, + "theoretical_loss": 4.8690706616542805, + "tokens_seen": 86638592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917853560682047, + "loss": 3.3358, + "theoretical_loss": 4.868549848538675, + "tokens_seen": 86704128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917753259779337, + "loss": 3.3347, + "theoretical_loss": 4.868029539065176, + "tokens_seen": 86769664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 61005, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4489526748657227, + "objective/train/theoretical_loss": 4.867509732366907, + "objective/train/tokens_used": 107295200, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491765295887663, + "loss": 3.6654, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917552657973922, + "loss": 3.8192, + "theoretical_loss": 4.866990427579129, + "tokens_seen": 86900736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917452357071214, + "loss": 3.5657, + "theoretical_loss": 4.866471623839248, + "tokens_seen": 86966272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917352056168506, + "loss": 3.348, + "theoretical_loss": 4.8659533202867955, + "tokens_seen": 87031808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917251755265797, + "loss": 3.4467, + "theoretical_loss": 4.86543551606343, + "tokens_seen": 87097344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917151454363089, + "loss": 3.5397, + "theoretical_loss": 4.864918210312927, + "tokens_seen": 87162880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917051153460381, + "loss": 3.5919, + "theoretical_loss": 4.864401402181173, + "tokens_seen": 87228416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916950852557673, + "loss": 3.5553, + "theoretical_loss": 4.863885090816158, + "tokens_seen": 87293952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916850551654965, + "loss": 3.6125, + "theoretical_loss": 4.863369275367968, + "tokens_seen": 87359488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916750250752258, + "loss": 3.4756, + "theoretical_loss": 4.862853954988781, + "tokens_seen": 87425024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916649949849548, + "loss": 3.5864, + "theoretical_loss": 4.862339128832857, + "tokens_seen": 87490560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916549648946841, + "loss": 3.6655, + "theoretical_loss": 4.861824796056533, + "tokens_seen": 87556096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916449348044132, + "loss": 3.5427, + "theoretical_loss": 4.861310955818219, + "tokens_seen": 87621632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916349047141425, + "loss": 3.5534, + "theoretical_loss": 4.860797607278385, + "tokens_seen": 87687168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916248746238717, + "loss": 3.7963, + "theoretical_loss": 4.86028474959956, + "tokens_seen": 87752704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916148445336008, + "loss": 3.6729, + "theoretical_loss": 4.859772381946323, + "tokens_seen": 87818240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049160481444333, + "loss": 3.6349, + "theoretical_loss": 4.859260503485298, + "tokens_seen": 87883776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915947843530592, + "loss": 3.7414, + "theoretical_loss": 4.858749113385144, + "tokens_seen": 87949312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915847542627884, + "loss": 3.2873, + "theoretical_loss": 4.858238210816554, + "tokens_seen": 88014848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915747241725176, + "loss": 3.5819, + "theoretical_loss": 4.8577277949522415, + "tokens_seen": 88080384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915646940822468, + "loss": 3.6767, + "theoretical_loss": 4.857217864966943, + "tokens_seen": 88145920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915546639919759, + "loss": 3.3539, + "theoretical_loss": 4.856708420037402, + "tokens_seen": 88211456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915446339017051, + "loss": 3.8479, + "theoretical_loss": 4.8561994593423705, + "tokens_seen": 88276992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915346038114343, + "loss": 3.5546, + "theoretical_loss": 4.8556909820625975, + "tokens_seen": 88342528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915245737211635, + "loss": 3.8083, + "theoretical_loss": 4.855182987380823, + "tokens_seen": 88408064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 61597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9370856285095215, + "objective/train/theoretical_loss": 4.854675474481779, + "objective/train/tokens_used": 108933600, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915145436308927, + "loss": 3.6345, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915045135406218, + "loss": 3.4992, + "theoretical_loss": 4.8541684425521705, + "tokens_seen": 88539136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914944834503511, + "loss": 3.4698, + "theoretical_loss": 4.85366189078068, + "tokens_seen": 88604672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914844533600802, + "loss": 3.5404, + "theoretical_loss": 4.853155818357957, + "tokens_seen": 88670208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914744232698095, + "loss": 3.5519, + "theoretical_loss": 4.852650224476609, + "tokens_seen": 88735744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914643931795386, + "loss": 3.2668, + "theoretical_loss": 4.852145108331205, + "tokens_seen": 88801280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914543630892679, + "loss": 3.7668, + "theoretical_loss": 4.851640469118255, + "tokens_seen": 88866816 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491444332998997, + "loss": 3.6371, + "theoretical_loss": 4.851136306036219, + "tokens_seen": 88932352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914343029087262, + "loss": 3.5714, + "theoretical_loss": 4.850632618285486, + "tokens_seen": 88997888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914242728184554, + "loss": 3.4973, + "theoretical_loss": 4.850129405068383, + "tokens_seen": 89063424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914142427281846, + "loss": 3.6218, + "theoretical_loss": 4.849626665589156, + "tokens_seen": 89128960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914042126379138, + "loss": 3.3802, + "theoretical_loss": 4.849124399053969, + "tokens_seen": 89194496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913941825476429, + "loss": 3.456, + "theoretical_loss": 4.8486226046709024, + "tokens_seen": 89260032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913841524573721, + "loss": 3.6214, + "theoretical_loss": 4.8481212816499415, + "tokens_seen": 89325568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913741223671013, + "loss": 3.7055, + "theoretical_loss": 4.847620429202967, + "tokens_seen": 89391104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913640922768305, + "loss": 3.6888, + "theoretical_loss": 4.847120046543763, + "tokens_seen": 89456640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913540621865597, + "loss": 3.803, + "theoretical_loss": 4.846620132887992, + "tokens_seen": 89522176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913440320962888, + "loss": 3.392, + "theoretical_loss": 4.8461206874532055, + "tokens_seen": 89587712 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491334002006018, + "loss": 3.5634, + "theoretical_loss": 4.845621709458831, + "tokens_seen": 89653248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913239719157472, + "loss": 3.6485, + "theoretical_loss": 4.845123198126162, + "tokens_seen": 89718784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913139418254765, + "loss": 3.6994, + "theoretical_loss": 4.844625152678364, + "tokens_seen": 89784320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913039117352056, + "loss": 3.6592, + "theoretical_loss": 4.844127572340455, + "tokens_seen": 89849856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912938816449349, + "loss": 3.6714, + "theoretical_loss": 4.84363045633931, + "tokens_seen": 89915392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912838515546639, + "loss": 3.3055, + "theoretical_loss": 4.843133803903651, + "tokens_seen": 89980928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912738214643932, + "loss": 3.6851, + "theoretical_loss": 4.84263761426404, + "tokens_seen": 90046464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 63098, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.415714979171753, + "objective/train/theoretical_loss": 4.842141886652876, + "objective/train/tokens_used": 110572000, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912637913741224, + "loss": 3.496, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912537612838516, + "loss": 3.7029, + "theoretical_loss": 4.841646620304388, + "tokens_seen": 90177536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912437311935808, + "loss": 3.4109, + "theoretical_loss": 4.841151814454632, + "tokens_seen": 90243072 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049123370110331, + "loss": 3.5139, + "theoretical_loss": 4.840657468341476, + "tokens_seen": 90308608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912236710130391, + "loss": 3.7375, + "theoretical_loss": 4.84016358120461, + "tokens_seen": 90374144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912136409227683, + "loss": 3.4378, + "theoretical_loss": 4.839670152285526, + "tokens_seen": 90439680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912036108324975, + "loss": 3.4727, + "theoretical_loss": 4.8391771808275195, + "tokens_seen": 90505216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911935807422267, + "loss": 3.4128, + "theoretical_loss": 4.838684666075682, + "tokens_seen": 90570752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911835506519559, + "loss": 3.5232, + "theoretical_loss": 4.838192607276896, + "tokens_seen": 90636288 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491173520561685, + "loss": 3.5274, + "theoretical_loss": 4.837701003679829, + "tokens_seen": 90701824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911634904714142, + "loss": 3.573, + "theoretical_loss": 4.8372098545349305, + "tokens_seen": 90767360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911534603811434, + "loss": 3.4979, + "theoretical_loss": 4.836719159094422, + "tokens_seen": 90832896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911434302908726, + "loss": 3.7194, + "theoretical_loss": 4.836228916612292, + "tokens_seen": 90898432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911334002006019, + "loss": 3.71, + "theoretical_loss": 4.835739126344298, + "tokens_seen": 90963968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911233701103309, + "loss": 3.4962, + "theoretical_loss": 4.8352497875479505, + "tokens_seen": 91029504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911133400200602, + "loss": 3.4739, + "theoretical_loss": 4.834760899482514, + "tokens_seen": 91095040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911033099297893, + "loss": 3.452, + "theoretical_loss": 4.834272461409001, + "tokens_seen": 91160576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910932798395186, + "loss": 3.8125, + "theoretical_loss": 4.833784472590165, + "tokens_seen": 91226112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910832497492478, + "loss": 3.3993, + "theoretical_loss": 4.833296932290495, + "tokens_seen": 91291648 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491073219658977, + "loss": 3.5064, + "theoretical_loss": 4.832809839776213, + "tokens_seen": 91357184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910631895687061, + "loss": 3.4702, + "theoretical_loss": 4.832323194315265, + "tokens_seen": 91422720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910531594784353, + "loss": 3.576, + "theoretical_loss": 4.831836995177319, + "tokens_seen": 91488256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910431293881645, + "loss": 3.6835, + "theoretical_loss": 4.831351241633756, + "tokens_seen": 91553792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910330992978937, + "loss": 3.6733, + "theoretical_loss": 4.8308659329576695, + "tokens_seen": 91619328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910230692076229, + "loss": 3.7039, + "theoretical_loss": 4.830381068423856, + "tokens_seen": 91684864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 63840, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2570016384124756, + "objective/train/theoretical_loss": 4.8298966473088125, + "objective/train/tokens_used": 112210400, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491013039117352, + "loss": 3.5049, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910030090270812, + "loss": 3.7075, + "theoretical_loss": 4.829412668890729, + "tokens_seen": 91815936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909929789368104, + "loss": 3.3632, + "theoretical_loss": 4.8289291324494865, + "tokens_seen": 91881472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909829488465397, + "loss": 3.4331, + "theoretical_loss": 4.828446037266647, + "tokens_seen": 91947008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909729187562688, + "loss": 3.5859, + "theoretical_loss": 4.827963382625454, + "tokens_seen": 92012544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909628886659981, + "loss": 3.4441, + "theoretical_loss": 4.827481167810825, + "tokens_seen": 92078080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909528585757272, + "loss": 3.6517, + "theoretical_loss": 4.826999392109344, + "tokens_seen": 92143616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909428284854564, + "loss": 3.4679, + "theoretical_loss": 4.826518054809259, + "tokens_seen": 92209152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909327983951856, + "loss": 3.6007, + "theoretical_loss": 4.826037155200478, + "tokens_seen": 92274688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909227683049148, + "loss": 3.4858, + "theoretical_loss": 4.825556692574562, + "tokens_seen": 92340224 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490912738214644, + "loss": 3.5593, + "theoretical_loss": 4.825076666224717, + "tokens_seen": 92405760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909027081243731, + "loss": 3.5697, + "theoretical_loss": 4.824597075445799, + "tokens_seen": 92471296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908926780341023, + "loss": 3.5531, + "theoretical_loss": 4.824117919534297, + "tokens_seen": 92536832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908826479438315, + "loss": 3.5856, + "theoretical_loss": 4.823639197788334, + "tokens_seen": 92602368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908726178535607, + "loss": 3.6203, + "theoretical_loss": 4.823160909507665, + "tokens_seen": 92667904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908625877632899, + "loss": 3.5859, + "theoretical_loss": 4.822683053993664, + "tokens_seen": 92733440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490852557673019, + "loss": 3.4464, + "theoretical_loss": 4.822205630549329, + "tokens_seen": 92798976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908425275827482, + "loss": 3.1732, + "theoretical_loss": 4.821728638479267, + "tokens_seen": 92864512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908324974924774, + "loss": 3.6278, + "theoretical_loss": 4.821252077089696, + "tokens_seen": 92930048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908224674022067, + "loss": 3.6351, + "theoretical_loss": 4.820775945688437, + "tokens_seen": 92995584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908124373119358, + "loss": 3.4334, + "theoretical_loss": 4.820300243584913, + "tokens_seen": 93061120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908024072216651, + "loss": 3.4374, + "theoretical_loss": 4.819824970090138, + "tokens_seen": 93126656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907923771313941, + "loss": 3.8253, + "theoretical_loss": 4.819350124516717, + "tokens_seen": 93192192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907823470411234, + "loss": 3.5773, + "theoretical_loss": 4.818875706178841, + "tokens_seen": 93257728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907723169508526, + "loss": 3.7615, + "theoretical_loss": 4.818401714392279, + "tokens_seen": 93323264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 65224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.483311653137207, + "objective/train/theoretical_loss": 4.817928148474378, + "objective/train/tokens_used": 113848800, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907622868605818, + "loss": 3.571, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490752256770311, + "loss": 3.5792, + "theoretical_loss": 4.817455007744052, + "tokens_seen": 93454336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907422266800401, + "loss": 3.4639, + "theoretical_loss": 4.816982291521785, + "tokens_seen": 93519872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907321965897693, + "loss": 3.3508, + "theoretical_loss": 4.816509999129618, + "tokens_seen": 93585408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907221664994985, + "loss": 3.5891, + "theoretical_loss": 4.816038129891151, + "tokens_seen": 93650944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907121364092277, + "loss": 3.4847, + "theoretical_loss": 4.815566683131536, + "tokens_seen": 93716480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907021063189569, + "loss": 3.7072, + "theoretical_loss": 4.815095658177472, + "tokens_seen": 93782016 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490692076228686, + "loss": 3.6021, + "theoretical_loss": 4.814625054357199, + "tokens_seen": 93847552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906820461384152, + "loss": 3.4324, + "theoretical_loss": 4.814154871000497, + "tokens_seen": 93913088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906720160481444, + "loss": 3.5133, + "theoretical_loss": 4.813685107438679, + "tokens_seen": 93978624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906619859578736, + "loss": 3.5597, + "theoretical_loss": 4.813215763004585, + "tokens_seen": 94044160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906519558676028, + "loss": 3.4239, + "theoretical_loss": 4.812746837032582, + "tokens_seen": 94109696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906419257773321, + "loss": 3.373, + "theoretical_loss": 4.812278328858554, + "tokens_seen": 94175232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906318956870611, + "loss": 3.5417, + "theoretical_loss": 4.811810237819904, + "tokens_seen": 94240768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906218655967904, + "loss": 3.3743, + "theoretical_loss": 4.81134256325554, + "tokens_seen": 94306304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906118355065195, + "loss": 3.5175, + "theoretical_loss": 4.810875304505881, + "tokens_seen": 94371840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906018054162488, + "loss": 3.3378, + "theoretical_loss": 4.810408460912846, + "tokens_seen": 94437376 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490591775325978, + "loss": 3.8392, + "theoretical_loss": 4.809942031819853, + "tokens_seen": 94502912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905817452357072, + "loss": 3.4628, + "theoretical_loss": 4.809476016571809, + "tokens_seen": 94568448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905717151454363, + "loss": 3.699, + "theoretical_loss": 4.809010414515113, + "tokens_seen": 94633984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905616850551655, + "loss": 3.6345, + "theoretical_loss": 4.808545224997644, + "tokens_seen": 94699520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905516549648947, + "loss": 3.3299, + "theoretical_loss": 4.808080447368766, + "tokens_seen": 94765056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905416248746239, + "loss": 3.5958, + "theoretical_loss": 4.807616080979315, + "tokens_seen": 94830592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905315947843531, + "loss": 3.752, + "theoretical_loss": 4.807152125181597, + "tokens_seen": 94896128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905215646940822, + "loss": 3.7867, + "theoretical_loss": 4.806688579329387, + "tokens_seen": 94961664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 65494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3390982151031494, + "objective/train/theoretical_loss": 4.8062254427779205, + "objective/train/tokens_used": 115487200, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905115346038114, + "loss": 3.5332, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905015045135406, + "loss": 3.6865, + "theoretical_loss": 4.80576271488389, + "tokens_seen": 95092736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904914744232698, + "loss": 3.6984, + "theoretical_loss": 4.805300395005444, + "tokens_seen": 95158272 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490481444332999, + "loss": 3.7887, + "theoretical_loss": 4.804838482502181, + "tokens_seen": 95223808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904714142427281, + "loss": 3.5508, + "theoretical_loss": 4.8043769767351385, + "tokens_seen": 95289344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904613841524574, + "loss": 3.2765, + "theoretical_loss": 4.8039158770668005, + "tokens_seen": 95354880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904513540621865, + "loss": 3.4666, + "theoretical_loss": 4.803455182861087, + "tokens_seen": 95420416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904413239719158, + "loss": 3.4331, + "theoretical_loss": 4.802994893483348, + "tokens_seen": 95485952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904312938816449, + "loss": 3.4208, + "theoretical_loss": 4.802535008300364, + "tokens_seen": 95551488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904212637913742, + "loss": 3.3395, + "theoretical_loss": 4.802075526680335, + "tokens_seen": 95617024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904112337011033, + "loss": 3.3708, + "theoretical_loss": 4.801616447992888, + "tokens_seen": 95682560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904012036108325, + "loss": 3.6168, + "theoretical_loss": 4.801157771609061, + "tokens_seen": 95748096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903911735205617, + "loss": 3.3959, + "theoretical_loss": 4.8006994969013, + "tokens_seen": 95813632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903811434302909, + "loss": 3.6339, + "theoretical_loss": 4.800241623243467, + "tokens_seen": 95879168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903711133400201, + "loss": 3.3917, + "theoretical_loss": 4.799784150010819, + "tokens_seen": 95944704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903610832497492, + "loss": 3.6765, + "theoretical_loss": 4.799327076580017, + "tokens_seen": 96010240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903510531594784, + "loss": 3.6297, + "theoretical_loss": 4.798870402329115, + "tokens_seen": 96075776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903410230692076, + "loss": 3.4202, + "theoretical_loss": 4.798414126637558, + "tokens_seen": 96141312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903309929789368, + "loss": 3.417, + "theoretical_loss": 4.797958248886179, + "tokens_seen": 96206848 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490320962888666, + "loss": 3.5827, + "theoretical_loss": 4.797502768457193, + "tokens_seen": 96272384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903109327983952, + "loss": 3.5392, + "theoretical_loss": 4.797047684734192, + "tokens_seen": 96337920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903009027081243, + "loss": 3.744, + "theoretical_loss": 4.796592997102147, + "tokens_seen": 96403456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902908726178535, + "loss": 3.823, + "theoretical_loss": 4.796138704947397, + "tokens_seen": 96468992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902808425275828, + "loss": 3.475, + "theoretical_loss": 4.795684807657649, + "tokens_seen": 96534528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902708124373119, + "loss": 3.6812, + "theoretical_loss": 4.795231304621968, + "tokens_seen": 96600064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 66055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.559401750564575, + "objective/train/theoretical_loss": 4.794778195230787, + "objective/train/tokens_used": 117125600, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902607823470412, + "loss": 3.6274, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902507522567703, + "loss": 3.5801, + "theoretical_loss": 4.794325478875885, + "tokens_seen": 96731136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902407221664995, + "loss": 3.6061, + "theoretical_loss": 4.793873154950399, + "tokens_seen": 96796672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902306920762287, + "loss": 3.6615, + "theoretical_loss": 4.793421222848808, + "tokens_seen": 96862208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902206619859579, + "loss": 3.3969, + "theoretical_loss": 4.7929696819669365, + "tokens_seen": 96927744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902106318956871, + "loss": 3.8594, + "theoretical_loss": 4.792518531701948, + "tokens_seen": 96993280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902006018054163, + "loss": 3.6387, + "theoretical_loss": 4.792067771452341, + "tokens_seen": 97058816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901905717151454, + "loss": 3.4593, + "theoretical_loss": 4.791617400617948, + "tokens_seen": 97124352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901805416248746, + "loss": 3.5499, + "theoretical_loss": 4.791167418599925, + "tokens_seen": 97189888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901705115346038, + "loss": 3.3165, + "theoretical_loss": 4.790717824800755, + "tokens_seen": 97255424 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490160481444333, + "loss": 3.5253, + "theoretical_loss": 4.790268618624239, + "tokens_seen": 97320960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901504513540623, + "loss": 3.2939, + "theoretical_loss": 4.789819799475499, + "tokens_seen": 97386496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901404212637913, + "loss": 3.7183, + "theoretical_loss": 4.789371366760961, + "tokens_seen": 97452032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901303911735206, + "loss": 3.3984, + "theoretical_loss": 4.788923319888369, + "tokens_seen": 97517568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901203610832497, + "loss": 3.4614, + "theoretical_loss": 4.788475658266766, + "tokens_seen": 97583104 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490110330992979, + "loss": 3.2819, + "theoretical_loss": 4.788028381306497, + "tokens_seen": 97648640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901003009027082, + "loss": 3.3862, + "theoretical_loss": 4.787581488419207, + "tokens_seen": 97714176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900902708124374, + "loss": 3.7962, + "theoretical_loss": 4.787134979017832, + "tokens_seen": 97779712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900802407221665, + "loss": 3.3423, + "theoretical_loss": 4.786688852516599, + "tokens_seen": 97845248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900702106318957, + "loss": 3.5008, + "theoretical_loss": 4.786243108331024, + "tokens_seen": 97910784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900601805416249, + "loss": 3.736, + "theoretical_loss": 4.7857977458779, + "tokens_seen": 97976320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900501504513541, + "loss": 3.6437, + "theoretical_loss": 4.785352764575304, + "tokens_seen": 98041856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900401203610833, + "loss": 3.4917, + "theoretical_loss": 4.784908163842585, + "tokens_seen": 98107392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900300902708124, + "loss": 3.6996, + "theoretical_loss": 4.784463943100367, + "tokens_seen": 98172928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900200601805416, + "loss": 3.5085, + "theoretical_loss": 4.7840201017705395, + "tokens_seen": 98238464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 66835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.026576995849609, + "objective/train/theoretical_loss": 4.783576639276257, + "objective/train/tokens_used": 118764000, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900100300902708, + "loss": 3.6122, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049, + "loss": 3.6488, + "theoretical_loss": 4.783133555041934, + "tokens_seen": 98369536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899899699097292, + "loss": 3.4521, + "theoretical_loss": 4.782690848493245, + "tokens_seen": 98435072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899799398194583, + "loss": 3.4946, + "theoretical_loss": 4.7822485190571165, + "tokens_seen": 98500608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899699097291876, + "loss": 3.1609, + "theoretical_loss": 4.781806566161723, + "tokens_seen": 98566144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899598796389167, + "loss": 3.2605, + "theoretical_loss": 4.781364989236488, + "tokens_seen": 98631680 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489949849548646, + "loss": 3.5803, + "theoretical_loss": 4.78092378771208, + "tokens_seen": 98697216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899398194583751, + "loss": 3.6634, + "theoretical_loss": 4.780482961020402, + "tokens_seen": 98762752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899297893681044, + "loss": 3.6734, + "theoretical_loss": 4.780042508594596, + "tokens_seen": 98828288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899197592778335, + "loss": 3.6452, + "theoretical_loss": 4.779602429869035, + "tokens_seen": 98893824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899097291875627, + "loss": 3.4789, + "theoretical_loss": 4.779162724279324, + "tokens_seen": 98959360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898996990972919, + "loss": 3.5272, + "theoretical_loss": 4.7787233912622895, + "tokens_seen": 99024896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898896690070211, + "loss": 3.9395, + "theoretical_loss": 4.778284430255981, + "tokens_seen": 99090432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898796389167503, + "loss": 3.803, + "theoretical_loss": 4.77784584069967, + "tokens_seen": 99155968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898696088264794, + "loss": 3.5448, + "theoretical_loss": 4.777407622033838, + "tokens_seen": 99221504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898595787362086, + "loss": 3.3819, + "theoretical_loss": 4.776969773700181, + "tokens_seen": 99287040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898495486459378, + "loss": 3.5351, + "theoretical_loss": 4.776532295141601, + "tokens_seen": 99352576 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489839518555667, + "loss": 3.6881, + "theoretical_loss": 4.776095185802211, + "tokens_seen": 99418112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898294884653962, + "loss": 3.8074, + "theoretical_loss": 4.775658445127318, + "tokens_seen": 99483648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898194583751254, + "loss": 3.4411, + "theoretical_loss": 4.775222072563429, + "tokens_seen": 99549184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898094282848545, + "loss": 3.4496, + "theoretical_loss": 4.7747860675582485, + "tokens_seen": 99614720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897993981945837, + "loss": 3.348, + "theoretical_loss": 4.77435042956067, + "tokens_seen": 99680256 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489789368104313, + "loss": 3.5864, + "theoretical_loss": 4.773915158020776, + "tokens_seen": 99745792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897793380140421, + "loss": 3.4012, + "theoretical_loss": 4.773480252389831, + "tokens_seen": 99811328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897693079237714, + "loss": 3.6203, + "theoretical_loss": 4.773045712120284, + "tokens_seen": 99876864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 67525, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8278653621673584, + "objective/train/theoretical_loss": 4.77261153666576, + "objective/train/tokens_used": 120402400, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897592778335005, + "loss": 3.5655, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897492477432297, + "loss": 3.6504, + "theoretical_loss": 4.772177725481062, + "tokens_seen": 100007936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897392176529589, + "loss": 3.5848, + "theoretical_loss": 4.77174427802216, + "tokens_seen": 100073472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897291875626881, + "loss": 3.6098, + "theoretical_loss": 4.771311193746191, + "tokens_seen": 100139008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897191574724173, + "loss": 3.2068, + "theoretical_loss": 4.770878472111465, + "tokens_seen": 100204544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897091273821465, + "loss": 3.6295, + "theoretical_loss": 4.770446112577445, + "tokens_seen": 100270080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896990972918756, + "loss": 3.4111, + "theoretical_loss": 4.770014114604756, + "tokens_seen": 100335616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896890672016048, + "loss": 3.5571, + "theoretical_loss": 4.769582477655177, + "tokens_seen": 100401152 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489679037111334, + "loss": 3.2238, + "theoretical_loss": 4.769151201191641, + "tokens_seen": 100466688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896690070210632, + "loss": 3.501, + "theoretical_loss": 4.768720284678228, + "tokens_seen": 100532224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896589769307924, + "loss": 3.6858, + "theoretical_loss": 4.768289727580161, + "tokens_seen": 100597760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896489468405215, + "loss": 3.6342, + "theoretical_loss": 4.767859529363809, + "tokens_seen": 100663296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896389167502507, + "loss": 3.3918, + "theoretical_loss": 4.767429689496682, + "tokens_seen": 100728832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896288866599799, + "loss": 3.4626, + "theoretical_loss": 4.767000207447417, + "tokens_seen": 100794368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896188565697091, + "loss": 3.4291, + "theoretical_loss": 4.766571082685794, + "tokens_seen": 100859904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896088264794384, + "loss": 3.4262, + "theoretical_loss": 4.766142314682716, + "tokens_seen": 100925440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895987963891674, + "loss": 3.8865, + "theoretical_loss": 4.765713902910214, + "tokens_seen": 100990976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895887662988967, + "loss": 3.4615, + "theoretical_loss": 4.765285846841444, + "tokens_seen": 101056512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895787362086259, + "loss": 3.4272, + "theoretical_loss": 4.76485814595068, + "tokens_seen": 101122048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895687061183551, + "loss": 3.5119, + "theoretical_loss": 4.764430799713314, + "tokens_seen": 101187584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895586760280843, + "loss": 3.6724, + "theoretical_loss": 4.764003807605853, + "tokens_seen": 101253120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895486459378135, + "loss": 3.6045, + "theoretical_loss": 4.763577169105912, + "tokens_seen": 101318656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895386158475426, + "loss": 3.4985, + "theoretical_loss": 4.763150883692218, + "tokens_seen": 101384192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895285857572718, + "loss": 3.5738, + "theoretical_loss": 4.762724950844598, + "tokens_seen": 101449728 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489518555667001, + "loss": 3.4495, + "theoretical_loss": 4.762299370043984, + "tokens_seen": 101515264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 68617, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4468700885772705, + "objective/train/theoretical_loss": 4.761874140772408, + "objective/train/tokens_used": 122040800, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895085255767302, + "loss": 3.5015, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894984954864594, + "loss": 3.7421, + "theoretical_loss": 4.761449262512993, + "tokens_seen": 101646336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894884653961885, + "loss": 3.8129, + "theoretical_loss": 4.761024734749958, + "tokens_seen": 101711872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894784353059178, + "loss": 3.7874, + "theoretical_loss": 4.76060055696861, + "tokens_seen": 101777408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894684052156469, + "loss": 3.7841, + "theoretical_loss": 4.760176728655345, + "tokens_seen": 101842944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894583751253762, + "loss": 3.4924, + "theoretical_loss": 4.75975324929764, + "tokens_seen": 101908480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894483450351053, + "loss": 3.3683, + "theoretical_loss": 4.759330118384053, + "tokens_seen": 101974016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894383149448346, + "loss": 3.7427, + "theoretical_loss": 4.758907335404221, + "tokens_seen": 102039552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894282848545637, + "loss": 3.6284, + "theoretical_loss": 4.758484899848854, + "tokens_seen": 102105088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894182547642929, + "loss": 3.4244, + "theoretical_loss": 4.7580628112097365, + "tokens_seen": 102170624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894082246740221, + "loss": 3.4994, + "theoretical_loss": 4.7576410689797175, + "tokens_seen": 102236160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893981945837513, + "loss": 3.5959, + "theoretical_loss": 4.757219672652717, + "tokens_seen": 102301696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893881644934805, + "loss": 3.1601, + "theoretical_loss": 4.756798621723712, + "tokens_seen": 102367232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893781344032096, + "loss": 3.6548, + "theoretical_loss": 4.756377915688748, + "tokens_seen": 102432768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893681043129388, + "loss": 3.4761, + "theoretical_loss": 4.755957554044917, + "tokens_seen": 102498304 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489358074222668, + "loss": 3.5187, + "theoretical_loss": 4.755537536290373, + "tokens_seen": 102563840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893480441323972, + "loss": 3.4874, + "theoretical_loss": 4.755117861924321, + "tokens_seen": 102629376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893380140421264, + "loss": 3.387, + "theoretical_loss": 4.754698530447009, + "tokens_seen": 102694912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893279839518556, + "loss": 3.3774, + "theoretical_loss": 4.754279541359738, + "tokens_seen": 102760448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893179538615847, + "loss": 3.4792, + "theoretical_loss": 4.753860894164845, + "tokens_seen": 102825984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893079237713139, + "loss": 3.5093, + "theoretical_loss": 4.75344258836571, + "tokens_seen": 102891520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892978936810432, + "loss": 3.5069, + "theoretical_loss": 4.753024623466752, + "tokens_seen": 102957056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892878635907723, + "loss": 3.7293, + "theoretical_loss": 4.752606998973421, + "tokens_seen": 103022592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892778335005016, + "loss": 3.5777, + "theoretical_loss": 4.752189714392202, + "tokens_seen": 103088128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892678034102307, + "loss": 3.5236, + "theoretical_loss": 4.7517727692306035, + "tokens_seen": 103153664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 69265, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0723447799682617, + "objective/train/theoretical_loss": 4.751356162997164, + "objective/train/tokens_used": 123679200, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892577733199599, + "loss": 3.525, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892477432296891, + "loss": 3.602, + "theoretical_loss": 4.750939895201443, + "tokens_seen": 103284736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892377131394183, + "loss": 3.4738, + "theoretical_loss": 4.750523965354024, + "tokens_seen": 103350272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892276830491475, + "loss": 3.3909, + "theoretical_loss": 4.750108372966501, + "tokens_seen": 103415808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892176529588767, + "loss": 3.2759, + "theoretical_loss": 4.749693117551491, + "tokens_seen": 103481344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892076228686058, + "loss": 3.7655, + "theoretical_loss": 4.749278198622617, + "tokens_seen": 103546880 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489197592778335, + "loss": 3.4825, + "theoretical_loss": 4.748863615694514, + "tokens_seen": 103612416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891875626880642, + "loss": 3.7804, + "theoretical_loss": 4.748449368282822, + "tokens_seen": 103677952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891775325977934, + "loss": 3.4161, + "theoretical_loss": 4.748035455904185, + "tokens_seen": 103743488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891675025075226, + "loss": 3.5973, + "theoretical_loss": 4.747621878076252, + "tokens_seen": 103809024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891574724172517, + "loss": 3.5654, + "theoretical_loss": 4.747208634317664, + "tokens_seen": 103874560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891474423269809, + "loss": 3.6798, + "theoretical_loss": 4.746795724148061, + "tokens_seen": 103940096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891374122367101, + "loss": 3.7902, + "theoretical_loss": 4.746383147088078, + "tokens_seen": 104005632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891273821464393, + "loss": 3.2944, + "theoretical_loss": 4.745970902659338, + "tokens_seen": 104071168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891173520561686, + "loss": 3.6076, + "theoretical_loss": 4.745558990384451, + "tokens_seen": 104136704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891073219658976, + "loss": 3.3813, + "theoretical_loss": 4.7451474097870125, + "tokens_seen": 104202240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890972918756269, + "loss": 3.245, + "theoretical_loss": 4.744736160391602, + "tokens_seen": 104267776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890872617853561, + "loss": 3.6455, + "theoretical_loss": 4.744325241723777, + "tokens_seen": 104333312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890772316950853, + "loss": 3.6057, + "theoretical_loss": 4.743914653310073, + "tokens_seen": 104398848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890672016048145, + "loss": 3.2381, + "theoretical_loss": 4.743504394678, + "tokens_seen": 104464384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890571715145437, + "loss": 3.5268, + "theoretical_loss": 4.743094465356039, + "tokens_seen": 104529920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890471414242728, + "loss": 3.4362, + "theoretical_loss": 4.742684864873641, + "tokens_seen": 104595456 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489037111334002, + "loss": 3.2982, + "theoretical_loss": 4.742275592761223, + "tokens_seen": 104660992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890270812437312, + "loss": 3.5981, + "theoretical_loss": 4.741866648550168, + "tokens_seen": 104726528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890170511534604, + "loss": 3.5663, + "theoretical_loss": 4.741458031772817, + "tokens_seen": 104792064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 70556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0196893215179443, + "objective/train/theoretical_loss": 4.741049741962473, + "objective/train/tokens_used": 125317600, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890070210631896, + "loss": 3.3116, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889969909729187, + "loss": 3.3268, + "theoretical_loss": 4.740641778653395, + "tokens_seen": 104923136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889869608826479, + "loss": 3.535, + "theoretical_loss": 4.740234141380794, + "tokens_seen": 104988672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889769307923771, + "loss": 3.5609, + "theoretical_loss": 4.739826829680833, + "tokens_seen": 105054208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889669007021063, + "loss": 3.2398, + "theoretical_loss": 4.739419843090626, + "tokens_seen": 105119744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889568706118355, + "loss": 3.6536, + "theoretical_loss": 4.739013181148229, + "tokens_seen": 105185280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889468405215647, + "loss": 3.2403, + "theoretical_loss": 4.738606843392644, + "tokens_seen": 105250816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889368104312939, + "loss": 3.3196, + "theoretical_loss": 4.738200829363815, + "tokens_seen": 105316352 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488926780341023, + "loss": 3.5581, + "theoretical_loss": 4.737795138602624, + "tokens_seen": 105381888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889167502507523, + "loss": 3.478, + "theoretical_loss": 4.737389770650887, + "tokens_seen": 105447424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889067201604815, + "loss": 3.4703, + "theoretical_loss": 4.736984725051357, + "tokens_seen": 105512960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888966900702107, + "loss": 3.5458, + "theoretical_loss": 4.736580001347717, + "tokens_seen": 105578496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888866599799398, + "loss": 3.5523, + "theoretical_loss": 4.736175599084576, + "tokens_seen": 105644032 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488876629889669, + "loss": 3.4106, + "theoretical_loss": 4.735771517807473, + "tokens_seen": 105709568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888665997993982, + "loss": 3.6541, + "theoretical_loss": 4.735367757062869, + "tokens_seen": 105775104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888565697091274, + "loss": 3.4943, + "theoretical_loss": 4.734964316398148, + "tokens_seen": 105840640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888465396188566, + "loss": 3.4438, + "theoretical_loss": 4.734561195361609, + "tokens_seen": 105906176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888365095285858, + "loss": 3.5359, + "theoretical_loss": 4.734158393502471, + "tokens_seen": 105971712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888264794383149, + "loss": 3.6042, + "theoretical_loss": 4.733755910370867, + "tokens_seen": 106037248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888164493480441, + "loss": 3.4694, + "theoretical_loss": 4.73335374551784, + "tokens_seen": 106102784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888064192577733, + "loss": 3.3701, + "theoretical_loss": 4.732951898495341, + "tokens_seen": 106168320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887963891675025, + "loss": 3.4071, + "theoretical_loss": 4.7325503688562325, + "tokens_seen": 106233856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887863590772317, + "loss": 3.4026, + "theoretical_loss": 4.732149156154276, + "tokens_seen": 106299392 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488776328986961, + "loss": 3.5821, + "theoretical_loss": 4.731748259944139, + "tokens_seen": 106364928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048876629889669, + "loss": 3.1001, + "theoretical_loss": 4.731347679781386, + "tokens_seen": 106430464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 70950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.365251302719116, + "objective/train/theoretical_loss": 4.730947415222481, + "objective/train/tokens_used": 126956000, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887562688064193, + "loss": 3.3721, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887462387161484, + "loss": 3.3913, + "theoretical_loss": 4.730547465824781, + "tokens_seen": 106561536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887362086258777, + "loss": 3.477, + "theoretical_loss": 4.730147831146537, + "tokens_seen": 106627072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887261785356069, + "loss": 3.4452, + "theoretical_loss": 4.72974851074689, + "tokens_seen": 106692608 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488716148445336, + "loss": 3.3808, + "theoretical_loss": 4.729349504185867, + "tokens_seen": 106758144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887061183550652, + "loss": 3.6197, + "theoretical_loss": 4.728950811024383, + "tokens_seen": 106823680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886960882647944, + "loss": 3.2728, + "theoretical_loss": 4.7285524308242355, + "tokens_seen": 106889216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886860581745236, + "loss": 3.4828, + "theoretical_loss": 4.728154363148102, + "tokens_seen": 106954752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886760280842528, + "loss": 3.645, + "theoretical_loss": 4.72775660755954, + "tokens_seen": 107020288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886659979939819, + "loss": 3.5389, + "theoretical_loss": 4.72735916362298, + "tokens_seen": 107085824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886559679037111, + "loss": 3.2972, + "theoretical_loss": 4.7269620309037315, + "tokens_seen": 107151360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886459378134403, + "loss": 3.3874, + "theoretical_loss": 4.726565208967973, + "tokens_seen": 107216896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886359077231695, + "loss": 3.375, + "theoretical_loss": 4.726168697382751, + "tokens_seen": 107282432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886258776328988, + "loss": 3.6601, + "theoretical_loss": 4.725772495715983, + "tokens_seen": 107347968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886158475426278, + "loss": 3.369, + "theoretical_loss": 4.725376603536446, + "tokens_seen": 107413504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886058174523571, + "loss": 3.2596, + "theoretical_loss": 4.724981020413787, + "tokens_seen": 107479040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885957873620863, + "loss": 3.3279, + "theoretical_loss": 4.724585745918505, + "tokens_seen": 107544576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885857572718155, + "loss": 3.4956, + "theoretical_loss": 4.7241907796219635, + "tokens_seen": 107610112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885757271815447, + "loss": 3.4726, + "theoretical_loss": 4.723796121096381, + "tokens_seen": 107675648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885656970912739, + "loss": 3.5414, + "theoretical_loss": 4.723401769914824, + "tokens_seen": 107741184 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488555667001003, + "loss": 3.2837, + "theoretical_loss": 4.723007725651219, + "tokens_seen": 107806720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885456369107322, + "loss": 3.4393, + "theoretical_loss": 4.722613987880335, + "tokens_seen": 107872256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885356068204614, + "loss": 3.4046, + "theoretical_loss": 4.722220556177792, + "tokens_seen": 107937792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885255767301906, + "loss": 3.3551, + "theoretical_loss": 4.721827430120053, + "tokens_seen": 108003328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885155466399198, + "loss": 3.4638, + "theoretical_loss": 4.721434609284424, + "tokens_seen": 108068864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 72030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6380910873413086, + "objective/train/theoretical_loss": 4.721042093249051, + "objective/train/tokens_used": 128594400, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488505516549649, + "loss": 3.2402, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884954864593781, + "loss": 3.3689, + "theoretical_loss": 4.720649881592919, + "tokens_seen": 108199936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884854563691073, + "loss": 3.5546, + "theoretical_loss": 4.7202579738958494, + "tokens_seen": 108265472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884754262788365, + "loss": 3.2907, + "theoretical_loss": 4.7198663697384955, + "tokens_seen": 108331008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884653961885657, + "loss": 3.763, + "theoretical_loss": 4.719475068702346, + "tokens_seen": 108396544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884553660982949, + "loss": 3.3741, + "theoretical_loss": 4.719084070369714, + "tokens_seen": 108462080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884453360080241, + "loss": 3.5399, + "theoretical_loss": 4.718693374323747, + "tokens_seen": 108527616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884353059177532, + "loss": 3.5136, + "theoretical_loss": 4.718302980148412, + "tokens_seen": 108593152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884252758274825, + "loss": 3.3874, + "theoretical_loss": 4.717912887428501, + "tokens_seen": 108658688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884152457372117, + "loss": 3.2129, + "theoretical_loss": 4.717523095749626, + "tokens_seen": 108724224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884052156469409, + "loss": 3.1945, + "theoretical_loss": 4.717133604698222, + "tokens_seen": 108789760 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048839518555667, + "loss": 3.4207, + "theoretical_loss": 4.7167444138615355, + "tokens_seen": 108855296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883851554663992, + "loss": 3.6135, + "theoretical_loss": 4.716355522827633, + "tokens_seen": 108920832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883751253761284, + "loss": 3.7415, + "theoretical_loss": 4.715966931185388, + "tokens_seen": 108986368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883650952858576, + "loss": 3.6946, + "theoretical_loss": 4.715578638524491, + "tokens_seen": 109051904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883550651955868, + "loss": 3.4421, + "theoretical_loss": 4.715190644435435, + "tokens_seen": 109117440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488345035105316, + "loss": 3.6584, + "theoretical_loss": 4.714802948509522, + "tokens_seen": 109182976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883350050150451, + "loss": 3.3107, + "theoretical_loss": 4.71441555033886, + "tokens_seen": 109248512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883249749247743, + "loss": 3.5758, + "theoretical_loss": 4.714028449516356, + "tokens_seen": 109314048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883149448345035, + "loss": 3.3739, + "theoretical_loss": 4.713641645635718, + "tokens_seen": 109379584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883049147442327, + "loss": 3.1408, + "theoretical_loss": 4.713255138291454, + "tokens_seen": 109445120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882948846539619, + "loss": 3.6864, + "theoretical_loss": 4.712868927078868, + "tokens_seen": 109510656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882848545636911, + "loss": 3.2615, + "theoretical_loss": 4.712483011594056, + "tokens_seen": 109576192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882748244734203, + "loss": 3.5947, + "theoretical_loss": 4.7120973914339075, + "tokens_seen": 109641728 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048826479438314946, + "loss": 3.3371, + "theoretical_loss": 4.7117120661961005, + "tokens_seen": 109707264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 72698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.991194009780884, + "objective/train/theoretical_loss": 4.711327035479103, + "objective/train/tokens_used": 130232800, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048825476429287864, + "loss": 3.2689, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882447342026078, + "loss": 3.4525, + "theoretical_loss": 4.710942298882169, + "tokens_seen": 109838336 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488234704112337, + "loss": 3.3659, + "theoretical_loss": 4.710557856005335, + "tokens_seen": 109903872 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048822467402206624, + "loss": 3.4411, + "theoretical_loss": 4.710173706449419, + "tokens_seen": 109969408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048821464393179536, + "loss": 3.382, + "theoretical_loss": 4.709789849816021, + "tokens_seen": 110034944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882046138415246, + "loss": 3.3798, + "theoretical_loss": 4.7094062857075185, + "tokens_seen": 110100480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881945837512537, + "loss": 3.5293, + "theoretical_loss": 4.709023013727063, + "tokens_seen": 110166016 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048818455366098296, + "loss": 3.6122, + "theoretical_loss": 4.708640033478584, + "tokens_seen": 110231552 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048817452357071214, + "loss": 3.3274, + "theoretical_loss": 4.708257344566778, + "tokens_seen": 110297088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881644934804413, + "loss": 3.554, + "theoretical_loss": 4.7078749465971175, + "tokens_seen": 110362624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881544633901705, + "loss": 3.3524, + "theoretical_loss": 4.707492839175837, + "tokens_seen": 110428160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048814443329989974, + "loss": 3.5396, + "theoretical_loss": 4.707111021909941, + "tokens_seen": 110493696 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048813440320962887, + "loss": 3.5515, + "theoretical_loss": 4.706729494407197, + "tokens_seen": 110559232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881243731193581, + "loss": 3.2664, + "theoretical_loss": 4.706348256276138, + "tokens_seen": 110624768 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048811434302908723, + "loss": 3.4359, + "theoretical_loss": 4.705967307126051, + "tokens_seen": 110690304 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048810431293881646, + "loss": 3.6256, + "theoretical_loss": 4.705586646566987, + "tokens_seen": 110755840 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048809428284854564, + "loss": 3.5248, + "theoretical_loss": 4.705206274209751, + "tokens_seen": 110821376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880842527582748, + "loss": 3.5576, + "theoretical_loss": 4.704826189665905, + "tokens_seen": 110886912 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488074222668004, + "loss": 3.3691, + "theoretical_loss": 4.704446392547759, + "tokens_seen": 110952448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880641925777332, + "loss": 3.6296, + "theoretical_loss": 4.7040668824683785, + "tokens_seen": 111017984 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048805416248746237, + "loss": 3.5711, + "theoretical_loss": 4.7036876590415755, + "tokens_seen": 111083520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880441323971916, + "loss": 3.1637, + "theoretical_loss": 4.7033087218819105, + "tokens_seen": 111149056 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048803410230692073, + "loss": 3.4891, + "theoretical_loss": 4.7029300706046895, + "tokens_seen": 111214592 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048802407221664997, + "loss": 3.4164, + "theoretical_loss": 4.702551704825957, + "tokens_seen": 111280128 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048801404212637915, + "loss": 3.2079, + "theoretical_loss": 4.702173624162507, + "tokens_seen": 111345664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 74080, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.588409900665283, + "objective/train/theoretical_loss": 4.701795828231866, + "objective/train/tokens_used": 131871200, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048800401203610833, + "loss": 3.4279, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879939819458375, + "loss": 3.4107, + "theoretical_loss": 4.701418316652299, + "tokens_seen": 111476736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879839518555667, + "loss": 3.4536, + "theoretical_loss": 4.701041089042813, + "tokens_seen": 111542272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879739217652959, + "loss": 3.7348, + "theoretical_loss": 4.700664145023142, + "tokens_seen": 111607808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879638916750251, + "loss": 3.4685, + "theoretical_loss": 4.700287484213753, + "tokens_seen": 111673344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879538615847543, + "loss": 3.4494, + "theoretical_loss": 4.699911106235849, + "tokens_seen": 111738880 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048794383149448347, + "loss": 3.5418, + "theoretical_loss": 4.6995350107113545, + "tokens_seen": 111804416 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048793380140421265, + "loss": 3.5618, + "theoretical_loss": 4.699159197262922, + "tokens_seen": 111869952 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048792377131394183, + "loss": 3.3049, + "theoretical_loss": 4.698783665513934, + "tokens_seen": 111935488 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048791374122367107, + "loss": 3.4054, + "theoretical_loss": 4.698408415088491, + "tokens_seen": 112001024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879037111334002, + "loss": 3.5376, + "theoretical_loss": 4.698033445611415, + "tokens_seen": 112066560 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048789368104312943, + "loss": 3.5404, + "theoretical_loss": 4.6976587567082495, + "tokens_seen": 112132096 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048788365095285856, + "loss": 3.6666, + "theoretical_loss": 4.697284348005253, + "tokens_seen": 112197632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878736208625878, + "loss": 3.6087, + "theoretical_loss": 4.696910219129402, + "tokens_seen": 112263168 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048786359077231697, + "loss": 3.5387, + "theoretical_loss": 4.696536369708386, + "tokens_seen": 112328704 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048785356068204615, + "loss": 3.3868, + "theoretical_loss": 4.696162799370606, + "tokens_seen": 112394240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048784353059177533, + "loss": 3.6141, + "theoretical_loss": 4.695789507745176, + "tokens_seen": 112459776 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048783350050150457, + "loss": 3.5212, + "theoretical_loss": 4.695416494461917, + "tokens_seen": 112525312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878234704112337, + "loss": 3.4791, + "theoretical_loss": 4.695043759151353, + "tokens_seen": 112590848 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048781344032096293, + "loss": 3.4949, + "theoretical_loss": 4.694671301444722, + "tokens_seen": 112656384 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048780341023069206, + "loss": 3.6483, + "theoretical_loss": 4.694299120973957, + "tokens_seen": 112721920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877933801404213, + "loss": 3.5765, + "theoretical_loss": 4.693927217371698, + "tokens_seen": 112787456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877833500501505, + "loss": 3.5005, + "theoretical_loss": 4.693555590271282, + "tokens_seen": 112852992 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048777331995987966, + "loss": 3.3525, + "theoretical_loss": 4.693184239306744, + "tokens_seen": 112918528 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048776328986960884, + "loss": 3.5055, + "theoretical_loss": 4.692813164112819, + "tokens_seen": 112984064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 74817, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.344128131866455, + "objective/train/theoretical_loss": 4.692442364324931, + "objective/train/tokens_used": 133509600, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487753259779338, + "loss": 3.3455, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877432296890672, + "loss": 3.4303, + "theoretical_loss": 4.692071839579201, + "tokens_seen": 113115136 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048773319959879644, + "loss": 3.4279, + "theoretical_loss": 4.6917015895124425, + "tokens_seen": 113180672 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048772316950852556, + "loss": 3.7147, + "theoretical_loss": 4.691331613762153, + "tokens_seen": 113246208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877131394182548, + "loss": 3.4796, + "theoretical_loss": 4.690961911966523, + "tokens_seen": 113311744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877031093279839, + "loss": 3.4674, + "theoretical_loss": 4.690592483764427, + "tokens_seen": 113377280 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048769307923771316, + "loss": 3.4433, + "theoretical_loss": 4.690223328795424, + "tokens_seen": 113442816 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048768304914744234, + "loss": 3.2091, + "theoretical_loss": 4.689854446699757, + "tokens_seen": 113508352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876730190571715, + "loss": 3.345, + "theoretical_loss": 4.689485837118347, + "tokens_seen": 113573888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876629889669007, + "loss": 3.3691, + "theoretical_loss": 4.689117499692798, + "tokens_seen": 113639424 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048765295887662994, + "loss": 3.5643, + "theoretical_loss": 4.688749434065389, + "tokens_seen": 113704960 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048764292878635907, + "loss": 3.241, + "theoretical_loss": 4.688381639879076, + "tokens_seen": 113770496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876328986960883, + "loss": 3.4272, + "theoretical_loss": 4.68801411677749, + "tokens_seen": 113836032 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048762286860581743, + "loss": 3.3181, + "theoretical_loss": 4.687646864404934, + "tokens_seen": 113901568 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048761283851554666, + "loss": 3.6051, + "theoretical_loss": 4.687279882406381, + "tokens_seen": 113967104 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048760280842527584, + "loss": 3.1655, + "theoretical_loss": 4.686913170427477, + "tokens_seen": 114032640 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487592778335005, + "loss": 3.3195, + "theoretical_loss": 4.68654672811453, + "tokens_seen": 114098176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875827482447342, + "loss": 3.4204, + "theoretical_loss": 4.68618055511452, + "tokens_seen": 114163712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875727181544634, + "loss": 3.3986, + "theoretical_loss": 4.685814651075088, + "tokens_seen": 114229248 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048756268806419257, + "loss": 3.3853, + "theoretical_loss": 4.685449015644537, + "tokens_seen": 114294784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875526579739218, + "loss": 3.1979, + "theoretical_loss": 4.685083648471835, + "tokens_seen": 114360320 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048754262788365093, + "loss": 3.3925, + "theoretical_loss": 4.684718549206607, + "tokens_seen": 114425856 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048753259779338017, + "loss": 3.4082, + "theoretical_loss": 4.6843537174991345, + "tokens_seen": 114491392 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048752256770310935, + "loss": 3.3881, + "theoretical_loss": 4.6839891530003595, + "tokens_seen": 114556928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048751253761283853, + "loss": 3.4909, + "theoretical_loss": 4.683624855361876, + "tokens_seen": 114622464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 76169, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.169177770614624, + "objective/train/theoretical_loss": 4.68326082423593, + "objective/train/tokens_used": 135148000, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875025075225677, + "loss": 3.1599, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874924774322969, + "loss": 3.744, + "theoretical_loss": 4.682897059275422, + "tokens_seen": 114753536 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048748244734202607, + "loss": 3.5137, + "theoretical_loss": 4.682533560133901, + "tokens_seen": 114819072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874724172517553, + "loss": 3.3442, + "theoretical_loss": 4.682170326465565, + "tokens_seen": 114884608 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048746238716148443, + "loss": 3.4037, + "theoretical_loss": 4.681807357925257, + "tokens_seen": 114950144 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048745235707121367, + "loss": 3.6075, + "theoretical_loss": 4.681444654168468, + "tokens_seen": 115015680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874423269809428, + "loss": 3.3067, + "theoretical_loss": 4.68108221485133, + "tokens_seen": 115081216 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048743229689067203, + "loss": 3.5086, + "theoretical_loss": 4.680720039630617, + "tokens_seen": 115146752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874222668004012, + "loss": 3.1917, + "theoretical_loss": 4.680358128163747, + "tokens_seen": 115212288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874122367101304, + "loss": 3.6048, + "theoretical_loss": 4.679996480108773, + "tokens_seen": 115277824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874022066198596, + "loss": 3.3722, + "theoretical_loss": 4.6796350951243895, + "tokens_seen": 115343360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048739217652958876, + "loss": 3.491, + "theoretical_loss": 4.679273972869922, + "tokens_seen": 115408896 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048738214643931794, + "loss": 3.3156, + "theoretical_loss": 4.678913113005333, + "tokens_seen": 115474432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048737211634904717, + "loss": 3.3652, + "theoretical_loss": 4.6785525151912175, + "tokens_seen": 115539968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873620862587763, + "loss": 3.3324, + "theoretical_loss": 4.678192179088802, + "tokens_seen": 115605504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048735205616850553, + "loss": 3.6088, + "theoretical_loss": 4.6778321043599425, + "tokens_seen": 115671040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873420260782347, + "loss": 3.3234, + "theoretical_loss": 4.677472290667122, + "tokens_seen": 115736576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873319959879639, + "loss": 3.3235, + "theoretical_loss": 4.677112737673453, + "tokens_seen": 115802112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873219658976931, + "loss": 3.3194, + "theoretical_loss": 4.676753445042669, + "tokens_seen": 115867648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048731193580742226, + "loss": 3.0035, + "theoretical_loss": 4.676394412439132, + "tokens_seen": 115933184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048730190571715144, + "loss": 3.2693, + "theoretical_loss": 4.6760356395278215, + "tokens_seen": 115998720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872918756268807, + "loss": 3.3124, + "theoretical_loss": 4.675677125974339, + "tokens_seen": 116064256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872818455366098, + "loss": 3.4606, + "theoretical_loss": 4.675318871444908, + "tokens_seen": 116129792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048727181544633904, + "loss": 3.3459, + "theoretical_loss": 4.674960875606366, + "tokens_seen": 116195328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048726178535606816, + "loss": 3.463, + "theoretical_loss": 4.674603138126168, + "tokens_seen": 116260864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 76931, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.525822162628174, + "objective/train/theoretical_loss": 4.674245658672382, + "objective/train/tokens_used": 136786400, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872517552657974, + "loss": 3.5205, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872417251755266, + "loss": 3.2274, + "theoretical_loss": 4.673888436913694, + "tokens_seen": 116391936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048723169508525576, + "loss": 3.524, + "theoretical_loss": 4.673531472519397, + "tokens_seen": 116457472 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487221664994985, + "loss": 3.5341, + "theoretical_loss": 4.673174765159393, + "tokens_seen": 116523008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872116349047141, + "loss": 3.3575, + "theoretical_loss": 4.672818314504198, + "tokens_seen": 116588544 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048720160481444336, + "loss": 3.2956, + "theoretical_loss": 4.6724621202249335, + "tokens_seen": 116654080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048719157472417254, + "loss": 3.4414, + "theoretical_loss": 4.672106181993324, + "tokens_seen": 116719616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871815446339017, + "loss": 3.4029, + "theoretical_loss": 4.6717504994817, + "tokens_seen": 116785152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871715145436309, + "loss": 3.3539, + "theoretical_loss": 4.671395072362996, + "tokens_seen": 116850688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048716148445336014, + "loss": 3.7625, + "theoretical_loss": 4.671039900310747, + "tokens_seen": 116916224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048715145436308927, + "loss": 3.2591, + "theoretical_loss": 4.670684982999088, + "tokens_seen": 116981760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871414242728185, + "loss": 3.4151, + "theoretical_loss": 4.670330320102753, + "tokens_seen": 117047296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048713139418254763, + "loss": 3.4562, + "theoretical_loss": 4.669975911297072, + "tokens_seen": 117112832 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048712136409227686, + "loss": 3.4337, + "theoretical_loss": 4.669621756257971, + "tokens_seen": 117178368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048711133400200604, + "loss": 3.3579, + "theoretical_loss": 4.669267854661973, + "tokens_seen": 117243904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871013039117352, + "loss": 3.3642, + "theoretical_loss": 4.668914206186189, + "tokens_seen": 117309440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870912738214644, + "loss": 3.4332, + "theoretical_loss": 4.6685608105083265, + "tokens_seen": 117374976 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870812437311936, + "loss": 3.5261, + "theoretical_loss": 4.66820766730668, + "tokens_seen": 117440512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048707121364092277, + "loss": 3.2805, + "theoretical_loss": 4.667854776260132, + "tokens_seen": 117506048 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487061183550652, + "loss": 3.5186, + "theoretical_loss": 4.667502137048155, + "tokens_seen": 117571584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048705115346038113, + "loss": 3.3778, + "theoretical_loss": 4.667149749350805, + "tokens_seen": 117637120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048704112337011037, + "loss": 3.1395, + "theoretical_loss": 4.666797612848723, + "tokens_seen": 117702656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048703109327983955, + "loss": 3.3278, + "theoretical_loss": 4.666445727223134, + "tokens_seen": 117768192 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048702106318956873, + "loss": 3.2112, + "theoretical_loss": 4.666094092155843, + "tokens_seen": 117833728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870110330992979, + "loss": 3.6812, + "theoretical_loss": 4.665742707329238, + "tokens_seen": 117899264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 78354, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.753464460372925, + "objective/train/theoretical_loss": 4.665391572426282, + "objective/train/tokens_used": 138424800, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870010030090271, + "loss": 3.4575, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048699097291875627, + "loss": 3.2806, + "theoretical_loss": 4.665040687130518, + "tokens_seen": 118030336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869809428284855, + "loss": 3.2856, + "theoretical_loss": 4.664690051126065, + "tokens_seen": 118095872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048697091273821463, + "loss": 3.6047, + "theoretical_loss": 4.664339664097617, + "tokens_seen": 118161408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048696088264794387, + "loss": 3.3602, + "theoretical_loss": 4.66398952573044, + "tokens_seen": 118226944 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486950852557673, + "loss": 3.6397, + "theoretical_loss": 4.663639635710373, + "tokens_seen": 118292480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048694082246740223, + "loss": 3.5041, + "theoretical_loss": 4.663289993723826, + "tokens_seen": 118358016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869307923771314, + "loss": 3.6349, + "theoretical_loss": 4.662940599457777, + "tokens_seen": 118423552 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869207622868606, + "loss": 3.311, + "theoretical_loss": 4.662591452599774, + "tokens_seen": 118489088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869107321965898, + "loss": 3.3616, + "theoretical_loss": 4.662242552837929, + "tokens_seen": 118554624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048690070210631896, + "loss": 3.5774, + "theoretical_loss": 4.661893899860923, + "tokens_seen": 118620160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048689067201604814, + "loss": 3.5329, + "theoretical_loss": 4.6615454933579965, + "tokens_seen": 118685696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048688064192577737, + "loss": 3.3496, + "theoretical_loss": 4.661197333018957, + "tokens_seen": 118751232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868706118355065, + "loss": 3.1691, + "theoretical_loss": 4.66084941853417, + "tokens_seen": 118816768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048686058174523573, + "loss": 3.4726, + "theoretical_loss": 4.6605017495945615, + "tokens_seen": 118882304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868505516549649, + "loss": 3.2301, + "theoretical_loss": 4.660154325891618, + "tokens_seen": 118947840 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868405215646941, + "loss": 3.3908, + "theoretical_loss": 4.659807147117382, + "tokens_seen": 119013376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868304914744233, + "loss": 3.5186, + "theoretical_loss": 4.6594602129644525, + "tokens_seen": 119078912 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048682046138415246, + "loss": 3.5771, + "theoretical_loss": 4.659113523125981, + "tokens_seen": 119144448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048681043129388164, + "loss": 3.4647, + "theoretical_loss": 4.6587670772956775, + "tokens_seen": 119209984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868004012036109, + "loss": 3.359, + "theoretical_loss": 4.658420875167799, + "tokens_seen": 119275520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048679037111334, + "loss": 3.2057, + "theoretical_loss": 4.658074916437155, + "tokens_seen": 119341056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048678034102306924, + "loss": 3.5888, + "theoretical_loss": 4.657729200799105, + "tokens_seen": 119406592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048677031093279836, + "loss": 3.3121, + "theoretical_loss": 4.657383727949558, + "tokens_seen": 119472128 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867602808425276, + "loss": 3.4127, + "theoretical_loss": 4.657038497584967, + "tokens_seen": 119537664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 78905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3278000354766846, + "objective/train/theoretical_loss": 4.656693509402331, + "objective/train/tokens_used": 140063200, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867502507522568, + "loss": 3.5357, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048674022066198596, + "loss": 3.3471, + "theoretical_loss": 4.6563487630991975, + "tokens_seen": 119668736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048673019057171514, + "loss": 3.5287, + "theoretical_loss": 4.656004258373651, + "tokens_seen": 119734272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867201604814443, + "loss": 3.5142, + "theoretical_loss": 4.655659994924323, + "tokens_seen": 119799808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867101303911735, + "loss": 3.2315, + "theoretical_loss": 4.655315972450383, + "tokens_seen": 119865344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048670010030090274, + "loss": 3.1649, + "theoretical_loss": 4.65497219065154, + "tokens_seen": 119930880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048669007021063187, + "loss": 3.5037, + "theoretical_loss": 4.654628649228041, + "tokens_seen": 119996416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866800401203611, + "loss": 3.2458, + "theoretical_loss": 4.654285347880672, + "tokens_seen": 120061952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866700100300903, + "loss": 3.407, + "theoretical_loss": 4.653942286310749, + "tokens_seen": 120127488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048665997993981947, + "loss": 3.4908, + "theoretical_loss": 4.653599464220129, + "tokens_seen": 120193024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048664994984954865, + "loss": 3.3596, + "theoretical_loss": 4.653256881311198, + "tokens_seen": 120258560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048663991975927783, + "loss": 3.489, + "theoretical_loss": 4.6529145372868745, + "tokens_seen": 120324096 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486629889669007, + "loss": 3.3819, + "theoretical_loss": 4.652572431850608, + "tokens_seen": 120389632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048661985957873624, + "loss": 3.2704, + "theoretical_loss": 4.652230564706377, + "tokens_seen": 120455168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048660982948846537, + "loss": 3.2807, + "theoretical_loss": 4.651888935558688, + "tokens_seen": 120520704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865997993981946, + "loss": 3.4143, + "theoretical_loss": 4.651547544112575, + "tokens_seen": 120586240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048658976930792373, + "loss": 3.3822, + "theoretical_loss": 4.651206390073597, + "tokens_seen": 120651776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048657973921765297, + "loss": 3.0172, + "theoretical_loss": 4.650865473147837, + "tokens_seen": 120717312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048656970912738215, + "loss": 3.2458, + "theoretical_loss": 4.650524793041903, + "tokens_seen": 120782848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048655967903711133, + "loss": 3.3165, + "theoretical_loss": 4.650184349462922, + "tokens_seen": 120848384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865496489468405, + "loss": 3.5201, + "theoretical_loss": 4.649844142118544, + "tokens_seen": 120913920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048653961885656975, + "loss": 3.2466, + "theoretical_loss": 4.6495041707169396, + "tokens_seen": 120979456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865295887662989, + "loss": 3.4625, + "theoretical_loss": 4.649164434966794, + "tokens_seen": 121044992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865195586760281, + "loss": 3.4472, + "theoretical_loss": 4.648824934577313, + "tokens_seen": 121110528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048650952858575724, + "loss": 3.454, + "theoretical_loss": 4.648485669258216, + "tokens_seen": 121176064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 79692, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7176690101623535, + "objective/train/theoretical_loss": 4.648146638719739, + "objective/train/tokens_used": 141701600, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048649949849548647, + "loss": 3.6455, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048648946840521565, + "loss": 3.2591, + "theoretical_loss": 4.647807842672631, + "tokens_seen": 121307136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048647943831494483, + "loss": 3.1074, + "theoretical_loss": 4.647469280828153, + "tokens_seen": 121372672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048646940822467407, + "loss": 3.625, + "theoretical_loss": 4.647130952898077, + "tokens_seen": 121438208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864593781344032, + "loss": 3.6184, + "theoretical_loss": 4.646792858594686, + "tokens_seen": 121503744 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048644934804413243, + "loss": 3.3147, + "theoretical_loss": 4.64645499763077, + "tokens_seen": 121569280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864393179538616, + "loss": 3.3483, + "theoretical_loss": 4.646117369719629, + "tokens_seen": 121634816 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864292878635908, + "loss": 3.6571, + "theoretical_loss": 4.645779974575069, + "tokens_seen": 121700352 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048641925777332, + "loss": 3.392, + "theoretical_loss": 4.6454428119113995, + "tokens_seen": 121765888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048640922768304916, + "loss": 3.4246, + "theoretical_loss": 4.6451058814434365, + "tokens_seen": 121831424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048639919759277834, + "loss": 3.2547, + "theoretical_loss": 4.644769182886495, + "tokens_seen": 121896960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048638916750250757, + "loss": 3.4761, + "theoretical_loss": 4.644432715956399, + "tokens_seen": 121962496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863791374122367, + "loss": 3.3089, + "theoretical_loss": 4.644096480369466, + "tokens_seen": 122028032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048636910732196593, + "loss": 3.4689, + "theoretical_loss": 4.643760475842518, + "tokens_seen": 122093568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863590772316951, + "loss": 3.3779, + "theoretical_loss": 4.6434247020928705, + "tokens_seen": 122159104 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863490471414243, + "loss": 3.3198, + "theoretical_loss": 4.643089158838341, + "tokens_seen": 122224640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863390170511535, + "loss": 3.4073, + "theoretical_loss": 4.642753845797243, + "tokens_seen": 122290176 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048632898696088266, + "loss": 3.3571, + "theoretical_loss": 4.642418762688379, + "tokens_seen": 122355712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048631895687061184, + "loss": 3.4689, + "theoretical_loss": 4.642083909231053, + "tokens_seen": 122421248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863089267803411, + "loss": 3.1746, + "theoretical_loss": 4.641749285145057, + "tokens_seen": 122486784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862988966900702, + "loss": 3.339, + "theoretical_loss": 4.641414890150675, + "tokens_seen": 122552320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048628886659979944, + "loss": 3.2195, + "theoretical_loss": 4.641080723968684, + "tokens_seen": 122617856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048627883650952857, + "loss": 3.3498, + "theoretical_loss": 4.6407467863203475, + "tokens_seen": 122683392 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862688064192578, + "loss": 3.6232, + "theoretical_loss": 4.640413076927418, + "tokens_seen": 122748928 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486258776328987, + "loss": 3.3464, + "theoretical_loss": 4.6400795955121374, + "tokens_seen": 122814464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 81051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.561691999435425, + "objective/train/theoretical_loss": 4.639746341797229, + "objective/train/tokens_used": 143340000, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048624874623871616, + "loss": 3.3018, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048623871614844534, + "loss": 3.3855, + "theoretical_loss": 4.639413315505905, + "tokens_seen": 122945536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862286860581745, + "loss": 3.3287, + "theoretical_loss": 4.639080516361861, + "tokens_seen": 123011072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862186559679037, + "loss": 3.3515, + "theoretical_loss": 4.638747944089273, + "tokens_seen": 123076608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048620862587763294, + "loss": 3.5609, + "theoretical_loss": 4.638415598412799, + "tokens_seen": 123142144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048619859578736207, + "loss": 3.1183, + "theoretical_loss": 4.638083479057579, + "tokens_seen": 123207680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861885656970913, + "loss": 3.2753, + "theoretical_loss": 4.637751585749234, + "tokens_seen": 123273216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861785356068205, + "loss": 3.3623, + "theoretical_loss": 4.6374199182138565, + "tokens_seen": 123338752 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048616850551654967, + "loss": 3.2958, + "theoretical_loss": 4.637088476178025, + "tokens_seen": 123404288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048615847542627885, + "loss": 3.3467, + "theoretical_loss": 4.636757259368787, + "tokens_seen": 123469824 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048614844533600803, + "loss": 3.6196, + "theoretical_loss": 4.636426267513668, + "tokens_seen": 123535360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861384152457372, + "loss": 3.4397, + "theoretical_loss": 4.636095500340669, + "tokens_seen": 123600896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048612838515546644, + "loss": 3.3183, + "theoretical_loss": 4.635764957578261, + "tokens_seen": 123666432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048611835506519557, + "loss": 3.5385, + "theoretical_loss": 4.635434638955388, + "tokens_seen": 123731968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861083249749248, + "loss": 3.4339, + "theoretical_loss": 4.635104544201465, + "tokens_seen": 123797504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048609829488465393, + "loss": 3.4177, + "theoretical_loss": 4.634774673046376, + "tokens_seen": 123863040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048608826479438317, + "loss": 3.5387, + "theoretical_loss": 4.634445025220475, + "tokens_seen": 123928576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048607823470411235, + "loss": 3.2727, + "theoretical_loss": 4.634115600454582, + "tokens_seen": 123994112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048606820461384153, + "loss": 3.5262, + "theoretical_loss": 4.633786398479983, + "tokens_seen": 124059648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860581745235707, + "loss": 3.3889, + "theoretical_loss": 4.6334574190284314, + "tokens_seen": 124125184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048604814443329995, + "loss": 3.1077, + "theoretical_loss": 4.633128661832145, + "tokens_seen": 124190720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860381143430291, + "loss": 3.5107, + "theoretical_loss": 4.632800126623803, + "tokens_seen": 124256256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860280842527583, + "loss": 3.4446, + "theoretical_loss": 4.632471813136547, + "tokens_seen": 124321792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048601805416248744, + "loss": 3.3011, + "theoretical_loss": 4.632143721103983, + "tokens_seen": 124387328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048600802407221667, + "loss": 3.4728, + "theoretical_loss": 4.631815850260173, + "tokens_seen": 124452864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 81846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6328256130218506, + "objective/train/theoretical_loss": 4.631488200339643, + "objective/train/tokens_used": 144978400, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048599799398194585, + "loss": 3.3451, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048598796389167503, + "loss": 3.3085, + "theoretical_loss": 4.63116077107737, + "tokens_seen": 124583936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859779338014042, + "loss": 3.3846, + "theoretical_loss": 4.630833562208797, + "tokens_seen": 124649472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859679037111334, + "loss": 3.4496, + "theoretical_loss": 4.630506573469815, + "tokens_seen": 124715008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859578736208626, + "loss": 3.0359, + "theoretical_loss": 4.630179804596775, + "tokens_seen": 124780544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859478435305918, + "loss": 3.3915, + "theoretical_loss": 4.629853255326481, + "tokens_seen": 124846080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048593781344032094, + "loss": 3.1342, + "theoretical_loss": 4.629526925396189, + "tokens_seen": 124911616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859277833500502, + "loss": 3.5865, + "theoretical_loss": 4.6292008145436085, + "tokens_seen": 124977152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859177532597793, + "loss": 3.3983, + "theoretical_loss": 4.628874922506897, + "tokens_seen": 125042688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048590772316950854, + "loss": 3.3449, + "theoretical_loss": 4.628549249024666, + "tokens_seen": 125108224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858976930792377, + "loss": 3.3839, + "theoretical_loss": 4.628223793835975, + "tokens_seen": 125173760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858876629889669, + "loss": 3.1372, + "theoretical_loss": 4.627898556680327, + "tokens_seen": 125239296 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858776328986961, + "loss": 3.2933, + "theoretical_loss": 4.627573537297678, + "tokens_seen": 125304832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858676028084253, + "loss": 3.506, + "theoretical_loss": 4.627248735428427, + "tokens_seen": 125370368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048585757271815444, + "loss": 3.4041, + "theoretical_loss": 4.6269241508134185, + "tokens_seen": 125435904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858475426278837, + "loss": 3.5242, + "theoretical_loss": 4.6265997831939405, + "tokens_seen": 125501440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858375125376128, + "loss": 3.3199, + "theoretical_loss": 4.6262756323117245, + "tokens_seen": 125566976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048582748244734204, + "loss": 3.1673, + "theoretical_loss": 4.625951697908944, + "tokens_seen": 125632512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858174523570712, + "loss": 3.5243, + "theoretical_loss": 4.625627979728212, + "tokens_seen": 125698048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858074222668004, + "loss": 3.404, + "theoretical_loss": 4.625304477512584, + "tokens_seen": 125763584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857973921765296, + "loss": 3.2105, + "theoretical_loss": 4.624981191005554, + "tokens_seen": 125829120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048578736208625877, + "loss": 3.0283, + "theoretical_loss": 4.624658119951052, + "tokens_seen": 125894656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048577733199598795, + "loss": 3.1752, + "theoretical_loss": 4.624335264093447, + "tokens_seen": 125960192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857673019057172, + "loss": 3.5489, + "theoretical_loss": 4.624012623177544, + "tokens_seen": 126025728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857572718154463, + "loss": 3.3505, + "theoretical_loss": 4.623690196948582, + "tokens_seen": 126091264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 83055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7693979740142822, + "objective/train/theoretical_loss": 4.623367985152234, + "objective/train/tokens_used": 146616800, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048574724172517554, + "loss": 3.554, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048573721163490467, + "loss": 3.4829, + "theoretical_loss": 4.623045987534609, + "tokens_seen": 126222336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857271815446339, + "loss": 3.5869, + "theoretical_loss": 4.622724203842246, + "tokens_seen": 126287872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048571715145436314, + "loss": 3.3389, + "theoretical_loss": 4.622402633822114, + "tokens_seen": 126353408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048570712136409227, + "loss": 3.3537, + "theoretical_loss": 4.622081277221616, + "tokens_seen": 126418944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856970912738215, + "loss": 3.2713, + "theoretical_loss": 4.62176013378858, + "tokens_seen": 126484480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856870611835507, + "loss": 3.4149, + "theoretical_loss": 4.621439203271267, + "tokens_seen": 126550016 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048567703109327987, + "loss": 3.4411, + "theoretical_loss": 4.621118485418362, + "tokens_seen": 126615552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048566700100300905, + "loss": 3.5645, + "theoretical_loss": 4.620797979978978, + "tokens_seen": 126681088 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048565697091273823, + "loss": 3.4029, + "theoretical_loss": 4.620477686702651, + "tokens_seen": 126746624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856469408224674, + "loss": 3.5134, + "theoretical_loss": 4.620157605339347, + "tokens_seen": 126812160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048563691073219664, + "loss": 3.2403, + "theoretical_loss": 4.619837735639452, + "tokens_seen": 126877696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048562688064192577, + "loss": 3.2403, + "theoretical_loss": 4.619518077353776, + "tokens_seen": 126943232 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485616850551655, + "loss": 3.3605, + "theoretical_loss": 4.619198630233547, + "tokens_seen": 127008768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048560682046138413, + "loss": 3.6782, + "theoretical_loss": 4.6188793940304205, + "tokens_seen": 127074304 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048559679037111337, + "loss": 3.291, + "theoretical_loss": 4.618560368496466, + "tokens_seen": 127139840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048558676028084255, + "loss": 3.2539, + "theoretical_loss": 4.618241553384175, + "tokens_seen": 127205376 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048557673019057173, + "loss": 3.3349, + "theoretical_loss": 4.617922948446459, + "tokens_seen": 127270912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855667001003009, + "loss": 3.3863, + "theoretical_loss": 4.617604553436642, + "tokens_seen": 127336448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048555667001003015, + "loss": 3.3637, + "theoretical_loss": 4.617286368108466, + "tokens_seen": 127401984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855466399197593, + "loss": 3.2942, + "theoretical_loss": 4.6169683922160925, + "tokens_seen": 127467520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855366098294885, + "loss": 3.4269, + "theoretical_loss": 4.616650625514091, + "tokens_seen": 127533056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048552657973921764, + "loss": 3.3937, + "theoretical_loss": 4.616333067757449, + "tokens_seen": 127598592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048551654964894687, + "loss": 3.4983, + "theoretical_loss": 4.616015718701563, + "tokens_seen": 127664128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048550651955867605, + "loss": 3.3911, + "theoretical_loss": 4.615698578102245, + "tokens_seen": 127729664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 83592, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.880964994430542, + "objective/train/theoretical_loss": 4.615381645715717, + "objective/train/tokens_used": 148255200, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048549648946840523, + "loss": 3.5533, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854864593781344, + "loss": 3.4536, + "theoretical_loss": 4.615064921298608, + "tokens_seen": 127860736 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854764292878636, + "loss": 3.377, + "theoretical_loss": 4.61474840460796, + "tokens_seen": 127926272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854663991975928, + "loss": 3.5236, + "theoretical_loss": 4.614432095401219, + "tokens_seen": 127991808 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485456369107322, + "loss": 3.4254, + "theoretical_loss": 4.614115993436242, + "tokens_seen": 128057344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048544633901705114, + "loss": 3.4557, + "theoretical_loss": 4.613800098471291, + "tokens_seen": 128122880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854363089267804, + "loss": 3.5225, + "theoretical_loss": 4.613484410265032, + "tokens_seen": 128188416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854262788365095, + "loss": 3.3142, + "theoretical_loss": 4.613168928576538, + "tokens_seen": 128253952 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048541624874623874, + "loss": 3.3601, + "theoretical_loss": 4.612853653165283, + "tokens_seen": 128319488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854062186559679, + "loss": 3.4458, + "theoretical_loss": 4.612538583791146, + "tokens_seen": 128385024 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853961885656971, + "loss": 3.1733, + "theoretical_loss": 4.612223720214407, + "tokens_seen": 128450560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853861584754263, + "loss": 3.2368, + "theoretical_loss": 4.611909062195749, + "tokens_seen": 128516096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853761283851555, + "loss": 3.3807, + "theoretical_loss": 4.61159460949625, + "tokens_seen": 128581632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048536609829488464, + "loss": 3.3186, + "theoretical_loss": 4.611280361877393, + "tokens_seen": 128647168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853560682046139, + "loss": 3.4312, + "theoretical_loss": 4.610966319101056, + "tokens_seen": 128712704 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485346038114343, + "loss": 3.3996, + "theoretical_loss": 4.610652480929515, + "tokens_seen": 128778240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048533600802407224, + "loss": 3.3225, + "theoretical_loss": 4.610338847125445, + "tokens_seen": 128843776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853259779338014, + "loss": 3.5473, + "theoretical_loss": 4.610025417451913, + "tokens_seen": 128909312 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853159478435306, + "loss": 3.339, + "theoretical_loss": 4.6097121916723856, + "tokens_seen": 128974848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853059177532598, + "loss": 3.5529, + "theoretical_loss": 4.609399169550718, + "tokens_seen": 129040384 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048529588766298897, + "loss": 3.282, + "theoretical_loss": 4.609086350851165, + "tokens_seen": 129105920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048528585757271815, + "loss": 3.3268, + "theoretical_loss": 4.6087737353383655, + "tokens_seen": 129171456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852758274824474, + "loss": 3.2374, + "theoretical_loss": 4.6084613227773605, + "tokens_seen": 129236992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852657973921765, + "loss": 3.2591, + "theoretical_loss": 4.608149112933571, + "tokens_seen": 129302528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048525576730190574, + "loss": 3.0899, + "theoretical_loss": 4.607837105572816, + "tokens_seen": 129368064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 84814, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1867215633392334, + "objective/train/theoretical_loss": 4.607525300461299, + "objective/train/tokens_used": 149893600, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048524573721163487, + "loss": 3.1172, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852357071213641, + "loss": 3.1282, + "theoretical_loss": 4.607213697365613, + "tokens_seen": 129499136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852256770310933, + "loss": 3.3892, + "theoretical_loss": 4.606902296052739, + "tokens_seen": 129564672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048521564694082247, + "loss": 3.2205, + "theoretical_loss": 4.6065910962900425, + "tokens_seen": 129630208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048520561685055165, + "loss": 3.4858, + "theoretical_loss": 4.606280097845277, + "tokens_seen": 129695744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851955867602809, + "loss": 3.2573, + "theoretical_loss": 4.60596930048658, + "tokens_seen": 129761280 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048518555667001, + "loss": 3.3325, + "theoretical_loss": 4.605658703982471, + "tokens_seen": 129826816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048517552657973925, + "loss": 3.3328, + "theoretical_loss": 4.6053483081018545, + "tokens_seen": 129892352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851654964894684, + "loss": 3.3861, + "theoretical_loss": 4.605038112614018, + "tokens_seen": 129957888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851554663991976, + "loss": 3.1793, + "theoretical_loss": 4.604728117288631, + "tokens_seen": 130023424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851454363089268, + "loss": 3.275, + "theoretical_loss": 4.604418321895739, + "tokens_seen": 130088960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048513540621865597, + "loss": 3.4315, + "theoretical_loss": 4.604108726205774, + "tokens_seen": 130154496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048512537612838515, + "loss": 3.3219, + "theoretical_loss": 4.60379932998954, + "tokens_seen": 130220032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048511534603811433, + "loss": 3.4426, + "theoretical_loss": 4.6034901330182265, + "tokens_seen": 130285568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851053159478435, + "loss": 3.685, + "theoretical_loss": 4.603181135063394, + "tokens_seen": 130351104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048509528585757275, + "loss": 3.2137, + "theoretical_loss": 4.6028723358969845, + "tokens_seen": 130416640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850852557673019, + "loss": 3.3579, + "theoretical_loss": 4.602563735291312, + "tokens_seen": 130482176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850752256770311, + "loss": 3.3985, + "theoretical_loss": 4.602255333019068, + "tokens_seen": 130547712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048506519558676024, + "loss": 3.4027, + "theoretical_loss": 4.6019471288533165, + "tokens_seen": 130613248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850551654964895, + "loss": 3.4611, + "theoretical_loss": 4.601639122567497, + "tokens_seen": 130678784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048504513540621866, + "loss": 3.2935, + "theoretical_loss": 4.601331313935418, + "tokens_seen": 130744320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048503510531594784, + "loss": 3.3577, + "theoretical_loss": 4.601023702731264, + "tokens_seen": 130809856 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485025075225677, + "loss": 3.3172, + "theoretical_loss": 4.600716288729587, + "tokens_seen": 130875392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048501504513540625, + "loss": 3.2354, + "theoretical_loss": 4.600409071705312, + "tokens_seen": 130940928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850050150451354, + "loss": 3.406, + "theoretical_loss": 4.60010205143373, + "tokens_seen": 131006464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 85560, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.442735195159912, + "objective/train/theoretical_loss": 4.599795227690505, + "objective/train/tokens_used": 151532000, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849949849548646, + "loss": 3.4252, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849849548645938, + "loss": 3.2521, + "theoretical_loss": 4.5994886002516635, + "tokens_seen": 131137536 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484974924774323, + "loss": 3.2135, + "theoretical_loss": 4.599182168893604, + "tokens_seen": 131203072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849648946840522, + "loss": 3.29, + "theoretical_loss": 4.598875933393089, + "tokens_seen": 131268608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048495486459378134, + "loss": 3.4736, + "theoretical_loss": 4.5985698935272445, + "tokens_seen": 131334144 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849448345035106, + "loss": 3.361, + "theoretical_loss": 4.598264049073565, + "tokens_seen": 131399680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849348044132397, + "loss": 3.1653, + "theoretical_loss": 4.597958399809908, + "tokens_seen": 131465216 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048492477432296894, + "loss": 3.0195, + "theoretical_loss": 4.59765294551449, + "tokens_seen": 131530752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849147442326981, + "loss": 3.4434, + "theoretical_loss": 4.597347685965897, + "tokens_seen": 131596288 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849047141424273, + "loss": 3.579, + "theoretical_loss": 4.597042620943069, + "tokens_seen": 131661824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848946840521565, + "loss": 3.1773, + "theoretical_loss": 4.596737750225311, + "tokens_seen": 131727360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848846539618857, + "loss": 3.3462, + "theoretical_loss": 4.596433073592289, + "tokens_seen": 131792896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048487462387161484, + "loss": 3.169, + "theoretical_loss": 4.596128590824026, + "tokens_seen": 131858432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848645937813441, + "loss": 3.3918, + "theoretical_loss": 4.595824301700904, + "tokens_seen": 131923968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848545636910732, + "loss": 3.1064, + "theoretical_loss": 4.595520206003663, + "tokens_seen": 131989504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048484453360080244, + "loss": 3.4125, + "theoretical_loss": 4.595216303513399, + "tokens_seen": 132055040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848345035105316, + "loss": 3.5448, + "theoretical_loss": 4.594912594011566, + "tokens_seen": 132120576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848244734202608, + "loss": 3.1756, + "theoretical_loss": 4.594609077279973, + "tokens_seen": 132186112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048481444332999, + "loss": 3.3843, + "theoretical_loss": 4.594305753100782, + "tokens_seen": 132251648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048480441323971917, + "loss": 3.4335, + "theoretical_loss": 4.594002621256511, + "tokens_seen": 132317184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048479438314944835, + "loss": 3.4907, + "theoretical_loss": 4.59369968153003, + "tokens_seen": 132382720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847843530591776, + "loss": 3.3829, + "theoretical_loss": 4.593396933704562, + "tokens_seen": 132448256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847743229689067, + "loss": 3.4217, + "theoretical_loss": 4.593094377563681, + "tokens_seen": 132513792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048476429287863594, + "loss": 3.5836, + "theoretical_loss": 4.592792012891314, + "tokens_seen": 132579328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048475426278836507, + "loss": 3.1773, + "theoretical_loss": 4.592489839471735, + "tokens_seen": 132644864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 86181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.529838800430298, + "objective/train/theoretical_loss": 4.592187857089571, + "objective/train/tokens_used": 153170400, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847442326980943, + "loss": 3.3219, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847342026078235, + "loss": 3.0821, + "theoretical_loss": 4.591886065529795, + "tokens_seen": 132775936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048472417251755267, + "loss": 3.2018, + "theoretical_loss": 4.591584464577728, + "tokens_seen": 132841472 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048471414242728185, + "loss": 3.3234, + "theoretical_loss": 4.591283054019041, + "tokens_seen": 132907008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847041123370111, + "loss": 3.3791, + "theoretical_loss": 4.5909818336397485, + "tokens_seen": 132972544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846940822467402, + "loss": 3.2039, + "theoretical_loss": 4.590680803226213, + "tokens_seen": 133038080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048468405215646945, + "loss": 3.4776, + "theoretical_loss": 4.590379962565141, + "tokens_seen": 133103616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846740220661986, + "loss": 3.337, + "theoretical_loss": 4.590079311443583, + "tokens_seen": 133169152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846639919759278, + "loss": 3.431, + "theoretical_loss": 4.589778849648934, + "tokens_seen": 133234688 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484653961885657, + "loss": 3.3345, + "theoretical_loss": 4.589478576968932, + "tokens_seen": 133300224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048464393179538617, + "loss": 3.4218, + "theoretical_loss": 4.589178493191655, + "tokens_seen": 133365760 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048463390170511535, + "loss": 3.4226, + "theoretical_loss": 4.588878598105527, + "tokens_seen": 133431296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048462387161484453, + "loss": 3.3897, + "theoretical_loss": 4.588578891499308, + "tokens_seen": 133496832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846138415245737, + "loss": 3.3441, + "theoretical_loss": 4.588279373162101, + "tokens_seen": 133562368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048460381143430295, + "loss": 3.1719, + "theoretical_loss": 4.587980042883347, + "tokens_seen": 133627904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845937813440321, + "loss": 3.5295, + "theoretical_loss": 4.587680900452824, + "tokens_seen": 133693440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845837512537613, + "loss": 3.3176, + "theoretical_loss": 4.587381945660653, + "tokens_seen": 133758976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048457372116349044, + "loss": 3.1641, + "theoretical_loss": 4.587083178297288, + "tokens_seen": 133824512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845636910732197, + "loss": 3.1573, + "theoretical_loss": 4.5867845981535185, + "tokens_seen": 133890048 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048455366098294886, + "loss": 3.3356, + "theoretical_loss": 4.586486205020474, + "tokens_seen": 133955584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048454363089267804, + "loss": 3.331, + "theoretical_loss": 4.586187998689616, + "tokens_seen": 134021120 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845336008024072, + "loss": 3.464, + "theoretical_loss": 4.585889978952741, + "tokens_seen": 134086656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048452357071213645, + "loss": 3.1701, + "theoretical_loss": 4.58559214560198, + "tokens_seen": 134152192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845135406218656, + "loss": 3.6266, + "theoretical_loss": 4.585294498429796, + "tokens_seen": 134217728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845035105315948, + "loss": 3.3988, + "theoretical_loss": 4.584997037228986, + "tokens_seen": 134283264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 87535, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1884214878082275, + "objective/train/theoretical_loss": 4.584699761792674, + "objective/train/tokens_used": 154808800, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048449348044132394, + "loss": 3.2752, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844834503510532, + "loss": 3.3727, + "theoretical_loss": 4.5844026719143205, + "tokens_seen": 134414336 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048447342026078236, + "loss": 3.4484, + "theoretical_loss": 4.5841057673877135, + "tokens_seen": 134479872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048446339017051154, + "loss": 3.429, + "theoretical_loss": 4.5838090480069695, + "tokens_seen": 134545408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844533600802407, + "loss": 3.1828, + "theoretical_loss": 4.5835125135665375, + "tokens_seen": 134610944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844433299899699, + "loss": 3.2808, + "theoretical_loss": 4.583216163861191, + "tokens_seen": 134676480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844332998996991, + "loss": 3.4666, + "theoretical_loss": 4.58291999868603, + "tokens_seen": 134742016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844232698094283, + "loss": 3.0757, + "theoretical_loss": 4.582624017836489, + "tokens_seen": 134807552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048441323971915745, + "loss": 3.5777, + "theoretical_loss": 4.582328221108318, + "tokens_seen": 134873088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844032096288867, + "loss": 3.1727, + "theoretical_loss": 4.5820326082976, + "tokens_seen": 134938624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048439317953861586, + "loss": 3.6033, + "theoretical_loss": 4.581737179200739, + "tokens_seen": 135004160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048438314944834504, + "loss": 3.3186, + "theoretical_loss": 4.581441933614466, + "tokens_seen": 135069696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843731193580742, + "loss": 3.3087, + "theoretical_loss": 4.581146871335832, + "tokens_seen": 135135232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843630892678034, + "loss": 3.0668, + "theoretical_loss": 4.580851992162214, + "tokens_seen": 135200768 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843530591775326, + "loss": 3.2295, + "theoretical_loss": 4.5805572958913086, + "tokens_seen": 135266304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843430290872618, + "loss": 3.5356, + "theoretical_loss": 4.580262782321135, + "tokens_seen": 135331840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048433299899699095, + "loss": 3.3037, + "theoretical_loss": 4.579968451250032, + "tokens_seen": 135397376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843229689067202, + "loss": 3.2675, + "theoretical_loss": 4.579674302476661, + "tokens_seen": 135462912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843129388164493, + "loss": 3.2687, + "theoretical_loss": 4.579380335800001, + "tokens_seen": 135528448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048430290872617855, + "loss": 3.5504, + "theoretical_loss": 4.579086551019348, + "tokens_seen": 135593984 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048429287863590773, + "loss": 3.4594, + "theoretical_loss": 4.5787929479343195, + "tokens_seen": 135659520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842828485456369, + "loss": 3.2982, + "theoretical_loss": 4.578499526344848, + "tokens_seen": 135725056 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842728184553661, + "loss": 3.1927, + "theoretical_loss": 4.578206286051184, + "tokens_seen": 135790592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048426278836509527, + "loss": 3.3044, + "theoretical_loss": 4.5779132268538945, + "tokens_seen": 135856128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048425275827482445, + "loss": 3.4211, + "theoretical_loss": 4.577620348553859, + "tokens_seen": 135921664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 88189, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8670549392700195, + "objective/train/theoretical_loss": 4.577327650952276, + "objective/train/tokens_used": 156447200, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842427281845537, + "loss": 3.3964, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048423269809428287, + "loss": 3.0862, + "theoretical_loss": 4.5770351338506545, + "tokens_seen": 136052736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048422266800401205, + "loss": 3.0829, + "theoretical_loss": 4.57674279705082, + "tokens_seen": 136118272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842126379137413, + "loss": 3.4358, + "theoretical_loss": 4.57645064035491, + "tokens_seen": 136183808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842026078234704, + "loss": 3.4142, + "theoretical_loss": 4.576158663565371, + "tokens_seen": 136249344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048419257773319965, + "loss": 3.3851, + "theoretical_loss": 4.575866866484967, + "tokens_seen": 136314880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841825476429288, + "loss": 3.4905, + "theoretical_loss": 4.575575248916767, + "tokens_seen": 136380416 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484172517552658, + "loss": 3.2962, + "theoretical_loss": 4.575283810664155, + "tokens_seen": 136445952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841624874623872, + "loss": 3.3953, + "theoretical_loss": 4.574992551530822, + "tokens_seen": 136511488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048415245737211637, + "loss": 3.4929, + "theoretical_loss": 4.574701471320768, + "tokens_seen": 136577024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048414242728184555, + "loss": 3.2127, + "theoretical_loss": 4.574410569838304, + "tokens_seen": 136642560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048413239719157473, + "loss": 3.3957, + "theoretical_loss": 4.574119846888045, + "tokens_seen": 136708096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841223671013039, + "loss": 3.4099, + "theoretical_loss": 4.573829302274915, + "tokens_seen": 136773632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048411233701103315, + "loss": 3.4686, + "theoretical_loss": 4.573538935804146, + "tokens_seen": 136839168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841023069207623, + "loss": 3.0463, + "theoretical_loss": 4.573248747281273, + "tokens_seen": 136904704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840922768304915, + "loss": 3.7113, + "theoretical_loss": 4.5729587365121365, + "tokens_seen": 136970240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048408224674022064, + "loss": 3.3974, + "theoretical_loss": 4.572668903302886, + "tokens_seen": 137035776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840722166499499, + "loss": 3.184, + "theoretical_loss": 4.572379247459969, + "tokens_seen": 137101312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048406218655967906, + "loss": 3.4432, + "theoretical_loss": 4.57208976879014, + "tokens_seen": 137166848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048405215646940824, + "loss": 3.5294, + "theoretical_loss": 4.571800467100456, + "tokens_seen": 137232384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840421263791374, + "loss": 3.0596, + "theoretical_loss": 4.5715113421982725, + "tokens_seen": 137297920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048403209628886665, + "loss": 3.3131, + "theoretical_loss": 4.571222393891253, + "tokens_seen": 137363456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840220661985958, + "loss": 3.51, + "theoretical_loss": 4.570933621987356, + "tokens_seen": 137428992 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484012036108325, + "loss": 3.4489, + "theoretical_loss": 4.570645026294844, + "tokens_seen": 137494528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048400200601805414, + "loss": 3.3655, + "theoretical_loss": 4.570356606622278, + "tokens_seen": 137560064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 88907, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2466330528259277, + "objective/train/theoretical_loss": 4.570068362778516, + "objective/train/tokens_used": 158085600, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839919759277834, + "loss": 3.4146, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048398194583751256, + "loss": 3.3105, + "theoretical_loss": 4.569780294572718, + "tokens_seen": 137691136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048397191574724174, + "loss": 3.5781, + "theoretical_loss": 4.569492401814339, + "tokens_seen": 137756672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839618856569709, + "loss": 3.047, + "theoretical_loss": 4.569204684313133, + "tokens_seen": 137822208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839518555667001, + "loss": 3.4353, + "theoretical_loss": 4.568917141879149, + "tokens_seen": 137887744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839418254764293, + "loss": 3.1746, + "theoretical_loss": 4.568629774322736, + "tokens_seen": 137953280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839317953861585, + "loss": 3.4604, + "theoretical_loss": 4.568342581454532, + "tokens_seen": 138018816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048392176529588765, + "loss": 3.5407, + "theoretical_loss": 4.568055563085476, + "tokens_seen": 138084352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839117352056169, + "loss": 3.2899, + "theoretical_loss": 4.567768719026797, + "tokens_seen": 138149888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048390170511534606, + "loss": 3.3983, + "theoretical_loss": 4.567482049090019, + "tokens_seen": 138215424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048389167502507524, + "loss": 3.2304, + "theoretical_loss": 4.567195553086961, + "tokens_seen": 138280960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838816449348044, + "loss": 3.3718, + "theoretical_loss": 4.566909230829729, + "tokens_seen": 138346496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838716148445336, + "loss": 3.4189, + "theoretical_loss": 4.566623082130729, + "tokens_seen": 138412032 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838615847542628, + "loss": 3.197, + "theoretical_loss": 4.566337106802651, + "tokens_seen": 138477568 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483851554663992, + "loss": 3.4958, + "theoretical_loss": 4.56605130465848, + "tokens_seen": 138543104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048384152457372115, + "loss": 3.245, + "theoretical_loss": 4.565765675511487, + "tokens_seen": 138608640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838314944834504, + "loss": 3.3454, + "theoretical_loss": 4.565480219175237, + "tokens_seen": 138674176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838214643931795, + "loss": 3.5074, + "theoretical_loss": 4.56519493546358, + "tokens_seen": 138739712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048381143430290875, + "loss": 3.5971, + "theoretical_loss": 4.56490982419066, + "tokens_seen": 138805248 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048380140421263793, + "loss": 3.3556, + "theoretical_loss": 4.564624885170902, + "tokens_seen": 138870784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837913741223671, + "loss": 3.4486, + "theoretical_loss": 4.564340118219022, + "tokens_seen": 138936320 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837813440320963, + "loss": 3.3375, + "theoretical_loss": 4.56405552315002, + "tokens_seen": 139001856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048377131394182547, + "loss": 3.5742, + "theoretical_loss": 4.563771099779187, + "tokens_seen": 139067392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048376128385155465, + "loss": 3.402, + "theoretical_loss": 4.563486847922093, + "tokens_seen": 139132928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837512537612839, + "loss": 3.221, + "theoretical_loss": 4.563202767394597, + "tokens_seen": 139198464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 90119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3730666637420654, + "objective/train/theoretical_loss": 4.562918858012843, + "objective/train/tokens_used": 159724000, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483741223671013, + "loss": 3.459, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048373119358074225, + "loss": 3.4475, + "theoretical_loss": 4.562635119593255, + "tokens_seen": 139329536 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048372116349047143, + "loss": 3.523, + "theoretical_loss": 4.562351551952542, + "tokens_seen": 139395072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837111334002006, + "loss": 3.434, + "theoretical_loss": 4.5620681549076965, + "tokens_seen": 139460608 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837011033099298, + "loss": 3.4291, + "theoretical_loss": 4.561784928275992, + "tokens_seen": 139526144 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483691073219659, + "loss": 3.3811, + "theoretical_loss": 4.561501871874984, + "tokens_seen": 139591680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048368104312938816, + "loss": 3.3562, + "theoretical_loss": 4.561218985522507, + "tokens_seen": 139657216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836710130391174, + "loss": 3.7033, + "theoretical_loss": 4.560936269036679, + "tokens_seen": 139722752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836609829488465, + "loss": 3.4809, + "theoretical_loss": 4.560653722235895, + "tokens_seen": 139788288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048365095285857575, + "loss": 3.6892, + "theoretical_loss": 4.560371344938831, + "tokens_seen": 139853824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836409227683049, + "loss": 3.3982, + "theoretical_loss": 4.560089136964439, + "tokens_seen": 139919360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836308926780341, + "loss": 3.7439, + "theoretical_loss": 4.559807098131953, + "tokens_seen": 139984896 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836208625877633, + "loss": 3.2185, + "theoretical_loss": 4.559525228260882, + "tokens_seen": 140050432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836108324974925, + "loss": 3.2183, + "theoretical_loss": 4.559243527171011, + "tokens_seen": 140115968 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048360080240722166, + "loss": 3.4715, + "theoretical_loss": 4.558961994682403, + "tokens_seen": 140181504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048359077231695084, + "loss": 3.5439, + "theoretical_loss": 4.558680630615397, + "tokens_seen": 140247040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048358074222668, + "loss": 3.4811, + "theoretical_loss": 4.558399434790607, + "tokens_seen": 140312576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048357071213640926, + "loss": 3.4954, + "theoretical_loss": 4.558118407028921, + "tokens_seen": 140378112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835606820461384, + "loss": 3.3851, + "theoretical_loss": 4.557837547151502, + "tokens_seen": 140443648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835506519558676, + "loss": 3.3218, + "theoretical_loss": 4.557556854979786, + "tokens_seen": 140509184 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835406218655968, + "loss": 3.4642, + "theoretical_loss": 4.5572763303354815, + "tokens_seen": 140574720 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483530591775326, + "loss": 3.237, + "theoretical_loss": 4.556995973040574, + "tokens_seen": 140640256 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048352056168505516, + "loss": 3.4711, + "theoretical_loss": 4.556715782917314, + "tokens_seen": 140705792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048351053159478434, + "loss": 3.3196, + "theoretical_loss": 4.556435759788229, + "tokens_seen": 140771328 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835005015045135, + "loss": 3.2629, + "theoretical_loss": 4.556155903476114, + "tokens_seen": 140836864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 90792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9938900470733643, + "objective/train/theoretical_loss": 4.555876213804037, + "objective/train/tokens_used": 161362400, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048349047141424276, + "loss": 3.6836, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048348044132397194, + "loss": 3.4205, + "theoretical_loss": 4.555596690595333, + "tokens_seen": 140967936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834704112337011, + "loss": 3.3216, + "theoretical_loss": 4.555317333673611, + "tokens_seen": 141033472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834603811434303, + "loss": 3.247, + "theoretical_loss": 4.555038142862742, + "tokens_seen": 141099008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834503510531595, + "loss": 3.3618, + "theoretical_loss": 4.5547591179868725, + "tokens_seen": 141164544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834403209628887, + "loss": 3.3634, + "theoretical_loss": 4.554480258870409, + "tokens_seen": 141230080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048343029087261785, + "loss": 3.4449, + "theoretical_loss": 4.554201565338033, + "tokens_seen": 141295616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834202607823471, + "loss": 3.212, + "theoretical_loss": 4.5539230372146875, + "tokens_seen": 141361152 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048341023069207626, + "loss": 3.2166, + "theoretical_loss": 4.553644674325584, + "tokens_seen": 141426688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048340020060180544, + "loss": 3.4181, + "theoretical_loss": 4.553366476496198, + "tokens_seen": 141492224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833901705115346, + "loss": 3.2078, + "theoretical_loss": 4.553088443552269, + "tokens_seen": 141557760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833801404212638, + "loss": 3.3662, + "theoretical_loss": 4.552810575319806, + "tokens_seen": 141623296 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483370110330993, + "loss": 3.2753, + "theoretical_loss": 4.552532871625077, + "tokens_seen": 141688832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833600802407222, + "loss": 3.4694, + "theoretical_loss": 4.5522553322946155, + "tokens_seen": 141754368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048335005015045135, + "loss": 3.3584, + "theoretical_loss": 4.551977957155217, + "tokens_seen": 141819904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833400200601806, + "loss": 3.3498, + "theoretical_loss": 4.5517007460339425, + "tokens_seen": 141885440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833299899699097, + "loss": 3.4718, + "theoretical_loss": 4.551423698758111, + "tokens_seen": 141950976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048331995987963895, + "loss": 3.4992, + "theoretical_loss": 4.551146815155304, + "tokens_seen": 142016512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048330992978936813, + "loss": 3.1915, + "theoretical_loss": 4.550870095053366, + "tokens_seen": 142082048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832998996990973, + "loss": 3.4141, + "theoretical_loss": 4.550593538280398, + "tokens_seen": 142147584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832898696088265, + "loss": 3.4559, + "theoretical_loss": 4.550317144664766, + "tokens_seen": 142213120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048327983951855567, + "loss": 3.1613, + "theoretical_loss": 4.55004091403509, + "tokens_seen": 142278656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048326980942828485, + "loss": 3.4072, + "theoretical_loss": 4.5497648462202545, + "tokens_seen": 142344192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832597793380141, + "loss": 3.4498, + "theoretical_loss": 4.549488941049397, + "tokens_seen": 142409728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832497492477432, + "loss": 3.1324, + "theoretical_loss": 4.549213198351914, + "tokens_seen": 142475264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 92175, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.230867624282837, + "objective/train/theoretical_loss": 4.548937617957463, + "objective/train/tokens_used": 163000800, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048323971915747245, + "loss": 3.3699, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048322968906720163, + "loss": 3.3031, + "theoretical_loss": 4.548662199695954, + "tokens_seen": 142606336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832196589769308, + "loss": 3.6595, + "theoretical_loss": 4.548386943397556, + "tokens_seen": 142671872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048320962888666, + "loss": 3.518, + "theoretical_loss": 4.548111848892693, + "tokens_seen": 142737408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831995987963892, + "loss": 3.1302, + "theoretical_loss": 4.547836916012042, + "tokens_seen": 142802944 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048318956870611836, + "loss": 3.4835, + "theoretical_loss": 4.547562144586539, + "tokens_seen": 142868480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831795386158476, + "loss": 3.4292, + "theoretical_loss": 4.547287534447372, + "tokens_seen": 142934016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831695085255767, + "loss": 3.3998, + "theoretical_loss": 4.5470130854259825, + "tokens_seen": 142999552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048315947843530595, + "loss": 3.5444, + "theoretical_loss": 4.546738797354065, + "tokens_seen": 143065088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831494483450351, + "loss": 3.3672, + "theoretical_loss": 4.546464670063569, + "tokens_seen": 143130624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831394182547643, + "loss": 3.3033, + "theoretical_loss": 4.546190703386695, + "tokens_seen": 143196160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831293881644935, + "loss": 3.2802, + "theoretical_loss": 4.545916897155894, + "tokens_seen": 143261696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831193580742227, + "loss": 3.2078, + "theoretical_loss": 4.54564325120387, + "tokens_seen": 143327232 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048310932798395186, + "loss": 3.369, + "theoretical_loss": 4.545369765363578, + "tokens_seen": 143392768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048309929789368104, + "loss": 3.0522, + "theoretical_loss": 4.545096439468223, + "tokens_seen": 143458304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830892678034102, + "loss": 3.457, + "theoretical_loss": 4.544823273351257, + "tokens_seen": 143523840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048307923771313946, + "loss": 3.2289, + "theoretical_loss": 4.544550266846388, + "tokens_seen": 143589376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830692076228686, + "loss": 3.491, + "theoretical_loss": 4.544277419787566, + "tokens_seen": 143654912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830591775325978, + "loss": 3.273, + "theoretical_loss": 4.544004732008993, + "tokens_seen": 143720448 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483049147442327, + "loss": 3.4575, + "theoretical_loss": 4.543732203345119, + "tokens_seen": 143785984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830391173520562, + "loss": 3.4134, + "theoretical_loss": 4.543459833630639, + "tokens_seen": 143851520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048302908726178536, + "loss": 3.1865, + "theoretical_loss": 4.543187622700497, + "tokens_seen": 143917056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048301905717151454, + "loss": 3.4901, + "theoretical_loss": 4.542915570389884, + "tokens_seen": 143982592 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830090270812437, + "loss": 3.3153, + "theoretical_loss": 4.542643676534234, + "tokens_seen": 144048128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048299899699097296, + "loss": 3.4697, + "theoretical_loss": 4.542371940969231, + "tokens_seen": 144113664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 92798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.180187702178955, + "objective/train/theoretical_loss": 4.542100363530799, + "objective/train/tokens_used": 164639200, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829889669007021, + "loss": 3.4208, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829789368104313, + "loss": 3.4833, + "theoretical_loss": 4.54182894405511, + "tokens_seen": 144244736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048296890672016045, + "loss": 3.5622, + "theoretical_loss": 4.5415576823785795, + "tokens_seen": 144310272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829588766298897, + "loss": 3.3664, + "theoretical_loss": 4.541286578337866, + "tokens_seen": 144375808 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048294884653961887, + "loss": 3.3885, + "theoretical_loss": 4.541015631769872, + "tokens_seen": 144441344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048293881644934805, + "loss": 3.4071, + "theoretical_loss": 4.5407448425117405, + "tokens_seen": 144506880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048292878635907723, + "loss": 3.2401, + "theoretical_loss": 4.540474210400859, + "tokens_seen": 144572416 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048291875626880646, + "loss": 3.3437, + "theoretical_loss": 4.540203735274855, + "tokens_seen": 144637952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829087261785356, + "loss": 3.2116, + "theoretical_loss": 4.5399334169716, + "tokens_seen": 144703488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828986960882648, + "loss": 3.6115, + "theoretical_loss": 4.539663255329202, + "tokens_seen": 144769024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048288866599799395, + "loss": 3.0955, + "theoretical_loss": 4.539393250186015, + "tokens_seen": 144834560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828786359077232, + "loss": 3.4351, + "theoretical_loss": 4.539123401380625, + "tokens_seen": 144900096 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048286860581745237, + "loss": 3.5045, + "theoretical_loss": 4.538853708751866, + "tokens_seen": 144965632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048285857572718155, + "loss": 3.3471, + "theoretical_loss": 4.538584172138804, + "tokens_seen": 145031168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048284854563691073, + "loss": 3.2684, + "theoretical_loss": 4.538314791380748, + "tokens_seen": 145096704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828385155466399, + "loss": 3.2516, + "theoretical_loss": 4.538045566317242, + "tokens_seen": 145162240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828284854563691, + "loss": 3.4594, + "theoretical_loss": 4.537776496788071, + "tokens_seen": 145227776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048281845536609833, + "loss": 3.603, + "theoretical_loss": 4.537507582633253, + "tokens_seen": 145293312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048280842527582746, + "loss": 3.2846, + "theoretical_loss": 4.537238823693045, + "tokens_seen": 145358848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827983951855567, + "loss": 3.4105, + "theoretical_loss": 4.536970219807939, + "tokens_seen": 145424384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827883650952858, + "loss": 3.3511, + "theoretical_loss": 4.536701770818665, + "tokens_seen": 145489920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048277833500501505, + "loss": 3.2664, + "theoretical_loss": 4.536433476566185, + "tokens_seen": 145555456 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048276830491474423, + "loss": 3.2244, + "theoretical_loss": 4.536165336891699, + "tokens_seen": 145620992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827582748244734, + "loss": 3.5109, + "theoretical_loss": 4.535897351636638, + "tokens_seen": 145686528 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827482447342026, + "loss": 3.509, + "theoretical_loss": 4.53562952064267, + "tokens_seen": 145752064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 94036, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4322946071624756, + "objective/train/theoretical_loss": 4.535361843751696, + "objective/train/tokens_used": 166277600, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048273821464393183, + "loss": 3.2137, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482728184553661, + "loss": 3.0617, + "theoretical_loss": 4.535094320805847, + "tokens_seen": 145883136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827181544633902, + "loss": 3.4118, + "theoretical_loss": 4.534826951647489, + "tokens_seen": 145948672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827081243731194, + "loss": 3.3963, + "theoretical_loss": 4.5345597361192205, + "tokens_seen": 146014208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048269809428284856, + "loss": 3.4053, + "theoretical_loss": 4.53429267406387, + "tokens_seen": 146079744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826880641925778, + "loss": 3.1817, + "theoretical_loss": 4.5340257653244995, + "tokens_seen": 146145280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826780341023069, + "loss": 3.6942, + "theoretical_loss": 4.5337590097444, + "tokens_seen": 146210816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048266800401203615, + "loss": 3.4861, + "theoretical_loss": 4.533492407167093, + "tokens_seen": 146276352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826579739217653, + "loss": 3.1617, + "theoretical_loss": 4.53322595743633, + "tokens_seen": 146341888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826479438314945, + "loss": 3.1373, + "theoretical_loss": 4.5329596603960916, + "tokens_seen": 146407424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826379137412237, + "loss": 3.1347, + "theoretical_loss": 4.53269351589059, + "tokens_seen": 146472960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826278836509529, + "loss": 3.3526, + "theoretical_loss": 4.532427523764261, + "tokens_seen": 146538496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048261785356068206, + "loss": 3.2586, + "theoretical_loss": 4.532161683861773, + "tokens_seen": 146604032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048260782347041124, + "loss": 3.3128, + "theoretical_loss": 4.5318959960280205, + "tokens_seen": 146669568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825977933801404, + "loss": 3.39, + "theoretical_loss": 4.531630460108125, + "tokens_seen": 146735104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048258776328986966, + "loss": 3.4165, + "theoretical_loss": 4.531365075947434, + "tokens_seen": 146800640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825777331995988, + "loss": 3.2288, + "theoretical_loss": 4.531099843391524, + "tokens_seen": 146866176 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482567703109328, + "loss": 3.3832, + "theoretical_loss": 4.5308347622861955, + "tokens_seen": 146931712 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825576730190572, + "loss": 3.6141, + "theoretical_loss": 4.5305698324774735, + "tokens_seen": 146997248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825476429287864, + "loss": 3.4217, + "theoretical_loss": 4.530305053811611, + "tokens_seen": 147062784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048253761283851556, + "loss": 3.2163, + "theoretical_loss": 4.530040426135084, + "tokens_seen": 147128320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048252758274824474, + "loss": 3.3166, + "theoretical_loss": 4.529775949294593, + "tokens_seen": 147193856 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825175526579739, + "loss": 3.3181, + "theoretical_loss": 4.529511623137061, + "tokens_seen": 147259392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048250752256770316, + "loss": 3.2954, + "theoretical_loss": 4.529247447509637, + "tokens_seen": 147324928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824974924774323, + "loss": 3.2769, + "theoretical_loss": 4.528983422259691, + "tokens_seen": 147390464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 94726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4257872104644775, + "objective/train/theoretical_loss": 4.528719547234816, + "objective/train/tokens_used": 167916000, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824874623871615, + "loss": 3.3119, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048247743229689065, + "loss": 3.3536, + "theoretical_loss": 4.528455822282828, + "tokens_seen": 147521536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824674022066199, + "loss": 3.4758, + "theoretical_loss": 4.528192247251763, + "tokens_seen": 147587072 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048245737211634907, + "loss": 3.1317, + "theoretical_loss": 4.52792882198988, + "tokens_seen": 147652608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048244734202607825, + "loss": 3.3368, + "theoretical_loss": 4.527665546345656, + "tokens_seen": 147718144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048243731193580743, + "loss": 3.6421, + "theoretical_loss": 4.5274024201677925, + "tokens_seen": 147783680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048242728184553666, + "loss": 3.0839, + "theoretical_loss": 4.527139443305209, + "tokens_seen": 147849216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824172517552658, + "loss": 3.4025, + "theoretical_loss": 4.526876615607042, + "tokens_seen": 147914752 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482407221664995, + "loss": 3.1217, + "theoretical_loss": 4.526613936922654, + "tokens_seen": 147980288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048239719157472415, + "loss": 3.3364, + "theoretical_loss": 4.526351407101618, + "tokens_seen": 148045824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823871614844534, + "loss": 3.0166, + "theoretical_loss": 4.526089025993732, + "tokens_seen": 148111360 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048237713139418257, + "loss": 3.263, + "theoretical_loss": 4.525826793449008, + "tokens_seen": 148176896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048236710130391175, + "loss": 3.4591, + "theoretical_loss": 4.525564709317678, + "tokens_seen": 148242432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048235707121364093, + "loss": 3.4337, + "theoretical_loss": 4.525302773450187, + "tokens_seen": 148307968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823470411233701, + "loss": 3.3084, + "theoretical_loss": 4.525040985697203, + "tokens_seen": 148373504 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823370110330993, + "loss": 3.5317, + "theoretical_loss": 4.524779345909604, + "tokens_seen": 148439040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048232698094282853, + "loss": 3.4339, + "theoretical_loss": 4.524517853938489, + "tokens_seen": 148504576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048231695085255766, + "loss": 3.1791, + "theoretical_loss": 4.524256509635169, + "tokens_seen": 148570112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004823069207622869, + "loss": 3.1341, + "theoretical_loss": 4.523995312851174, + "tokens_seen": 148635648 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482296890672016, + "loss": 3.2603, + "theoretical_loss": 4.523734263438241, + "tokens_seen": 148701184 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048228686058174525, + "loss": 3.2534, + "theoretical_loss": 4.52347336124833, + "tokens_seen": 148766720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048227683049147443, + "loss": 3.4183, + "theoretical_loss": 4.52321260613361, + "tokens_seen": 148832256 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822668004012036, + "loss": 3.3214, + "theoretical_loss": 4.522951997946466, + "tokens_seen": 148897792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822567703109328, + "loss": 3.3674, + "theoretical_loss": 4.522691536539492, + "tokens_seen": 148963328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048224674022066203, + "loss": 3.2784, + "theoretical_loss": 4.522431221765498, + "tokens_seen": 149028864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 95869, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1724908351898193, + "objective/train/theoretical_loss": 4.522171053477507, + "objective/train/tokens_used": 169554400, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048223671013039116, + "loss": 3.2823, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822266800401204, + "loss": 3.2289, + "theoretical_loss": 4.5219110315287505, + "tokens_seen": 149159936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822166499498495, + "loss": 3.3112, + "theoretical_loss": 4.521651155772675, + "tokens_seen": 149225472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048220661985957876, + "loss": 3.5058, + "theoretical_loss": 4.521391426062934, + "tokens_seen": 149291008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048219658976930794, + "loss": 3.5517, + "theoretical_loss": 4.521131842253396, + "tokens_seen": 149356544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821865596790371, + "loss": 3.4992, + "theoretical_loss": 4.520872404198139, + "tokens_seen": 149422080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821765295887663, + "loss": 3.4315, + "theoretical_loss": 4.520613111751445, + "tokens_seen": 149487616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821664994984955, + "loss": 3.3585, + "theoretical_loss": 4.520353964767814, + "tokens_seen": 149553152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048215646940822466, + "loss": 3.3818, + "theoretical_loss": 4.5200949631019505, + "tokens_seen": 149618688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821464393179539, + "loss": 3.183, + "theoretical_loss": 4.519836106608768, + "tokens_seen": 149684224 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482136409227683, + "loss": 3.2419, + "theoretical_loss": 4.519577395143388, + "tokens_seen": 149749760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048212637913741226, + "loss": 3.139, + "theoretical_loss": 4.519318828561142, + "tokens_seen": 149815296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821163490471414, + "loss": 3.1937, + "theoretical_loss": 4.519060406717565, + "tokens_seen": 149880832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821063189568706, + "loss": 3.1968, + "theoretical_loss": 4.518802129468405, + "tokens_seen": 149946368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820962888665998, + "loss": 3.0867, + "theoretical_loss": 4.51854399666961, + "tokens_seen": 150011904 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482086258776329, + "loss": 3.1493, + "theoretical_loss": 4.518286008177341, + "tokens_seen": 150077440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048207622868605816, + "loss": 3.4273, + "theoretical_loss": 4.51802816384796, + "tokens_seen": 150142976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820661985957874, + "loss": 3.3718, + "theoretical_loss": 4.517770463538038, + "tokens_seen": 150208512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048205616850551653, + "loss": 3.4009, + "theoretical_loss": 4.517512907104347, + "tokens_seen": 150274048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048204613841524576, + "loss": 3.2267, + "theoretical_loss": 4.517255494403868, + "tokens_seen": 150339584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820361083249749, + "loss": 3.2091, + "theoretical_loss": 4.516998225293785, + "tokens_seen": 150405120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820260782347041, + "loss": 3.4881, + "theoretical_loss": 4.516741099631485, + "tokens_seen": 150470656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820160481444333, + "loss": 3.2292, + "theoretical_loss": 4.51648411727456, + "tokens_seen": 150536192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820060180541625, + "loss": 3.291, + "theoretical_loss": 4.5162272780808035, + "tokens_seen": 150601728 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048199598796389167, + "loss": 3.1278, + "theoretical_loss": 4.515970581908216, + "tokens_seen": 150667264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 96442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.100712299346924, + "objective/train/theoretical_loss": 4.515714028614996, + "objective/train/tokens_used": 171192800, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048198595787362085, + "loss": 3.182, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819759277833501, + "loss": 3.4292, + "theoretical_loss": 4.515457618059546, + "tokens_seen": 150798336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048196589769307927, + "loss": 3.469, + "theoretical_loss": 4.515201350100471, + "tokens_seen": 150863872 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048195586760280845, + "loss": 3.0502, + "theoretical_loss": 4.514945224596577, + "tokens_seen": 150929408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048194583751253763, + "loss": 3.0543, + "theoretical_loss": 4.5146892414068684, + "tokens_seen": 150994944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048193580742226686, + "loss": 3.1962, + "theoretical_loss": 4.514433400390554, + "tokens_seen": 151060480 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481925777331996, + "loss": 3.2645, + "theoretical_loss": 4.514177701407042, + "tokens_seen": 151126016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819157472417252, + "loss": 3.411, + "theoretical_loss": 4.51392214431594, + "tokens_seen": 151191552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048190571715145435, + "loss": 3.4078, + "theoretical_loss": 4.513666728977054, + "tokens_seen": 151257088 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818956870611836, + "loss": 3.1651, + "theoretical_loss": 4.51341145525039, + "tokens_seen": 151322624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048188565697091277, + "loss": 3.0852, + "theoretical_loss": 4.513156322996155, + "tokens_seen": 151388160 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048187562688064195, + "loss": 3.223, + "theoretical_loss": 4.512901332074751, + "tokens_seen": 151453696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048186559679037113, + "loss": 3.2013, + "theoretical_loss": 4.5126464823467805, + "tokens_seen": 151519232 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818555667001003, + "loss": 3.4013, + "theoretical_loss": 4.512391773673042, + "tokens_seen": 151584768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818455366098295, + "loss": 3.1806, + "theoretical_loss": 4.5121372059145335, + "tokens_seen": 151650304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048183550651955873, + "loss": 3.0253, + "theoretical_loss": 4.511882778932447, + "tokens_seen": 151715840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048182547642928786, + "loss": 3.437, + "theoretical_loss": 4.511628492588174, + "tokens_seen": 151781376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818154463390171, + "loss": 3.2479, + "theoretical_loss": 4.5113743467433, + "tokens_seen": 151846912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818054162487462, + "loss": 3.2266, + "theoretical_loss": 4.511120341259608, + "tokens_seen": 151912448 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048179538615847545, + "loss": 3.065, + "theoretical_loss": 4.510866475999077, + "tokens_seen": 151977984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048178535606820463, + "loss": 3.1586, + "theoretical_loss": 4.510612750823878, + "tokens_seen": 152043520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817753259779338, + "loss": 2.9635, + "theoretical_loss": 4.5103591655963795, + "tokens_seen": 152109056 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481765295887663, + "loss": 3.2094, + "theoretical_loss": 4.510105720179144, + "tokens_seen": 152174592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048175526579739223, + "loss": 3.1597, + "theoretical_loss": 4.5098524144349295, + "tokens_seen": 152240128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048174523570712136, + "loss": 3.1867, + "theoretical_loss": 4.509599248226683, + "tokens_seen": 152305664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 97778, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6629927158355713, + "objective/train/theoretical_loss": 4.509346221417552, + "objective/train/tokens_used": 172831200, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817352056168506, + "loss": 3.2688, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817251755265797, + "loss": 3.4886, + "theoretical_loss": 4.509093333870869, + "tokens_seen": 152436736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048171514543630896, + "loss": 3.2402, + "theoretical_loss": 4.508840585450166, + "tokens_seen": 152502272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048170511534603814, + "loss": 3.3127, + "theoretical_loss": 4.508587976019164, + "tokens_seen": 152567808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816950852557673, + "loss": 3.2319, + "theoretical_loss": 4.508335505441774, + "tokens_seen": 152633344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816850551654965, + "loss": 3.3341, + "theoretical_loss": 4.508083173582105, + "tokens_seen": 152698880 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816750250752257, + "loss": 3.0618, + "theoretical_loss": 4.507830980304451, + "tokens_seen": 152764416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048166499498495486, + "loss": 3.4654, + "theoretical_loss": 4.5075789254733, + "tokens_seen": 152829952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816549648946841, + "loss": 2.7021, + "theoretical_loss": 4.507327008953329, + "tokens_seen": 152895488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816449348044132, + "loss": 3.5096, + "theoretical_loss": 4.507075230609407, + "tokens_seen": 152961024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048163490471414246, + "loss": 3.1766, + "theoretical_loss": 4.506823590306591, + "tokens_seen": 153026560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816248746238716, + "loss": 3.4232, + "theoretical_loss": 4.506572087910127, + "tokens_seen": 153092096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816148445336008, + "loss": 3.3578, + "theoretical_loss": 4.506320723285455, + "tokens_seen": 153157632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048160481444333, + "loss": 3.4206, + "theoretical_loss": 4.506069496298198, + "tokens_seen": 153223168 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815947843530592, + "loss": 3.5393, + "theoretical_loss": 4.5058184068141705, + "tokens_seen": 153288704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048158475426278837, + "loss": 2.9682, + "theoretical_loss": 4.505567454699373, + "tokens_seen": 153354240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815747241725176, + "loss": 3.3476, + "theoretical_loss": 4.505316639819997, + "tokens_seen": 153419776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048156469408224673, + "loss": 3.4125, + "theoretical_loss": 4.505065962042418, + "tokens_seen": 153485312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048155466399197596, + "loss": 3.0099, + "theoretical_loss": 4.504815421233202, + "tokens_seen": 153550848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815446339017051, + "loss": 3.2099, + "theoretical_loss": 4.504565017259097, + "tokens_seen": 153616384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815346038114343, + "loss": 3.1983, + "theoretical_loss": 4.504314749987044, + "tokens_seen": 153681920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815245737211635, + "loss": 3.4997, + "theoretical_loss": 4.504064619284163, + "tokens_seen": 153747456 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815145436308927, + "loss": 3.3542, + "theoretical_loss": 4.503814625017766, + "tokens_seen": 153812992 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048150451354062187, + "loss": 3.4172, + "theoretical_loss": 4.5035647670553445, + "tokens_seen": 153878528 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048149448345035105, + "loss": 3.0587, + "theoretical_loss": 4.503315045264581, + "tokens_seen": 153944064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 98099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.236271858215332, + "objective/train/theoretical_loss": 4.503065459513339, + "objective/train/tokens_used": 174469600, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048148445336008023, + "loss": 3.1071, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048147442326980947, + "loss": 3.4448, + "theoretical_loss": 4.502816009669665, + "tokens_seen": 154075136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814643931795386, + "loss": 3.301, + "theoretical_loss": 4.502566695601795, + "tokens_seen": 154140672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048145436308926783, + "loss": 3.3052, + "theoretical_loss": 4.502317517178142, + "tokens_seen": 154206208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048144433299899696, + "loss": 3.1486, + "theoretical_loss": 4.502068474267309, + "tokens_seen": 154271744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814343029087262, + "loss": 2.9085, + "theoretical_loss": 4.501819566738076, + "tokens_seen": 154337280 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048142427281845537, + "loss": 3.271, + "theoretical_loss": 4.501570794459411, + "tokens_seen": 154402816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048141424272818455, + "loss": 3.4126, + "theoretical_loss": 4.501322157300461, + "tokens_seen": 154468352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048140421263791373, + "loss": 3.3014, + "theoretical_loss": 4.501073655130554, + "tokens_seen": 154533888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048139418254764297, + "loss": 3.3142, + "theoretical_loss": 4.500825287819205, + "tokens_seen": 154599424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813841524573721, + "loss": 3.3548, + "theoretical_loss": 4.500577055236104, + "tokens_seen": 154664960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048137412236710133, + "loss": 3.179, + "theoretical_loss": 4.500328957251128, + "tokens_seen": 154730496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048136409227683046, + "loss": 3.2137, + "theoretical_loss": 4.500080993734329, + "tokens_seen": 154796032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813540621865597, + "loss": 3.3226, + "theoretical_loss": 4.499833164555944, + "tokens_seen": 154861568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813440320962889, + "loss": 3.3445, + "theoretical_loss": 4.499585469586387, + "tokens_seen": 154927104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048133400200601806, + "loss": 3.2798, + "theoretical_loss": 4.499337908696255, + "tokens_seen": 154992640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048132397191574724, + "loss": 3.1576, + "theoretical_loss": 4.499090481756321, + "tokens_seen": 155058176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813139418254764, + "loss": 3.3429, + "theoretical_loss": 4.498843188637538, + "tokens_seen": 155123712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813039117352056, + "loss": 3.2675, + "theoretical_loss": 4.498596029211041, + "tokens_seen": 155189248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048129388164493483, + "loss": 3.3317, + "theoretical_loss": 4.498349003348137, + "tokens_seen": 155254784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048128385155466396, + "loss": 3.3357, + "theoretical_loss": 4.4981021109203185, + "tokens_seen": 155320320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812738214643932, + "loss": 3.4323, + "theoretical_loss": 4.49785535179925, + "tokens_seen": 155385856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812637913741223, + "loss": 3.3122, + "theoretical_loss": 4.497608725856776, + "tokens_seen": 155451392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048125376128385156, + "loss": 3.2995, + "theoretical_loss": 4.497362232964919, + "tokens_seen": 155516928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048124373119358074, + "loss": 3.3192, + "theoretical_loss": 4.497115872995876, + "tokens_seen": 155582464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 99496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4420876502990723, + "objective/train/theoretical_loss": 4.496869645822022, + "objective/train/tokens_used": 176108000, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812337011033099, + "loss": 3.2141, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048122367101303916, + "loss": 3.2491, + "theoretical_loss": 4.496623551315908, + "tokens_seen": 155713536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048121364092276834, + "loss": 3.5781, + "theoretical_loss": 4.496377589350261, + "tokens_seen": 155779072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812036108324975, + "loss": 3.5251, + "theoretical_loss": 4.496131759797984, + "tokens_seen": 155844608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811935807422267, + "loss": 3.1931, + "theoretical_loss": 4.495886062532153, + "tokens_seen": 155910144 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811835506519559, + "loss": 3.425, + "theoretical_loss": 4.495640497426023, + "tokens_seen": 155975680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048117352056168506, + "loss": 3.1489, + "theoretical_loss": 4.495395064353019, + "tokens_seen": 156041216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811634904714143, + "loss": 3.3909, + "theoretical_loss": 4.4951497631867445, + "tokens_seen": 156106752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811534603811434, + "loss": 3.505, + "theoretical_loss": 4.494904593800973, + "tokens_seen": 156172288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048114343029087266, + "loss": 3.4255, + "theoretical_loss": 4.4946595560696565, + "tokens_seen": 156237824 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811334002006018, + "loss": 3.2254, + "theoretical_loss": 4.494414649866915, + "tokens_seen": 156303360 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481123370110331, + "loss": 3.285, + "theoretical_loss": 4.494169875067046, + "tokens_seen": 156368896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811133400200602, + "loss": 3.1795, + "theoretical_loss": 4.493925231544516, + "tokens_seen": 156434432 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811033099297894, + "loss": 3.4117, + "theoretical_loss": 4.493680719173968, + "tokens_seen": 156499968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048109327983951857, + "loss": 3.1854, + "theoretical_loss": 4.4934363378302145, + "tokens_seen": 156565504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810832497492478, + "loss": 3.1177, + "theoretical_loss": 4.493192087388239, + "tokens_seen": 156631040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048107321965897693, + "loss": 3.3501, + "theoretical_loss": 4.4929479677232, + "tokens_seen": 156696576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048106318956870616, + "loss": 3.3047, + "theoretical_loss": 4.4927039787104235, + "tokens_seen": 156762112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810531594784353, + "loss": 3.3947, + "theoretical_loss": 4.4924601202254095, + "tokens_seen": 156827648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810431293881645, + "loss": 3.217, + "theoretical_loss": 4.492216392143826, + "tokens_seen": 156893184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810330992978937, + "loss": 3.3287, + "theoretical_loss": 4.491972794341514, + "tokens_seen": 156958720 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810230692076229, + "loss": 3.4629, + "theoretical_loss": 4.49172932669448, + "tokens_seen": 157024256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048101303911735207, + "loss": 3.2084, + "theoretical_loss": 4.491485989078906, + "tokens_seen": 157089792 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048100300902708125, + "loss": 3.4562, + "theoretical_loss": 4.491242781371138, + "tokens_seen": 157155328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048099297893681043, + "loss": 3.2665, + "theoretical_loss": 4.490999703447697, + "tokens_seen": 157220864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 100000, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1658103466033936, + "objective/train/theoretical_loss": 4.4907567551852665, + "objective/train/tokens_used": 177746400, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048098294884653967, + "loss": 3.3429, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809729187562688, + "loss": 3.416, + "theoretical_loss": 4.490513936460702, + "tokens_seen": 157351936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048096288866599803, + "loss": 3.1769, + "theoretical_loss": 4.490271247151027, + "tokens_seen": 157417472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048095285857572716, + "loss": 3.4918, + "theoretical_loss": 4.490028687133432, + "tokens_seen": 157483008 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809428284854564, + "loss": 3.1271, + "theoretical_loss": 4.489786256285276, + "tokens_seen": 157548544 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048093279839518557, + "loss": 3.3183, + "theoretical_loss": 4.489543954484084, + "tokens_seen": 157614080 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048092276830491475, + "loss": 3.176, + "theoretical_loss": 4.489301781607551, + "tokens_seen": 157679616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048091273821464393, + "loss": 3.2113, + "theoretical_loss": 4.489059737533534, + "tokens_seen": 157745152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048090270812437317, + "loss": 3.2051, + "theoretical_loss": 4.48881782214006, + "tokens_seen": 157810688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808926780341023, + "loss": 3.2277, + "theoretical_loss": 4.48857603530532, + "tokens_seen": 157876224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048088264794383153, + "loss": 3.3081, + "theoretical_loss": 4.488334376907673, + "tokens_seen": 157941760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048087261785356066, + "loss": 3.4009, + "theoretical_loss": 4.4880928468256425, + "tokens_seen": 158007296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808625877632899, + "loss": 3.2696, + "theoretical_loss": 4.487851444937916, + "tokens_seen": 158072832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808525576730191, + "loss": 3.3747, + "theoretical_loss": 4.487610171123347, + "tokens_seen": 158138368 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048084252758274826, + "loss": 3.2603, + "theoretical_loss": 4.487369025260954, + "tokens_seen": 158203904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048083249749247744, + "loss": 3.3168, + "theoretical_loss": 4.48712800722992, + "tokens_seen": 158269440 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808224674022066, + "loss": 3.0748, + "theoretical_loss": 4.48688711690959, + "tokens_seen": 158334976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808124373119358, + "loss": 3.2276, + "theoretical_loss": 4.486646354179475, + "tokens_seen": 158400512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048080240722166503, + "loss": 3.3016, + "theoretical_loss": 4.48640571891925, + "tokens_seen": 158466048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048079237713139416, + "loss": 3.2242, + "theoretical_loss": 4.48616521100875, + "tokens_seen": 158531584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807823470411234, + "loss": 3.5084, + "theoretical_loss": 4.485924830327974, + "tokens_seen": 158597120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807723169508525, + "loss": 3.2451, + "theoretical_loss": 4.485684576757087, + "tokens_seen": 158662656 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048076228686058176, + "loss": 3.5179, + "theoretical_loss": 4.485444450176413, + "tokens_seen": 158728192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048075225677031094, + "loss": 3.4267, + "theoretical_loss": 4.485204450466437, + "tokens_seen": 158793728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807422266800401, + "loss": 3.4344, + "theoretical_loss": 4.484964577507808, + "tokens_seen": 158859264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 101476, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1423563957214355, + "objective/train/theoretical_loss": 4.484724831181337, + "objective/train/tokens_used": 179384800, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807321965897693, + "loss": 3.3062, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048072216649949854, + "loss": 3.222, + "theoretical_loss": 4.4844852113679945, + "tokens_seen": 158990336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048071213640922766, + "loss": 3.2831, + "theoretical_loss": 4.484245717948913, + "tokens_seen": 159055872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807021063189569, + "loss": 3.2262, + "theoretical_loss": 4.484006350805385, + "tokens_seen": 159121408 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480692076228686, + "loss": 3.3261, + "theoretical_loss": 4.483767109818862, + "tokens_seen": 159186944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048068204613841526, + "loss": 3.483, + "theoretical_loss": 4.483527994870958, + "tokens_seen": 159252480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048067201604814444, + "loss": 3.2439, + "theoretical_loss": 4.483289005843445, + "tokens_seen": 159318016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806619859578736, + "loss": 3.4449, + "theoretical_loss": 4.483050142618255, + "tokens_seen": 159383552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806519558676028, + "loss": 3.1009, + "theoretical_loss": 4.482811405077482, + "tokens_seen": 159449088 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480641925777332, + "loss": 3.2611, + "theoretical_loss": 4.482572793103373, + "tokens_seen": 159514624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048063189568706117, + "loss": 3.3544, + "theoretical_loss": 4.482334306578339, + "tokens_seen": 159580160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806218655967904, + "loss": 3.1545, + "theoretical_loss": 4.482095945384946, + "tokens_seen": 159645696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048061183550651953, + "loss": 3.2616, + "theoretical_loss": 4.481857709405919, + "tokens_seen": 159711232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048060180541624877, + "loss": 3.3525, + "theoretical_loss": 4.4816195985241425, + "tokens_seen": 159776768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048059177532597795, + "loss": 3.1274, + "theoretical_loss": 4.481381612622657, + "tokens_seen": 159842304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048058174523570713, + "loss": 3.5142, + "theoretical_loss": 4.481143751584659, + "tokens_seen": 159907840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805717151454363, + "loss": 3.4438, + "theoretical_loss": 4.480906015293505, + "tokens_seen": 159973376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805616850551655, + "loss": 3.3428, + "theoretical_loss": 4.480668403632706, + "tokens_seen": 160038912 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048055165496489467, + "loss": 3.4011, + "theoretical_loss": 4.480430916485929, + "tokens_seen": 160104448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805416248746239, + "loss": 3.1368, + "theoretical_loss": 4.480193553736999, + "tokens_seen": 160169984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048053159478435303, + "loss": 3.1091, + "theoretical_loss": 4.479956315269897, + "tokens_seen": 160235520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048052156469408227, + "loss": 3.4397, + "theoretical_loss": 4.479719200968757, + "tokens_seen": 160301056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805115346038114, + "loss": 3.4909, + "theoretical_loss": 4.479482210717871, + "tokens_seen": 160366592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048050150451354063, + "loss": 3.3116, + "theoretical_loss": 4.479245344401685, + "tokens_seen": 160432128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804914744232698, + "loss": 3.2769, + "theoretical_loss": 4.479008601904798, + "tokens_seen": 160497664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 102054, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4178307056427, + "objective/train/theoretical_loss": 4.478771983111967, + "objective/train/tokens_used": 181023200, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480481444332999, + "loss": 3.467, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048047141424272823, + "loss": 3.4466, + "theoretical_loss": 4.478535487908101, + "tokens_seen": 160628736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048046138415245736, + "loss": 3.3249, + "theoretical_loss": 4.478299116178265, + "tokens_seen": 160694272 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804513540621866, + "loss": 3.2936, + "theoretical_loss": 4.478062867807674, + "tokens_seen": 160759808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048044132397191577, + "loss": 3.466, + "theoretical_loss": 4.4778267426817, + "tokens_seen": 160825344 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048043129388164495, + "loss": 3.4092, + "theoretical_loss": 4.477590740685867, + "tokens_seen": 160890880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048042126379137413, + "loss": 3.2282, + "theoretical_loss": 4.47735486170585, + "tokens_seen": 160956416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048041123370110337, + "loss": 3.088, + "theoretical_loss": 4.47711910562748, + "tokens_seen": 161021952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804012036108325, + "loss": 3.1203, + "theoretical_loss": 4.4768834723367394, + "tokens_seen": 161087488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048039117352056173, + "loss": 3.5309, + "theoretical_loss": 4.4766479617197605, + "tokens_seen": 161153024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048038114343029086, + "loss": 3.1474, + "theoretical_loss": 4.476412573662829, + "tokens_seen": 161218560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803711133400201, + "loss": 3.1828, + "theoretical_loss": 4.4761773080523835, + "tokens_seen": 161284096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803610832497493, + "loss": 3.3428, + "theoretical_loss": 4.475942164775013, + "tokens_seen": 161349632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048035105315947846, + "loss": 3.0744, + "theoretical_loss": 4.475707143717455, + "tokens_seen": 161415168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048034102306920764, + "loss": 3.1276, + "theoretical_loss": 4.475472244766601, + "tokens_seen": 161480704 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803309929789368, + "loss": 3.0242, + "theoretical_loss": 4.475237467809492, + "tokens_seen": 161546240 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480320962888666, + "loss": 3.4158, + "theoretical_loss": 4.47500281273332, + "tokens_seen": 161611776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048031093279839523, + "loss": 3.2473, + "theoretical_loss": 4.474768279425424, + "tokens_seen": 161677312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048030090270812436, + "loss": 3.1312, + "theoretical_loss": 4.474533867773299, + "tokens_seen": 161742848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802908726178536, + "loss": 3.3392, + "theoretical_loss": 4.474299577664581, + "tokens_seen": 161808384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802808425275827, + "loss": 3.3211, + "theoretical_loss": 4.474065408987063, + "tokens_seen": 161873920 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048027081243731196, + "loss": 3.3298, + "theoretical_loss": 4.473831361628682, + "tokens_seen": 161939456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048026078234704114, + "loss": 3.3306, + "theoretical_loss": 4.473597435477526, + "tokens_seen": 162004992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802507522567703, + "loss": 3.1812, + "theoretical_loss": 4.473363630421831, + "tokens_seen": 162070528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802407221664995, + "loss": 3.1926, + "theoretical_loss": 4.473129946349982, + "tokens_seen": 162136064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 102718, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.263267993927002, + "objective/train/theoretical_loss": 4.472896383150508, + "objective/train/tokens_used": 182661600, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048023069207622874, + "loss": 3.1614, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048022066198595786, + "loss": 3.4761, + "theoretical_loss": 4.472662940712091, + "tokens_seen": 162267136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802106318956871, + "loss": 3.1622, + "theoretical_loss": 4.472429618923558, + "tokens_seen": 162332672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048020060180541623, + "loss": 3.2604, + "theoretical_loss": 4.472196417673883, + "tokens_seen": 162398208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048019057171514546, + "loss": 3.373, + "theoretical_loss": 4.471963336852187, + "tokens_seen": 162463744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048018054162487464, + "loss": 3.1342, + "theoretical_loss": 4.471730376347738, + "tokens_seen": 162529280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801705115346038, + "loss": 3.1844, + "theoretical_loss": 4.4714975360499505, + "tokens_seen": 162594816 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480160481444333, + "loss": 3.4885, + "theoretical_loss": 4.471264815848384, + "tokens_seen": 162660352 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801504513540622, + "loss": 3.2073, + "theoretical_loss": 4.471032215632746, + "tokens_seen": 162725888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048014042126379137, + "loss": 3.1441, + "theoretical_loss": 4.470799735292889, + "tokens_seen": 162791424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801303911735206, + "loss": 3.3775, + "theoretical_loss": 4.470567374718808, + "tokens_seen": 162856960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048012036108324973, + "loss": 3.4188, + "theoretical_loss": 4.470335133800649, + "tokens_seen": 162922496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048011033099297897, + "loss": 3.1071, + "theoretical_loss": 4.470103012428696, + "tokens_seen": 162988032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048010030090270815, + "loss": 3.3467, + "theoretical_loss": 4.469871010493383, + "tokens_seen": 163053568 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048009027081243733, + "loss": 3.3758, + "theoretical_loss": 4.469639127885287, + "tokens_seen": 163119104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800802407221665, + "loss": 3.1418, + "theoretical_loss": 4.4694073644951295, + "tokens_seen": 163184640 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800702106318957, + "loss": 3.2458, + "theoretical_loss": 4.469175720213771, + "tokens_seen": 163250176 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048006018054162487, + "loss": 3.3036, + "theoretical_loss": 4.468944194932225, + "tokens_seen": 163315712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800501504513541, + "loss": 3.2241, + "theoretical_loss": 4.468712788541639, + "tokens_seen": 163381248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048004012036108323, + "loss": 3.2485, + "theoretical_loss": 4.46848150093331, + "tokens_seen": 163446784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048003009027081247, + "loss": 3.286, + "theoretical_loss": 4.468250331998676, + "tokens_seen": 163512320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800200601805416, + "loss": 3.0841, + "theoretical_loss": 4.468019281629316, + "tokens_seen": 163577856 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048001003009027083, + "loss": 3.1869, + "theoretical_loss": 4.467788349716955, + "tokens_seen": 163643392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048, + "loss": 3.2902, + "theoretical_loss": 4.467557536153457, + "tokens_seen": 163708928 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799899699097292, + "loss": 3.179, + "theoretical_loss": 4.467326840830829, + "tokens_seen": 163774464 + }, + { + "debugging/Self-BLEU-5": 0.47376287031010694, + "debugging/distinct-1-grams": 0.7718165351312889, + "debugging/distinct-2-grams": 0.9624982234606337, + "debugging/entropy-1-grams": 5.847463360327344, + "debugging/entropy-2-grams": 6.816946097439353, + "debugging/length": 477.7857142857143, + "debugging/num_segments": 14, + "epoch": 0.05, + "objective/train/docs_used": 104069, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2056386470794678, + "objective/train/theoretical_loss": 4.467096263641219, + "objective/train/tokens_used": 184300000, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799799398194584, + "loss": 3.24, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047996990972918756, + "loss": 3.1111, + "theoretical_loss": 4.466865804476919, + "tokens_seen": 163905536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047995987963891674, + "loss": 3.1284, + "theoretical_loss": 4.466635463230359, + "tokens_seen": 163971072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047994984954864597, + "loss": 3.3292, + "theoretical_loss": 4.466405239794113, + "tokens_seen": 164036608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799398194583751, + "loss": 3.3509, + "theoretical_loss": 4.466175134060894, + "tokens_seen": 164102144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047992978936810433, + "loss": 3.4643, + "theoretical_loss": 4.465945145923554, + "tokens_seen": 164167680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799197592778335, + "loss": 3.0926, + "theoretical_loss": 4.4657152752750875, + "tokens_seen": 164233216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799097291875627, + "loss": 3.2696, + "theoretical_loss": 4.465485522008629, + "tokens_seen": 164298752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798996990972919, + "loss": 3.158, + "theoretical_loss": 4.465255886017452, + "tokens_seen": 164364288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047988966900702106, + "loss": 3.3011, + "theoretical_loss": 4.465026367194971, + "tokens_seen": 164429824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047987963891675024, + "loss": 3.0533, + "theoretical_loss": 4.464796965434738, + "tokens_seen": 164495360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798696088264795, + "loss": 3.1821, + "theoretical_loss": 4.464567680630443, + "tokens_seen": 164560896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798595787362086, + "loss": 3.2458, + "theoretical_loss": 4.464338512675919, + "tokens_seen": 164626432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047984954864593784, + "loss": 3.3066, + "theoretical_loss": 4.464109461465133, + "tokens_seen": 164691968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047983951855566696, + "loss": 3.1252, + "theoretical_loss": 4.4638805268921935, + "tokens_seen": 164757504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798294884653962, + "loss": 3.1787, + "theoretical_loss": 4.463651708851346, + "tokens_seen": 164823040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798194583751254, + "loss": 3.209, + "theoretical_loss": 4.463423007236974, + "tokens_seen": 164888576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047980942828485456, + "loss": 3.1093, + "theoretical_loss": 4.4631944219436, + "tokens_seen": 164954112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047979939819458374, + "loss": 3.1704, + "theoretical_loss": 4.462965952865879, + "tokens_seen": 165019648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797893681043129, + "loss": 3.3571, + "theoretical_loss": 4.46273759989861, + "tokens_seen": 165085184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797793380140421, + "loss": 3.3585, + "theoretical_loss": 4.462509362936723, + "tokens_seen": 165150720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047976930792377134, + "loss": 3.4484, + "theoretical_loss": 4.46228124187529, + "tokens_seen": 165216256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047975927783350047, + "loss": 3.2982, + "theoretical_loss": 4.462053236609516, + "tokens_seen": 165281792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797492477432297, + "loss": 3.2709, + "theoretical_loss": 4.461825347034742, + "tokens_seen": 165347328 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797392176529589, + "loss": 3.5718, + "theoretical_loss": 4.461597573046449, + "tokens_seen": 165412864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 105300, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3106820583343506, + "objective/train/theoretical_loss": 4.461369914540247, + "objective/train/tokens_used": 185938400, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047972918756268807, + "loss": 3.3645, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797191574724173, + "loss": 3.322, + "theoretical_loss": 4.4611423714118885, + "tokens_seen": 165543936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047970912738214643, + "loss": 3.3285, + "theoretical_loss": 4.460914943557256, + "tokens_seen": 165609472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047969909729187566, + "loss": 3.2845, + "theoretical_loss": 4.460687630872371, + "tokens_seen": 165675008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047968906720160484, + "loss": 3.2704, + "theoretical_loss": 4.46046043325339, + "tokens_seen": 165740544 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479679037111334, + "loss": 3.5378, + "theoretical_loss": 4.460233350596599, + "tokens_seen": 165806080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796690070210632, + "loss": 3.2292, + "theoretical_loss": 4.460006382798425, + "tokens_seen": 165871616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796589769307924, + "loss": 3.4241, + "theoretical_loss": 4.459779529755423, + "tokens_seen": 165937152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047964894684052157, + "loss": 3.3032, + "theoretical_loss": 4.459552791364288, + "tokens_seen": 166002688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796389167502508, + "loss": 3.0771, + "theoretical_loss": 4.459326167521844, + "tokens_seen": 166068224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047962888665997993, + "loss": 3.6786, + "theoretical_loss": 4.4590996581250515, + "tokens_seen": 166133760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047961885656970917, + "loss": 3.4545, + "theoretical_loss": 4.458873263071002, + "tokens_seen": 166199296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047960882647943835, + "loss": 3.228, + "theoretical_loss": 4.458646982256921, + "tokens_seen": 166264832 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047959879638916753, + "loss": 3.2841, + "theoretical_loss": 4.458420815580169, + "tokens_seen": 166330368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795887662988967, + "loss": 2.9158, + "theoretical_loss": 4.458194762938234, + "tokens_seen": 166395904 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795787362086259, + "loss": 3.1962, + "theoretical_loss": 4.457968824228743, + "tokens_seen": 166461440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047956870611835507, + "loss": 3.3127, + "theoretical_loss": 4.457742999349449, + "tokens_seen": 166526976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795586760280843, + "loss": 3.1228, + "theoretical_loss": 4.4575172881982414, + "tokens_seen": 166592512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047954864593781343, + "loss": 3.3468, + "theoretical_loss": 4.457291690673139, + "tokens_seen": 166658048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047953861584754267, + "loss": 3.1293, + "theoretical_loss": 4.457066206672291, + "tokens_seen": 166723584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795285857572718, + "loss": 3.4924, + "theoretical_loss": 4.456840836093983, + "tokens_seen": 166789120 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047951855566700103, + "loss": 3.3401, + "theoretical_loss": 4.456615578836625, + "tokens_seen": 166854656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795085255767302, + "loss": 3.3277, + "theoretical_loss": 4.456390434798762, + "tokens_seen": 166920192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794984954864594, + "loss": 3.1098, + "theoretical_loss": 4.45616540387907, + "tokens_seen": 166985728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794884653961886, + "loss": 3.3298, + "theoretical_loss": 4.4559404859763525, + "tokens_seen": 167051264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 106148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4106695652008057, + "objective/train/theoretical_loss": 4.455715680989545, + "objective/train/tokens_used": 187576800, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047947843530591776, + "loss": 2.9402, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047946840521564694, + "loss": 3.432, + "theoretical_loss": 4.455490988817713, + "tokens_seen": 167182336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047945837512537617, + "loss": 3.1718, + "theoretical_loss": 4.4552664093600525, + "tokens_seen": 167247872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794483450351053, + "loss": 3.1644, + "theoretical_loss": 4.455041942515887, + "tokens_seen": 167313408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047943831494483453, + "loss": 3.1954, + "theoretical_loss": 4.454817588184669, + "tokens_seen": 167378944 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794282848545637, + "loss": 3.2134, + "theoretical_loss": 4.454593346265984, + "tokens_seen": 167444480 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794182547642929, + "loss": 3.4663, + "theoretical_loss": 4.454369216659542, + "tokens_seen": 167510016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794082246740221, + "loss": 3.3214, + "theoretical_loss": 4.454145199265183, + "tokens_seen": 167575552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047939819458375126, + "loss": 3.2135, + "theoretical_loss": 4.453921293982877, + "tokens_seen": 167641088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047938816449348044, + "loss": 3.3655, + "theoretical_loss": 4.453697500712722, + "tokens_seen": 167706624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793781344032097, + "loss": 3.202, + "theoretical_loss": 4.453473819354942, + "tokens_seen": 167772160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793681043129388, + "loss": 3.2527, + "theoretical_loss": 4.453250249809889, + "tokens_seen": 167837696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047935807422266804, + "loss": 3.2188, + "theoretical_loss": 4.453026791978045, + "tokens_seen": 167903232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047934804413239716, + "loss": 3.2968, + "theoretical_loss": 4.4528034457600185, + "tokens_seen": 167968768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793380140421264, + "loss": 3.133, + "theoretical_loss": 4.452580211056542, + "tokens_seen": 168034304 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793279839518556, + "loss": 3.1103, + "theoretical_loss": 4.452357087768481, + "tokens_seen": 168099840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047931795386158476, + "loss": 3.1566, + "theoretical_loss": 4.45213407579682, + "tokens_seen": 168165376 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047930792377131394, + "loss": 3.3208, + "theoretical_loss": 4.451911175042679, + "tokens_seen": 168230912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792978936810431, + "loss": 3.3578, + "theoretical_loss": 4.451688385407296, + "tokens_seen": 168296448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792878635907723, + "loss": 3.0301, + "theoretical_loss": 4.451465706792041, + "tokens_seen": 168361984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047927783350050154, + "loss": 3.3597, + "theoretical_loss": 4.4512431390984055, + "tokens_seen": 168427520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047926780341023067, + "loss": 3.2, + "theoretical_loss": 4.451020682228011, + "tokens_seen": 168493056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792577733199599, + "loss": 3.3903, + "theoretical_loss": 4.450798336082601, + "tokens_seen": 168558592 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792477432296891, + "loss": 3.5916, + "theoretical_loss": 4.450576100564046, + "tokens_seen": 168624128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047923771313941827, + "loss": 3.0493, + "theoretical_loss": 4.450353975574341, + "tokens_seen": 168689664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 106755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.232574701309204, + "objective/train/theoretical_loss": 4.450131961015606, + "objective/train/tokens_used": 189215200, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047922768304914745, + "loss": 3.3465, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047921765295887663, + "loss": 3.4464, + "theoretical_loss": 4.449910056790086, + "tokens_seen": 168820736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792076228686058, + "loss": 3.2509, + "theoretical_loss": 4.44968826280015, + "tokens_seen": 168886272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047919759277833504, + "loss": 3.4296, + "theoretical_loss": 4.4494665789482895, + "tokens_seen": 168951808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047918756268806417, + "loss": 3.2661, + "theoretical_loss": 4.449245005137125, + "tokens_seen": 169017344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791775325977934, + "loss": 3.2495, + "theoretical_loss": 4.449023541269395, + "tokens_seen": 169082880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047916750250752253, + "loss": 3.1478, + "theoretical_loss": 4.448802187247966, + "tokens_seen": 169148416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047915747241725177, + "loss": 3.4552, + "theoretical_loss": 4.448580942975825, + "tokens_seen": 169213952 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047914744232698095, + "loss": 3.34, + "theoretical_loss": 4.448359808356084, + "tokens_seen": 169279488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047913741223671013, + "loss": 3.2944, + "theoretical_loss": 4.448138783291979, + "tokens_seen": 169345024 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791273821464393, + "loss": 3.2205, + "theoretical_loss": 4.447917867686863, + "tokens_seen": 169410560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047911735205616855, + "loss": 3.1967, + "theoretical_loss": 4.44769706144422, + "tokens_seen": 169476096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791073219658977, + "loss": 3.0132, + "theoretical_loss": 4.44747636446765, + "tokens_seen": 169541632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790972918756269, + "loss": 3.4747, + "theoretical_loss": 4.447255776660878, + "tokens_seen": 169607168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047908726178535604, + "loss": 3.3565, + "theoretical_loss": 4.44703529792775, + "tokens_seen": 169672704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047907723169508527, + "loss": 3.0567, + "theoretical_loss": 4.446814928172234, + "tokens_seen": 169738240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047906720160481445, + "loss": 3.1763, + "theoretical_loss": 4.446594667298421, + "tokens_seen": 169803776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047905717151454363, + "loss": 3.3902, + "theoretical_loss": 4.446374515210521, + "tokens_seen": 169869312 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790471414242728, + "loss": 3.4254, + "theoretical_loss": 4.446154471812866, + "tokens_seen": 169934848 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479037111334002, + "loss": 3.2961, + "theoretical_loss": 4.445934537009911, + "tokens_seen": 170000384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790270812437312, + "loss": 2.8793, + "theoretical_loss": 4.445714710706228, + "tokens_seen": 170065920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790170511534604, + "loss": 2.8962, + "theoretical_loss": 4.445494992806513, + "tokens_seen": 170131456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047900702106318954, + "loss": 3.2095, + "theoretical_loss": 4.44527538321558, + "tokens_seen": 170196992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789969909729188, + "loss": 3.4441, + "theoretical_loss": 4.445055881838365, + "tokens_seen": 170262528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789869608826479, + "loss": 3.3936, + "theoretical_loss": 4.444836488579924, + "tokens_seen": 170328064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 107830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1494503021240234, + "objective/train/theoretical_loss": 4.44461720334543, + "objective/train/tokens_used": 190853600, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047897693079237714, + "loss": 3.2369, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047896690070210637, + "loss": 3.3457, + "theoretical_loss": 4.444398026040179, + "tokens_seen": 170459136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789568706118355, + "loss": 3.3824, + "theoretical_loss": 4.444178956569585, + "tokens_seen": 170524672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047894684052156473, + "loss": 3.4014, + "theoretical_loss": 4.443959994839181, + "tokens_seen": 170590208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789368104312939, + "loss": 3.3058, + "theoretical_loss": 4.44374114075462, + "tokens_seen": 170655744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789267803410231, + "loss": 3.1202, + "theoretical_loss": 4.443522394221671, + "tokens_seen": 170721280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789167502507523, + "loss": 3.3846, + "theoretical_loss": 4.443303755146225, + "tokens_seen": 170786816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047890672016048146, + "loss": 3.2755, + "theoretical_loss": 4.443085223434291, + "tokens_seen": 170852352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047889669007021064, + "loss": 3.0001, + "theoretical_loss": 4.442866798991993, + "tokens_seen": 170917888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788866599799399, + "loss": 3.5228, + "theoretical_loss": 4.442648481725577, + "tokens_seen": 170983424 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478876629889669, + "loss": 3.5152, + "theoretical_loss": 4.442430271541404, + "tokens_seen": 171048960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047886659979939824, + "loss": 3.0459, + "theoretical_loss": 4.442212168345956, + "tokens_seen": 171114496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047885656970912736, + "loss": 3.2609, + "theoretical_loss": 4.4419941720458285, + "tokens_seen": 171180032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788465396188566, + "loss": 3.1621, + "theoretical_loss": 4.441776282547736, + "tokens_seen": 171245568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788365095285858, + "loss": 3.2143, + "theoretical_loss": 4.441558499758511, + "tokens_seen": 171311104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047882647943831496, + "loss": 3.3814, + "theoretical_loss": 4.441340823585101, + "tokens_seen": 171376640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047881644934804414, + "loss": 3.4502, + "theoretical_loss": 4.441123253934572, + "tokens_seen": 171442176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788064192577733, + "loss": 3.2365, + "theoretical_loss": 4.440905790714105, + "tokens_seen": 171507712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787963891675025, + "loss": 2.9738, + "theoretical_loss": 4.440688433830999, + "tokens_seen": 171573248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047878635907723174, + "loss": 3.2246, + "theoretical_loss": 4.440471183192667, + "tokens_seen": 171638784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047877632898696087, + "loss": 3.3308, + "theoretical_loss": 4.440254038706639, + "tokens_seen": 171704320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787662988966901, + "loss": 3.1173, + "theoretical_loss": 4.440037000280561, + "tokens_seen": 171769856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787562688064193, + "loss": 3.2074, + "theoretical_loss": 4.439820067822195, + "tokens_seen": 171835392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047874623871614847, + "loss": 3.115, + "theoretical_loss": 4.439603241239416, + "tokens_seen": 171900928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047873620862587765, + "loss": 3.3423, + "theoretical_loss": 4.439386520440218, + "tokens_seen": 171966464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 108471, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.364562749862671, + "objective/train/theoretical_loss": 4.439169905332706, + "objective/train/tokens_used": 192492000, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047872617853560683, + "loss": 3.1072, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478716148445336, + "loss": 3.1176, + "theoretical_loss": 4.438953395825102, + "tokens_seen": 172097536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047870611835506524, + "loss": 3.3633, + "theoretical_loss": 4.438736991825744, + "tokens_seen": 172163072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047869608826479437, + "loss": 3.38, + "theoretical_loss": 4.438520693243079, + "tokens_seen": 172228608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786860581745236, + "loss": 3.0177, + "theoretical_loss": 4.4383044999856756, + "tokens_seen": 172294144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047867602808425273, + "loss": 3.1664, + "theoretical_loss": 4.438088411962211, + "tokens_seen": 172359680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047866599799398197, + "loss": 3.3626, + "theoretical_loss": 4.437872429081477, + "tokens_seen": 172425216 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047865596790371115, + "loss": 3.0495, + "theoretical_loss": 4.437656551252381, + "tokens_seen": 172490752 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047864593781344033, + "loss": 3.2428, + "theoretical_loss": 4.4374407783839445, + "tokens_seen": 172556288 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786359077231695, + "loss": 3.2672, + "theoretical_loss": 4.437225110385297, + "tokens_seen": 172621824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047862587763289875, + "loss": 3.1621, + "theoretical_loss": 4.4370095471656885, + "tokens_seen": 172687360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786158475426279, + "loss": 3.3215, + "theoretical_loss": 4.436794088634477, + "tokens_seen": 172752896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786058174523571, + "loss": 3.1152, + "theoretical_loss": 4.4365787347011345, + "tokens_seen": 172818432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047859578736208624, + "loss": 3.1985, + "theoretical_loss": 4.436363485275246, + "tokens_seen": 172883968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047858575727181547, + "loss": 3.3297, + "theoretical_loss": 4.436148340266508, + "tokens_seen": 172949504 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047857572718154465, + "loss": 3.3167, + "theoretical_loss": 4.435933299584729, + "tokens_seen": 173015040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047856569709127383, + "loss": 3.0363, + "theoretical_loss": 4.4357183631398325, + "tokens_seen": 173080576 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478555667001003, + "loss": 3.1073, + "theoretical_loss": 4.435503530841849, + "tokens_seen": 173146112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785456369107322, + "loss": 3.1795, + "theoretical_loss": 4.435288802600926, + "tokens_seen": 173211648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785356068204614, + "loss": 3.4799, + "theoretical_loss": 4.4350741783273175, + "tokens_seen": 173277184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785255767301906, + "loss": 3.3446, + "theoretical_loss": 4.434859657931392, + "tokens_seen": 173342720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047851554663991974, + "loss": 3.1328, + "theoretical_loss": 4.434645241323629, + "tokens_seen": 173408256 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478505516549649, + "loss": 2.8784, + "theoretical_loss": 4.434430928414617, + "tokens_seen": 173473792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784954864593781, + "loss": 3.2111, + "theoretical_loss": 4.434216719115057, + "tokens_seen": 173539328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047848545636910734, + "loss": 3.3716, + "theoretical_loss": 4.43400261333576, + "tokens_seen": 173604864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 109645, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0553133487701416, + "objective/train/theoretical_loss": 4.433788610987646, + "objective/train/tokens_used": 194130400, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784754262788365, + "loss": 3.2501, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784653961885657, + "loss": 3.4322, + "theoretical_loss": 4.433574711981749, + "tokens_seen": 173735936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784553660982949, + "loss": 3.2243, + "theoretical_loss": 4.433360916229209, + "tokens_seen": 173801472 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784453360080241, + "loss": 3.2569, + "theoretical_loss": 4.433147223641278, + "tokens_seen": 173867008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047843530591775324, + "loss": 3.2601, + "theoretical_loss": 4.432933634129318, + "tokens_seen": 173932544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784252758274825, + "loss": 3.2892, + "theoretical_loss": 4.4327201476047975, + "tokens_seen": 173998080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784152457372116, + "loss": 3.3204, + "theoretical_loss": 4.432506763979299, + "tokens_seen": 174063616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047840521564694084, + "loss": 3.3412, + "theoretical_loss": 4.432293483164512, + "tokens_seen": 174129152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047839518555667, + "loss": 3.0976, + "theoretical_loss": 4.432080305072233, + "tokens_seen": 174194688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783851554663992, + "loss": 3.1193, + "theoretical_loss": 4.43186722961437, + "tokens_seen": 174260224 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783751253761284, + "loss": 3.2963, + "theoretical_loss": 4.431654256702938, + "tokens_seen": 174325760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047836509528585756, + "loss": 3.3016, + "theoretical_loss": 4.431441386250063, + "tokens_seen": 174391296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047835506519558675, + "loss": 3.2971, + "theoretical_loss": 4.4312286181679745, + "tokens_seen": 174456832 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478345035105316, + "loss": 3.123, + "theoretical_loss": 4.431015952369016, + "tokens_seen": 174522368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783350050150451, + "loss": 3.4087, + "theoretical_loss": 4.430803388765636, + "tokens_seen": 174587904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047832497492477434, + "loss": 3.5089, + "theoretical_loss": 4.430590927270388, + "tokens_seen": 174653440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047831494483450347, + "loss": 3.1789, + "theoretical_loss": 4.430378567795938, + "tokens_seen": 174718976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783049147442327, + "loss": 3.3577, + "theoretical_loss": 4.430166310255057, + "tokens_seen": 174784512 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782948846539619, + "loss": 3.044, + "theoretical_loss": 4.429954154560624, + "tokens_seen": 174850048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047828485456369107, + "loss": 3.467, + "theoretical_loss": 4.429742100625624, + "tokens_seen": 174915584 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047827482447342025, + "loss": 3.0549, + "theoretical_loss": 4.429530148363151, + "tokens_seen": 174981120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782647943831495, + "loss": 3.323, + "theoretical_loss": 4.429318297686402, + "tokens_seen": 175046656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782547642928786, + "loss": 3.1715, + "theoretical_loss": 4.429106548508685, + "tokens_seen": 175112192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047824473420260785, + "loss": 3.3082, + "theoretical_loss": 4.428894900743411, + "tokens_seen": 175177728 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478234704112337, + "loss": 3.0858, + "theoretical_loss": 4.428683354304098, + "tokens_seen": 175243264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 110166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.895432710647583, + "objective/train/theoretical_loss": 4.428471909104372, + "objective/train/tokens_used": 195768800, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782246740220662, + "loss": 3.1331, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047821464393179544, + "loss": 3.3663, + "theoretical_loss": 4.428260565057964, + "tokens_seen": 175374336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047820461384152457, + "loss": 3.3574, + "theoretical_loss": 4.428049322078708, + "tokens_seen": 175439872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781945837512538, + "loss": 2.9226, + "theoretical_loss": 4.427838180080547, + "tokens_seen": 175505408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047818455366098293, + "loss": 2.922, + "theoretical_loss": 4.4276271389775275, + "tokens_seen": 175570944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047817452357071217, + "loss": 3.5267, + "theoretical_loss": 4.427416198683803, + "tokens_seen": 175636480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047816449348044135, + "loss": 3.2307, + "theoretical_loss": 4.427205359113629, + "tokens_seen": 175702016 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047815446339017053, + "loss": 3.3649, + "theoretical_loss": 4.42699462018137, + "tokens_seen": 175767552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781444332998997, + "loss": 3.0675, + "theoretical_loss": 4.42678398180149, + "tokens_seen": 175833088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047813440320962895, + "loss": 3.3193, + "theoretical_loss": 4.426573443888563, + "tokens_seen": 175898624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781243731193581, + "loss": 3.1539, + "theoretical_loss": 4.426363006357263, + "tokens_seen": 175964160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781143430290873, + "loss": 3.1723, + "theoretical_loss": 4.426152669122374, + "tokens_seen": 176029696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047810431293881644, + "loss": 3.3604, + "theoretical_loss": 4.425942432098774, + "tokens_seen": 176095232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047809428284854567, + "loss": 3.3204, + "theoretical_loss": 4.425732295201455, + "tokens_seen": 176160768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047808425275827485, + "loss": 3.0399, + "theoretical_loss": 4.425522258345508, + "tokens_seen": 176226304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047807422266800403, + "loss": 3.1259, + "theoretical_loss": 4.425312321446127, + "tokens_seen": 176291840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780641925777332, + "loss": 3.087, + "theoretical_loss": 4.425102484418613, + "tokens_seen": 176357376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780541624874624, + "loss": 3.0982, + "theoretical_loss": 4.424892747178365, + "tokens_seen": 176422912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780441323971916, + "loss": 3.1463, + "theoretical_loss": 4.42468310964089, + "tokens_seen": 176488448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780341023069208, + "loss": 3.0898, + "theoretical_loss": 4.424473571721794, + "tokens_seen": 176553984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047802407221664994, + "loss": 2.6555, + "theoretical_loss": 4.42426413333679, + "tokens_seen": 176619520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780140421263792, + "loss": 3.364, + "theoretical_loss": 4.424054794401689, + "tokens_seen": 176685056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780040120361083, + "loss": 3.2562, + "theoretical_loss": 4.423845554832406, + "tokens_seen": 176750592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047799398194583754, + "loss": 3.1273, + "theoretical_loss": 4.42363641454496, + "tokens_seen": 176816128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779839518555667, + "loss": 2.7713, + "theoretical_loss": 4.423427373455471, + "tokens_seen": 176881664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 110915, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4182002544403076, + "objective/train/theoretical_loss": 4.42321843148016, + "objective/train/tokens_used": 197407200, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779739217652959, + "loss": 3.4005, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779638916750251, + "loss": 3.0105, + "theoretical_loss": 4.423009588535351, + "tokens_seen": 177012736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779538615847543, + "loss": 3.3242, + "theoretical_loss": 4.422800844537466, + "tokens_seen": 177078272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047794383149448344, + "loss": 3.23, + "theoretical_loss": 4.422592199403036, + "tokens_seen": 177143808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779338014042127, + "loss": 3.1387, + "theoretical_loss": 4.422383653048685, + "tokens_seen": 177209344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779237713139418, + "loss": 3.328, + "theoretical_loss": 4.422175205391145, + "tokens_seen": 177274880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047791374122367104, + "loss": 3.1981, + "theoretical_loss": 4.421966856347243, + "tokens_seen": 177340416 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779037111334002, + "loss": 3.1989, + "theoretical_loss": 4.421758605833912, + "tokens_seen": 177405952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778936810431294, + "loss": 3.0267, + "theoretical_loss": 4.421550453768181, + "tokens_seen": 177471488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778836509528586, + "loss": 3.4539, + "theoretical_loss": 4.421342400067183, + "tokens_seen": 177537024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047787362086258776, + "loss": 3.2626, + "theoretical_loss": 4.42113444464815, + "tokens_seen": 177602560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047786359077231695, + "loss": 3.4596, + "theoretical_loss": 4.420926587428411, + "tokens_seen": 177668096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778535606820462, + "loss": 3.1495, + "theoretical_loss": 4.420718828325403, + "tokens_seen": 177733632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778435305917753, + "loss": 3.2311, + "theoretical_loss": 4.420511167256656, + "tokens_seen": 177799168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047783350050150454, + "loss": 3.2253, + "theoretical_loss": 4.4203036041398, + "tokens_seen": 177864704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047782347041123367, + "loss": 3.1878, + "theoretical_loss": 4.420096138892568, + "tokens_seen": 177930240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778134403209629, + "loss": 3.0134, + "theoretical_loss": 4.419888771432789, + "tokens_seen": 177995776 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778034102306921, + "loss": 2.9021, + "theoretical_loss": 4.419681501678395, + "tokens_seen": 178061312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047779338014042127, + "loss": 3.3523, + "theoretical_loss": 4.419474329547413, + "tokens_seen": 178126848 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047778335005015045, + "loss": 2.9923, + "theoretical_loss": 4.419267254957971, + "tokens_seen": 178192384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777733199598797, + "loss": 3.1929, + "theoretical_loss": 4.419060277828295, + "tokens_seen": 178257920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777632898696088, + "loss": 3.4043, + "theoretical_loss": 4.41885339807671, + "tokens_seen": 178323456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047775325977933805, + "loss": 3.2683, + "theoretical_loss": 4.4186466156216415, + "tokens_seen": 178388992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777432296890672, + "loss": 3.3439, + "theoretical_loss": 4.418439930381609, + "tokens_seen": 178454528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777331995987964, + "loss": 3.3543, + "theoretical_loss": 4.418233342275233, + "tokens_seen": 178520064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 112413, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1684422492980957, + "objective/train/theoretical_loss": 4.418026851221231, + "objective/train/tokens_used": 199045600, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777231695085256, + "loss": 3.2288, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047771313941825477, + "loss": 3.31, + "theoretical_loss": 4.4178204571384185, + "tokens_seen": 178651136 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047770310932798395, + "loss": 3.2361, + "theoretical_loss": 4.41761415994571, + "tokens_seen": 178716672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047769307923771313, + "loss": 2.9928, + "theoretical_loss": 4.417407959562116, + "tokens_seen": 178782208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776830491474423, + "loss": 3.21, + "theoretical_loss": 4.417201855906742, + "tokens_seen": 178847744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047767301905717155, + "loss": 3.3201, + "theoretical_loss": 4.416995848898797, + "tokens_seen": 178913280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776629889669007, + "loss": 3.2827, + "theoretical_loss": 4.4167899384575815, + "tokens_seen": 178978816 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776529588766299, + "loss": 3.3293, + "theoretical_loss": 4.416584124502495, + "tokens_seen": 179044352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047764292878635904, + "loss": 3.3263, + "theoretical_loss": 4.416378406953033, + "tokens_seen": 179109888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776328986960883, + "loss": 3.3501, + "theoretical_loss": 4.41617278572879, + "tokens_seen": 179175424 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047762286860581746, + "loss": 2.9366, + "theoretical_loss": 4.4159672607494524, + "tokens_seen": 179240960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047761283851554664, + "loss": 3.0323, + "theoretical_loss": 4.415761831934808, + "tokens_seen": 179306496 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776028084252758, + "loss": 2.9697, + "theoretical_loss": 4.415556499204737, + "tokens_seen": 179372032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047759277833500505, + "loss": 3.0708, + "theoretical_loss": 4.415351262479216, + "tokens_seen": 179437568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775827482447342, + "loss": 3.3015, + "theoretical_loss": 4.415146121678321, + "tokens_seen": 179503104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775727181544634, + "loss": 3.1837, + "theoretical_loss": 4.414941076722219, + "tokens_seen": 179568640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047756268806419254, + "loss": 3.2274, + "theoretical_loss": 4.4147361275311745, + "tokens_seen": 179634176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775526579739218, + "loss": 3.3152, + "theoretical_loss": 4.414531274025548, + "tokens_seen": 179699712 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047754262788365096, + "loss": 3.1656, + "theoretical_loss": 4.414326516125795, + "tokens_seen": 179765248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047753259779338014, + "loss": 3.0617, + "theoretical_loss": 4.414121853752466, + "tokens_seen": 179830784 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775225677031093, + "loss": 3.0591, + "theoretical_loss": 4.413917286826205, + "tokens_seen": 179896320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775125376128385, + "loss": 3.4019, + "theoretical_loss": 4.413712815267752, + "tokens_seen": 179961856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775025075225677, + "loss": 3.4241, + "theoretical_loss": 4.413508438997944, + "tokens_seen": 180027392 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774924774322969, + "loss": 2.8506, + "theoretical_loss": 4.4133041579377075, + "tokens_seen": 180092928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047748244734202605, + "loss": 3.3765, + "theoretical_loss": 4.413099972008068, + "tokens_seen": 180158464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 112982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9989802837371826, + "objective/train/theoretical_loss": 4.412895881130142, + "objective/train/tokens_used": 200684000, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774724172517553, + "loss": 3.1511, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774623871614845, + "loss": 3.2793, + "theoretical_loss": 4.412691885225141, + "tokens_seen": 180289536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047745235707121364, + "loss": 3.1662, + "theoretical_loss": 4.412487984214373, + "tokens_seen": 180355072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774423269809429, + "loss": 2.9821, + "theoretical_loss": 4.412284178019235, + "tokens_seen": 180420608 + }, + { + "epoch": 0.05, + "learning_rate": 0.000477432296890672, + "loss": 3.2647, + "theoretical_loss": 4.412080466561221, + "tokens_seen": 180486144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047742226680040124, + "loss": 3.041, + "theoretical_loss": 4.411876849761917, + "tokens_seen": 180551680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774122367101304, + "loss": 3.229, + "theoretical_loss": 4.411673327543005, + "tokens_seen": 180617216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774022066198596, + "loss": 2.9898, + "theoretical_loss": 4.4114698998262565, + "tokens_seen": 180682752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773921765295888, + "loss": 3.4276, + "theoretical_loss": 4.411266566533539, + "tokens_seen": 180748288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047738214643931797, + "loss": 3.1725, + "theoretical_loss": 4.41106332758681, + "tokens_seen": 180813824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047737211634904715, + "loss": 3.1015, + "theoretical_loss": 4.41086018290812, + "tokens_seen": 180879360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773620862587764, + "loss": 3.1636, + "theoretical_loss": 4.410657132419617, + "tokens_seen": 180944896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773520561685055, + "loss": 3.2115, + "theoretical_loss": 4.410454176043537, + "tokens_seen": 181010432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047734202607823474, + "loss": 3.1097, + "theoretical_loss": 4.410251313702208, + "tokens_seen": 181075968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047733199598796387, + "loss": 3.322, + "theoretical_loss": 4.410048545318052, + "tokens_seen": 181141504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773219658976931, + "loss": 3.1962, + "theoretical_loss": 4.409845870813582, + "tokens_seen": 181207040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773119358074223, + "loss": 3.3378, + "theoretical_loss": 4.409643290111404, + "tokens_seen": 181272576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047730190571715147, + "loss": 3.0122, + "theoretical_loss": 4.409440803134215, + "tokens_seen": 181338112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047729187562688065, + "loss": 3.3294, + "theoretical_loss": 4.409238409804804, + "tokens_seen": 181403648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004772818455366099, + "loss": 3.3616, + "theoretical_loss": 4.409036110046051, + "tokens_seen": 181469184 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477271815446339, + "loss": 2.9033, + "theoretical_loss": 4.408833903780926, + "tokens_seen": 181534720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047726178535606825, + "loss": 3.1769, + "theoretical_loss": 4.408631790932494, + "tokens_seen": 181600256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772517552657974, + "loss": 3.2627, + "theoretical_loss": 4.408429771423909, + "tokens_seen": 181665792 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772417251755266, + "loss": 3.2006, + "theoretical_loss": 4.408227845178414, + "tokens_seen": 181731328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772316950852558, + "loss": 3.3776, + "theoretical_loss": 4.408026012119344, + "tokens_seen": 181796864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 114216, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.247993230819702, + "objective/train/theoretical_loss": 4.407824272170128, + "objective/train/tokens_used": 202322400, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047722166499498497, + "loss": 3.1762, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047721163490471415, + "loss": 3.2095, + "theoretical_loss": 4.407622625254279, + "tokens_seen": 181927936 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047720160481444333, + "loss": 3.2396, + "theoretical_loss": 4.407421071295406, + "tokens_seen": 181993472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771915747241725, + "loss": 3.2236, + "theoretical_loss": 4.407219610217206, + "tokens_seen": 182059008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047718154463390175, + "loss": 3.2076, + "theoretical_loss": 4.407018241943467, + "tokens_seen": 182124544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771715145436309, + "loss": 3.3943, + "theoretical_loss": 4.406816966398064, + "tokens_seen": 182190080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771614844533601, + "loss": 3.0633, + "theoretical_loss": 4.406615783504965, + "tokens_seen": 182255616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047715145436308924, + "loss": 3.3245, + "theoretical_loss": 4.4064146931882275, + "tokens_seen": 182321152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771414242728185, + "loss": 3.2816, + "theoretical_loss": 4.406213695371996, + "tokens_seen": 182386688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047713139418254766, + "loss": 3.0674, + "theoretical_loss": 4.406012789980506, + "tokens_seen": 182452224 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047712136409227684, + "loss": 3.2889, + "theoretical_loss": 4.405811976938084, + "tokens_seen": 182517760 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477111334002006, + "loss": 2.9432, + "theoretical_loss": 4.405611256169143, + "tokens_seen": 182583296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047710130391173525, + "loss": 3.1513, + "theoretical_loss": 4.405410627598185, + "tokens_seen": 182648832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770912738214644, + "loss": 3.2537, + "theoretical_loss": 4.405210091149802, + "tokens_seen": 182714368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770812437311936, + "loss": 3.2721, + "theoretical_loss": 4.405009646748674, + "tokens_seen": 182779904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047707121364092274, + "loss": 3.2594, + "theoretical_loss": 4.404809294319572, + "tokens_seen": 182845440 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477061183550652, + "loss": 3.0547, + "theoretical_loss": 4.40460903378735, + "tokens_seen": 182910976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047705115346038116, + "loss": 3.238, + "theoretical_loss": 4.404408865076955, + "tokens_seen": 182976512 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047704112337011034, + "loss": 3.0375, + "theoretical_loss": 4.404208788113422, + "tokens_seen": 183042048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770310932798395, + "loss": 2.9986, + "theoretical_loss": 4.404008802821871, + "tokens_seen": 183107584 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770210631895687, + "loss": 3.4418, + "theoretical_loss": 4.4038089091275125, + "tokens_seen": 183173120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770110330992979, + "loss": 3.1524, + "theoretical_loss": 4.403609106955645, + "tokens_seen": 183238656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770010030090271, + "loss": 3.4428, + "theoretical_loss": 4.403409396231651, + "tokens_seen": 183304192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047699097291875625, + "loss": 3.097, + "theoretical_loss": 4.403209776881004, + "tokens_seen": 183369728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769809428284855, + "loss": 3.1748, + "theoretical_loss": 4.403010248829265, + "tokens_seen": 183435264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 114775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7343997955322266, + "objective/train/theoretical_loss": 4.4028108120020795, + "objective/train/tokens_used": 203960800, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047697091273821466, + "loss": 3.364, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047696088264794384, + "loss": 3.5379, + "theoretical_loss": 4.402611466325182, + "tokens_seen": 183566336 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476950852557673, + "loss": 3.1437, + "theoretical_loss": 4.4024122117243945, + "tokens_seen": 183631872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769408224674022, + "loss": 3.0713, + "theoretical_loss": 4.402213048125624, + "tokens_seen": 183697408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769307923771314, + "loss": 3.1808, + "theoretical_loss": 4.4020139754548655, + "tokens_seen": 183762944 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769207622868606, + "loss": 3.5212, + "theoretical_loss": 4.401814993638199, + "tokens_seen": 183828480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047691073219658975, + "loss": 3.1228, + "theoretical_loss": 4.4016161026017935, + "tokens_seen": 183894016 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476900702106319, + "loss": 3.2918, + "theoretical_loss": 4.401417302271902, + "tokens_seen": 183959552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768906720160481, + "loss": 3.3633, + "theoretical_loss": 4.401218592574865, + "tokens_seen": 184025088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047688064192577735, + "loss": 3.3739, + "theoretical_loss": 4.401019973437108, + "tokens_seen": 184090624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047687061183550653, + "loss": 3.3149, + "theoretical_loss": 4.400821444785143, + "tokens_seen": 184156160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768605817452357, + "loss": 3.3195, + "theoretical_loss": 4.400623006545567, + "tokens_seen": 184221696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768505516549649, + "loss": 3.2683, + "theoretical_loss": 4.400424658645065, + "tokens_seen": 184287232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047684052156469407, + "loss": 2.8183, + "theoretical_loss": 4.400226401010404, + "tokens_seen": 184352768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047683049147442325, + "loss": 3.4246, + "theoretical_loss": 4.40002823356844, + "tokens_seen": 184418304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768204613841525, + "loss": 3.3673, + "theoretical_loss": 4.39983015624611, + "tokens_seen": 184483840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768104312938816, + "loss": 3.029, + "theoretical_loss": 4.39963216897044, + "tokens_seen": 184549376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047680040120361085, + "loss": 3.2208, + "theoretical_loss": 4.3994342716685395, + "tokens_seen": 184614912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047679037111334003, + "loss": 3.184, + "theoretical_loss": 4.399236464267602, + "tokens_seen": 184680448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767803410230692, + "loss": 3.2012, + "theoretical_loss": 4.399038746694908, + "tokens_seen": 184745984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767703109327984, + "loss": 3.4012, + "theoretical_loss": 4.398841118877819, + "tokens_seen": 184811520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767602808425276, + "loss": 3.0397, + "theoretical_loss": 4.398643580743785, + "tokens_seen": 184877056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047675025075225676, + "loss": 3.3388, + "theoretical_loss": 4.398446132220338, + "tokens_seen": 184942592 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476740220661986, + "loss": 3.0237, + "theoretical_loss": 4.3982487732350934, + "tokens_seen": 185008128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767301905717151, + "loss": 3.1185, + "theoretical_loss": 4.398051503715753, + "tokens_seen": 185073664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 115939, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.676588296890259, + "objective/train/theoretical_loss": 4.397854323590102, + "objective/train/tokens_used": 205599200, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047672016048144435, + "loss": 3.2948, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047671013039117353, + "loss": 3.2664, + "theoretical_loss": 4.397657232786008, + "tokens_seen": 185204736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767001003009027, + "loss": 2.8907, + "theoretical_loss": 4.397460231231424, + "tokens_seen": 185270272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047669007021063195, + "loss": 3.2102, + "theoretical_loss": 4.397263318854384, + "tokens_seen": 185335808 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766800401203611, + "loss": 3.0437, + "theoretical_loss": 4.39706649558301, + "tokens_seen": 185401344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766700100300903, + "loss": 3.2762, + "theoretical_loss": 4.396869761345503, + "tokens_seen": 185466880 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047665997993981944, + "loss": 3.2785, + "theoretical_loss": 4.396673116070147, + "tokens_seen": 185532416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766499498495487, + "loss": 3.169, + "theoretical_loss": 4.396476559685315, + "tokens_seen": 185597952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047663991975927786, + "loss": 2.8861, + "theoretical_loss": 4.396280092119455, + "tokens_seen": 185663488 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047662988966900704, + "loss": 3.6408, + "theoretical_loss": 4.3960837133011035, + "tokens_seen": 185729024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766198595787362, + "loss": 3.29, + "theoretical_loss": 4.395887423158877, + "tokens_seen": 185794560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047660982948846545, + "loss": 3.2726, + "theoretical_loss": 4.395691221621476, + "tokens_seen": 185860096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765997993981946, + "loss": 3.2641, + "theoretical_loss": 4.395495108617682, + "tokens_seen": 185925632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765897693079238, + "loss": 3.3623, + "theoretical_loss": 4.39529908407636, + "tokens_seen": 185991168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047657973921765294, + "loss": 3.0954, + "theoretical_loss": 4.3951031479264575, + "tokens_seen": 186056704 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765697091273822, + "loss": 3.4236, + "theoretical_loss": 4.394907300097002, + "tokens_seen": 186122240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047655967903711136, + "loss": 3.328, + "theoretical_loss": 4.394711540517106, + "tokens_seen": 186187776 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047654964894684054, + "loss": 3.3462, + "theoretical_loss": 4.39451586911596, + "tokens_seen": 186253312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765396188565697, + "loss": 3.3489, + "theoretical_loss": 4.39432028582284, + "tokens_seen": 186318848 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765295887662989, + "loss": 3.2442, + "theoretical_loss": 4.394124790567101, + "tokens_seen": 186384384 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765195586760281, + "loss": 3.6337, + "theoretical_loss": 4.3939293832781825, + "tokens_seen": 186449920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765095285857573, + "loss": 3.2979, + "theoretical_loss": 4.393734063885599, + "tokens_seen": 186515456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047649949849548645, + "loss": 3.1169, + "theoretical_loss": 4.3935388323189555, + "tokens_seen": 186580992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764894684052157, + "loss": 3.2682, + "theoretical_loss": 4.39334368850793, + "tokens_seen": 186646528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047647943831494486, + "loss": 3.1131, + "theoretical_loss": 4.3931486323822835, + "tokens_seen": 186712064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 116562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.736578941345215, + "objective/train/theoretical_loss": 4.392953663871862, + "objective/train/tokens_used": 207237600, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047646940822467404, + "loss": 3.3584, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764593781344032, + "loss": 3.162, + "theoretical_loss": 4.392758782906586, + "tokens_seen": 186843136 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764493480441324, + "loss": 3.062, + "theoretical_loss": 4.392563989416462, + "tokens_seen": 186908672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764393179538616, + "loss": 3.478, + "theoretical_loss": 4.392369283331574, + "tokens_seen": 186974208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764292878635908, + "loss": 3.2393, + "theoretical_loss": 4.392174664582085, + "tokens_seen": 187039744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047641925777331995, + "loss": 3.1537, + "theoretical_loss": 4.391980133098244, + "tokens_seen": 187105280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764092276830492, + "loss": 3.2428, + "theoretical_loss": 4.391785688810373, + "tokens_seen": 187170816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763991975927783, + "loss": 3.2938, + "theoretical_loss": 4.391591331648879, + "tokens_seen": 187236352 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047638916750250755, + "loss": 3.4311, + "theoretical_loss": 4.391397061544247, + "tokens_seen": 187301888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047637913741223673, + "loss": 3.0581, + "theoretical_loss": 4.391202878427042, + "tokens_seen": 187367424 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763691073219659, + "loss": 3.5038, + "theoretical_loss": 4.3910087822279085, + "tokens_seen": 187432960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763590772316951, + "loss": 3.2592, + "theoretical_loss": 4.390814772877571, + "tokens_seen": 187498496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047634904714142427, + "loss": 3.0196, + "theoretical_loss": 4.390620850306832, + "tokens_seen": 187564032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047633901705115345, + "loss": 3.2439, + "theoretical_loss": 4.390427014446575, + "tokens_seen": 187629568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763289869608827, + "loss": 3.3844, + "theoretical_loss": 4.390233265227764, + "tokens_seen": 187695104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763189568706118, + "loss": 3.1343, + "theoretical_loss": 4.390039602581437, + "tokens_seen": 187760640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047630892678034105, + "loss": 2.9602, + "theoretical_loss": 4.389846026438715, + "tokens_seen": 187826176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047629889669007023, + "loss": 3.2549, + "theoretical_loss": 4.3896525367307975, + "tokens_seen": 187891712 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762888665997994, + "loss": 2.9531, + "theoretical_loss": 4.389459133388962, + "tokens_seen": 187957248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762788365095286, + "loss": 3.1851, + "theoretical_loss": 4.3892658163445635, + "tokens_seen": 188022784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762688064192578, + "loss": 3.2693, + "theoretical_loss": 4.389072585529037, + "tokens_seen": 188088320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047625877632898696, + "loss": 3.1727, + "theoretical_loss": 4.388879440873897, + "tokens_seen": 188153856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762487462387162, + "loss": 3.3616, + "theoretical_loss": 4.388686382310732, + "tokens_seen": 188219392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762387161484453, + "loss": 3.176, + "theoretical_loss": 4.388493409771213, + "tokens_seen": 188284928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047622868605817455, + "loss": 3.135, + "theoretical_loss": 4.388300523187087, + "tokens_seen": 188350464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 117858, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.447967290878296, + "objective/train/theoretical_loss": 4.3881077224901786, + "objective/train/tokens_used": 208876000, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762186559679037, + "loss": 3.4454, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762086258776329, + "loss": 3.1037, + "theoretical_loss": 4.38791500761239, + "tokens_seen": 188481536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761985957873621, + "loss": 3.3634, + "theoretical_loss": 4.387722378485703, + "tokens_seen": 188547072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761885656970913, + "loss": 3.1468, + "theoretical_loss": 4.3875298350421765, + "tokens_seen": 188612608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047617853560682046, + "loss": 3.3193, + "theoretical_loss": 4.387337377213943, + "tokens_seen": 188678144 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047616850551654964, + "loss": 3.2997, + "theoretical_loss": 4.387145004933218, + "tokens_seen": 188743680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761584754262788, + "loss": 3.0325, + "theoretical_loss": 4.38695271813229, + "tokens_seen": 188809216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047614844533600806, + "loss": 3.0921, + "theoretical_loss": 4.386760516743526, + "tokens_seen": 188874752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761384152457372, + "loss": 3.3926, + "theoretical_loss": 4.38656840069937, + "tokens_seen": 188940288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761283851554664, + "loss": 3.1398, + "theoretical_loss": 4.386376369932344, + "tokens_seen": 189005824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761183550651956, + "loss": 3.0158, + "theoretical_loss": 4.386184424375044, + "tokens_seen": 189071360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761083249749248, + "loss": 3.4574, + "theoretical_loss": 4.385992563960145, + "tokens_seen": 189136896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047609829488465396, + "loss": 3.1658, + "theoretical_loss": 4.385800788620397, + "tokens_seen": 189202432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047608826479438314, + "loss": 3.1425, + "theoretical_loss": 4.385609098288628, + "tokens_seen": 189267968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760782347041123, + "loss": 3.239, + "theoretical_loss": 4.385417492897741, + "tokens_seen": 189333504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047606820461384156, + "loss": 3.3172, + "theoretical_loss": 4.385225972380715, + "tokens_seen": 189399040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760581745235707, + "loss": 3.2791, + "theoretical_loss": 4.385034536670606, + "tokens_seen": 189464576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760481444332999, + "loss": 3.33, + "theoretical_loss": 4.384843185700544, + "tokens_seen": 189530112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047603811434302905, + "loss": 3.3761, + "theoretical_loss": 4.384651919403739, + "tokens_seen": 189595648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760280842527583, + "loss": 3.2921, + "theoretical_loss": 4.384460737713471, + "tokens_seen": 189661184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047601805416248746, + "loss": 3.4782, + "theoretical_loss": 4.384269640563101, + "tokens_seen": 189726720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047600802407221665, + "loss": 3.1232, + "theoretical_loss": 4.384078627886062, + "tokens_seen": 189792256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759979939819458, + "loss": 3.2606, + "theoretical_loss": 4.383887699615863, + "tokens_seen": 189857792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047598796389167506, + "loss": 3.0839, + "theoretical_loss": 4.38369685568609, + "tokens_seen": 189923328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759779338014042, + "loss": 2.9678, + "theoretical_loss": 4.383506096030401, + "tokens_seen": 189988864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 118609, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9445102214813232, + "objective/train/theoretical_loss": 4.383315420582533, + "objective/train/tokens_used": 210514400, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759679037111334, + "loss": 3.2935, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759578736208626, + "loss": 3.2186, + "theoretical_loss": 4.383124829276294, + "tokens_seen": 190119936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759478435305918, + "loss": 3.2593, + "theoretical_loss": 4.38293432204557, + "tokens_seen": 190185472 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475937813440321, + "loss": 3.1491, + "theoretical_loss": 4.382743898824321, + "tokens_seen": 190251008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047592778335005015, + "loss": 3.1876, + "theoretical_loss": 4.3825535595465785, + "tokens_seen": 190316544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759177532597794, + "loss": 3.3843, + "theoretical_loss": 4.382363304146453, + "tokens_seen": 190382080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759077231695085, + "loss": 3.2096, + "theoretical_loss": 4.382173132558126, + "tokens_seen": 190447616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047589769307923775, + "loss": 3.0433, + "theoretical_loss": 4.381983044715856, + "tokens_seen": 190513152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047588766298896693, + "loss": 3.2308, + "theoretical_loss": 4.381793040553973, + "tokens_seen": 190578688 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758776328986961, + "loss": 3.478, + "theoretical_loss": 4.381603120006883, + "tokens_seen": 190644224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758676028084253, + "loss": 3.4116, + "theoretical_loss": 4.381413283009065, + "tokens_seen": 190709760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047585757271815447, + "loss": 3.0949, + "theoretical_loss": 4.381223529495073, + "tokens_seen": 190775296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047584754262788365, + "loss": 3.1533, + "theoretical_loss": 4.381033859399532, + "tokens_seen": 190840832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758375125376129, + "loss": 3.2304, + "theoretical_loss": 4.380844272657145, + "tokens_seen": 190906368 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475827482447342, + "loss": 3.3131, + "theoretical_loss": 4.380654769202683, + "tokens_seen": 190971904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047581745235707125, + "loss": 2.9842, + "theoretical_loss": 4.380465348970995, + "tokens_seen": 191037440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047580742226680043, + "loss": 3.1313, + "theoretical_loss": 4.380276011897003, + "tokens_seen": 191102976 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757973921765296, + "loss": 3.2165, + "theoretical_loss": 4.380086757915698, + "tokens_seen": 191168512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757873620862588, + "loss": 3.2402, + "theoretical_loss": 4.379897586962148, + "tokens_seen": 191234048 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475777331995988, + "loss": 3.1014, + "theoretical_loss": 4.379708498971494, + "tokens_seen": 191299584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047576730190571716, + "loss": 3.2106, + "theoretical_loss": 4.379519493878948, + "tokens_seen": 191365120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757572718154464, + "loss": 3.2548, + "theoretical_loss": 4.379330571619795, + "tokens_seen": 191430656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757472417251755, + "loss": 3.3656, + "theoretical_loss": 4.379141732129394, + "tokens_seen": 191496192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047573721163490475, + "loss": 3.4178, + "theoretical_loss": 4.378952975343175, + "tokens_seen": 191561728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757271815446339, + "loss": 3.1327, + "theoretical_loss": 4.378764301196642, + "tokens_seen": 191627264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 119339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7788050174713135, + "objective/train/theoretical_loss": 4.37857570962537, + "objective/train/tokens_used": 212152800, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757171514543631, + "loss": 3.0324, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757071213640923, + "loss": 3.1974, + "theoretical_loss": 4.378387200565006, + "tokens_seen": 191758336 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756970912738215, + "loss": 3.4038, + "theoretical_loss": 4.378198773951272, + "tokens_seen": 191823872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047568706118355066, + "loss": 3.0394, + "theoretical_loss": 4.378010429719957, + "tokens_seen": 191889408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047567703109327984, + "loss": 2.9882, + "theoretical_loss": 4.377822167806928, + "tokens_seen": 191954944 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475667001003009, + "loss": 3.0199, + "theoretical_loss": 4.377633988148117, + "tokens_seen": 192020480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047565697091273826, + "loss": 3.2158, + "theoretical_loss": 4.377445890679534, + "tokens_seen": 192086016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756469408224674, + "loss": 3.0202, + "theoretical_loss": 4.377257875337257, + "tokens_seen": 192151552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756369107321966, + "loss": 3.1161, + "theoretical_loss": 4.377069942057436, + "tokens_seen": 192217088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756268806419258, + "loss": 3.2261, + "theoretical_loss": 4.376882090776293, + "tokens_seen": 192282624 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475616850551655, + "loss": 3.3294, + "theoretical_loss": 4.376694321430121, + "tokens_seen": 192348160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047560682046138416, + "loss": 3.2889, + "theoretical_loss": 4.376506633955286, + "tokens_seen": 192413696 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047559679037111334, + "loss": 3.3172, + "theoretical_loss": 4.376319028288219, + "tokens_seen": 192479232 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755867602808425, + "loss": 3.426, + "theoretical_loss": 4.37613150436543, + "tokens_seen": 192544768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047557673019057176, + "loss": 3.0088, + "theoretical_loss": 4.375944062123496, + "tokens_seen": 192610304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755667001003009, + "loss": 3.4675, + "theoretical_loss": 4.375756701499063, + "tokens_seen": 192675840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755566700100301, + "loss": 3.2118, + "theoretical_loss": 4.3755694224288515, + "tokens_seen": 192741376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047554663991975925, + "loss": 3.1241, + "theoretical_loss": 4.375382224849648, + "tokens_seen": 192806912 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755366098294885, + "loss": 3.2397, + "theoretical_loss": 4.375195108698316, + "tokens_seen": 192872448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047552657973921766, + "loss": 3.6427, + "theoretical_loss": 4.375008073911781, + "tokens_seen": 192937984 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047551654964894685, + "loss": 3.1663, + "theoretical_loss": 4.374821120427047, + "tokens_seen": 193003520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047550651955867603, + "loss": 3.0961, + "theoretical_loss": 4.374634248181182, + "tokens_seen": 193069056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047549648946840526, + "loss": 3.5514, + "theoretical_loss": 4.3744474571113265, + "tokens_seen": 193134592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754864593781344, + "loss": 3.0639, + "theoretical_loss": 4.374260747154692, + "tokens_seen": 193200128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754764292878636, + "loss": 3.178, + "theoretical_loss": 4.374074118248559, + "tokens_seen": 193265664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 120055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8682849407196045, + "objective/train/theoretical_loss": 4.373887570330275, + "objective/train/tokens_used": 213791200, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047546639919759275, + "loss": 3.3739, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475456369107322, + "loss": 3.3962, + "theoretical_loss": 4.373701103337263, + "tokens_seen": 193396736 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047544633901705117, + "loss": 3.1677, + "theoretical_loss": 4.373514717207009, + "tokens_seen": 193462272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047543630892678035, + "loss": 3.2994, + "theoretical_loss": 4.373328411877073, + "tokens_seen": 193527808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047542627883650953, + "loss": 3.3462, + "theoretical_loss": 4.373142187285083, + "tokens_seen": 193593344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754162487462387, + "loss": 3.3214, + "theoretical_loss": 4.372956043368736, + "tokens_seen": 193658880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754062186559679, + "loss": 3.333, + "theoretical_loss": 4.372769980065797, + "tokens_seen": 193724416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047539618856569713, + "loss": 3.3139, + "theoretical_loss": 4.372583997314104, + "tokens_seen": 193789952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047538615847542625, + "loss": 3.2031, + "theoretical_loss": 4.372398095051559, + "tokens_seen": 193855488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753761283851555, + "loss": 3.3055, + "theoretical_loss": 4.372212273216136, + "tokens_seen": 193921024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753660982948846, + "loss": 3.3132, + "theoretical_loss": 4.372026531745877, + "tokens_seen": 193986560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047535606820461385, + "loss": 3.2799, + "theoretical_loss": 4.371840870578891, + "tokens_seen": 194052096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047534603811434303, + "loss": 3.0888, + "theoretical_loss": 4.37165528965336, + "tokens_seen": 194117632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753360080240722, + "loss": 3.441, + "theoretical_loss": 4.371469788907529, + "tokens_seen": 194183168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753259779338014, + "loss": 3.1359, + "theoretical_loss": 4.371284368279714, + "tokens_seen": 194248704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047531594784353063, + "loss": 3.282, + "theoretical_loss": 4.3710990277083, + "tokens_seen": 194314240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047530591775325976, + "loss": 3.1125, + "theoretical_loss": 4.3709137671317375, + "tokens_seen": 194379776 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475295887662989, + "loss": 3.1613, + "theoretical_loss": 4.37072858648855, + "tokens_seen": 194445312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752858575727181, + "loss": 3.267, + "theoretical_loss": 4.370543485717322, + "tokens_seen": 194510848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047527582748244736, + "loss": 3.2022, + "theoretical_loss": 4.370358464756713, + "tokens_seen": 194576384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047526579739217654, + "loss": 3.2746, + "theoretical_loss": 4.370173523545443, + "tokens_seen": 194641920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752557673019057, + "loss": 3.3689, + "theoretical_loss": 4.3699886620223065, + "tokens_seen": 194707456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752457372116349, + "loss": 3.4246, + "theoretical_loss": 4.369803880126162, + "tokens_seen": 194772992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752357071213641, + "loss": 3.0546, + "theoretical_loss": 4.3696191777959354, + "tokens_seen": 194838528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047522567703109326, + "loss": 3.254, + "theoretical_loss": 4.369434554970621, + "tokens_seen": 194904064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 121265, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3207590579986572, + "objective/train/theoretical_loss": 4.369250011589279, + "objective/train/tokens_used": 215429600, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752156469408225, + "loss": 3.1958, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752056168505517, + "loss": 3.1648, + "theoretical_loss": 4.369065547591038, + "tokens_seen": 195035136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047519558676028086, + "loss": 3.2667, + "theoretical_loss": 4.368881162915095, + "tokens_seen": 195100672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047518555667001004, + "loss": 3.3682, + "theoretical_loss": 4.36869685750071, + "tokens_seen": 195166208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751755265797392, + "loss": 3.4655, + "theoretical_loss": 4.3685126312872145, + "tokens_seen": 195231744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047516549648946846, + "loss": 3.0632, + "theoretical_loss": 4.368328484214002, + "tokens_seen": 195297280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751554663991976, + "loss": 3.2254, + "theoretical_loss": 4.368144416220538, + "tokens_seen": 195362816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751454363089268, + "loss": 3.26, + "theoretical_loss": 4.3679604272463495, + "tokens_seen": 195428352 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475135406218656, + "loss": 3.19, + "theoretical_loss": 4.367776517231033, + "tokens_seen": 195493888 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751253761283852, + "loss": 3.282, + "theoretical_loss": 4.367592686114252, + "tokens_seen": 195559424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047511534603811436, + "loss": 3.1387, + "theoretical_loss": 4.367408933835733, + "tokens_seen": 195624960 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047510531594784354, + "loss": 3.3084, + "theoretical_loss": 4.367225260335272, + "tokens_seen": 195690496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750952858575727, + "loss": 3.409, + "theoretical_loss": 4.36704166555273, + "tokens_seen": 195756032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047508525576730196, + "loss": 3.3344, + "theoretical_loss": 4.366858149428032, + "tokens_seen": 195821568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750752256770311, + "loss": 3.0945, + "theoretical_loss": 4.366674711901173, + "tokens_seen": 195887104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750651955867603, + "loss": 3.1295, + "theoretical_loss": 4.366491352912211, + "tokens_seen": 195952640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047505516549648945, + "loss": 3.1857, + "theoretical_loss": 4.366308072401271, + "tokens_seen": 196018176 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750451354062187, + "loss": 3.2574, + "theoretical_loss": 4.366124870308541, + "tokens_seen": 196083712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047503510531594787, + "loss": 3.1826, + "theoretical_loss": 4.365941746574278, + "tokens_seen": 196149248 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047502507522567705, + "loss": 3.1245, + "theoretical_loss": 4.3657587011388035, + "tokens_seen": 196214784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047501504513540623, + "loss": 3.1448, + "theoretical_loss": 4.365575733942503, + "tokens_seen": 196280320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047500501504513546, + "loss": 3.1648, + "theoretical_loss": 4.365392844925829, + "tokens_seen": 196345856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749949849548646, + "loss": 3.2024, + "theoretical_loss": 4.365210034029298, + "tokens_seen": 196411392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749849548645938, + "loss": 3.0918, + "theoretical_loss": 4.365027301193491, + "tokens_seen": 196476928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047497492477432295, + "loss": 2.8404, + "theoretical_loss": 4.364844646359056, + "tokens_seen": 196542464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 121711, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7159526348114014, + "objective/train/theoretical_loss": 4.364662069466704, + "objective/train/tokens_used": 217068000, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749648946840522, + "loss": 3.3992, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047495486459378137, + "loss": 3.3754, + "theoretical_loss": 4.364479570457213, + "tokens_seen": 196673536 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047494483450351055, + "loss": 3.1078, + "theoretical_loss": 4.364297149271423, + "tokens_seen": 196739072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047493480441323973, + "loss": 3.2097, + "theoretical_loss": 4.3641148058502415, + "tokens_seen": 196804608 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749247743229689, + "loss": 3.0253, + "theoretical_loss": 4.363932540134638, + "tokens_seen": 196870144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749147442326981, + "loss": 3.4151, + "theoretical_loss": 4.363750352065647, + "tokens_seen": 196935680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047490471414242733, + "loss": 3.1998, + "theoretical_loss": 4.363568241584368, + "tokens_seen": 197001216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047489468405215646, + "loss": 3.2949, + "theoretical_loss": 4.363386208631966, + "tokens_seen": 197066752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748846539618857, + "loss": 3.153, + "theoretical_loss": 4.363204253149667, + "tokens_seen": 197132288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748746238716148, + "loss": 3.3637, + "theoretical_loss": 4.3630223750787644, + "tokens_seen": 197197824 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047486459378134405, + "loss": 3.0955, + "theoretical_loss": 4.362840574360612, + "tokens_seen": 197263360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047485456369107323, + "loss": 3.2066, + "theoretical_loss": 4.362658850936631, + "tokens_seen": 197328896 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748445336008024, + "loss": 3.065, + "theoretical_loss": 4.362477204748305, + "tokens_seen": 197394432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748345035105316, + "loss": 3.1457, + "theoretical_loss": 4.362295635737179, + "tokens_seen": 197459968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047482447342026083, + "loss": 3.2047, + "theoretical_loss": 4.362114143844867, + "tokens_seen": 197525504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047481444332998996, + "loss": 3.2647, + "theoretical_loss": 4.3619327290130405, + "tokens_seen": 197591040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748044132397192, + "loss": 3.3005, + "theoretical_loss": 4.3617513911834385, + "tokens_seen": 197656576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747943831494483, + "loss": 3.1384, + "theoretical_loss": 4.361570130297863, + "tokens_seen": 197722112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047478435305917756, + "loss": 3.3974, + "theoretical_loss": 4.3613889462981765, + "tokens_seen": 197787648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047477432296890674, + "loss": 3.1913, + "theoretical_loss": 4.361207839126308, + "tokens_seen": 197853184 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747642928786359, + "loss": 3.3772, + "theoretical_loss": 4.361026808724247, + "tokens_seen": 197918720 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747542627883651, + "loss": 3.3336, + "theoretical_loss": 4.360845855034049, + "tokens_seen": 197984256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747442326980943, + "loss": 3.1099, + "theoretical_loss": 4.360664977997828, + "tokens_seen": 198049792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047473420260782346, + "loss": 3.1798, + "theoretical_loss": 4.360484177557766, + "tokens_seen": 198115328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747241725175527, + "loss": 3.0723, + "theoretical_loss": 4.360303453656103, + "tokens_seen": 198180864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 122248, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.521057605743408, + "objective/train/theoretical_loss": 4.360122806235145, + "objective/train/tokens_used": 218706400, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747141424272818, + "loss": 3.3945, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047470411233701106, + "loss": 3.0591, + "theoretical_loss": 4.359942235237257, + "tokens_seen": 198311936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746940822467402, + "loss": 2.9216, + "theoretical_loss": 4.359761740604871, + "tokens_seen": 198377472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746840521564694, + "loss": 3.3207, + "theoretical_loss": 4.359581322280479, + "tokens_seen": 198443008 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746740220661986, + "loss": 3.3274, + "theoretical_loss": 4.359400980206634, + "tokens_seen": 198508544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746639919759278, + "loss": 3.3515, + "theoretical_loss": 4.359220714325954, + "tokens_seen": 198574080 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047465396188565696, + "loss": 3.272, + "theoretical_loss": 4.359040524581116, + "tokens_seen": 198639616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746439317953862, + "loss": 3.213, + "theoretical_loss": 4.358860410914861, + "tokens_seen": 198705152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746339017051153, + "loss": 3.1872, + "theoretical_loss": 4.358680373269993, + "tokens_seen": 198770688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047462387161484456, + "loss": 2.9749, + "theoretical_loss": 4.358500411589375, + "tokens_seen": 198836224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746138415245737, + "loss": 3.2143, + "theoretical_loss": 4.358320525815934, + "tokens_seen": 198901760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746038114343029, + "loss": 3.2556, + "theoretical_loss": 4.358140715892658, + "tokens_seen": 198967296 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745937813440321, + "loss": 3.2029, + "theoretical_loss": 4.357960981762595, + "tokens_seen": 199032832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745837512537613, + "loss": 3.1239, + "theoretical_loss": 4.357781323368857, + "tokens_seen": 199098368 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047457372116349047, + "loss": 2.9612, + "theoretical_loss": 4.357601740654617, + "tokens_seen": 199163904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047456369107321965, + "loss": 3.0499, + "theoretical_loss": 4.357422233563106, + "tokens_seen": 199229440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047455366098294883, + "loss": 3.0168, + "theoretical_loss": 4.357242802037623, + "tokens_seen": 199294976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047454363089267807, + "loss": 3.276, + "theoretical_loss": 4.35706344602152, + "tokens_seen": 199360512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745336008024072, + "loss": 3.2373, + "theoretical_loss": 4.356884165458217, + "tokens_seen": 199426048 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047452357071213643, + "loss": 3.1589, + "theoretical_loss": 4.356704960291191, + "tokens_seen": 199491584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047451354062186555, + "loss": 3.0203, + "theoretical_loss": 4.35652583046398, + "tokens_seen": 199557120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745035105315948, + "loss": 3.136, + "theoretical_loss": 4.356346775920185, + "tokens_seen": 199622656 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047449348044132397, + "loss": 3.2817, + "theoretical_loss": 4.356167796603467, + "tokens_seen": 199688192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047448345035105315, + "loss": 3.282, + "theoretical_loss": 4.355988892457546, + "tokens_seen": 199753728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744734202607824, + "loss": 3.1863, + "theoretical_loss": 4.355810063426204, + "tokens_seen": 199819264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 123553, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0212910175323486, + "objective/train/theoretical_loss": 4.355631309453283, + "objective/train/tokens_used": 220344800, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047446339017051157, + "loss": 3.171, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047445336008024075, + "loss": 3.0987, + "theoretical_loss": 4.355452630482685, + "tokens_seen": 199950336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047444332998996993, + "loss": 3.1614, + "theoretical_loss": 4.355274026458375, + "tokens_seen": 200015872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744332998996991, + "loss": 3.3795, + "theoretical_loss": 4.355095497324373, + "tokens_seen": 200081408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744232698094283, + "loss": 3.0216, + "theoretical_loss": 4.354917043024765, + "tokens_seen": 200146944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047441323971915753, + "loss": 3.3482, + "theoretical_loss": 4.354738663503692, + "tokens_seen": 200212480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047440320962888666, + "loss": 3.1463, + "theoretical_loss": 4.354560358705358, + "tokens_seen": 200278016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743931795386159, + "loss": 2.9351, + "theoretical_loss": 4.354382128574027, + "tokens_seen": 200343552 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474383149448345, + "loss": 3.3023, + "theoretical_loss": 4.35420397305402, + "tokens_seen": 200409088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047437311935807425, + "loss": 3.2039, + "theoretical_loss": 4.35402589208972, + "tokens_seen": 200474624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047436308926780343, + "loss": 3.1976, + "theoretical_loss": 4.353847885625571, + "tokens_seen": 200540160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743530591775326, + "loss": 3.0562, + "theoretical_loss": 4.353669953606072, + "tokens_seen": 200605696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743430290872618, + "loss": 3.4207, + "theoretical_loss": 4.353492095975787, + "tokens_seen": 200671232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047433299899699103, + "loss": 3.0709, + "theoretical_loss": 4.353314312679333, + "tokens_seen": 200736768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047432296890672016, + "loss": 3.1709, + "theoretical_loss": 4.353136603661392, + "tokens_seen": 200802304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743129388164494, + "loss": 3.2795, + "theoretical_loss": 4.352958968866704, + "tokens_seen": 200867840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743029087261785, + "loss": 3.3119, + "theoretical_loss": 4.352781408240065, + "tokens_seen": 200933376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047429287863590776, + "loss": 3.0623, + "theoretical_loss": 4.352603921726334, + "tokens_seen": 200998912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047428284854563694, + "loss": 3.4305, + "theoretical_loss": 4.352426509270425, + "tokens_seen": 201064448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742728184553661, + "loss": 3.1922, + "theoretical_loss": 4.352249170817315, + "tokens_seen": 201129984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742627883650953, + "loss": 3.3135, + "theoretical_loss": 4.352071906312037, + "tokens_seen": 201195520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742527582748245, + "loss": 3.1888, + "theoretical_loss": 4.351894715699684, + "tokens_seen": 201261056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047424272818455366, + "loss": 3.4327, + "theoretical_loss": 4.351717598925406, + "tokens_seen": 201326592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742326980942829, + "loss": 3.3805, + "theoretical_loss": 4.351540555934414, + "tokens_seen": 201392128 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474222668004012, + "loss": 3.5804, + "theoretical_loss": 4.351363586671976, + "tokens_seen": 201457664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 124253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.035176992416382, + "objective/train/theoretical_loss": 4.351186691083417, + "objective/train/tokens_used": 221983200, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047421263791374126, + "loss": 3.126, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742026078234704, + "loss": 3.1474, + "theoretical_loss": 4.351009869114124, + "tokens_seen": 201588736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741925777331996, + "loss": 3.2053, + "theoretical_loss": 4.350833120709539, + "tokens_seen": 201654272 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741825476429288, + "loss": 3.2585, + "theoretical_loss": 4.350656445815164, + "tokens_seen": 201719808 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474172517552658, + "loss": 3.1713, + "theoretical_loss": 4.350479844376557, + "tokens_seen": 201785344 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047416248746238716, + "loss": 3.3766, + "theoretical_loss": 4.350303316339337, + "tokens_seen": 201850880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741524573721164, + "loss": 3.2092, + "theoretical_loss": 4.350126861649178, + "tokens_seen": 201916416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741424272818455, + "loss": 3.1106, + "theoretical_loss": 4.349950480251813, + "tokens_seen": 201981952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047413239719157476, + "loss": 3.0031, + "theoretical_loss": 4.349774172093033, + "tokens_seen": 202047488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741223671013039, + "loss": 3.4637, + "theoretical_loss": 4.349597937118687, + "tokens_seen": 202113024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741123370110331, + "loss": 3.169, + "theoretical_loss": 4.3494217752746795, + "tokens_seen": 202178560 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741023069207623, + "loss": 3.2752, + "theoretical_loss": 4.349245686506976, + "tokens_seen": 202244096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740922768304915, + "loss": 3.3951, + "theoretical_loss": 4.349069670761597, + "tokens_seen": 202309632 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047408224674022067, + "loss": 3.12, + "theoretical_loss": 4.348893727984619, + "tokens_seen": 202375168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047407221664994985, + "loss": 3.0903, + "theoretical_loss": 4.348717858122178, + "tokens_seen": 202440704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047406218655967903, + "loss": 3.1313, + "theoretical_loss": 4.348542061120469, + "tokens_seen": 202506240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047405215646940827, + "loss": 3.2476, + "theoretical_loss": 4.348366336925739, + "tokens_seen": 202571776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740421263791374, + "loss": 3.0185, + "theoretical_loss": 4.3481906854842975, + "tokens_seen": 202637312 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047403209628886663, + "loss": 3.1723, + "theoretical_loss": 4.348015106742507, + "tokens_seen": 202702848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047402206619859575, + "loss": 3.1502, + "theoretical_loss": 4.347839600646786, + "tokens_seen": 202768384 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474012036108325, + "loss": 3.3803, + "theoretical_loss": 4.347664167143615, + "tokens_seen": 202833920 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047400200601805417, + "loss": 3.1256, + "theoretical_loss": 4.347488806179528, + "tokens_seen": 202899456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047399197592778335, + "loss": 3.2037, + "theoretical_loss": 4.347313517701114, + "tokens_seen": 202964992 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047398194583751253, + "loss": 3.1465, + "theoretical_loss": 4.347138301655021, + "tokens_seen": 203030528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047397191574724177, + "loss": 3.4567, + "theoretical_loss": 4.346963157987954, + "tokens_seen": 203096064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 125838, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.456173896789551, + "objective/train/theoretical_loss": 4.346788086646671, + "objective/train/tokens_used": 223621600, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739618856569709, + "loss": 3.2604, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047395185556670013, + "loss": 3.2752, + "theoretical_loss": 4.346613087577991, + "tokens_seen": 203227136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047394182547642926, + "loss": 3.2155, + "theoretical_loss": 4.346438160728785, + "tokens_seen": 203292672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739317953861585, + "loss": 3.2269, + "theoretical_loss": 4.346263306045983, + "tokens_seen": 203358208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739217652958877, + "loss": 3.1447, + "theoretical_loss": 4.346088523476569, + "tokens_seen": 203423744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047391173520561686, + "loss": 3.1881, + "theoretical_loss": 4.345913812967584, + "tokens_seen": 203489280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047390170511534604, + "loss": 3.148, + "theoretical_loss": 4.345739174466127, + "tokens_seen": 203554816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738916750250752, + "loss": 3.1253, + "theoretical_loss": 4.345564607919348, + "tokens_seen": 203620352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738816449348044, + "loss": 3.2427, + "theoretical_loss": 4.3453901132744575, + "tokens_seen": 203685888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047387161484453363, + "loss": 3.1132, + "theoretical_loss": 4.345215690478719, + "tokens_seen": 203751424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047386158475426276, + "loss": 3.0614, + "theoretical_loss": 4.345041339479453, + "tokens_seen": 203816960 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473851554663992, + "loss": 3.2345, + "theoretical_loss": 4.3448670602240345, + "tokens_seen": 203882496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738415245737211, + "loss": 2.9035, + "theoretical_loss": 4.344692852659895, + "tokens_seen": 203948032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047383149448345036, + "loss": 3.2493, + "theoretical_loss": 4.34451871673452, + "tokens_seen": 204013568 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047382146439317954, + "loss": 3.2788, + "theoretical_loss": 4.344344652395451, + "tokens_seen": 204079104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738114343029087, + "loss": 3.0973, + "theoretical_loss": 4.3441706595902865, + "tokens_seen": 204144640 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738014042126379, + "loss": 3.0732, + "theoretical_loss": 4.343996738266677, + "tokens_seen": 204210176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047379137412236714, + "loss": 3.2518, + "theoretical_loss": 4.343822888372331, + "tokens_seen": 204275712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047378134403209626, + "loss": 3.064, + "theoretical_loss": 4.343649109855009, + "tokens_seen": 204341248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737713139418255, + "loss": 3.3627, + "theoretical_loss": 4.343475402662529, + "tokens_seen": 204406784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737612838515546, + "loss": 3.0398, + "theoretical_loss": 4.343301766742763, + "tokens_seen": 204472320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047375125376128386, + "loss": 3.0176, + "theoretical_loss": 4.343128202043638, + "tokens_seen": 204537856 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047374122367101304, + "loss": 3.257, + "theoretical_loss": 4.342954708513136, + "tokens_seen": 204603392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737311935807422, + "loss": 2.8876, + "theoretical_loss": 4.342781286099291, + "tokens_seen": 204668928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047372116349047146, + "loss": 3.3289, + "theoretical_loss": 4.3426079347501965, + "tokens_seen": 204734464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 126453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.122795820236206, + "objective/train/theoretical_loss": 4.342434654413995, + "objective/train/tokens_used": 225260000, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737111334002006, + "loss": 3.1005, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737011033099298, + "loss": 3.2419, + "theoretical_loss": 4.342261445038888, + "tokens_seen": 204865536 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473691073219659, + "loss": 3.2928, + "theoretical_loss": 4.342088306573128, + "tokens_seen": 204931072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736810431293882, + "loss": 2.9942, + "theoretical_loss": 4.341915238965026, + "tokens_seen": 204996608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047367101303911736, + "loss": 3.3056, + "theoretical_loss": 4.34174224216294, + "tokens_seen": 205062144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736609829488466, + "loss": 3.3729, + "theoretical_loss": 4.34156931611529, + "tokens_seen": 205127680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736509528585757, + "loss": 3.1314, + "theoretical_loss": 4.341396460770547, + "tokens_seen": 205193216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047364092276830496, + "loss": 3.0286, + "theoretical_loss": 4.341223676077232, + "tokens_seen": 205258752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736308926780341, + "loss": 3.3591, + "theoretical_loss": 4.341050961983926, + "tokens_seen": 205324288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736208625877633, + "loss": 3.2768, + "theoretical_loss": 4.340878318439261, + "tokens_seen": 205389824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736108324974925, + "loss": 3.1598, + "theoretical_loss": 4.340705745391922, + "tokens_seen": 205455360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736008024072217, + "loss": 3.0908, + "theoretical_loss": 4.3405332427906504, + "tokens_seen": 205520896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047359077231695087, + "loss": 3.1934, + "theoretical_loss": 4.340360810584238, + "tokens_seen": 205586432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047358074222668005, + "loss": 3.1539, + "theoretical_loss": 4.340188448721532, + "tokens_seen": 205651968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047357071213640923, + "loss": 3.2624, + "theoretical_loss": 4.3400161571514335, + "tokens_seen": 205717504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047356068204613847, + "loss": 3.2158, + "theoretical_loss": 4.339843935822895, + "tokens_seen": 205783040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735506519558676, + "loss": 3.1277, + "theoretical_loss": 4.339671784684923, + "tokens_seen": 205848576 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047354062186559683, + "loss": 3.0358, + "theoretical_loss": 4.339499703686579, + "tokens_seen": 205914112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047353059177532595, + "loss": 3.2111, + "theoretical_loss": 4.339327692776977, + "tokens_seen": 205979648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735205616850552, + "loss": 3.1453, + "theoretical_loss": 4.339155751905282, + "tokens_seen": 206045184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047351053159478437, + "loss": 3.2403, + "theoretical_loss": 4.338983881020713, + "tokens_seen": 206110720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047350050150451355, + "loss": 3.3298, + "theoretical_loss": 4.338812080072545, + "tokens_seen": 206176256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047349047141424273, + "loss": 3.3233, + "theoretical_loss": 4.338640349010101, + "tokens_seen": 206241792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047348044132397197, + "loss": 3.2418, + "theoretical_loss": 4.3384686877827585, + "tokens_seen": 206307328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734704112337011, + "loss": 3.1538, + "theoretical_loss": 4.338297096339951, + "tokens_seen": 206372864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 127848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0600967407226562, + "objective/train/theoretical_loss": 4.33812557463116, + "objective/train/tokens_used": 226898400, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047346038114343033, + "loss": 3.0882, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047345035105315946, + "loss": 3.2587, + "theoretical_loss": 4.3379541226059235, + "tokens_seen": 206503936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734403209628887, + "loss": 3.1858, + "theoretical_loss": 4.337782740213827, + "tokens_seen": 206569472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734302908726179, + "loss": 3.1159, + "theoretical_loss": 4.337611427404514, + "tokens_seen": 206635008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047342026078234706, + "loss": 3.3164, + "theoretical_loss": 4.337440184127679, + "tokens_seen": 206700544 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047341023069207624, + "loss": 3.0572, + "theoretical_loss": 4.337269010333065, + "tokens_seen": 206766080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734002006018054, + "loss": 2.8251, + "theoretical_loss": 4.337097905970471, + "tokens_seen": 206831616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733901705115346, + "loss": 3.1861, + "theoretical_loss": 4.336926870989748, + "tokens_seen": 206897152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047338014042126383, + "loss": 3.3199, + "theoretical_loss": 4.336755905340797, + "tokens_seen": 206962688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047337011033099296, + "loss": 3.151, + "theoretical_loss": 4.336585008973573, + "tokens_seen": 207028224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733600802407222, + "loss": 3.0852, + "theoretical_loss": 4.336414181838082, + "tokens_seen": 207093760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733500501504513, + "loss": 3.1909, + "theoretical_loss": 4.336243423884382, + "tokens_seen": 207159296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047334002006018056, + "loss": 3.2911, + "theoretical_loss": 4.336072735062583, + "tokens_seen": 207224832 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047332998996990974, + "loss": 3.2248, + "theoretical_loss": 4.335902115322847, + "tokens_seen": 207290368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733199598796389, + "loss": 2.9551, + "theoretical_loss": 4.335731564615387, + "tokens_seen": 207355904 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733099297893681, + "loss": 3.3696, + "theoretical_loss": 4.335561082890468, + "tokens_seen": 207421440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047329989969909734, + "loss": 3.4282, + "theoretical_loss": 4.335390670098407, + "tokens_seen": 207486976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047328986960882646, + "loss": 3.2705, + "theoretical_loss": 4.335220326189571, + "tokens_seen": 207552512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732798395185557, + "loss": 3.205, + "theoretical_loss": 4.335050051114379, + "tokens_seen": 207618048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732698094282848, + "loss": 2.9493, + "theoretical_loss": 4.334879844823304, + "tokens_seen": 207683584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047325977933801406, + "loss": 3.0003, + "theoretical_loss": 4.334709707266865, + "tokens_seen": 207749120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047324974924774324, + "loss": 3.1429, + "theoretical_loss": 4.334539638395636, + "tokens_seen": 207814656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732397191574724, + "loss": 3.2752, + "theoretical_loss": 4.334369638160242, + "tokens_seen": 207880192 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732296890672016, + "loss": 3.1693, + "theoretical_loss": 4.334199706511358, + "tokens_seen": 207945728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732196589769308, + "loss": 3.402, + "theoretical_loss": 4.334029843399709, + "tokens_seen": 208011264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 128338, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.665210247039795, + "objective/train/theoretical_loss": 4.333860048776074, + "objective/train/tokens_used": 228536800, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047320962888665997, + "loss": 3.2236, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731995987963892, + "loss": 3.167, + "theoretical_loss": 4.33369032259128, + "tokens_seen": 208142336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047318956870611833, + "loss": 3.1083, + "theoretical_loss": 4.333520664796206, + "tokens_seen": 208207872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047317953861584757, + "loss": 3.1765, + "theoretical_loss": 4.33335107534178, + "tokens_seen": 208273408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047316950852557675, + "loss": 3.1977, + "theoretical_loss": 4.333181554178985, + "tokens_seen": 208338944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047315947843530593, + "loss": 2.9084, + "theoretical_loss": 4.3330121012588485, + "tokens_seen": 208404480 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731494483450351, + "loss": 3.4184, + "theoretical_loss": 4.332842716532454, + "tokens_seen": 208470016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731394182547643, + "loss": 3.2963, + "theoretical_loss": 4.332673399950932, + "tokens_seen": 208535552 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047312938816449347, + "loss": 3.052, + "theoretical_loss": 4.332504151465464, + "tokens_seen": 208601088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731193580742227, + "loss": 3.0663, + "theoretical_loss": 4.332334971027284, + "tokens_seen": 208666624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047310932798395183, + "loss": 3.0481, + "theoretical_loss": 4.332165858587672, + "tokens_seen": 208732160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047309929789368107, + "loss": 3.3365, + "theoretical_loss": 4.331996814097963, + "tokens_seen": 208797696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730892678034102, + "loss": 3.1267, + "theoretical_loss": 4.331827837509538, + "tokens_seen": 208863232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047307923771313943, + "loss": 3.3878, + "theoretical_loss": 4.331658928773831, + "tokens_seen": 208928768 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730692076228686, + "loss": 3.0899, + "theoretical_loss": 4.331490087842324, + "tokens_seen": 208994304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730591775325978, + "loss": 3.2895, + "theoretical_loss": 4.33132131466655, + "tokens_seen": 209059840 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473049147442327, + "loss": 3.2144, + "theoretical_loss": 4.3311526091980905, + "tokens_seen": 209125376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047303911735205615, + "loss": 2.7723, + "theoretical_loss": 4.330983971388578, + "tokens_seen": 209190912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047302908726178534, + "loss": 2.9338, + "theoretical_loss": 4.330815401189695, + "tokens_seen": 209256448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047301905717151457, + "loss": 3.1443, + "theoretical_loss": 4.330646898553173, + "tokens_seen": 209321984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730090270812437, + "loss": 3.2174, + "theoretical_loss": 4.330478463430792, + "tokens_seen": 209387520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047299899699097293, + "loss": 3.2128, + "theoretical_loss": 4.330310095774383, + "tokens_seen": 209453056 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729889669007021, + "loss": 3.1469, + "theoretical_loss": 4.330141795535828, + "tokens_seen": 209518592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729789368104313, + "loss": 3.2627, + "theoretical_loss": 4.329973562667053, + "tokens_seen": 209584128 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047296890672016053, + "loss": 3.3848, + "theoretical_loss": 4.3298053971200385, + "tokens_seen": 209649664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 129483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.797501802444458, + "objective/train/theoretical_loss": 4.329637298846812, + "objective/train/tokens_used": 230175200, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047295887662988966, + "loss": 3.3015, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729488465396189, + "loss": 3.2439, + "theoretical_loss": 4.329469267799451, + "tokens_seen": 209780736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729388164493481, + "loss": 3.2575, + "theoretical_loss": 4.32930130393008, + "tokens_seen": 209846272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047292878635907726, + "loss": 3.1579, + "theoretical_loss": 4.329133407190876, + "tokens_seen": 209911808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047291875626880644, + "loss": 3.204, + "theoretical_loss": 4.3289655775340625, + "tokens_seen": 209977344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729087261785356, + "loss": 3.1764, + "theoretical_loss": 4.328797814911912, + "tokens_seen": 210042880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728986960882648, + "loss": 3.5043, + "theoretical_loss": 4.328630119276747, + "tokens_seen": 210108416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047288866599799403, + "loss": 3.3447, + "theoretical_loss": 4.328462490580938, + "tokens_seen": 210173952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047287863590772316, + "loss": 3.1687, + "theoretical_loss": 4.328294928776903, + "tokens_seen": 210239488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728686058174524, + "loss": 3.3405, + "theoretical_loss": 4.328127433817112, + "tokens_seen": 210305024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728585757271815, + "loss": 3.0517, + "theoretical_loss": 4.327960005654081, + "tokens_seen": 210370560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047284854563691076, + "loss": 3.3842, + "theoretical_loss": 4.327792644240374, + "tokens_seen": 210436096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047283851554663994, + "loss": 3.2548, + "theoretical_loss": 4.327625349528605, + "tokens_seen": 210501632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728284854563691, + "loss": 3.3037, + "theoretical_loss": 4.327458121471436, + "tokens_seen": 210567168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728184553660983, + "loss": 3.1623, + "theoretical_loss": 4.3272909600215765, + "tokens_seen": 210632704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047280842527582754, + "loss": 2.8898, + "theoretical_loss": 4.327123865131786, + "tokens_seen": 210698240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047279839518555666, + "loss": 3.2277, + "theoretical_loss": 4.326956836754871, + "tokens_seen": 210763776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727883650952859, + "loss": 3.2198, + "theoretical_loss": 4.326789874843685, + "tokens_seen": 210829312 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472778335005015, + "loss": 3.577, + "theoretical_loss": 4.326622979351132, + "tokens_seen": 210894848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047276830491474426, + "loss": 3.1072, + "theoretical_loss": 4.326456150230163, + "tokens_seen": 210960384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047275827482447344, + "loss": 3.3812, + "theoretical_loss": 4.326289387433776, + "tokens_seen": 211025920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727482447342026, + "loss": 3.3247, + "theoretical_loss": 4.326122690915017, + "tokens_seen": 211091456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727382146439318, + "loss": 3.1334, + "theoretical_loss": 4.325956060626982, + "tokens_seen": 211156992 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472728184553661, + "loss": 3.2365, + "theoretical_loss": 4.325789496522812, + "tokens_seen": 211222528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047271815446339017, + "loss": 3.3295, + "theoretical_loss": 4.325622998555697, + "tokens_seen": 211288064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 130263, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.355026960372925, + "objective/train/theoretical_loss": 4.3254565666788745, + "objective/train/tokens_used": 231813600, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727081243731194, + "loss": 3.3001, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047269809428284853, + "loss": 3.2941, + "theoretical_loss": 4.325290200845629, + "tokens_seen": 211419136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047268806419257777, + "loss": 3.3354, + "theoretical_loss": 4.3251239010092934, + "tokens_seen": 211484672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047267803410230695, + "loss": 3.3392, + "theoretical_loss": 4.324957667123249, + "tokens_seen": 211550208 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047266800401203613, + "loss": 3.1309, + "theoretical_loss": 4.32479149914092, + "tokens_seen": 211615744 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726579739217653, + "loss": 3.2534, + "theoretical_loss": 4.324625397015783, + "tokens_seen": 211681280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726479438314945, + "loss": 3.0925, + "theoretical_loss": 4.3244593607013595, + "tokens_seen": 211746816 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047263791374122367, + "loss": 3.2174, + "theoretical_loss": 4.324293390151218, + "tokens_seen": 211812352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726278836509529, + "loss": 3.1953, + "theoretical_loss": 4.324127485318975, + "tokens_seen": 211877888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047261785356068203, + "loss": 3.0756, + "theoretical_loss": 4.323961646158294, + "tokens_seen": 211943424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047260782347041127, + "loss": 3.3479, + "theoretical_loss": 4.323795872622884, + "tokens_seen": 212008960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725977933801404, + "loss": 3.1166, + "theoretical_loss": 4.323630164666502, + "tokens_seen": 212074496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047258776328986963, + "loss": 3.3872, + "theoretical_loss": 4.323464522242954, + "tokens_seen": 212140032 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725777331995988, + "loss": 3.0378, + "theoretical_loss": 4.323298945306089, + "tokens_seen": 212205568 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472567703109328, + "loss": 2.9616, + "theoretical_loss": 4.3231334338098035, + "tokens_seen": 212271104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725576730190572, + "loss": 3.1819, + "theoretical_loss": 4.322967987708043, + "tokens_seen": 212336640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047254764292878636, + "loss": 3.2994, + "theoretical_loss": 4.322802606954798, + "tokens_seen": 212402176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047253761283851554, + "loss": 3.2778, + "theoretical_loss": 4.322637291504106, + "tokens_seen": 212467712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047252758274824477, + "loss": 3.3196, + "theoretical_loss": 4.32247204131005, + "tokens_seen": 212533248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725175526579739, + "loss": 3.2477, + "theoretical_loss": 4.322306856326761, + "tokens_seen": 212598784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047250752256770313, + "loss": 3.4815, + "theoretical_loss": 4.322141736508415, + "tokens_seen": 212664320 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724974924774323, + "loss": 3.3946, + "theoretical_loss": 4.321976681809236, + "tokens_seen": 212729856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724874623871615, + "loss": 3.2502, + "theoretical_loss": 4.321811692183491, + "tokens_seen": 212795392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724774322968907, + "loss": 3.2951, + "theoretical_loss": 4.321646767585497, + "tokens_seen": 212860928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047246740220661986, + "loss": 3.0522, + "theoretical_loss": 4.3214819079696145, + "tokens_seen": 212926464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 131630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6950013637542725, + "objective/train/theoretical_loss": 4.321317113290252, + "objective/train/tokens_used": 233452000, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047245737211634904, + "loss": 3.3024, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724473420260783, + "loss": 3.0431, + "theoretical_loss": 4.321152383501863, + "tokens_seen": 213057536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724373119358074, + "loss": 2.8804, + "theoretical_loss": 4.320987718558945, + "tokens_seen": 213123072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047242728184553664, + "loss": 3.1601, + "theoretical_loss": 4.320823118416046, + "tokens_seen": 213188608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047241725175526576, + "loss": 3.385, + "theoretical_loss": 4.320658583027755, + "tokens_seen": 213254144 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472407221664995, + "loss": 3.2701, + "theoretical_loss": 4.32049411234871, + "tokens_seen": 213319680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723971915747242, + "loss": 3.1385, + "theoretical_loss": 4.3203297063335935, + "tokens_seen": 213385216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047238716148445336, + "loss": 3.1332, + "theoretical_loss": 4.320165364937134, + "tokens_seen": 213450752 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047237713139418254, + "loss": 3.3959, + "theoretical_loss": 4.320001088114105, + "tokens_seen": 213516288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723671013039117, + "loss": 3.5302, + "theoretical_loss": 4.319836875819325, + "tokens_seen": 213581824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723570712136409, + "loss": 3.3552, + "theoretical_loss": 4.31967272800766, + "tokens_seen": 213647360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047234704112337014, + "loss": 3.2224, + "theoretical_loss": 4.319508644634021, + "tokens_seen": 213712896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047233701103309927, + "loss": 3.1956, + "theoretical_loss": 4.319344625653361, + "tokens_seen": 213778432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723269809428285, + "loss": 3.0912, + "theoretical_loss": 4.319180671020684, + "tokens_seen": 213843968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723169508525577, + "loss": 3.22, + "theoretical_loss": 4.319016780691033, + "tokens_seen": 213909504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047230692076228686, + "loss": 3.2483, + "theoretical_loss": 4.318852954619501, + "tokens_seen": 213975040 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047229689067201605, + "loss": 3.19, + "theoretical_loss": 4.318689192761225, + "tokens_seen": 214040576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722868605817452, + "loss": 3.0916, + "theoretical_loss": 4.318525495071385, + "tokens_seen": 214106112 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722768304914744, + "loss": 3.5612, + "theoretical_loss": 4.318361861505207, + "tokens_seen": 214171648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047226680040120364, + "loss": 3.2789, + "theoretical_loss": 4.318198292017964, + "tokens_seen": 214237184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047225677031093277, + "loss": 3.219, + "theoretical_loss": 4.318034786564971, + "tokens_seen": 214302720 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472246740220662, + "loss": 3.3485, + "theoretical_loss": 4.31787134510159, + "tokens_seen": 214368256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047223671013039113, + "loss": 3.2924, + "theoretical_loss": 4.3177079675832255, + "tokens_seen": 214433792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047222668004012037, + "loss": 3.2976, + "theoretical_loss": 4.317544653965329, + "tokens_seen": 214499328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004722166499498496, + "loss": 3.2069, + "theoretical_loss": 4.3173814042033944, + "tokens_seen": 214564864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 132228, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7650396823883057, + "objective/train/theoretical_loss": 4.317218218252963, + "objective/train/tokens_used": 235090400, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047220661985957873, + "loss": 3.3818, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047219658976930797, + "loss": 3.1204, + "theoretical_loss": 4.317055096069618, + "tokens_seen": 214695936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047218655967903715, + "loss": 3.2258, + "theoretical_loss": 4.316892037608987, + "tokens_seen": 214761472 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047217652958876633, + "loss": 2.9512, + "theoretical_loss": 4.316729042826745, + "tokens_seen": 214827008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721664994984955, + "loss": 3.3851, + "theoretical_loss": 4.316566111678609, + "tokens_seen": 214892544 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721564694082247, + "loss": 3.307, + "theoretical_loss": 4.316403244120339, + "tokens_seen": 214958080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047214643931795387, + "loss": 3.0003, + "theoretical_loss": 4.3162404401077445, + "tokens_seen": 215023616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721364092276831, + "loss": 3.4895, + "theoretical_loss": 4.316077699596671, + "tokens_seen": 215089152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047212637913741223, + "loss": 3.1155, + "theoretical_loss": 4.315915022543016, + "tokens_seen": 215154688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047211634904714147, + "loss": 3.0629, + "theoretical_loss": 4.315752408902716, + "tokens_seen": 215220224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721063189568706, + "loss": 3.2205, + "theoretical_loss": 4.315589858631755, + "tokens_seen": 215285760 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047209628886659983, + "loss": 3.2874, + "theoretical_loss": 4.315427371686157, + "tokens_seen": 215351296 + }, + { + "epoch": 0.07, + "learning_rate": 0.000472086258776329, + "loss": 3.3042, + "theoretical_loss": 4.315264948021994, + "tokens_seen": 215416832 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720762286860582, + "loss": 3.3027, + "theoretical_loss": 4.315102587595379, + "tokens_seen": 215482368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720661985957874, + "loss": 3.1877, + "theoretical_loss": 4.31494029036247, + "tokens_seen": 215547904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047205616850551656, + "loss": 3.2381, + "theoretical_loss": 4.314778056279468, + "tokens_seen": 215613440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047204613841524574, + "loss": 3.0961, + "theoretical_loss": 4.314615885302619, + "tokens_seen": 215678976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047203610832497497, + "loss": 3.0676, + "theoretical_loss": 4.314453777388209, + "tokens_seen": 215744512 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720260782347041, + "loss": 3.2471, + "theoretical_loss": 4.314291732492573, + "tokens_seen": 215810048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047201604814443333, + "loss": 3.2031, + "theoretical_loss": 4.314129750572087, + "tokens_seen": 215875584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720060180541625, + "loss": 2.8863, + "theoretical_loss": 4.3139678315831675, + "tokens_seen": 215941120 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719959879638917, + "loss": 3.0952, + "theoretical_loss": 4.313805975482278, + "tokens_seen": 216006656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719859578736209, + "loss": 3.196, + "theoretical_loss": 4.313644182225926, + "tokens_seen": 216072192 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047197592778335006, + "loss": 2.9889, + "theoretical_loss": 4.313482451770659, + "tokens_seen": 216137728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047196589769307924, + "loss": 3.1137, + "theoretical_loss": 4.313320784073069, + "tokens_seen": 216203264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 133529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.224602460861206, + "objective/train/theoretical_loss": 4.3131591790897925, + "objective/train/tokens_used": 236728800, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719558676028085, + "loss": 3.3034, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719458375125376, + "loss": 3.0825, + "theoretical_loss": 4.3129976367775065, + "tokens_seen": 216334336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047193580742226684, + "loss": 3.293, + "theoretical_loss": 4.312836157092934, + "tokens_seen": 216399872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047192577733199596, + "loss": 3.172, + "theoretical_loss": 4.312674739992839, + "tokens_seen": 216465408 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719157472417252, + "loss": 3.128, + "theoretical_loss": 4.31251338543403, + "tokens_seen": 216530944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719057171514544, + "loss": 3.3622, + "theoretical_loss": 4.312352093373354, + "tokens_seen": 216596480 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047189568706118356, + "loss": 3.3549, + "theoretical_loss": 4.312190863767708, + "tokens_seen": 216662016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047188565697091274, + "loss": 3.1291, + "theoretical_loss": 4.312029696574027, + "tokens_seen": 216727552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718756268806419, + "loss": 2.8499, + "theoretical_loss": 4.311868591749287, + "tokens_seen": 216793088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718655967903711, + "loss": 3.0329, + "theoretical_loss": 4.311707549250514, + "tokens_seen": 216858624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047185556670010034, + "loss": 3.5888, + "theoretical_loss": 4.311546569034767, + "tokens_seen": 216924160 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047184553660982947, + "loss": 3.4006, + "theoretical_loss": 4.311385651059155, + "tokens_seen": 216989696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718355065195587, + "loss": 3.2021, + "theoretical_loss": 4.311224795280825, + "tokens_seen": 217055232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718254764292879, + "loss": 3.2574, + "theoretical_loss": 4.3110640016569715, + "tokens_seen": 217120768 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047181544633901706, + "loss": 3.4922, + "theoretical_loss": 4.310903270144825, + "tokens_seen": 217186304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047180541624874625, + "loss": 3.4122, + "theoretical_loss": 4.310742600701664, + "tokens_seen": 217251840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717953861584754, + "loss": 2.913, + "theoretical_loss": 4.310581993284805, + "tokens_seen": 217317376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717853560682046, + "loss": 3.0789, + "theoretical_loss": 4.310421447851609, + "tokens_seen": 217382912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047177532597793384, + "loss": 3.1302, + "theoretical_loss": 4.310260964359479, + "tokens_seen": 217448448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047176529588766297, + "loss": 3.0638, + "theoretical_loss": 4.310100542765858, + "tokens_seen": 217513984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717552657973922, + "loss": 3.1546, + "theoretical_loss": 4.309940183028236, + "tokens_seen": 217579520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047174523570712133, + "loss": 3.3793, + "theoretical_loss": 4.309779885104139, + "tokens_seen": 217645056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047173520561685057, + "loss": 3.2103, + "theoretical_loss": 4.309619648951139, + "tokens_seen": 217710592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047172517552657975, + "loss": 3.3218, + "theoretical_loss": 4.3094594745268475, + "tokens_seen": 217776128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047171514543630893, + "loss": 3.0378, + "theoretical_loss": 4.30929936178892, + "tokens_seen": 217841664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 134228, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2407047748565674, + "objective/train/theoretical_loss": 4.309139310695053, + "objective/train/tokens_used": 238367200, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717051153460381, + "loss": 3.2923, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047169508525576735, + "loss": 3.2057, + "theoretical_loss": 4.308979321202983, + "tokens_seen": 217972736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716850551654965, + "loss": 3.2015, + "theoretical_loss": 4.308819393270491, + "tokens_seen": 218038272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716750250752257, + "loss": 3.064, + "theoretical_loss": 4.308659526855396, + "tokens_seen": 218103808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047166499498495484, + "loss": 3.0323, + "theoretical_loss": 4.308499721915563, + "tokens_seen": 218169344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047165496489468407, + "loss": 3.2744, + "theoretical_loss": 4.308339978408897, + "tokens_seen": 218234880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047164493480441325, + "loss": 3.1236, + "theoretical_loss": 4.308180296293341, + "tokens_seen": 218300416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047163490471414243, + "loss": 3.117, + "theoretical_loss": 4.308020675526883, + "tokens_seen": 218365952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716248746238716, + "loss": 3.1281, + "theoretical_loss": 4.307861116067554, + "tokens_seen": 218431488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716148445336008, + "loss": 2.9074, + "theoretical_loss": 4.30770161787342, + "tokens_seen": 218497024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047160481444333, + "loss": 3.2527, + "theoretical_loss": 4.307542180902594, + "tokens_seen": 218562560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715947843530592, + "loss": 3.0822, + "theoretical_loss": 4.307382805113228, + "tokens_seen": 218628096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047158475426278834, + "loss": 3.3932, + "theoretical_loss": 4.307223490463516, + "tokens_seen": 218693632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715747241725176, + "loss": 3.1793, + "theoretical_loss": 4.307064236911692, + "tokens_seen": 218759168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715646940822467, + "loss": 3.3369, + "theoretical_loss": 4.30690504441603, + "tokens_seen": 218824704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047155466399197594, + "loss": 3.1318, + "theoretical_loss": 4.306745912934849, + "tokens_seen": 218890240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715446339017051, + "loss": 3.0594, + "theoretical_loss": 4.306586842426504, + "tokens_seen": 218955776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715346038114343, + "loss": 3.3079, + "theoretical_loss": 4.306427832849394, + "tokens_seen": 219021312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715245737211635, + "loss": 3.19, + "theoretical_loss": 4.306268884161959, + "tokens_seen": 219086848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715145436308927, + "loss": 3.3318, + "theoretical_loss": 4.306109996322679, + "tokens_seen": 219152384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047150451354062184, + "loss": 3.0093, + "theoretical_loss": 4.305951169290073, + "tokens_seen": 219217920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714944834503511, + "loss": 3.3021, + "theoretical_loss": 4.305792403022703, + "tokens_seen": 219283456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714844533600802, + "loss": 3.0608, + "theoretical_loss": 4.305633697479171, + "tokens_seen": 219348992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047147442326980944, + "loss": 2.8014, + "theoretical_loss": 4.305475052618119, + "tokens_seen": 219414528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714643931795387, + "loss": 3.3019, + "theoretical_loss": 4.30531646839823, + "tokens_seen": 219480064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 135676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.066269874572754, + "objective/train/theoretical_loss": 4.305157944778228, + "objective/train/tokens_used": 240005600, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714543630892678, + "loss": 3.0659, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047144433299899704, + "loss": 3.251, + "theoretical_loss": 4.304999481716876, + "tokens_seen": 219611136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047143430290872616, + "loss": 3.4636, + "theoretical_loss": 4.304841079172979, + "tokens_seen": 219676672 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714242728184554, + "loss": 3.2243, + "theoretical_loss": 4.30468273710538, + "tokens_seen": 219742208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714142427281846, + "loss": 3.2012, + "theoretical_loss": 4.304524455472965, + "tokens_seen": 219807744 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047140421263791376, + "loss": 2.9528, + "theoretical_loss": 4.304366234234659, + "tokens_seen": 219873280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047139418254764294, + "loss": 3.214, + "theoretical_loss": 4.304208073349426, + "tokens_seen": 219938816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713841524573721, + "loss": 3.3243, + "theoretical_loss": 4.304049972776271, + "tokens_seen": 220004352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713741223671013, + "loss": 3.169, + "theoretical_loss": 4.30389193247424, + "tokens_seen": 220069888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047136409227683054, + "loss": 3.2084, + "theoretical_loss": 4.303733952402419, + "tokens_seen": 220135424 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047135406218655967, + "loss": 3.2908, + "theoretical_loss": 4.303576032519931, + "tokens_seen": 220200960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713440320962889, + "loss": 3.415, + "theoretical_loss": 4.303418172785943, + "tokens_seen": 220266496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713340020060181, + "loss": 3.3773, + "theoretical_loss": 4.303260373159659, + "tokens_seen": 220332032 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047132397191574726, + "loss": 3.2734, + "theoretical_loss": 4.303102633600322, + "tokens_seen": 220397568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047131394182547645, + "loss": 2.8756, + "theoretical_loss": 4.30294495406722, + "tokens_seen": 220463104 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047130391173520563, + "loss": 3.1967, + "theoretical_loss": 4.3027873345196745, + "tokens_seen": 220528640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712938816449348, + "loss": 3.4082, + "theoretical_loss": 4.302629774917049, + "tokens_seen": 220594176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047128385155466404, + "loss": 3.1634, + "theoretical_loss": 4.302472275218748, + "tokens_seen": 220659712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047127382146439317, + "loss": 3.3901, + "theoretical_loss": 4.302314835384214, + "tokens_seen": 220725248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712637913741224, + "loss": 3.3387, + "theoretical_loss": 4.30215745537293, + "tokens_seen": 220790784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047125376128385153, + "loss": 3.2651, + "theoretical_loss": 4.302000135144416, + "tokens_seen": 220856320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047124373119358077, + "loss": 3.2082, + "theoretical_loss": 4.301842874658235, + "tokens_seen": 220921856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047123370110330995, + "loss": 3.2934, + "theoretical_loss": 4.301685673873987, + "tokens_seen": 220987392 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047122367101303913, + "loss": 3.1771, + "theoretical_loss": 4.301528532751312, + "tokens_seen": 221052928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712136409227683, + "loss": 3.3551, + "theoretical_loss": 4.301371451249888, + "tokens_seen": 221118464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 136371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.877000093460083, + "objective/train/theoretical_loss": 4.301214429329433, + "objective/train/tokens_used": 241644000, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047120361083249755, + "loss": 3.4952, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711935807422267, + "loss": 3.1406, + "theoretical_loss": 4.301057466949707, + "tokens_seen": 221249536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711835506519559, + "loss": 3.1892, + "theoretical_loss": 4.300900564070504, + "tokens_seen": 221315072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047117352056168504, + "loss": 2.9775, + "theoretical_loss": 4.30074372065166, + "tokens_seen": 221380608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047116349047141427, + "loss": 3.3187, + "theoretical_loss": 4.300586936653049, + "tokens_seen": 221446144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047115346038114345, + "loss": 3.148, + "theoretical_loss": 4.300430212034587, + "tokens_seen": 221511680 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047114343029087263, + "loss": 2.8728, + "theoretical_loss": 4.300273546756223, + "tokens_seen": 221577216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711334002006018, + "loss": 3.1421, + "theoretical_loss": 4.300116940777951, + "tokens_seen": 221642752 + }, + { + "epoch": 0.07, + "learning_rate": 0.000471123370110331, + "loss": 3.1462, + "theoretical_loss": 4.299960394059799, + "tokens_seen": 221708288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711133400200602, + "loss": 2.9714, + "theoretical_loss": 4.299803906561835, + "tokens_seen": 221773824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711033099297894, + "loss": 3.278, + "theoretical_loss": 4.29964747824417, + "tokens_seen": 221839360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047109327983951854, + "loss": 3.1696, + "theoretical_loss": 4.299491109066947, + "tokens_seen": 221904896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710832497492478, + "loss": 3.1112, + "theoretical_loss": 4.299334798990351, + "tokens_seen": 221970432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710732196589769, + "loss": 3.1501, + "theoretical_loss": 4.2991785479746065, + "tokens_seen": 222035968 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047106318956870614, + "loss": 2.7828, + "theoretical_loss": 4.299022355979974, + "tokens_seen": 222101504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710531594784353, + "loss": 3.3247, + "theoretical_loss": 4.298866222966755, + "tokens_seen": 222167040 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710431293881645, + "loss": 3.1744, + "theoretical_loss": 4.298710148895286, + "tokens_seen": 222232576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710330992978937, + "loss": 3.4915, + "theoretical_loss": 4.298554133725946, + "tokens_seen": 222298112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710230692076229, + "loss": 3.1641, + "theoretical_loss": 4.298398177419149, + "tokens_seen": 222363648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047101303911735204, + "loss": 3.2386, + "theoretical_loss": 4.298242279935349, + "tokens_seen": 222429184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710030090270813, + "loss": 3.305, + "theoretical_loss": 4.2980864412350375, + "tokens_seen": 222494720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709929789368104, + "loss": 3.1451, + "theoretical_loss": 4.297930661278745, + "tokens_seen": 222560256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047098294884653964, + "loss": 3.031, + "theoretical_loss": 4.297774940027038, + "tokens_seen": 222625792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709729187562688, + "loss": 3.2461, + "theoretical_loss": 4.297619277440523, + "tokens_seen": 222691328 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470962888665998, + "loss": 3.1183, + "theoretical_loss": 4.297463673479846, + "tokens_seen": 222756864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 137650, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3066697120666504, + "objective/train/theoretical_loss": 4.297308128105687, + "objective/train/tokens_used": 243282400, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709528585757272, + "loss": 3.3301, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047094282848545636, + "loss": 3.2327, + "theoretical_loss": 4.297152641278767, + "tokens_seen": 222887936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047093279839518555, + "loss": 3.0477, + "theoretical_loss": 4.296997212959842, + "tokens_seen": 222953472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709227683049148, + "loss": 3.4307, + "theoretical_loss": 4.296841843109711, + "tokens_seen": 223019008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709127382146439, + "loss": 3.1878, + "theoretical_loss": 4.296686531689204, + "tokens_seen": 223084544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047090270812437314, + "loss": 3.29, + "theoretical_loss": 4.296531278659193, + "tokens_seen": 223150080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047089267803410227, + "loss": 3.0173, + "theoretical_loss": 4.296376083980589, + "tokens_seen": 223215616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708826479438315, + "loss": 3.3376, + "theoretical_loss": 4.296220947614337, + "tokens_seen": 223281152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708726178535607, + "loss": 2.9857, + "theoretical_loss": 4.296065869521421, + "tokens_seen": 223346688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047086258776328987, + "loss": 3.5665, + "theoretical_loss": 4.295910849662862, + "tokens_seen": 223412224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047085255767301905, + "loss": 3.2103, + "theoretical_loss": 4.2957558879997215, + "tokens_seen": 223477760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708425275827483, + "loss": 3.1934, + "theoretical_loss": 4.295600984493093, + "tokens_seen": 223543296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708324974924774, + "loss": 2.9751, + "theoretical_loss": 4.295446139104112, + "tokens_seen": 223608832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047082246740220665, + "loss": 3.18, + "theoretical_loss": 4.295291351793951, + "tokens_seen": 223674368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708124373119358, + "loss": 3.1672, + "theoretical_loss": 4.295136622523817, + "tokens_seen": 223739904 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470802407221665, + "loss": 3.001, + "theoretical_loss": 4.294981951254956, + "tokens_seen": 223805440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707923771313942, + "loss": 3.1423, + "theoretical_loss": 4.294827337948651, + "tokens_seen": 223870976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047078234704112337, + "loss": 3.0418, + "theoretical_loss": 4.294672782566224, + "tokens_seen": 223936512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047077231695085255, + "loss": 3.1785, + "theoretical_loss": 4.29451828506903, + "tokens_seen": 224002048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047076228686058173, + "loss": 3.2824, + "theoretical_loss": 4.294363845418465, + "tokens_seen": 224067584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707522567703109, + "loss": 3.099, + "theoretical_loss": 4.29420946357596, + "tokens_seen": 224133120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047074222668004015, + "loss": 3.1006, + "theoretical_loss": 4.294055139502985, + "tokens_seen": 224198656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707321965897693, + "loss": 3.2409, + "theoretical_loss": 4.293900873161043, + "tokens_seen": 224264192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707221664994985, + "loss": 3.0344, + "theoretical_loss": 4.293746664511678, + "tokens_seen": 224329728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047071213640922775, + "loss": 2.9979, + "theoretical_loss": 4.293592513516469, + "tokens_seen": 224395264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 138458, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.281918525695801, + "objective/train/theoretical_loss": 4.293438420137031, + "objective/train/tokens_used": 244920800, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707021063189569, + "loss": 3.1837, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706920762286861, + "loss": 3.0455, + "theoretical_loss": 4.293284384335017, + "tokens_seen": 224526336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047068204613841524, + "loss": 3.2542, + "theoretical_loss": 4.293130406072118, + "tokens_seen": 224591872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047067201604814447, + "loss": 3.2776, + "theoretical_loss": 4.292976485310057, + "tokens_seen": 224657408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047066198595787365, + "loss": 3.1208, + "theoretical_loss": 4.2928226220106005, + "tokens_seen": 224722944 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047065195586760283, + "loss": 3.1642, + "theoretical_loss": 4.292668816135545, + "tokens_seen": 224788480 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470641925777332, + "loss": 3.3477, + "theoretical_loss": 4.292515067646727, + "tokens_seen": 224854016 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706318956870612, + "loss": 3.1525, + "theoretical_loss": 4.29236137650602, + "tokens_seen": 224919552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706218655967904, + "loss": 3.203, + "theoretical_loss": 4.2922077426753305, + "tokens_seen": 224985088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706118355065196, + "loss": 3.304, + "theoretical_loss": 4.292054166116605, + "tokens_seen": 225050624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047060180541624874, + "loss": 3.099, + "theoretical_loss": 4.291900646791825, + "tokens_seen": 225116160 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470591775325978, + "loss": 3.4831, + "theoretical_loss": 4.2917471846630075, + "tokens_seen": 225181696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705817452357071, + "loss": 3.3273, + "theoretical_loss": 4.291593779692207, + "tokens_seen": 225247232 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047057171514543634, + "loss": 3.0792, + "theoretical_loss": 4.291440431841513, + "tokens_seen": 225312768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705616850551655, + "loss": 3.1787, + "theoretical_loss": 4.291287141073053, + "tokens_seen": 225378304 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705516549648947, + "loss": 3.0933, + "theoretical_loss": 4.291133907348989, + "tokens_seen": 225443840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705416248746239, + "loss": 3.2376, + "theoretical_loss": 4.29098073063152, + "tokens_seen": 225509376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705315947843531, + "loss": 3.3523, + "theoretical_loss": 4.29082761088288, + "tokens_seen": 225574912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047052156469408224, + "loss": 3.032, + "theoretical_loss": 4.290674548065338, + "tokens_seen": 225640448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705115346038115, + "loss": 3.0997, + "theoretical_loss": 4.290521542141203, + "tokens_seen": 225705984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705015045135406, + "loss": 3.1698, + "theoretical_loss": 4.290368593072817, + "tokens_seen": 225771520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047049147442326984, + "loss": 3.2039, + "theoretical_loss": 4.290215700822556, + "tokens_seen": 225837056 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470481444332999, + "loss": 3.1309, + "theoretical_loss": 4.290062865352837, + "tokens_seen": 225902592 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704714142427282, + "loss": 3.1847, + "theoretical_loss": 4.289910086626108, + "tokens_seen": 225968128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704613841524574, + "loss": 2.9217, + "theoretical_loss": 4.289757364604855, + "tokens_seen": 226033664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 139641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6932528018951416, + "objective/train/theoretical_loss": 4.2896046992515995, + "objective/train/tokens_used": 246559200, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047045135406218656, + "loss": 3.5236, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047044132397191575, + "loss": 3.1005, + "theoretical_loss": 4.289452090528897, + "tokens_seen": 226164736 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470431293881645, + "loss": 3.1343, + "theoretical_loss": 4.289299538399341, + "tokens_seen": 226230272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704212637913741, + "loss": 3.3457, + "theoretical_loss": 4.28914704282556, + "tokens_seen": 226295808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047041123370110334, + "loss": 3.4157, + "theoretical_loss": 4.288994603770215, + "tokens_seen": 226361344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047040120361083247, + "loss": 3.2237, + "theoretical_loss": 4.288842221196007, + "tokens_seen": 226426880 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703911735205617, + "loss": 3.2044, + "theoretical_loss": 4.28868989506567, + "tokens_seen": 226492416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703811434302909, + "loss": 3.2856, + "theoretical_loss": 4.288537625341974, + "tokens_seen": 226557952 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047037111334002007, + "loss": 3.0679, + "theoretical_loss": 4.288385411987722, + "tokens_seen": 226623488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047036108324974925, + "loss": 3.4104, + "theoretical_loss": 4.288233254965755, + "tokens_seen": 226689024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703510531594785, + "loss": 3.3151, + "theoretical_loss": 4.2880811542389505, + "tokens_seen": 226754560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703410230692076, + "loss": 3.3593, + "theoretical_loss": 4.287929109770217, + "tokens_seen": 226820096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047033099297893685, + "loss": 2.8766, + "theoretical_loss": 4.287777121522501, + "tokens_seen": 226885632 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470320962888666, + "loss": 3.209, + "theoretical_loss": 4.287625189458781, + "tokens_seen": 226951168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703109327983952, + "loss": 3.1, + "theoretical_loss": 4.287473313542077, + "tokens_seen": 227016704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703009027081244, + "loss": 3.3124, + "theoretical_loss": 4.287321493735438, + "tokens_seen": 227082240 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047029087261785357, + "loss": 3.1342, + "theoretical_loss": 4.287169730001949, + "tokens_seen": 227147776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047028084252758275, + "loss": 3.1754, + "theoretical_loss": 4.287018022304733, + "tokens_seen": 227213312 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047027081243731193, + "loss": 3.2797, + "theoretical_loss": 4.286866370606943, + "tokens_seen": 227278848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702607823470411, + "loss": 3.3938, + "theoretical_loss": 4.286714774871772, + "tokens_seen": 227344384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047025075225677035, + "loss": 2.8743, + "theoretical_loss": 4.286563235062444, + "tokens_seen": 227409920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702407221664995, + "loss": 3.2675, + "theoretical_loss": 4.28641175114222, + "tokens_seen": 227475456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702306920762287, + "loss": 3.5072, + "theoretical_loss": 4.286260323074394, + "tokens_seen": 227540992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047022066198595784, + "loss": 3.385, + "theoretical_loss": 4.286108950822296, + "tokens_seen": 227606528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702106318956871, + "loss": 3.2283, + "theoretical_loss": 4.285957634349289, + "tokens_seen": 227672064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 140376, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.265362501144409, + "objective/train/theoretical_loss": 4.285806373618774, + "objective/train/tokens_used": 248197600, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047020060180541626, + "loss": 2.999, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047019057171514544, + "loss": 3.1608, + "theoretical_loss": 4.285655168594182, + "tokens_seen": 227803136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701805416248746, + "loss": 3.2796, + "theoretical_loss": 4.285504019238982, + "tokens_seen": 227868672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047017051153460385, + "loss": 3.4479, + "theoretical_loss": 4.285352925516676, + "tokens_seen": 227934208 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470160481444333, + "loss": 3.3548, + "theoretical_loss": 4.2852018873908, + "tokens_seen": 227999744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701504513540622, + "loss": 3.2998, + "theoretical_loss": 4.285050904824925, + "tokens_seen": 228065280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047014042126379134, + "loss": 3.1287, + "theoretical_loss": 4.284899977782658, + "tokens_seen": 228130816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701303911735206, + "loss": 3.2708, + "theoretical_loss": 4.284749106227636, + "tokens_seen": 228196352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047012036108324976, + "loss": 3.1895, + "theoretical_loss": 4.284598290123535, + "tokens_seen": 228261888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047011033099297894, + "loss": 3.128, + "theoretical_loss": 4.284447529434061, + "tokens_seen": 228327424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701003009027081, + "loss": 3.2905, + "theoretical_loss": 4.284296824122959, + "tokens_seen": 228392960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700902708124373, + "loss": 3.152, + "theoretical_loss": 4.284146174154003, + "tokens_seen": 228458496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700802407221665, + "loss": 3.4164, + "theoretical_loss": 4.283995579491004, + "tokens_seen": 228524032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700702106318957, + "loss": 2.9542, + "theoretical_loss": 4.283845040097807, + "tokens_seen": 228589568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047006018054162484, + "loss": 3.1201, + "theoretical_loss": 4.28369455593829, + "tokens_seen": 228655104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700501504513541, + "loss": 3.3299, + "theoretical_loss": 4.2835441269763646, + "tokens_seen": 228720640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047004012036108326, + "loss": 2.9468, + "theoretical_loss": 4.283393753175979, + "tokens_seen": 228786176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047003009027081244, + "loss": 3.3356, + "theoretical_loss": 4.283243434501112, + "tokens_seen": 228851712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700200601805416, + "loss": 3.0321, + "theoretical_loss": 4.283093170915778, + "tokens_seen": 228917248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700100300902708, + "loss": 3.143, + "theoretical_loss": 4.282942962384023, + "tokens_seen": 228982784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047, + "loss": 3.1754, + "theoretical_loss": 4.282792808869932, + "tokens_seen": 229048320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699899699097292, + "loss": 3.1478, + "theoretical_loss": 4.282642710337618, + "tokens_seen": 229113856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046997993981945835, + "loss": 3.336, + "theoretical_loss": 4.28249266675123, + "tokens_seen": 229179392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699699097291876, + "loss": 3.2775, + "theoretical_loss": 4.282342678074951, + "tokens_seen": 229244928 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046995987963891676, + "loss": 3.2018, + "theoretical_loss": 4.2821927442729955, + "tokens_seen": 229310464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 141693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1491169929504395, + "objective/train/theoretical_loss": 4.282042865309616, + "objective/train/tokens_used": 249836000, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046994984954864595, + "loss": 3.1859, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699398194583752, + "loss": 3.3997, + "theoretical_loss": 4.281893041149093, + "tokens_seen": 229441536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699297893681043, + "loss": 3.1647, + "theoretical_loss": 4.2817432717557455, + "tokens_seen": 229507072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046991975927783354, + "loss": 3.2627, + "theoretical_loss": 4.28159355709392, + "tokens_seen": 229572608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046990972918756267, + "loss": 3.1173, + "theoretical_loss": 4.281443897128004, + "tokens_seen": 229638144 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698996990972919, + "loss": 3.3119, + "theoretical_loss": 4.2812942918224115, + "tokens_seen": 229703680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698896690070211, + "loss": 2.8788, + "theoretical_loss": 4.281144741141593, + "tokens_seen": 229769216 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046987963891675027, + "loss": 3.2099, + "theoretical_loss": 4.280995245050032, + "tokens_seen": 229834752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046986960882647945, + "loss": 3.1133, + "theoretical_loss": 4.2808458035122445, + "tokens_seen": 229900288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698595787362087, + "loss": 3.3331, + "theoretical_loss": 4.2806964164927805, + "tokens_seen": 229965824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698495486459378, + "loss": 3.0517, + "theoretical_loss": 4.280547083956224, + "tokens_seen": 230031360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046983951855566705, + "loss": 3.1513, + "theoretical_loss": 4.280397805867188, + "tokens_seen": 230096896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698294884653962, + "loss": 3.1371, + "theoretical_loss": 4.280248582190324, + "tokens_seen": 230162432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698194583751254, + "loss": 3.1632, + "theoretical_loss": 4.280099412890312, + "tokens_seen": 230227968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698094282848546, + "loss": 3.1009, + "theoretical_loss": 4.279950297931869, + "tokens_seen": 230293504 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046979939819458377, + "loss": 3.414, + "theoretical_loss": 4.27980123727974, + "tokens_seen": 230359040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046978936810431295, + "loss": 3.1625, + "theoretical_loss": 4.279652230898709, + "tokens_seen": 230424576 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046977933801404213, + "loss": 3.2313, + "theoretical_loss": 4.279503278753586, + "tokens_seen": 230490112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697693079237713, + "loss": 3.3858, + "theoretical_loss": 4.27935438080922, + "tokens_seen": 230555648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046975927783350055, + "loss": 3.011, + "theoretical_loss": 4.27920553703049, + "tokens_seen": 230621184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697492477432297, + "loss": 2.9844, + "theoretical_loss": 4.279056747382306, + "tokens_seen": 230686720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697392176529589, + "loss": 3.197, + "theoretical_loss": 4.278908011829613, + "tokens_seen": 230752256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046972918756268804, + "loss": 3.1954, + "theoretical_loss": 4.27875933033739, + "tokens_seen": 230817792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697191574724173, + "loss": 3.0489, + "theoretical_loss": 4.278610702870646, + "tokens_seen": 230883328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046970912738214646, + "loss": 3.2719, + "theoretical_loss": 4.278462129394423, + "tokens_seen": 230948864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 142407, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.702136993408203, + "objective/train/theoretical_loss": 4.278313609873795, + "objective/train/tokens_used": 251474400, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046969909729187564, + "loss": 3.3355, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696890672016048, + "loss": 2.9092, + "theoretical_loss": 4.278165144273871, + "tokens_seen": 231079936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046967903711133405, + "loss": 3.2064, + "theoretical_loss": 4.27801673255979, + "tokens_seen": 231145472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696690070210632, + "loss": 3.2198, + "theoretical_loss": 4.277868374696725, + "tokens_seen": 231211008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696589769307924, + "loss": 3.1205, + "theoretical_loss": 4.277720070649879, + "tokens_seen": 231276544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046964894684052154, + "loss": 3.1119, + "theoretical_loss": 4.277571820384491, + "tokens_seen": 231342080 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696389167502508, + "loss": 3.2283, + "theoretical_loss": 4.277423623865829, + "tokens_seen": 231407616 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046962888665997996, + "loss": 3.036, + "theoretical_loss": 4.277275481059195, + "tokens_seen": 231473152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046961885656970914, + "loss": 3.0813, + "theoretical_loss": 4.2771273919299215, + "tokens_seen": 231538688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696088264794383, + "loss": 3.2151, + "theoretical_loss": 4.276979356443377, + "tokens_seen": 231604224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695987963891675, + "loss": 2.992, + "theoretical_loss": 4.276831374564957, + "tokens_seen": 231669760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695887662988967, + "loss": 3.1676, + "theoretical_loss": 4.276683446260093, + "tokens_seen": 231735296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695787362086259, + "loss": 3.0428, + "theoretical_loss": 4.276535571494247, + "tokens_seen": 231800832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046956870611835505, + "loss": 2.979, + "theoretical_loss": 4.276387750232913, + "tokens_seen": 231866368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695586760280843, + "loss": 3.0738, + "theoretical_loss": 4.276239982441617, + "tokens_seen": 231931904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046954864593781346, + "loss": 3.2279, + "theoretical_loss": 4.276092268085918, + "tokens_seen": 231997440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046953861584754264, + "loss": 3.1162, + "theoretical_loss": 4.275944607131406, + "tokens_seen": 232062976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695285857572718, + "loss": 3.0761, + "theoretical_loss": 4.275796999543703, + "tokens_seen": 232128512 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469518555667001, + "loss": 3.0807, + "theoretical_loss": 4.275649445288461, + "tokens_seen": 232194048 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695085255767302, + "loss": 3.1171, + "theoretical_loss": 4.275501944331367, + "tokens_seen": 232259584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694984954864594, + "loss": 3.273, + "theoretical_loss": 4.275354496638139, + "tokens_seen": 232325120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046948846539618855, + "loss": 3.2311, + "theoretical_loss": 4.275207102174525, + "tokens_seen": 232390656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694784353059178, + "loss": 3.3267, + "theoretical_loss": 4.275059760906305, + "tokens_seen": 232456192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694684052156469, + "loss": 3.1457, + "theoretical_loss": 4.2749124727992935, + "tokens_seen": 232521728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046945837512537615, + "loss": 3.4693, + "theoretical_loss": 4.274765237819333, + "tokens_seen": 232587264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 143093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.266288995742798, + "objective/train/theoretical_loss": 4.274618055932298, + "objective/train/tokens_used": 253112800, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694483450351053, + "loss": 3.1205, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694383149448345, + "loss": 3.3176, + "theoretical_loss": 4.2744709271040975, + "tokens_seen": 232718336 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694282848545637, + "loss": 3.1205, + "theoretical_loss": 4.27432385130067, + "tokens_seen": 232783872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046941825476429287, + "loss": 3.1899, + "theoretical_loss": 4.274176828487984, + "tokens_seen": 232849408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046940822467402205, + "loss": 3.2353, + "theoretical_loss": 4.2740298586320415, + "tokens_seen": 232914944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693981945837513, + "loss": 3.1096, + "theoretical_loss": 4.273882941698876, + "tokens_seen": 232980480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693881644934804, + "loss": 3.0017, + "theoretical_loss": 4.27373607765455, + "tokens_seen": 233046016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046937813440320965, + "loss": 3.0144, + "theoretical_loss": 4.2735892664651605, + "tokens_seen": 233111552 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046936810431293883, + "loss": 3.2733, + "theoretical_loss": 4.273442508096833, + "tokens_seen": 233177088 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469358074222668, + "loss": 3.1154, + "theoretical_loss": 4.273295802515726, + "tokens_seen": 233242624 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693480441323972, + "loss": 3.1081, + "theoretical_loss": 4.273149149688028, + "tokens_seen": 233308160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693380140421264, + "loss": 3.0391, + "theoretical_loss": 4.27300254957996, + "tokens_seen": 233373696 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046932798395185555, + "loss": 3.0994, + "theoretical_loss": 4.272856002157772, + "tokens_seen": 233439232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693179538615848, + "loss": 3.0702, + "theoretical_loss": 4.272709507387748, + "tokens_seen": 233504768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693079237713139, + "loss": 3.1512, + "theoretical_loss": 4.2725630652362, + "tokens_seen": 233570304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046929789368104315, + "loss": 2.9167, + "theoretical_loss": 4.272416675669473, + "tokens_seen": 233635840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692878635907723, + "loss": 3.353, + "theoretical_loss": 4.272270338653942, + "tokens_seen": 233701376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692778335005015, + "loss": 3.0815, + "theoretical_loss": 4.272124054156014, + "tokens_seen": 233766912 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692678034102307, + "loss": 3.1558, + "theoretical_loss": 4.271977822142125, + "tokens_seen": 233832448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692577733199599, + "loss": 3.4225, + "theoretical_loss": 4.271831642578745, + "tokens_seen": 233897984 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046924774322968906, + "loss": 3.0911, + "theoretical_loss": 4.27168551543237, + "tokens_seen": 233963520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046923771313941824, + "loss": 3.1895, + "theoretical_loss": 4.271539440669532, + "tokens_seen": 234029056 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692276830491474, + "loss": 3.223, + "theoretical_loss": 4.27139341825679, + "tokens_seen": 234094592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046921765295887666, + "loss": 3.4205, + "theoretical_loss": 4.271247448160736, + "tokens_seen": 234160128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046920762286860584, + "loss": 3.1894, + "theoretical_loss": 4.27110153034799, + "tokens_seen": 234225664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 144323, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5741994380950928, + "objective/train/theoretical_loss": 4.270955664785207, + "objective/train/tokens_used": 254751200, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469197592778335, + "loss": 3.2851, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046918756268806425, + "loss": 3.207, + "theoretical_loss": 4.2708098514390676, + "tokens_seen": 234356736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691775325977934, + "loss": 3.0235, + "theoretical_loss": 4.270664090276286, + "tokens_seen": 234422272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691675025075226, + "loss": 3.2393, + "theoretical_loss": 4.2705183812636065, + "tokens_seen": 234487808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046915747241725174, + "loss": 3.1636, + "theoretical_loss": 4.270372724367803, + "tokens_seen": 234553344 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469147442326981, + "loss": 3.073, + "theoretical_loss": 4.270227119555681, + "tokens_seen": 234618880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046913741223671016, + "loss": 3.199, + "theoretical_loss": 4.270081566794076, + "tokens_seen": 234684416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046912738214643934, + "loss": 3.0461, + "theoretical_loss": 4.269936066049852, + "tokens_seen": 234749952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691173520561685, + "loss": 3.0538, + "theoretical_loss": 4.269790617289907, + "tokens_seen": 234815488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691073219658977, + "loss": 3.0901, + "theoretical_loss": 4.269645220481166, + "tokens_seen": 234881024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690972918756269, + "loss": 3.1677, + "theoretical_loss": 4.269499875590587, + "tokens_seen": 234946560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690872617853561, + "loss": 3.1924, + "theoretical_loss": 4.269354582585156, + "tokens_seen": 235012096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046907723169508525, + "loss": 2.9523, + "theoretical_loss": 4.269209341431889, + "tokens_seen": 235077632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690672016048145, + "loss": 2.9465, + "theoretical_loss": 4.269064152097835, + "tokens_seen": 235143168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046905717151454366, + "loss": 3.2225, + "theoretical_loss": 4.26891901455007, + "tokens_seen": 235208704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046904714142427284, + "loss": 3.3442, + "theoretical_loss": 4.268773928755701, + "tokens_seen": 235274240 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469037111334002, + "loss": 2.9913, + "theoretical_loss": 4.268628894681868, + "tokens_seen": 235339776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690270812437312, + "loss": 3.1332, + "theoretical_loss": 4.268483912295735, + "tokens_seen": 235405312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690170511534604, + "loss": 3.1891, + "theoretical_loss": 4.268338981564502, + "tokens_seen": 235470848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690070210631896, + "loss": 3.0551, + "theoretical_loss": 4.268194102455395, + "tokens_seen": 235536384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046899699097291875, + "loss": 3.1104, + "theoretical_loss": 4.26804927493567, + "tokens_seen": 235601920 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468986960882648, + "loss": 3.1895, + "theoretical_loss": 4.267904498972618, + "tokens_seen": 235667456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689769307923771, + "loss": 3.115, + "theoretical_loss": 4.267759774533552, + "tokens_seen": 235732992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046896690070210635, + "loss": 3.2606, + "theoretical_loss": 4.267615101585821, + "tokens_seen": 235798528 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046895687061183553, + "loss": 3.1975, + "theoretical_loss": 4.267470480096801, + "tokens_seen": 235864064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 145084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.02962589263916, + "objective/train/theoretical_loss": 4.267325910033897, + "objective/train/tokens_used": 256389600, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689468405215647, + "loss": 3.2423, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689368104312939, + "loss": 3.0465, + "theoretical_loss": 4.267181391364547, + "tokens_seen": 235995136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046892678034102307, + "loss": 3.0515, + "theoretical_loss": 4.267036924056215, + "tokens_seen": 236060672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046891675025075225, + "loss": 3.1894, + "theoretical_loss": 4.266892508076397, + "tokens_seen": 236126208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689067201604815, + "loss": 3.3846, + "theoretical_loss": 4.266748143392617, + "tokens_seen": 236191744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688966900702106, + "loss": 2.8453, + "theoretical_loss": 4.26660382997243, + "tokens_seen": 236257280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046888665997993985, + "loss": 3.2638, + "theoretical_loss": 4.26645956778342, + "tokens_seen": 236322816 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046887662988966903, + "loss": 3.2298, + "theoretical_loss": 4.2663153567932, + "tokens_seen": 236388352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688665997993982, + "loss": 3.4072, + "theoretical_loss": 4.266171196969412, + "tokens_seen": 236453888 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688565697091274, + "loss": 3.3305, + "theoretical_loss": 4.2660270882797295, + "tokens_seen": 236519424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688465396188566, + "loss": 3.1997, + "theoretical_loss": 4.265883030691853, + "tokens_seen": 236584960 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046883650952858575, + "loss": 3.0699, + "theoretical_loss": 4.265739024173515, + "tokens_seen": 236650496 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468826479438315, + "loss": 3.1803, + "theoretical_loss": 4.265595068692473, + "tokens_seen": 236716032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688164493480441, + "loss": 3.254, + "theoretical_loss": 4.26545116421652, + "tokens_seen": 236781568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046880641925777335, + "loss": 2.9222, + "theoretical_loss": 4.265307310713471, + "tokens_seen": 236847104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687963891675025, + "loss": 3.0569, + "theoretical_loss": 4.2651635081511765, + "tokens_seen": 236912640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687863590772317, + "loss": 3.1851, + "theoretical_loss": 4.265019756497512, + "tokens_seen": 236978176 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687763289869609, + "loss": 3.0481, + "theoretical_loss": 4.264876055720386, + "tokens_seen": 237043712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687662988966901, + "loss": 3.2352, + "theoretical_loss": 4.264732405787731, + "tokens_seen": 237109248 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046875626880641926, + "loss": 2.9571, + "theoretical_loss": 4.264588806667513, + "tokens_seen": 237174784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046874623871614844, + "loss": 3.07, + "theoretical_loss": 4.264445258327724, + "tokens_seen": 237240320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687362086258776, + "loss": 3.1563, + "theoretical_loss": 4.264301760736389, + "tokens_seen": 237305856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046872617853560686, + "loss": 3.3152, + "theoretical_loss": 4.264158313861557, + "tokens_seen": 237371392 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468716148445336, + "loss": 3.1966, + "theoretical_loss": 4.264014917671309, + "tokens_seen": 237436928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687061183550652, + "loss": 3.0981, + "theoretical_loss": 4.2638715721337554, + "tokens_seen": 237502464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 146518, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.969031572341919, + "objective/train/theoretical_loss": 4.263728277217032, + "objective/train/tokens_used": 258028000, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686960882647944, + "loss": 2.995, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686860581745236, + "loss": 2.7096, + "theoretical_loss": 4.263585032889306, + "tokens_seen": 237633536 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046867602808425276, + "loss": 3.2779, + "theoretical_loss": 4.263441839118776, + "tokens_seen": 237699072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046866599799398194, + "loss": 3.237, + "theoretical_loss": 4.2632986958736625, + "tokens_seen": 237764608 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686559679037111, + "loss": 3.1594, + "theoretical_loss": 4.263155603122221, + "tokens_seen": 237830144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046864593781344036, + "loss": 3.1277, + "theoretical_loss": 4.263012560832733, + "tokens_seen": 237895680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686359077231695, + "loss": 3.4282, + "theoretical_loss": 4.262869568973508, + "tokens_seen": 237961216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686258776328987, + "loss": 3.1725, + "theoretical_loss": 4.262726627512886, + "tokens_seen": 238026752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046861584754262785, + "loss": 3.1387, + "theoretical_loss": 4.262583736419234, + "tokens_seen": 238092288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686058174523571, + "loss": 3.2668, + "theoretical_loss": 4.26244089566095, + "tokens_seen": 238157824 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046859578736208626, + "loss": 3.023, + "theoretical_loss": 4.262298105206456, + "tokens_seen": 238223360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046858575727181545, + "loss": 3.4639, + "theoretical_loss": 4.262155365024207, + "tokens_seen": 238288896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685757271815446, + "loss": 3.1628, + "theoretical_loss": 4.262012675082685, + "tokens_seen": 238354432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685656970912738, + "loss": 3.0802, + "theoretical_loss": 4.261870035350399, + "tokens_seen": 238419968 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468555667001003, + "loss": 2.9962, + "theoretical_loss": 4.261727445795888, + "tokens_seen": 238485504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685456369107322, + "loss": 3.2431, + "theoretical_loss": 4.26158490638772, + "tokens_seen": 238551040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046853560682046135, + "loss": 3.1662, + "theoretical_loss": 4.261442417094488, + "tokens_seen": 238616576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685255767301906, + "loss": 3.1555, + "theoretical_loss": 4.261299977884816, + "tokens_seen": 238682112 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046851554663991977, + "loss": 2.8705, + "theoretical_loss": 4.2611575887273565, + "tokens_seen": 238747648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046850551654964895, + "loss": 2.9495, + "theoretical_loss": 4.261015249590789, + "tokens_seen": 238813184 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046849548645937813, + "loss": 3.0884, + "theoretical_loss": 4.260872960443822, + "tokens_seen": 238878720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684854563691073, + "loss": 3.192, + "theoretical_loss": 4.260730721255191, + "tokens_seen": 238944256 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684754262788365, + "loss": 3.1981, + "theoretical_loss": 4.260588531993662, + "tokens_seen": 239009792 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046846539618856573, + "loss": 2.9505, + "theoretical_loss": 4.260446392628026, + "tokens_seen": 239075328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684553660982949, + "loss": 3.3252, + "theoretical_loss": 4.2603043031271035, + "tokens_seen": 239140864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 147039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.313131332397461, + "objective/train/theoretical_loss": 4.260162263459744, + "objective/train/tokens_used": 259666400, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684453360080241, + "loss": 3.0978, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046843530591775327, + "loss": 2.9198, + "theoretical_loss": 4.260020273594824, + "tokens_seen": 239271936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046842527582748245, + "loss": 3.025, + "theoretical_loss": 4.259878333501247, + "tokens_seen": 239337472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684152457372117, + "loss": 3.0117, + "theoretical_loss": 4.259736443147946, + "tokens_seen": 239403008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684052156469408, + "loss": 3.3059, + "theoretical_loss": 4.259594602503881, + "tokens_seen": 239468544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046839518555667005, + "loss": 3.4276, + "theoretical_loss": 4.259452811538041, + "tokens_seen": 239534080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046838515546639923, + "loss": 3.3361, + "theoretical_loss": 4.259311070219441, + "tokens_seen": 239599616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683751253761284, + "loss": 3.3203, + "theoretical_loss": 4.259169378517125, + "tokens_seen": 239665152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683650952858576, + "loss": 3.2315, + "theoretical_loss": 4.259027736400165, + "tokens_seen": 239730688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683550651955868, + "loss": 3.014, + "theoretical_loss": 4.258886143837661, + "tokens_seen": 239796224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046834503510531595, + "loss": 3.2752, + "theoretical_loss": 4.258744600798739, + "tokens_seen": 239861760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683350050150452, + "loss": 3.4321, + "theoretical_loss": 4.2586031072525525, + "tokens_seen": 239927296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683249749247743, + "loss": 2.9211, + "theoretical_loss": 4.258461663168285, + "tokens_seen": 239992832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046831494483450355, + "loss": 3.0035, + "theoretical_loss": 4.258320268515147, + "tokens_seen": 240058368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683049147442327, + "loss": 2.9131, + "theoretical_loss": 4.258178923262376, + "tokens_seen": 240123904 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682948846539619, + "loss": 3.3793, + "theoretical_loss": 4.258037627379235, + "tokens_seen": 240189440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682848545636911, + "loss": 3.182, + "theoretical_loss": 4.257896380835018, + "tokens_seen": 240254976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682748244734203, + "loss": 3.0911, + "theoretical_loss": 4.257755183599045, + "tokens_seen": 240320512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046826479438314946, + "loss": 3.1443, + "theoretical_loss": 4.257614035640662, + "tokens_seen": 240386048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046825476429287864, + "loss": 3.3931, + "theoretical_loss": 4.257472936929246, + "tokens_seen": 240451584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682447342026078, + "loss": 3.2777, + "theoretical_loss": 4.257331887434198, + "tokens_seen": 240517120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046823470411233706, + "loss": 3.3189, + "theoretical_loss": 4.257190887124946, + "tokens_seen": 240582656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682246740220662, + "loss": 3.3554, + "theoretical_loss": 4.25704993597095, + "tokens_seen": 240648192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682146439317954, + "loss": 3.1868, + "theoretical_loss": 4.256909033941691, + "tokens_seen": 240713728 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682046138415246, + "loss": 3.3277, + "theoretical_loss": 4.256768181006683, + "tokens_seen": 240779264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 148118, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0634944438934326, + "objective/train/theoretical_loss": 4.2566273771354615, + "objective/train/tokens_used": 261304800, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681945837512538, + "loss": 3.1663, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046818455366098296, + "loss": 3.3725, + "theoretical_loss": 4.256486622297595, + "tokens_seen": 240910336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046817452357071214, + "loss": 3.4629, + "theoretical_loss": 4.256345916462674, + "tokens_seen": 240975872 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681644934804413, + "loss": 3.216, + "theoretical_loss": 4.256205259600321, + "tokens_seen": 241041408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046815446339017056, + "loss": 3.2409, + "theoretical_loss": 4.256064651680182, + "tokens_seen": 241106944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681444332998997, + "loss": 3.3066, + "theoretical_loss": 4.255924092671931, + "tokens_seen": 241172480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681344032096289, + "loss": 3.2428, + "theoretical_loss": 4.255783582545269, + "tokens_seen": 241238016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046812437311935805, + "loss": 3.2391, + "theoretical_loss": 4.255643121269924, + "tokens_seen": 241303552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681143430290873, + "loss": 3.1302, + "theoretical_loss": 4.255502708815651, + "tokens_seen": 241369088 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046810431293881646, + "loss": 3.1376, + "theoretical_loss": 4.255362345152234, + "tokens_seen": 241434624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046809428284854565, + "loss": 3.2276, + "theoretical_loss": 4.255222030249479, + "tokens_seen": 241500160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680842527582748, + "loss": 3.1871, + "theoretical_loss": 4.255081764077224, + "tokens_seen": 241565696 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468074222668004, + "loss": 3.02, + "theoretical_loss": 4.25494154660533, + "tokens_seen": 241631232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680641925777332, + "loss": 3.2228, + "theoretical_loss": 4.254801377803689, + "tokens_seen": 241696768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680541624874624, + "loss": 3.198, + "theoretical_loss": 4.254661257642215, + "tokens_seen": 241762304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046804413239719155, + "loss": 3.2133, + "theoretical_loss": 4.254521186090852, + "tokens_seen": 241827840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680341023069208, + "loss": 3.2564, + "theoretical_loss": 4.254381163119568, + "tokens_seen": 241893376 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046802407221664997, + "loss": 3.1465, + "theoretical_loss": 4.254241188698361, + "tokens_seen": 241958912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046801404212637915, + "loss": 3.3137, + "theoretical_loss": 4.2541012627972545, + "tokens_seen": 242024448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046800401203610833, + "loss": 3.148, + "theoretical_loss": 4.2539613853862965, + "tokens_seen": 242089984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679939819458375, + "loss": 3.1473, + "theoretical_loss": 4.253821556435565, + "tokens_seen": 242155520 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679839518555667, + "loss": 2.9928, + "theoretical_loss": 4.253681775915161, + "tokens_seen": 242221056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046797392176529593, + "loss": 3.2938, + "theoretical_loss": 4.253542043795215, + "tokens_seen": 242286592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046796389167502505, + "loss": 3.1909, + "theoretical_loss": 4.253402360045882, + "tokens_seen": 242352128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679538615847543, + "loss": 3.3872, + "theoretical_loss": 4.253262724637346, + "tokens_seen": 242417664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 148826, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4665045738220215, + "objective/train/theoretical_loss": 4.253123137539814, + "objective/train/tokens_used": 262943200, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679438314944834, + "loss": 3.1621, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046793380140421265, + "loss": 2.9992, + "theoretical_loss": 4.252983598723521, + "tokens_seen": 242548736 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046792377131394183, + "loss": 3.2264, + "theoretical_loss": 4.25284410815873, + "tokens_seen": 242614272 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467913741223671, + "loss": 3.0202, + "theoretical_loss": 4.2527046658157275, + "tokens_seen": 242679808 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679037111334002, + "loss": 3.2755, + "theoretical_loss": 4.252565271664828, + "tokens_seen": 242745344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046789368104312943, + "loss": 3.2883, + "theoretical_loss": 4.252425925676373, + "tokens_seen": 242810880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046788365095285856, + "loss": 2.821, + "theoretical_loss": 4.252286627820727, + "tokens_seen": 242876416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678736208625878, + "loss": 3.4768, + "theoretical_loss": 4.252147378068285, + "tokens_seen": 242941952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678635907723169, + "loss": 3.2938, + "theoretical_loss": 4.252008176389465, + "tokens_seen": 243007488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046785356068204616, + "loss": 3.0992, + "theoretical_loss": 4.251869022754712, + "tokens_seen": 243073024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046784353059177534, + "loss": 3.1465, + "theoretical_loss": 4.251729917134498, + "tokens_seen": 243138560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678335005015045, + "loss": 3.3141, + "theoretical_loss": 4.251590859499322, + "tokens_seen": 243204096 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678234704112337, + "loss": 3.0572, + "theoretical_loss": 4.251451849819704, + "tokens_seen": 243269632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678134403209629, + "loss": 3.1641, + "theoretical_loss": 4.251312888066197, + "tokens_seen": 243335168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046780341023069206, + "loss": 2.8932, + "theoretical_loss": 4.251173974209375, + "tokens_seen": 243400704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677933801404213, + "loss": 2.9821, + "theoretical_loss": 4.251035108219839, + "tokens_seen": 243466240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677833500501504, + "loss": 3.248, + "theoretical_loss": 4.250896290068218, + "tokens_seen": 243531776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046777331995987966, + "loss": 3.1376, + "theoretical_loss": 4.250757519725165, + "tokens_seen": 243597312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677632898696088, + "loss": 3.236, + "theoretical_loss": 4.25061879716136, + "tokens_seen": 243662848 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467753259779338, + "loss": 2.9136, + "theoretical_loss": 4.250480122347507, + "tokens_seen": 243728384 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677432296890672, + "loss": 2.9435, + "theoretical_loss": 4.250341495254337, + "tokens_seen": 243793920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677331995987964, + "loss": 2.9687, + "theoretical_loss": 4.250202915852608, + "tokens_seen": 243859456 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046772316950852556, + "loss": 3.1614, + "theoretical_loss": 4.250064384113102, + "tokens_seen": 243924992 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677131394182548, + "loss": 2.939, + "theoretical_loss": 4.249925900006627, + "tokens_seen": 243990528 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467703109327984, + "loss": 3.036, + "theoretical_loss": 4.249787463504019, + "tokens_seen": 244056064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 150000, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0193285942077637, + "objective/train/theoretical_loss": 4.249649074576134, + "objective/train/tokens_used": 264581600, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046769307923771316, + "loss": 3.1174, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046768304914744234, + "loss": 3.278, + "theoretical_loss": 4.249510733193862, + "tokens_seen": 244187136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676730190571715, + "loss": 2.9943, + "theoretical_loss": 4.249372439328111, + "tokens_seen": 244252672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046766298896690076, + "loss": 3.3433, + "theoretical_loss": 4.249234192949818, + "tokens_seen": 244318208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676529588766299, + "loss": 3.2905, + "theoretical_loss": 4.249095994029947, + "tokens_seen": 244383744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676429287863591, + "loss": 3.0341, + "theoretical_loss": 4.248957842539484, + "tokens_seen": 244449280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046763289869608825, + "loss": 3.3165, + "theoretical_loss": 4.248819738449442, + "tokens_seen": 244514816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676228686058175, + "loss": 3.0861, + "theoretical_loss": 4.2486816817308615, + "tokens_seen": 244580352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046761283851554666, + "loss": 3.2345, + "theoretical_loss": 4.248543672354805, + "tokens_seen": 244645888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046760280842527585, + "loss": 3.2232, + "theoretical_loss": 4.248405710292364, + "tokens_seen": 244711424 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467592778335005, + "loss": 3.1468, + "theoretical_loss": 4.248267795514652, + "tokens_seen": 244776960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675827482447342, + "loss": 3.122, + "theoretical_loss": 4.248129927992808, + "tokens_seen": 244842496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675727181544634, + "loss": 3.1999, + "theoretical_loss": 4.247992107698002, + "tokens_seen": 244908032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675626880641926, + "loss": 3.1818, + "theoretical_loss": 4.247854334601421, + "tokens_seen": 244973568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046755265797392175, + "loss": 3.2209, + "theoretical_loss": 4.247716608674283, + "tokens_seen": 245039104 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467542627883651, + "loss": 3.1481, + "theoretical_loss": 4.247578929887829, + "tokens_seen": 245104640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046753259779338017, + "loss": 3.0031, + "theoretical_loss": 4.247441298213326, + "tokens_seen": 245170176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046752256770310935, + "loss": 2.998, + "theoretical_loss": 4.247303713622067, + "tokens_seen": 245235712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046751253761283853, + "loss": 3.1993, + "theoretical_loss": 4.247166176085367, + "tokens_seen": 245301248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675025075225677, + "loss": 3.1806, + "theoretical_loss": 4.247028685574569, + "tokens_seen": 245366784 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674924774322969, + "loss": 3.3216, + "theoretical_loss": 4.246891242061041, + "tokens_seen": 245432320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046748244734202613, + "loss": 3.0241, + "theoretical_loss": 4.246753845516174, + "tokens_seen": 245497856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046747241725175525, + "loss": 3.3789, + "theoretical_loss": 4.246616495911388, + "tokens_seen": 245563392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674623871614845, + "loss": 3.1574, + "theoretical_loss": 4.246479193218123, + "tokens_seen": 245628928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674523570712136, + "loss": 3.1504, + "theoretical_loss": 4.246341937407848, + "tokens_seen": 245694464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 150657, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3028979301452637, + "objective/train/theoretical_loss": 4.246204728452055, + "objective/train/tokens_used": 266220000, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046744232698094285, + "loss": 3.3196, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046743229689067203, + "loss": 3.2711, + "theoretical_loss": 4.246067566322259, + "tokens_seen": 245825536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674222668004012, + "loss": 3.3492, + "theoretical_loss": 4.245930450990007, + "tokens_seen": 245891072 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674122367101304, + "loss": 3.2023, + "theoretical_loss": 4.245793382426861, + "tokens_seen": 245956608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046740220661985963, + "loss": 3.0581, + "theoretical_loss": 4.245656360604417, + "tokens_seen": 246022144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046739217652958876, + "loss": 3.2156, + "theoretical_loss": 4.24551938549429, + "tokens_seen": 246087680 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467382146439318, + "loss": 3.1543, + "theoretical_loss": 4.2453824570681205, + "tokens_seen": 246153216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673721163490471, + "loss": 2.9836, + "theoretical_loss": 4.245245575297577, + "tokens_seen": 246218752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046736208625877636, + "loss": 3.0194, + "theoretical_loss": 4.2451087401543495, + "tokens_seen": 246284288 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046735205616850554, + "loss": 3.1133, + "theoretical_loss": 4.244971951610154, + "tokens_seen": 246349824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673420260782347, + "loss": 3.1389, + "theoretical_loss": 4.24483520963673, + "tokens_seen": 246415360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673319959879639, + "loss": 3.3422, + "theoretical_loss": 4.244698514205844, + "tokens_seen": 246480896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673219658976931, + "loss": 3.0803, + "theoretical_loss": 4.244561865289285, + "tokens_seen": 246546432 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046731193580742226, + "loss": 3.1255, + "theoretical_loss": 4.244425262858867, + "tokens_seen": 246611968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673019057171515, + "loss": 3.152, + "theoretical_loss": 4.2442887068864295, + "tokens_seen": 246677504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672918756268806, + "loss": 3.2079, + "theoretical_loss": 4.244152197343835, + "tokens_seen": 246743040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046728184553660986, + "loss": 3.4062, + "theoretical_loss": 4.244015734202973, + "tokens_seen": 246808576 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467271815446339, + "loss": 3.1603, + "theoretical_loss": 4.243879317435755, + "tokens_seen": 246874112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672617853560682, + "loss": 3.1124, + "theoretical_loss": 4.243742947014117, + "tokens_seen": 246939648 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672517552657974, + "loss": 2.9212, + "theoretical_loss": 4.243606622910021, + "tokens_seen": 247005184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672417251755266, + "loss": 3.074, + "theoretical_loss": 4.243470345095453, + "tokens_seen": 247070720 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046723169508525576, + "loss": 3.1613, + "theoretical_loss": 4.2433341135424225, + "tokens_seen": 247136256 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467221664994985, + "loss": 3.1584, + "theoretical_loss": 4.243197928222964, + "tokens_seen": 247201792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672116349047141, + "loss": 3.2257, + "theoretical_loss": 4.243061789109136, + "tokens_seen": 247267328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046720160481444336, + "loss": 3.1837, + "theoretical_loss": 4.242925696173021, + "tokens_seen": 247332864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 151862, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.255047082901001, + "objective/train/theoretical_loss": 4.2427896493867285, + "objective/train/tokens_used": 267858400, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671915747241725, + "loss": 2.9149, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671815446339017, + "loss": 2.8987, + "theoretical_loss": 4.242653648722387, + "tokens_seen": 247463936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671715145436309, + "loss": 2.9504, + "theoretical_loss": 4.242517694152154, + "tokens_seen": 247529472 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671614844533601, + "loss": 3.1881, + "theoretical_loss": 4.24238178564821, + "tokens_seen": 247595008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046715145436308927, + "loss": 2.9594, + "theoretical_loss": 4.242245923182756, + "tokens_seen": 247660544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046714142427281845, + "loss": 3.1327, + "theoretical_loss": 4.242110106728022, + "tokens_seen": 247726080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046713139418254763, + "loss": 3.0429, + "theoretical_loss": 4.241974336256261, + "tokens_seen": 247791616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046712136409227686, + "loss": 3.1163, + "theoretical_loss": 4.241838611739748, + "tokens_seen": 247857152 + }, + { + "epoch": 0.08, + "learning_rate": 0.000467111334002006, + "loss": 3.2067, + "theoretical_loss": 4.241702933150783, + "tokens_seen": 247922688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671013039117352, + "loss": 3.3482, + "theoretical_loss": 4.241567300461693, + "tokens_seen": 247988224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046709127382146435, + "loss": 3.3143, + "theoretical_loss": 4.241431713644823, + "tokens_seen": 248053760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670812437311936, + "loss": 3.2176, + "theoretical_loss": 4.241296172672547, + "tokens_seen": 248119296 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046707121364092277, + "loss": 3.1752, + "theoretical_loss": 4.24116067751726, + "tokens_seen": 248184832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046706118355065195, + "loss": 3.2537, + "theoretical_loss": 4.241025228151383, + "tokens_seen": 248250368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046705115346038113, + "loss": 3.4286, + "theoretical_loss": 4.24088982454736, + "tokens_seen": 248315904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046704112337011037, + "loss": 3.248, + "theoretical_loss": 4.240754466677659, + "tokens_seen": 248381440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670310932798395, + "loss": 2.9887, + "theoretical_loss": 4.240619154514771, + "tokens_seen": 248446976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046702106318956873, + "loss": 3.2964, + "theoretical_loss": 4.240483888031212, + "tokens_seen": 248512512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046701103309929786, + "loss": 2.9216, + "theoretical_loss": 4.240348667199521, + "tokens_seen": 248578048 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670010030090271, + "loss": 2.9462, + "theoretical_loss": 4.240213491992261, + "tokens_seen": 248643584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669909729187563, + "loss": 3.2691, + "theoretical_loss": 4.240078362382019, + "tokens_seen": 248709120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046698094282848545, + "loss": 2.8617, + "theoretical_loss": 4.239943278341404, + "tokens_seen": 248774656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046697091273821464, + "loss": 2.9596, + "theoretical_loss": 4.239808239843052, + "tokens_seen": 248840192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669608826479438, + "loss": 2.9612, + "theoretical_loss": 4.239673246859619, + "tokens_seen": 248905728 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046695085255767305, + "loss": 2.9767, + "theoretical_loss": 4.239538299363788, + "tokens_seen": 248971264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 152583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3142855167388916, + "objective/train/theoretical_loss": 4.239403397328261, + "objective/train/tokens_used": 269496800, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046694082246740223, + "loss": 3.3554, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669307923771314, + "loss": 3.1367, + "theoretical_loss": 4.239268540725769, + "tokens_seen": 249102336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669207622868606, + "loss": 3.0328, + "theoretical_loss": 4.239133729529064, + "tokens_seen": 249167872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046691073219658983, + "loss": 3.1881, + "theoretical_loss": 4.2389989637109196, + "tokens_seen": 249233408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046690070210631896, + "loss": 3.1403, + "theoretical_loss": 4.2388642432441355, + "tokens_seen": 249298944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668906720160482, + "loss": 3.099, + "theoretical_loss": 4.238729568101535, + "tokens_seen": 249364480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668806419257773, + "loss": 3.2886, + "theoretical_loss": 4.238594938255963, + "tokens_seen": 249430016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046687061183550656, + "loss": 2.9976, + "theoretical_loss": 4.2384603536802885, + "tokens_seen": 249495552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046686058174523574, + "loss": 3.1615, + "theoretical_loss": 4.238325814347404, + "tokens_seen": 249561088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668505516549649, + "loss": 3.1596, + "theoretical_loss": 4.238191320230227, + "tokens_seen": 249626624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668405215646941, + "loss": 3.2094, + "theoretical_loss": 4.238056871301695, + "tokens_seen": 249692160 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668304914744233, + "loss": 3.0536, + "theoretical_loss": 4.237922467534771, + "tokens_seen": 249757696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046682046138415246, + "loss": 2.9296, + "theoretical_loss": 4.237788108902441, + "tokens_seen": 249823232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668104312938817, + "loss": 3.278, + "theoretical_loss": 4.237653795377714, + "tokens_seen": 249888768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668004012036108, + "loss": 3.0784, + "theoretical_loss": 4.237519526933622, + "tokens_seen": 249954304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046679037111334006, + "loss": 3.3284, + "theoretical_loss": 4.2373853035432205, + "tokens_seen": 250019840 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667803410230692, + "loss": 3.1476, + "theoretical_loss": 4.237251125179588, + "tokens_seen": 250085376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667703109327984, + "loss": 3.4263, + "theoretical_loss": 4.237116991815826, + "tokens_seen": 250150912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667602808425276, + "loss": 3.3711, + "theoretical_loss": 4.23698290342506, + "tokens_seen": 250216448 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667502507522568, + "loss": 3.1352, + "theoretical_loss": 4.236848859980437, + "tokens_seen": 250281984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046674022066198596, + "loss": 3.1904, + "theoretical_loss": 4.23671486145513, + "tokens_seen": 250347520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667301905717152, + "loss": 3.1398, + "theoretical_loss": 4.236580907822331, + "tokens_seen": 250413056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667201604814443, + "loss": 3.2989, + "theoretical_loss": 4.236446999055257, + "tokens_seen": 250478592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046671013039117356, + "loss": 3.215, + "theoretical_loss": 4.2363131351271495, + "tokens_seen": 250544128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667001003009027, + "loss": 3.0471, + "theoretical_loss": 4.2361793160112695, + "tokens_seen": 250609664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 153242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2777795791625977, + "objective/train/theoretical_loss": 4.236045541680905, + "objective/train/tokens_used": 271135200, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666900702106319, + "loss": 3.1679, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666800401203611, + "loss": 3.0121, + "theoretical_loss": 4.235911812109363, + "tokens_seen": 250740736 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666700100300903, + "loss": 3.1682, + "theoretical_loss": 4.235778127269976, + "tokens_seen": 250806272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046665997993981947, + "loss": 3.236, + "theoretical_loss": 4.235644487136098, + "tokens_seen": 250871808 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046664994984954865, + "loss": 3.1782, + "theoretical_loss": 4.235510891681108, + "tokens_seen": 250937344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046663991975927783, + "loss": 2.8634, + "theoretical_loss": 4.235377340878404, + "tokens_seen": 251002880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046662988966900706, + "loss": 3.3324, + "theoretical_loss": 4.23524383470141, + "tokens_seen": 251068416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666198595787362, + "loss": 3.346, + "theoretical_loss": 4.235110373123572, + "tokens_seen": 251133952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046660982948846543, + "loss": 3.2462, + "theoretical_loss": 4.2349769561183574, + "tokens_seen": 251199488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046659979939819455, + "loss": 3.1068, + "theoretical_loss": 4.2348435836592575, + "tokens_seen": 251265024 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665897693079238, + "loss": 3.1695, + "theoretical_loss": 4.234710255719786, + "tokens_seen": 251330560 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046657973921765297, + "loss": 3.1471, + "theoretical_loss": 4.234576972273481, + "tokens_seen": 251396096 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046656970912738215, + "loss": 3.0517, + "theoretical_loss": 4.234443733293899, + "tokens_seen": 251461632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046655967903711133, + "loss": 3.1296, + "theoretical_loss": 4.234310538754624, + "tokens_seen": 251527168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046654964894684057, + "loss": 3.2316, + "theoretical_loss": 4.2341773886292575, + "tokens_seen": 251592704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665396188565697, + "loss": 3.0754, + "theoretical_loss": 4.234044282891429, + "tokens_seen": 251658240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046652958876629893, + "loss": 3.4376, + "theoretical_loss": 4.233911221514787, + "tokens_seen": 251723776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046651955867602806, + "loss": 3.1209, + "theoretical_loss": 4.233778204473002, + "tokens_seen": 251789312 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665095285857573, + "loss": 3.054, + "theoretical_loss": 4.23364523173977, + "tokens_seen": 251854848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664994984954865, + "loss": 3.1497, + "theoretical_loss": 4.233512303288807, + "tokens_seen": 251920384 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046648946840521565, + "loss": 3.243, + "theoretical_loss": 4.233379419093851, + "tokens_seen": 251985920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046647943831494484, + "loss": 2.9777, + "theoretical_loss": 4.233246579128666, + "tokens_seen": 252051456 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466469408224674, + "loss": 3.0244, + "theoretical_loss": 4.233113783367033, + "tokens_seen": 252116992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664593781344032, + "loss": 2.8659, + "theoretical_loss": 4.232981031782761, + "tokens_seen": 252182528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046644934804413243, + "loss": 3.0599, + "theoretical_loss": 4.232848324349677, + "tokens_seen": 252248064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 154565, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2938530445098877, + "objective/train/theoretical_loss": 4.232715661041632, + "objective/train/tokens_used": 272773600, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046643931795386156, + "loss": 3.1579, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664292878635908, + "loss": 2.8977, + "theoretical_loss": 4.232583041832499, + "tokens_seen": 252379136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664192577733199, + "loss": 2.9725, + "theoretical_loss": 4.232450466696174, + "tokens_seen": 252444672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046640922768304916, + "loss": 3.247, + "theoretical_loss": 4.2323179356065745, + "tokens_seen": 252510208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046639919759277834, + "loss": 3.0137, + "theoretical_loss": 4.23218544853764, + "tokens_seen": 252575744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663891675025075, + "loss": 3.1024, + "theoretical_loss": 4.232053005463333, + "tokens_seen": 252641280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663791374122367, + "loss": 3.1656, + "theoretical_loss": 4.231920606357638, + "tokens_seen": 252706816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046636910732196594, + "loss": 3.2556, + "theoretical_loss": 4.231788251194559, + "tokens_seen": 252772352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046635907723169506, + "loss": 3.2828, + "theoretical_loss": 4.231655939948127, + "tokens_seen": 252837888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663490471414243, + "loss": 3.0292, + "theoretical_loss": 4.231523672592392, + "tokens_seen": 252903424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663390170511534, + "loss": 3.2209, + "theoretical_loss": 4.231391449101425, + "tokens_seen": 252968960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046632898696088266, + "loss": 3.3026, + "theoretical_loss": 4.231259269449322, + "tokens_seen": 253034496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046631895687061184, + "loss": 3.2373, + "theoretical_loss": 4.231127133610198, + "tokens_seen": 253100032 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466308926780341, + "loss": 3.1833, + "theoretical_loss": 4.230995041558194, + "tokens_seen": 253165568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662988966900702, + "loss": 3.0795, + "theoretical_loss": 4.230862993267468, + "tokens_seen": 253231104 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662888665997994, + "loss": 3.1932, + "theoretical_loss": 4.230730988712205, + "tokens_seen": 253296640 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046627883650952857, + "loss": 3.2634, + "theoretical_loss": 4.230599027866606, + "tokens_seen": 253362176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662688064192578, + "loss": 3.3007, + "theoretical_loss": 4.2304671107048994, + "tokens_seen": 253427712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046625877632898693, + "loss": 3.1446, + "theoretical_loss": 4.2303352372013325, + "tokens_seen": 253493248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046624874623871616, + "loss": 3.2354, + "theoretical_loss": 4.230203407330176, + "tokens_seen": 253558784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046623871614844535, + "loss": 3.2521, + "theoretical_loss": 4.230071621065721, + "tokens_seen": 253624320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662286860581745, + "loss": 3.3151, + "theoretical_loss": 4.2299398783822815, + "tokens_seen": 253689856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662186559679037, + "loss": 3.0526, + "theoretical_loss": 4.229808179254192, + "tokens_seen": 253755392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662086258776329, + "loss": 3.1744, + "theoretical_loss": 4.22967652365581, + "tokens_seen": 253820928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661985957873621, + "loss": 3.2694, + "theoretical_loss": 4.229544911561513, + "tokens_seen": 253886464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 156106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7020857334136963, + "objective/train/theoretical_loss": 4.229413342945703, + "objective/train/tokens_used": 274412000, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661885656970913, + "loss": 3.1877, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661785356068205, + "loss": 3.2587, + "theoretical_loss": 4.229281817782801, + "tokens_seen": 254017536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046616850551654967, + "loss": 3.2171, + "theoretical_loss": 4.229150336047251, + "tokens_seen": 254083072 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046615847542627885, + "loss": 2.7179, + "theoretical_loss": 4.229018897713519, + "tokens_seen": 254148608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046614844533600803, + "loss": 3.0186, + "theoretical_loss": 4.22888750275609, + "tokens_seen": 254214144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046613841524573727, + "loss": 3.1104, + "theoretical_loss": 4.228756151149475, + "tokens_seen": 254279680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661283851554664, + "loss": 3.2379, + "theoretical_loss": 4.228624842868202, + "tokens_seen": 254345216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046611835506519563, + "loss": 3.1056, + "theoretical_loss": 4.228493577886824, + "tokens_seen": 254410752 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046610832497492475, + "loss": 3.2572, + "theoretical_loss": 4.228362356179913, + "tokens_seen": 254476288 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466098294884654, + "loss": 3.019, + "theoretical_loss": 4.228231177722063, + "tokens_seen": 254541824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046608826479438317, + "loss": 3.1823, + "theoretical_loss": 4.228100042487892, + "tokens_seen": 254607360 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046607823470411235, + "loss": 3.0422, + "theoretical_loss": 4.227968950452035, + "tokens_seen": 254672896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046606820461384153, + "loss": 3.0621, + "theoretical_loss": 4.227837901589153, + "tokens_seen": 254738432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046605817452357077, + "loss": 2.9792, + "theoretical_loss": 4.227706895873924, + "tokens_seen": 254803968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660481444332999, + "loss": 3.2174, + "theoretical_loss": 4.227575933281051, + "tokens_seen": 254869504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046603811434302913, + "loss": 3.2159, + "theoretical_loss": 4.227445013785257, + "tokens_seen": 254935040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046602808425275826, + "loss": 3.2537, + "theoretical_loss": 4.227314137361285, + "tokens_seen": 255000576 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660180541624875, + "loss": 3.2481, + "theoretical_loss": 4.227183303983901, + "tokens_seen": 255066112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660080240722167, + "loss": 3.1691, + "theoretical_loss": 4.227052513627893, + "tokens_seen": 255131648 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046599799398194586, + "loss": 3.1694, + "theoretical_loss": 4.226921766268067, + "tokens_seen": 255197184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046598796389167504, + "loss": 3.1596, + "theoretical_loss": 4.226791061879253, + "tokens_seen": 255262720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659779338014042, + "loss": 3.0473, + "theoretical_loss": 4.226660400436302, + "tokens_seen": 255328256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659679037111334, + "loss": 3.1521, + "theoretical_loss": 4.226529781914084, + "tokens_seen": 255393792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046595787362086263, + "loss": 3.0574, + "theoretical_loss": 4.226399206287493, + "tokens_seen": 255459328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046594784353059176, + "loss": 3.1197, + "theoretical_loss": 4.226268673531442, + "tokens_seen": 255524864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 156630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.661715507507324, + "objective/train/theoretical_loss": 4.226138183620867, + "objective/train/tokens_used": 276050400, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465937813440321, + "loss": 3.0664, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659277833500501, + "loss": 3.0895, + "theoretical_loss": 4.226007736530723, + "tokens_seen": 255655936 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046591775325977936, + "loss": 2.9574, + "theoretical_loss": 4.225877332235987, + "tokens_seen": 255721472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046590772316950854, + "loss": 3.1183, + "theoretical_loss": 4.225746970711657, + "tokens_seen": 255787008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658976930792377, + "loss": 2.9636, + "theoretical_loss": 4.225616651932753, + "tokens_seen": 255852544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658876629889669, + "loss": 3.2771, + "theoretical_loss": 4.225486375874315, + "tokens_seen": 255918080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046587763289869614, + "loss": 2.8867, + "theoretical_loss": 4.225356142511402, + "tokens_seen": 255983616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046586760280842526, + "loss": 3.2734, + "theoretical_loss": 4.225225951819099, + "tokens_seen": 256049152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658575727181545, + "loss": 3.1851, + "theoretical_loss": 4.225095803772507, + "tokens_seen": 256114688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658475426278836, + "loss": 3.0544, + "theoretical_loss": 4.22496569834675, + "tokens_seen": 256180224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046583751253761286, + "loss": 3.1337, + "theoretical_loss": 4.224835635516973, + "tokens_seen": 256245760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046582748244734204, + "loss": 2.9687, + "theoretical_loss": 4.224705615258341, + "tokens_seen": 256311296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658174523570712, + "loss": 3.1143, + "theoretical_loss": 4.224575637546041, + "tokens_seen": 256376832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658074222668004, + "loss": 3.1526, + "theoretical_loss": 4.224445702355279, + "tokens_seen": 256442368 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657973921765296, + "loss": 3.3006, + "theoretical_loss": 4.2243158096612845, + "tokens_seen": 256507904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046578736208625877, + "loss": 3.0952, + "theoretical_loss": 4.224185959439305, + "tokens_seen": 256573440 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465777331995988, + "loss": 2.8397, + "theoretical_loss": 4.22405615166461, + "tokens_seen": 256638976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046576730190571713, + "loss": 3.1773, + "theoretical_loss": 4.22392638631249, + "tokens_seen": 256704512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046575727181544636, + "loss": 3.0894, + "theoretical_loss": 4.223796663358255, + "tokens_seen": 256770048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046574724172517555, + "loss": 3.044, + "theoretical_loss": 4.223666982777237, + "tokens_seen": 256835584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657372116349047, + "loss": 3.1196, + "theoretical_loss": 4.223537344544788, + "tokens_seen": 256901120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657271815446339, + "loss": 3.3258, + "theoretical_loss": 4.223407748636282, + "tokens_seen": 256966656 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657171514543631, + "loss": 3.1867, + "theoretical_loss": 4.22327819502711, + "tokens_seen": 257032192 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046570712136409227, + "loss": 3.0392, + "theoretical_loss": 4.223148683692687, + "tokens_seen": 257097728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656970912738215, + "loss": 3.1276, + "theoretical_loss": 4.223019214608446, + "tokens_seen": 257163264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 157258, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.008852005004883, + "objective/train/theoretical_loss": 4.222889787749845, + "objective/train/tokens_used": 277688800, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046568706118355063, + "loss": 3.0922, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046567703109327987, + "loss": 2.8066, + "theoretical_loss": 4.222760403092358, + "tokens_seen": 257294336 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465667001003009, + "loss": 3.0272, + "theoretical_loss": 4.22263106061148, + "tokens_seen": 257359872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046565697091273823, + "loss": 3.0383, + "theoretical_loss": 4.222501760282729, + "tokens_seen": 257425408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656469408224674, + "loss": 2.9278, + "theoretical_loss": 4.22237250208164, + "tokens_seen": 257490944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656369107321966, + "loss": 3.0506, + "theoretical_loss": 4.222243285983772, + "tokens_seen": 257556480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656268806419258, + "loss": 2.754, + "theoretical_loss": 4.222114111964703, + "tokens_seen": 257622016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046561685055165495, + "loss": 2.9813, + "theoretical_loss": 4.221984980000029, + "tokens_seen": 257687552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046560682046138414, + "loss": 2.94, + "theoretical_loss": 4.2218558900653695, + "tokens_seen": 257753088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046559679037111337, + "loss": 2.7958, + "theoretical_loss": 4.221726842136364, + "tokens_seen": 257818624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655867602808425, + "loss": 3.1393, + "theoretical_loss": 4.2215978361886695, + "tokens_seen": 257884160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046557673019057173, + "loss": 3.1812, + "theoretical_loss": 4.221468872197967, + "tokens_seen": 257949696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655667001003009, + "loss": 3.066, + "theoretical_loss": 4.221339950139956, + "tokens_seen": 258015232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655566700100301, + "loss": 2.9906, + "theoretical_loss": 4.221211069990357, + "tokens_seen": 258080768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655466399197593, + "loss": 3.1732, + "theoretical_loss": 4.221082231724908, + "tokens_seen": 258146304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046553660982948846, + "loss": 3.0643, + "theoretical_loss": 4.22095343531937, + "tokens_seen": 258211840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046552657973921764, + "loss": 3.1475, + "theoretical_loss": 4.220824680749525, + "tokens_seen": 258277376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655165496489469, + "loss": 3.227, + "theoretical_loss": 4.220695967991171, + "tokens_seen": 258342912 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465506519558676, + "loss": 3.1797, + "theoretical_loss": 4.220567297020131, + "tokens_seen": 258408448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046549648946840524, + "loss": 3.0443, + "theoretical_loss": 4.220438667812244, + "tokens_seen": 258473984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046548645937813436, + "loss": 3.0716, + "theoretical_loss": 4.220310080343373, + "tokens_seen": 258539520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654764292878636, + "loss": 3.2806, + "theoretical_loss": 4.220181534589398, + "tokens_seen": 258605056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654663991975928, + "loss": 3.0484, + "theoretical_loss": 4.22005303052622, + "tokens_seen": 258670592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046545636910732196, + "loss": 3.2093, + "theoretical_loss": 4.219924568129759, + "tokens_seen": 258736128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654463390170512, + "loss": 2.9654, + "theoretical_loss": 4.219796147375957, + "tokens_seen": 258801664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 158510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9906694889068604, + "objective/train/theoretical_loss": 4.219667768240775, + "objective/train/tokens_used": 279327200, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654363089267803, + "loss": 3.068, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046542627883650956, + "loss": 3.2658, + "theoretical_loss": 4.219539430700195, + "tokens_seen": 258932736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046541624874623874, + "loss": 3.0762, + "theoretical_loss": 4.2194111347302155, + "tokens_seen": 258998272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654062186559679, + "loss": 3.2659, + "theoretical_loss": 4.219282880306859, + "tokens_seen": 259063808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653961885656971, + "loss": 3.0452, + "theoretical_loss": 4.219154667406166, + "tokens_seen": 259129344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046538615847542634, + "loss": 3.0682, + "theoretical_loss": 4.219026496004198, + "tokens_seen": 259194880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046537612838515546, + "loss": 3.0652, + "theoretical_loss": 4.218898366077035, + "tokens_seen": 259260416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653660982948847, + "loss": 3.2532, + "theoretical_loss": 4.218770277600775, + "tokens_seen": 259325952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653560682046138, + "loss": 2.9702, + "theoretical_loss": 4.218642230551541, + "tokens_seen": 259391488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046534603811434306, + "loss": 2.987, + "theoretical_loss": 4.218514224905472, + "tokens_seen": 259457024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046533600802407224, + "loss": 3.1381, + "theoretical_loss": 4.218386260638727, + "tokens_seen": 259522560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653259779338014, + "loss": 2.8499, + "theoretical_loss": 4.2182583377274865, + "tokens_seen": 259588096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653159478435306, + "loss": 3.0011, + "theoretical_loss": 4.218130456147948, + "tokens_seen": 259653632 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653059177532598, + "loss": 3.0006, + "theoretical_loss": 4.218002615876332, + "tokens_seen": 259719168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046529588766298897, + "loss": 2.8484, + "theoretical_loss": 4.217874816888877, + "tokens_seen": 259784704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652858575727182, + "loss": 3.0114, + "theoretical_loss": 4.217747059161839, + "tokens_seen": 259850240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046527582748244733, + "loss": 3.1379, + "theoretical_loss": 4.217619342671498, + "tokens_seen": 259915776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046526579739217656, + "loss": 3.1485, + "theoretical_loss": 4.2174916673941505, + "tokens_seen": 259981312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046525576730190575, + "loss": 3.1037, + "theoretical_loss": 4.217364033306113, + "tokens_seen": 260046848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652457372116349, + "loss": 3.1131, + "theoretical_loss": 4.217236440383724, + "tokens_seen": 260112384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652357071213641, + "loss": 3.1678, + "theoretical_loss": 4.217108888603337, + "tokens_seen": 260177920 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652256770310933, + "loss": 3.1065, + "theoretical_loss": 4.21698137794133, + "tokens_seen": 260243456 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046521564694082247, + "loss": 2.9596, + "theoretical_loss": 4.216853908374097, + "tokens_seen": 260308992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652056168505517, + "loss": 3.008, + "theoretical_loss": 4.216726479878052, + "tokens_seen": 260374528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046519558676028083, + "loss": 3.0454, + "theoretical_loss": 4.216599092429631, + "tokens_seen": 260440064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 159098, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7527639865875244, + "objective/train/theoretical_loss": 4.216471746005286, + "objective/train/tokens_used": 280965600, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046518555667001007, + "loss": 3.0212, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651755265797392, + "loss": 3.0566, + "theoretical_loss": 4.216344440581491, + "tokens_seen": 260571136 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046516549648946843, + "loss": 2.9021, + "theoretical_loss": 4.2162171761347365, + "tokens_seen": 260636672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651554663991976, + "loss": 2.9991, + "theoretical_loss": 4.2160899526415365, + "tokens_seen": 260702208 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651454363089268, + "loss": 3.0791, + "theoretical_loss": 4.215962770078422, + "tokens_seen": 260767744 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465135406218656, + "loss": 3.1413, + "theoretical_loss": 4.215835628421942, + "tokens_seen": 260833280 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046512537612838515, + "loss": 2.9224, + "theoretical_loss": 4.215708527648667, + "tokens_seen": 260898816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046511534603811434, + "loss": 3.189, + "theoretical_loss": 4.215581467735187, + "tokens_seen": 260964352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046510531594784357, + "loss": 3.3999, + "theoretical_loss": 4.215454448658109, + "tokens_seen": 261029888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650952858575727, + "loss": 2.9878, + "theoretical_loss": 4.215327470394062, + "tokens_seen": 261095424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046508525576730193, + "loss": 3.3008, + "theoretical_loss": 4.215200532919691, + "tokens_seen": 261160960 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650752256770311, + "loss": 3.126, + "theoretical_loss": 4.215073636211664, + "tokens_seen": 261226496 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650651955867603, + "loss": 3.0601, + "theoretical_loss": 4.214946780246666, + "tokens_seen": 261292032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650551654964895, + "loss": 3.2414, + "theoretical_loss": 4.214819965001401, + "tokens_seen": 261357568 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046504513540621866, + "loss": 3.3807, + "theoretical_loss": 4.214693190452593, + "tokens_seen": 261423104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046503510531594784, + "loss": 3.2496, + "theoretical_loss": 4.214566456576984, + "tokens_seen": 261488640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650250752256771, + "loss": 3.0059, + "theoretical_loss": 4.214439763351336, + "tokens_seen": 261554176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650150451354062, + "loss": 3.3606, + "theoretical_loss": 4.214313110752431, + "tokens_seen": 261619712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046500501504513544, + "loss": 3.2016, + "theoretical_loss": 4.214186498757069, + "tokens_seen": 261685248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046499498495486456, + "loss": 3.238, + "theoretical_loss": 4.214059927342068, + "tokens_seen": 261750784 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649849548645938, + "loss": 3.0229, + "theoretical_loss": 4.213933396484267, + "tokens_seen": 261816320 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464974924774323, + "loss": 3.3204, + "theoretical_loss": 4.213806906160523, + "tokens_seen": 261881856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046496489468405216, + "loss": 3.1548, + "theoretical_loss": 4.213680456347712, + "tokens_seen": 261947392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046495486459378134, + "loss": 3.0314, + "theoretical_loss": 4.213554047022729, + "tokens_seen": 262012928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649448345035105, + "loss": 3.1052, + "theoretical_loss": 4.213427678162489, + "tokens_seen": 262078464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 160106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2078521251678467, + "objective/train/theoretical_loss": 4.213301349743924, + "objective/train/tokens_used": 282604000, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649348044132397, + "loss": 3.285, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046492477432296894, + "loss": 3.2845, + "theoretical_loss": 4.2131750617439865, + "tokens_seen": 262209536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046491474423269807, + "loss": 3.0031, + "theoretical_loss": 4.213048814139647, + "tokens_seen": 262275072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649047141424273, + "loss": 3.0222, + "theoretical_loss": 4.212922606907895, + "tokens_seen": 262340608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648946840521565, + "loss": 3.0202, + "theoretical_loss": 4.21279644002574, + "tokens_seen": 262406144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046488465396188566, + "loss": 2.9464, + "theoretical_loss": 4.212670313470209, + "tokens_seen": 262471680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046487462387161485, + "loss": 3.3016, + "theoretical_loss": 4.212544227218347, + "tokens_seen": 262537216 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464864593781344, + "loss": 3.0782, + "theoretical_loss": 4.21241818124722, + "tokens_seen": 262602752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648545636910732, + "loss": 3.1333, + "theoretical_loss": 4.212292175533912, + "tokens_seen": 262668288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046484453360080244, + "loss": 2.8875, + "theoretical_loss": 4.212166210055526, + "tokens_seen": 262733824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046483450351053157, + "loss": 2.84, + "theoretical_loss": 4.212040284789181, + "tokens_seen": 262799360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648244734202608, + "loss": 3.2869, + "theoretical_loss": 4.211914399712019, + "tokens_seen": 262864896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046481444332998993, + "loss": 3.0755, + "theoretical_loss": 4.211788554801198, + "tokens_seen": 262930432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046480441323971917, + "loss": 3.207, + "theoretical_loss": 4.211662750033895, + "tokens_seen": 262995968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046479438314944835, + "loss": 2.9361, + "theoretical_loss": 4.211536985387307, + "tokens_seen": 263061504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046478435305917753, + "loss": 3.1135, + "theoretical_loss": 4.211411260838647, + "tokens_seen": 263127040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647743229689067, + "loss": 3.003, + "theoretical_loss": 4.2112855763651496, + "tokens_seen": 263192576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046476429287863595, + "loss": 3.252, + "theoretical_loss": 4.211159931944065, + "tokens_seen": 263258112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647542627883651, + "loss": 3.0456, + "theoretical_loss": 4.211034327552666, + "tokens_seen": 263323648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647442326980943, + "loss": 3.2239, + "theoretical_loss": 4.210908763168239, + "tokens_seen": 263389184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046473420260782344, + "loss": 3.3088, + "theoretical_loss": 4.210783238768093, + "tokens_seen": 263454720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046472417251755267, + "loss": 3.1186, + "theoretical_loss": 4.210657754329553, + "tokens_seen": 263520256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647141424272819, + "loss": 3.2965, + "theoretical_loss": 4.210532309829965, + "tokens_seen": 263585792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046470411233701103, + "loss": 3.2105, + "theoretical_loss": 4.21040690524669, + "tokens_seen": 263651328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046469408224674027, + "loss": 2.8521, + "theoretical_loss": 4.21028154055711, + "tokens_seen": 263716864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 160649, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5538294315338135, + "objective/train/theoretical_loss": 4.2101562157386265, + "objective/train/tokens_used": 284242400, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646840521564694, + "loss": 2.7667, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046467402206619863, + "loss": 2.8686, + "theoretical_loss": 4.210030930768655, + "tokens_seen": 263847936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646639919759278, + "loss": 3.1047, + "theoretical_loss": 4.2099056856246335, + "tokens_seen": 263913472 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464653961885657, + "loss": 3.2001, + "theoretical_loss": 4.209780480284017, + "tokens_seen": 263979008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646439317953862, + "loss": 3.1007, + "theoretical_loss": 4.209655314724279, + "tokens_seen": 264044544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046463390170511535, + "loss": 2.9901, + "theoretical_loss": 4.209530188922911, + "tokens_seen": 264110080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046462387161484454, + "loss": 2.8877, + "theoretical_loss": 4.209405102857422, + "tokens_seen": 264175616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046461384152457377, + "loss": 3.2396, + "theoretical_loss": 4.209280056505342, + "tokens_seen": 264241152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646038114343029, + "loss": 2.9324, + "theoretical_loss": 4.209155049844217, + "tokens_seen": 264306688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046459378134403213, + "loss": 2.7333, + "theoretical_loss": 4.209030082851612, + "tokens_seen": 264372224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645837512537613, + "loss": 2.7765, + "theoretical_loss": 4.208905155505109, + "tokens_seen": 264437760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645737211634905, + "loss": 3.0924, + "theoretical_loss": 4.20878026778231, + "tokens_seen": 264503296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645636910732197, + "loss": 3.1207, + "theoretical_loss": 4.208655419660834, + "tokens_seen": 264568832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046455366098294886, + "loss": 3.1763, + "theoretical_loss": 4.208530611118321, + "tokens_seen": 264634368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046454363089267804, + "loss": 3.0411, + "theoretical_loss": 4.208405842132423, + "tokens_seen": 264699904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645336008024073, + "loss": 3.1361, + "theoretical_loss": 4.208281112680817, + "tokens_seen": 264765440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645235707121364, + "loss": 3.1252, + "theoretical_loss": 4.208156422741195, + "tokens_seen": 264830976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046451354062186564, + "loss": 3.0564, + "theoretical_loss": 4.208031772291265, + "tokens_seen": 264896512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046450351053159476, + "loss": 3.4202, + "theoretical_loss": 4.207907161308757, + "tokens_seen": 264962048 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464493480441324, + "loss": 3.3518, + "theoretical_loss": 4.2077825897714165, + "tokens_seen": 265027584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644834503510532, + "loss": 2.9007, + "theoretical_loss": 4.207658057657008, + "tokens_seen": 265093120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046447342026078236, + "loss": 3.2147, + "theoretical_loss": 4.207533564943316, + "tokens_seen": 265158656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046446339017051154, + "loss": 3.1103, + "theoretical_loss": 4.207409111608138, + "tokens_seen": 265224192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644533600802407, + "loss": 3.1886, + "theoretical_loss": 4.2072846976292935, + "tokens_seen": 265289728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644433299899699, + "loss": 3.018, + "theoretical_loss": 4.2071603229846195, + "tokens_seen": 265355264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 161970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.322939157485962, + "objective/train/theoretical_loss": 4.20703598765197, + "objective/train/tokens_used": 285880800, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046443329989969914, + "loss": 3.2719, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046442326980942827, + "loss": 3.2149, + "theoretical_loss": 4.206911691609217, + "tokens_seen": 265486336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644132397191575, + "loss": 3.2664, + "theoretical_loss": 4.206787434834251, + "tokens_seen": 265551872 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644032096288867, + "loss": 2.9888, + "theoretical_loss": 4.20666321730498, + "tokens_seen": 265617408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046439317953861586, + "loss": 3.0063, + "theoretical_loss": 4.206539038999329, + "tokens_seen": 265682944 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046438314944834505, + "loss": 3.1335, + "theoretical_loss": 4.206414899895244, + "tokens_seen": 265748480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643731193580742, + "loss": 2.8322, + "theoretical_loss": 4.206290799970685, + "tokens_seen": 265814016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643630892678034, + "loss": 3.3554, + "theoretical_loss": 4.206166739203632, + "tokens_seen": 265879552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046435305917753264, + "loss": 2.9797, + "theoretical_loss": 4.206042717572082, + "tokens_seen": 265945088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046434302908726177, + "loss": 3.1981, + "theoretical_loss": 4.20591873505405, + "tokens_seen": 266010624 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464332998996991, + "loss": 3.1016, + "theoretical_loss": 4.20579479162757, + "tokens_seen": 266076160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046432296890672013, + "loss": 3.2206, + "theoretical_loss": 4.205670887270691, + "tokens_seen": 266141696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046431293881644937, + "loss": 2.916, + "theoretical_loss": 4.205547021961482, + "tokens_seen": 266207232 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046430290872617855, + "loss": 3.171, + "theoretical_loss": 4.205423195678029, + "tokens_seen": 266272768 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046429287863590773, + "loss": 3.2186, + "theoretical_loss": 4.205299408398435, + "tokens_seen": 266338304 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642828485456369, + "loss": 2.7666, + "theoretical_loss": 4.2051756601008226, + "tokens_seen": 266403840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046427281845536615, + "loss": 2.9658, + "theoretical_loss": 4.20505195076333, + "tokens_seen": 266469376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642627883650953, + "loss": 3.3264, + "theoretical_loss": 4.204928280364115, + "tokens_seen": 266534912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642527582748245, + "loss": 3.4252, + "theoretical_loss": 4.20480464888135, + "tokens_seen": 266600448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046424272818455364, + "loss": 2.9859, + "theoretical_loss": 4.204681056293228, + "tokens_seen": 266665984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046423269809428287, + "loss": 3.0057, + "theoretical_loss": 4.204557502577957, + "tokens_seen": 266731520 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046422266800401205, + "loss": 3.0726, + "theoretical_loss": 4.204433987713767, + "tokens_seen": 266797056 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046421263791374123, + "loss": 3.2009, + "theoretical_loss": 4.2043105116789, + "tokens_seen": 266862592 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642026078234704, + "loss": 2.9325, + "theoretical_loss": 4.204187074451617, + "tokens_seen": 266928128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641925777331996, + "loss": 3.144, + "theoretical_loss": 4.204063676010202, + "tokens_seen": 266993664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 162606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0582354068756104, + "objective/train/theoretical_loss": 4.203940316332948, + "objective/train/tokens_used": 287519200, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641825476429288, + "loss": 3.183, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464172517552658, + "loss": 3.2655, + "theoretical_loss": 4.203816995398171, + "tokens_seen": 267124736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046416248746238714, + "loss": 3.1179, + "theoretical_loss": 4.203693713184203, + "tokens_seen": 267190272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641524573721164, + "loss": 3.0399, + "theoretical_loss": 4.203570469669392, + "tokens_seen": 267255808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641424272818455, + "loss": 2.9932, + "theoretical_loss": 4.203447264832107, + "tokens_seen": 267321344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046413239719157474, + "loss": 3.1497, + "theoretical_loss": 4.203324098650731, + "tokens_seen": 267386880 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641223671013039, + "loss": 3.0141, + "theoretical_loss": 4.203200971103666, + "tokens_seen": 267452416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641123370110331, + "loss": 3.1592, + "theoretical_loss": 4.20307788216933, + "tokens_seen": 267517952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641023069207623, + "loss": 3.0264, + "theoretical_loss": 4.202954831826159, + "tokens_seen": 267583488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640922768304915, + "loss": 3.1223, + "theoretical_loss": 4.202831820052609, + "tokens_seen": 267649024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046408224674022064, + "loss": 3.1909, + "theoretical_loss": 4.202708846827148, + "tokens_seen": 267714560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640722166499499, + "loss": 3.2318, + "theoretical_loss": 4.202585912128266, + "tokens_seen": 267780096 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464062186559679, + "loss": 2.8608, + "theoretical_loss": 4.202463015934468, + "tokens_seen": 267845632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046405215646940824, + "loss": 3.2008, + "theoretical_loss": 4.202340158224277, + "tokens_seen": 267911168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640421263791374, + "loss": 3.1812, + "theoretical_loss": 4.202217338976231, + "tokens_seen": 267976704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640320962888666, + "loss": 2.8798, + "theoretical_loss": 4.2020945581688895, + "tokens_seen": 268042240 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640220661985958, + "loss": 3.1127, + "theoretical_loss": 4.201971815780826, + "tokens_seen": 268107776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046401203610832496, + "loss": 3.0995, + "theoretical_loss": 4.201849111790631, + "tokens_seen": 268173312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046400200601805414, + "loss": 3.0434, + "theoretical_loss": 4.201726446176915, + "tokens_seen": 268238848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639919759277834, + "loss": 3.05, + "theoretical_loss": 4.201603818918302, + "tokens_seen": 268304384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639819458375125, + "loss": 3.2322, + "theoretical_loss": 4.201481229993435, + "tokens_seen": 268369920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046397191574724174, + "loss": 3.2241, + "theoretical_loss": 4.201358679380976, + "tokens_seen": 268435456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639618856569709, + "loss": 3.1414, + "theoretical_loss": 4.201236167059601, + "tokens_seen": 268500992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639518555667001, + "loss": 3.2403, + "theoretical_loss": 4.201113693008002, + "tokens_seen": 268566528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046394182547642934, + "loss": 3.0643, + "theoretical_loss": 4.200991257204894, + "tokens_seen": 268632064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 163909, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.577915668487549, + "objective/train/theoretical_loss": 4.2008688596290025, + "objective/train/tokens_used": 289157600, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046393179538615847, + "loss": 2.8287, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639217652958877, + "loss": 3.1737, + "theoretical_loss": 4.200746500259073, + "tokens_seen": 268763136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639117352056169, + "loss": 2.6194, + "theoretical_loss": 4.200624179073869, + "tokens_seen": 268828672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046390170511534606, + "loss": 3.2024, + "theoretical_loss": 4.2005018960521685, + "tokens_seen": 268894208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046389167502507525, + "loss": 3.3453, + "theoretical_loss": 4.200379651172769, + "tokens_seen": 268959744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638816449348044, + "loss": 2.8859, + "theoretical_loss": 4.200257444414483, + "tokens_seen": 269025280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638716148445336, + "loss": 3.1556, + "theoretical_loss": 4.200135275756139, + "tokens_seen": 269090816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046386158475426284, + "loss": 3.0459, + "theoretical_loss": 4.200013145176587, + "tokens_seen": 269156352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046385155466399197, + "loss": 3.088, + "theoretical_loss": 4.199891052654689, + "tokens_seen": 269221888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638415245737212, + "loss": 3.0612, + "theoretical_loss": 4.199768998169326, + "tokens_seen": 269287424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046383149448345033, + "loss": 3.0789, + "theoretical_loss": 4.199646981699395, + "tokens_seen": 269352960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046382146439317957, + "loss": 3.1749, + "theoretical_loss": 4.199525003223812, + "tokens_seen": 269418496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046381143430290875, + "loss": 3.162, + "theoretical_loss": 4.199403062721506, + "tokens_seen": 269484032 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046380140421263793, + "loss": 3.1445, + "theoretical_loss": 4.199281160171427, + "tokens_seen": 269549568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637913741223671, + "loss": 3.0702, + "theoretical_loss": 4.1991592955525405, + "tokens_seen": 269615104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046378134403209635, + "loss": 3.0764, + "theoretical_loss": 4.199037468843825, + "tokens_seen": 269680640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637713139418255, + "loss": 3.1614, + "theoretical_loss": 4.198915680024282, + "tokens_seen": 269746176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637612838515547, + "loss": 2.84, + "theoretical_loss": 4.198793929072925, + "tokens_seen": 269811712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046375125376128384, + "loss": 3.1708, + "theoretical_loss": 4.198672215968785, + "tokens_seen": 269877248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046374122367101307, + "loss": 3.0235, + "theoretical_loss": 4.198550540690912, + "tokens_seen": 269942784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046373119358074225, + "loss": 3.393, + "theoretical_loss": 4.198428903218371, + "tokens_seen": 270008320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046372116349047143, + "loss": 3.1363, + "theoretical_loss": 4.198307303530243, + "tokens_seen": 270073856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637111334002006, + "loss": 3.0941, + "theoretical_loss": 4.198185741605628, + "tokens_seen": 270139392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637011033099298, + "loss": 3.0913, + "theoretical_loss": 4.19806421742364, + "tokens_seen": 270204928 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463691073219659, + "loss": 3.2218, + "theoretical_loss": 4.197942730963412, + "tokens_seen": 270270464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 164570, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8737075328826904, + "objective/train/theoretical_loss": 4.19782128220409, + "objective/train/tokens_used": 290796000, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636810431293882, + "loss": 2.9242, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046367101303911734, + "loss": 2.8725, + "theoretical_loss": 4.19769987112484, + "tokens_seen": 270401536 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636609829488466, + "loss": 3.3748, + "theoretical_loss": 4.1975784977048445, + "tokens_seen": 270467072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636509528585757, + "loss": 3.0672, + "theoretical_loss": 4.1974571619233, + "tokens_seen": 270532608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046364092276830494, + "loss": 3.1778, + "theoretical_loss": 4.197335863759422, + "tokens_seen": 270598144 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636308926780341, + "loss": 3.2507, + "theoretical_loss": 4.1972146031924416, + "tokens_seen": 270663680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636208625877633, + "loss": 2.9583, + "theoretical_loss": 4.197093380201606, + "tokens_seen": 270729216 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636108324974925, + "loss": 3.1339, + "theoretical_loss": 4.196972194766179, + "tokens_seen": 270794752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636008024072217, + "loss": 3.2131, + "theoretical_loss": 4.196851046865442, + "tokens_seen": 270860288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046359077231695084, + "loss": 3.0693, + "theoretical_loss": 4.1967299364786905, + "tokens_seen": 270925824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635807422266801, + "loss": 3.1201, + "theoretical_loss": 4.196608863585239, + "tokens_seen": 270991360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635707121364092, + "loss": 3.0753, + "theoretical_loss": 4.1964878281644165, + "tokens_seen": 271056896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046356068204613844, + "loss": 3.1767, + "theoretical_loss": 4.19636683019557, + "tokens_seen": 271122432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635506519558676, + "loss": 3.1568, + "theoretical_loss": 4.196245869658061, + "tokens_seen": 271187968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635406218655968, + "loss": 3.1067, + "theoretical_loss": 4.1961249465312696, + "tokens_seen": 271253504 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463530591775326, + "loss": 3.1049, + "theoretical_loss": 4.196004060794589, + "tokens_seen": 271319040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046352056168505516, + "loss": 3.1558, + "theoretical_loss": 4.195883212427433, + "tokens_seen": 271384576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046351053159478434, + "loss": 3.104, + "theoretical_loss": 4.195762401409229, + "tokens_seen": 271450112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635005015045136, + "loss": 2.9589, + "theoretical_loss": 4.19564162771942, + "tokens_seen": 271515648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634904714142427, + "loss": 3.0468, + "theoretical_loss": 4.195520891337466, + "tokens_seen": 271581184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046348044132397194, + "loss": 2.9809, + "theoretical_loss": 4.195400192242845, + "tokens_seen": 271646720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046347041123370107, + "loss": 3.2978, + "theoretical_loss": 4.19527953041505, + "tokens_seen": 271712256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634603811434303, + "loss": 3.4054, + "theoretical_loss": 4.19515890583359, + "tokens_seen": 271777792 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634503510531595, + "loss": 3.1847, + "theoretical_loss": 4.195038318477989, + "tokens_seen": 271843328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046344032096288867, + "loss": 3.1394, + "theoretical_loss": 4.194917768327789, + "tokens_seen": 271908864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 165961, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7331738471984863, + "objective/train/theoretical_loss": 4.194797255362549, + "objective/train/tokens_used": 292434400, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046343029087261785, + "loss": 2.8674, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634202607823471, + "loss": 3.05, + "theoretical_loss": 4.194676779561841, + "tokens_seen": 272039936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634102306920762, + "loss": 3.0387, + "theoretical_loss": 4.194556340905256, + "tokens_seen": 272105472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046340020060180545, + "loss": 3.0849, + "theoretical_loss": 4.194435939372401, + "tokens_seen": 272171008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046339017051153457, + "loss": 3.1541, + "theoretical_loss": 4.194315574942896, + "tokens_seen": 272236544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633801404212638, + "loss": 3.2083, + "theoretical_loss": 4.194195247596381, + "tokens_seen": 272302080 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463370110330993, + "loss": 3.2071, + "theoretical_loss": 4.19407495731251, + "tokens_seen": 272367616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046336008024072217, + "loss": 3.2115, + "theoretical_loss": 4.193954704070952, + "tokens_seen": 272433152 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046335005015045135, + "loss": 3.1099, + "theoretical_loss": 4.193834487851396, + "tokens_seen": 272498688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046334002006018053, + "loss": 2.9455, + "theoretical_loss": 4.193714308633542, + "tokens_seen": 272564224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633299899699097, + "loss": 3.2173, + "theoretical_loss": 4.1935941663971095, + "tokens_seen": 272629760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046331995987963895, + "loss": 3.0527, + "theoretical_loss": 4.193474061121833, + "tokens_seen": 272695296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633099297893681, + "loss": 3.0777, + "theoretical_loss": 4.193353992787463, + "tokens_seen": 272760832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632998996990973, + "loss": 3.1036, + "theoretical_loss": 4.193233961373766, + "tokens_seen": 272826368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046328986960882644, + "loss": 2.9951, + "theoretical_loss": 4.1931139668605235, + "tokens_seen": 272891904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632798395185557, + "loss": 3.0116, + "theoretical_loss": 4.192994009227535, + "tokens_seen": 272957440 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046326980942828485, + "loss": 2.8518, + "theoretical_loss": 4.192874088454613, + "tokens_seen": 273022976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046325977933801404, + "loss": 2.8654, + "theoretical_loss": 4.19275420452159, + "tokens_seen": 273088512 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632497492477432, + "loss": 3.0445, + "theoretical_loss": 4.192634357408309, + "tokens_seen": 273154048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046323971915747245, + "loss": 2.9804, + "theoretical_loss": 4.192514547094634, + "tokens_seen": 273219584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632296890672016, + "loss": 3.0997, + "theoretical_loss": 4.192394773560441, + "tokens_seen": 273285120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632196589769308, + "loss": 3.0125, + "theoretical_loss": 4.192275036785625, + "tokens_seen": 273350656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046320962888666, + "loss": 2.8637, + "theoretical_loss": 4.192155336750094, + "tokens_seen": 273416192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631995987963892, + "loss": 3.1459, + "theoretical_loss": 4.192035673433773, + "tokens_seen": 273481728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631895687061184, + "loss": 3.0333, + "theoretical_loss": 4.191916046816605, + "tokens_seen": 273547264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 166474, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.229163885116577, + "objective/train/theoretical_loss": 4.191796456878544, + "objective/train/tokens_used": 294072800, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046317953861584754, + "loss": 3.3893, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631695085255768, + "loss": 3.2798, + "theoretical_loss": 4.191676903599563, + "tokens_seen": 273678336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631594784353059, + "loss": 3.2739, + "theoretical_loss": 4.191557386959651, + "tokens_seen": 273743872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046314944834503514, + "loss": 2.8998, + "theoretical_loss": 4.191437906938811, + "tokens_seen": 273809408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631394182547643, + "loss": 3.1002, + "theoretical_loss": 4.191318463517062, + "tokens_seen": 273874944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631293881644935, + "loss": 3.052, + "theoretical_loss": 4.19119905667444, + "tokens_seen": 273940480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631193580742227, + "loss": 3.0649, + "theoretical_loss": 4.191079686390996, + "tokens_seen": 274006016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631093279839519, + "loss": 3.1153, + "theoretical_loss": 4.190960352646796, + "tokens_seen": 274071552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046309929789368104, + "loss": 3.0196, + "theoretical_loss": 4.190841055421921, + "tokens_seen": 274137088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630892678034103, + "loss": 3.0416, + "theoretical_loss": 4.19072179469647, + "tokens_seen": 274202624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630792377131394, + "loss": 3.0144, + "theoretical_loss": 4.190602570450556, + "tokens_seen": 274268160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046306920762286864, + "loss": 3.1422, + "theoretical_loss": 4.190483382664308, + "tokens_seen": 274333696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630591775325978, + "loss": 3.1745, + "theoretical_loss": 4.19036423131787, + "tokens_seen": 274399232 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463049147442327, + "loss": 3.0466, + "theoretical_loss": 4.190245116391403, + "tokens_seen": 274464768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630391173520562, + "loss": 2.9194, + "theoretical_loss": 4.190126037865082, + "tokens_seen": 274530304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046302908726178536, + "loss": 3.2119, + "theoretical_loss": 4.190006995719098, + "tokens_seen": 274595840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046301905717151455, + "loss": 3.1011, + "theoretical_loss": 4.1898879899336565, + "tokens_seen": 274661376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630090270812438, + "loss": 3.0468, + "theoretical_loss": 4.189769020488981, + "tokens_seen": 274726912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629989969909729, + "loss": 3.368, + "theoretical_loss": 4.189650087365309, + "tokens_seen": 274792448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046298896690070214, + "loss": 3.0634, + "theoretical_loss": 4.189531190542893, + "tokens_seen": 274857984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046297893681043127, + "loss": 3.0413, + "theoretical_loss": 4.189412330002001, + "tokens_seen": 274923520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629689067201605, + "loss": 3.3153, + "theoretical_loss": 4.189293505722918, + "tokens_seen": 274989056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629588766298897, + "loss": 2.9856, + "theoretical_loss": 4.189174717685942, + "tokens_seen": 275054592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046294884653961887, + "loss": 3.1424, + "theoretical_loss": 4.189055965871389, + "tokens_seen": 275120128 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046293881644934805, + "loss": 3.3162, + "theoretical_loss": 4.188937250259587, + "tokens_seen": 275185664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 167916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9187827110290527, + "objective/train/theoretical_loss": 4.188818570830883, + "objective/train/tokens_used": 295711200, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629287863590773, + "loss": 3.233, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629187562688064, + "loss": 3.1334, + "theoretical_loss": 4.188699927565638, + "tokens_seen": 275316736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046290872617853565, + "loss": 3.2221, + "theoretical_loss": 4.188581320444228, + "tokens_seen": 275382272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046289869608826477, + "loss": 3.4486, + "theoretical_loss": 4.1884627494470426, + "tokens_seen": 275447808 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462888665997994, + "loss": 2.7931, + "theoretical_loss": 4.1883442145544905, + "tokens_seen": 275513344 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628786359077232, + "loss": 3.002, + "theoretical_loss": 4.188225715746992, + "tokens_seen": 275578880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046286860581745237, + "loss": 3.2486, + "theoretical_loss": 4.188107253004986, + "tokens_seen": 275644416 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046285857572718155, + "loss": 3.2086, + "theoretical_loss": 4.187988826308925, + "tokens_seen": 275709952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046284854563691073, + "loss": 2.9635, + "theoretical_loss": 4.187870435639275, + "tokens_seen": 275775488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628385155466399, + "loss": 2.9808, + "theoretical_loss": 4.18775208097652, + "tokens_seen": 275841024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046282848545636915, + "loss": 3.2022, + "theoretical_loss": 4.187633762301159, + "tokens_seen": 275906560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628184553660983, + "loss": 3.0849, + "theoretical_loss": 4.187515479593704, + "tokens_seen": 275972096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628084252758275, + "loss": 3.0014, + "theoretical_loss": 4.187397232834683, + "tokens_seen": 276037632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046279839518555664, + "loss": 3.1524, + "theoretical_loss": 4.187279022004642, + "tokens_seen": 276103168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627883650952859, + "loss": 3.1932, + "theoretical_loss": 4.1871608470841375, + "tokens_seen": 276168704 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046277833500501505, + "loss": 2.89, + "theoretical_loss": 4.1870427080537445, + "tokens_seen": 276234240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046276830491474424, + "loss": 3.2589, + "theoretical_loss": 4.1869246048940525, + "tokens_seen": 276299776 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627582748244734, + "loss": 3.1626, + "theoretical_loss": 4.186806537585666, + "tokens_seen": 276365312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046274824473420265, + "loss": 3.1922, + "theoretical_loss": 4.186688506109202, + "tokens_seen": 276430848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627382146439318, + "loss": 2.9536, + "theoretical_loss": 4.186570510445296, + "tokens_seen": 276496384 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462728184553661, + "loss": 3.3615, + "theoretical_loss": 4.186452550574599, + "tokens_seen": 276561920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046271815446339014, + "loss": 3.2877, + "theoretical_loss": 4.186334626477774, + "tokens_seen": 276627456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627081243731194, + "loss": 3.2169, + "theoretical_loss": 4.186216738135501, + "tokens_seen": 276692992 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046269809428284856, + "loss": 3.0765, + "theoretical_loss": 4.186098885528473, + "tokens_seen": 276758528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046268806419257774, + "loss": 3.1812, + "theoretical_loss": 4.185981068637401, + "tokens_seen": 276824064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 168545, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.318019390106201, + "objective/train/theoretical_loss": 4.185863287443008, + "objective/train/tokens_used": 297349600, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626780341023069, + "loss": 3.1149, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626680040120361, + "loss": 3.1434, + "theoretical_loss": 4.185745541926035, + "tokens_seen": 276955136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626579739217653, + "loss": 3.0085, + "theoretical_loss": 4.185627832067237, + "tokens_seen": 277020672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626479438314945, + "loss": 3.1581, + "theoretical_loss": 4.1855101578473795, + "tokens_seen": 277086208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046263791374122364, + "loss": 3.2237, + "theoretical_loss": 4.18539251924725, + "tokens_seen": 277151744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626278836509529, + "loss": 3.0262, + "theoretical_loss": 4.185274916247646, + "tokens_seen": 277217280 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462617853560682, + "loss": 3.3536, + "theoretical_loss": 4.185157348829383, + "tokens_seen": 277282816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046260782347041124, + "loss": 3.1312, + "theoretical_loss": 4.185039816973289, + "tokens_seen": 277348352 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625977933801404, + "loss": 3.1212, + "theoretical_loss": 4.184922320660207, + "tokens_seen": 277413888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625877632898696, + "loss": 3.0527, + "theoretical_loss": 4.184804859870997, + "tokens_seen": 277479424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625777331995988, + "loss": 3.1812, + "theoretical_loss": 4.184687434586531, + "tokens_seen": 277544960 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462567703109328, + "loss": 2.9691, + "theoretical_loss": 4.184570044787698, + "tokens_seen": 277610496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046255767301905715, + "loss": 3.073, + "theoretical_loss": 4.1844526904554, + "tokens_seen": 277676032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625476429287864, + "loss": 3.0977, + "theoretical_loss": 4.184335371570556, + "tokens_seen": 277741568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625376128385155, + "loss": 3.0928, + "theoretical_loss": 4.184218088114097, + "tokens_seen": 277807104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046252758274824475, + "loss": 3.1863, + "theoretical_loss": 4.1841008400669715, + "tokens_seen": 277872640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625175526579739, + "loss": 3.3092, + "theoretical_loss": 4.183983627410142, + "tokens_seen": 277938176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625075225677031, + "loss": 2.8105, + "theoretical_loss": 4.183866450124584, + "tokens_seen": 278003712 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624974924774323, + "loss": 3.1829, + "theoretical_loss": 4.18374930819129, + "tokens_seen": 278069248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046248746238716147, + "loss": 3.2848, + "theoretical_loss": 4.183632201591264, + "tokens_seen": 278134784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046247743229689065, + "loss": 2.9075, + "theoretical_loss": 4.18351513030553, + "tokens_seen": 278200320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624674022066199, + "loss": 2.9081, + "theoretical_loss": 4.1833980943151206, + "tokens_seen": 278265856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046245737211634907, + "loss": 3.117, + "theoretical_loss": 4.183281093601087, + "tokens_seen": 278331392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046244734202607825, + "loss": 3.1446, + "theoretical_loss": 4.183164128144495, + "tokens_seen": 278396928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624373119358075, + "loss": 3.1212, + "theoretical_loss": 4.183047197926422, + "tokens_seen": 278462464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 169743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.211092233657837, + "objective/train/theoretical_loss": 4.182930302927963, + "objective/train/tokens_used": 298988000, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624272818455366, + "loss": 2.9243, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046241725175526585, + "loss": 3.1498, + "theoretical_loss": 4.182813443130227, + "tokens_seen": 278593536 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462407221664995, + "loss": 3.1976, + "theoretical_loss": 4.182696618514337, + "tokens_seen": 278659072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623971915747242, + "loss": 2.9944, + "theoretical_loss": 4.18257982906143, + "tokens_seen": 278724608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623871614844534, + "loss": 3.0975, + "theoretical_loss": 4.1824630747526585, + "tokens_seen": 278790144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046237713139418257, + "loss": 3.1447, + "theoretical_loss": 4.182346355569189, + "tokens_seen": 278855680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046236710130391175, + "loss": 3.0155, + "theoretical_loss": 4.182229671492204, + "tokens_seen": 278921216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046235707121364093, + "loss": 3.1009, + "theoretical_loss": 4.1821130225028975, + "tokens_seen": 278986752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623470411233701, + "loss": 3.1349, + "theoretical_loss": 4.1819964085824815, + "tokens_seen": 279052288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046233701103309935, + "loss": 3.2392, + "theoretical_loss": 4.181879829712178, + "tokens_seen": 279117824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623269809428285, + "loss": 2.8839, + "theoretical_loss": 4.181763285873231, + "tokens_seen": 279183360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623169508525577, + "loss": 3.2192, + "theoretical_loss": 4.181646777046889, + "tokens_seen": 279248896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046230692076228684, + "loss": 3.1266, + "theoretical_loss": 4.181530303214423, + "tokens_seen": 279314432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622968906720161, + "loss": 3.2385, + "theoretical_loss": 4.181413864357115, + "tokens_seen": 279379968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046228686058174525, + "loss": 3.0699, + "theoretical_loss": 4.181297460456262, + "tokens_seen": 279445504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046227683049147444, + "loss": 3.2547, + "theoretical_loss": 4.181181091493174, + "tokens_seen": 279511040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622668004012036, + "loss": 3.0276, + "theoretical_loss": 4.181064757449178, + "tokens_seen": 279576576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046225677031093285, + "loss": 3.1922, + "theoretical_loss": 4.180948458305615, + "tokens_seen": 279642112 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462246740220662, + "loss": 3.0343, + "theoretical_loss": 4.180832194043836, + "tokens_seen": 279707648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622367101303912, + "loss": 3.1996, + "theoretical_loss": 4.180715964645213, + "tokens_seen": 279773184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046222668004012034, + "loss": 3.3317, + "theoretical_loss": 4.180599770091126, + "tokens_seen": 279838720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622166499498496, + "loss": 2.9687, + "theoretical_loss": 4.180483610362975, + "tokens_seen": 279904256 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046220661985957876, + "loss": 3.158, + "theoretical_loss": 4.18036748544217, + "tokens_seen": 279969792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046219658976930794, + "loss": 3.0762, + "theoretical_loss": 4.180251395310137, + "tokens_seen": 280035328 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621865596790371, + "loss": 3.1545, + "theoretical_loss": 4.1801353399483165, + "tokens_seen": 280100864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 170343, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5833492279052734, + "objective/train/theoretical_loss": 4.180019319338163, + "objective/train/tokens_used": 300626400, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621765295887663, + "loss": 3.2849, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621664994984955, + "loss": 2.8302, + "theoretical_loss": 4.179903333461144, + "tokens_seen": 280231936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621564694082247, + "loss": 2.8809, + "theoretical_loss": 4.179787382298744, + "tokens_seen": 280297472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046214643931795384, + "loss": 3.2571, + "theoretical_loss": 4.179671465832458, + "tokens_seen": 280363008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621364092276831, + "loss": 2.9144, + "theoretical_loss": 4.179555584043799, + "tokens_seen": 280428544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621263791374122, + "loss": 3.1233, + "theoretical_loss": 4.17943973691429, + "tokens_seen": 280494080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046211634904714144, + "loss": 3.0764, + "theoretical_loss": 4.179323924425472, + "tokens_seen": 280559616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004621063189568706, + "loss": 3.1949, + "theoretical_loss": 4.179208146558899, + "tokens_seen": 280625152 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620962888665998, + "loss": 3.1481, + "theoretical_loss": 4.1790924032961385, + "tokens_seen": 280690688 + }, + { + "epoch": 0.09, + "learning_rate": 0.000462086258776329, + "loss": 2.9628, + "theoretical_loss": 4.178976694618772, + "tokens_seen": 280756224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620762286860582, + "loss": 3.1512, + "theoretical_loss": 4.178861020508395, + "tokens_seen": 280821760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046206619859578735, + "loss": 2.9415, + "theoretical_loss": 4.178745380946619, + "tokens_seen": 280887296 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620561685055166, + "loss": 3.1536, + "theoretical_loss": 4.178629775915066, + "tokens_seen": 280952832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620461384152457, + "loss": 3.0706, + "theoretical_loss": 4.178514205395376, + "tokens_seen": 281018368 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046203610832497495, + "loss": 3.0371, + "theoretical_loss": 4.178398669369201, + "tokens_seen": 281083904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620260782347041, + "loss": 2.8429, + "theoretical_loss": 4.178283167818206, + "tokens_seen": 281149440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620160481444333, + "loss": 3.1415, + "theoretical_loss": 4.178167700724073, + "tokens_seen": 281214976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620060180541625, + "loss": 3.0451, + "theoretical_loss": 4.178052268068494, + "tokens_seen": 281280512 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046199598796389167, + "loss": 3.0419, + "theoretical_loss": 4.177936869833179, + "tokens_seen": 281346048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046198595787362085, + "loss": 2.8053, + "theoretical_loss": 4.17782150599985, + "tokens_seen": 281411584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619759277833501, + "loss": 2.7688, + "theoretical_loss": 4.1777061765502435, + "tokens_seen": 281477120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619658976930792, + "loss": 3.022, + "theoretical_loss": 4.1775908814661085, + "tokens_seen": 281542656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046195586760280845, + "loss": 3.3437, + "theoretical_loss": 4.17747562072921, + "tokens_seen": 281608192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046194583751253763, + "loss": 2.9559, + "theoretical_loss": 4.177360394321325, + "tokens_seen": 281673728 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619358074222668, + "loss": 2.7982, + "theoretical_loss": 4.177245202224246, + "tokens_seen": 281739264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 171008, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.037534236907959, + "objective/train/theoretical_loss": 4.17713004441978, + "objective/train/tokens_used": 302264800, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461925777331996, + "loss": 3.1612, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619157472417252, + "loss": 3.2436, + "theoretical_loss": 4.177014920889745, + "tokens_seen": 281870336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046190571715145435, + "loss": 3.0578, + "theoretical_loss": 4.176899831615974, + "tokens_seen": 281935872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618956870611836, + "loss": 3.1397, + "theoretical_loss": 4.176784776580316, + "tokens_seen": 282001408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618856569709127, + "loss": 3.2312, + "theoretical_loss": 4.176669755764632, + "tokens_seen": 282066944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046187562688064195, + "loss": 3.0992, + "theoretical_loss": 4.176554769150796, + "tokens_seen": 282132480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618655967903711, + "loss": 3.1607, + "theoretical_loss": 4.176439816720697, + "tokens_seen": 282198016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618555667001003, + "loss": 2.9453, + "theoretical_loss": 4.1763248984562376, + "tokens_seen": 282263552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618455366098295, + "loss": 3.2318, + "theoretical_loss": 4.176210014339335, + "tokens_seen": 282329088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618355065195587, + "loss": 3.171, + "theoretical_loss": 4.17609516435192, + "tokens_seen": 282394624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046182547642928786, + "loss": 3.0733, + "theoretical_loss": 4.1759803484759335, + "tokens_seen": 282460160 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046181544633901704, + "loss": 3.0515, + "theoretical_loss": 4.175865566693336, + "tokens_seen": 282525696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618054162487462, + "loss": 3.2269, + "theoretical_loss": 4.175750818986098, + "tokens_seen": 282591232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046179538615847545, + "loss": 3.2102, + "theoretical_loss": 4.1756361053362046, + "tokens_seen": 282656768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617853560682046, + "loss": 3.2833, + "theoretical_loss": 4.1755214257256545, + "tokens_seen": 282722304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617753259779338, + "loss": 2.9001, + "theoretical_loss": 4.17540678013646, + "tokens_seen": 282787840 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461765295887663, + "loss": 2.989, + "theoretical_loss": 4.175292168550648, + "tokens_seen": 282853376 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617552657973922, + "loss": 3.2841, + "theoretical_loss": 4.175177590950257, + "tokens_seen": 282918912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046174523570712136, + "loss": 2.9466, + "theoretical_loss": 4.175063047317342, + "tokens_seen": 282984448 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046173520561685054, + "loss": 2.8851, + "theoretical_loss": 4.174948537633968, + "tokens_seen": 283049984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617251755265797, + "loss": 3.0796, + "theoretical_loss": 4.174834061882218, + "tokens_seen": 283115520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046171514543630896, + "loss": 3.0094, + "theoretical_loss": 4.1747196200441845, + "tokens_seen": 283181056 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046170511534603814, + "loss": 3.125, + "theoretical_loss": 4.174605212101977, + "tokens_seen": 283246592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616950852557673, + "loss": 3.1404, + "theoretical_loss": 4.174490838037716, + "tokens_seen": 283312128 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616850551654965, + "loss": 3.1135, + "theoretical_loss": 4.174376497833537, + "tokens_seen": 283377664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 172282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8163797855377197, + "objective/train/theoretical_loss": 4.174262191471587, + "objective/train/tokens_used": 303903200, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616750250752257, + "loss": 2.8791, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616649949849549, + "loss": 2.9439, + "theoretical_loss": 4.17414791893403, + "tokens_seen": 283508736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046165496489468404, + "loss": 2.934, + "theoretical_loss": 4.17403368020304, + "tokens_seen": 283574272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616449348044133, + "loss": 3.2267, + "theoretical_loss": 4.173919475260808, + "tokens_seen": 283639808 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616349047141424, + "loss": 3.2523, + "theoretical_loss": 4.173805304089536, + "tokens_seen": 283705344 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046162487462387164, + "loss": 3.0485, + "theoretical_loss": 4.173691166671439, + "tokens_seen": 283770880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616148445336008, + "loss": 3.3109, + "theoretical_loss": 4.173577062988748, + "tokens_seen": 283836416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046160481444333, + "loss": 3.2738, + "theoretical_loss": 4.173462993023706, + "tokens_seen": 283901952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615947843530592, + "loss": 2.992, + "theoretical_loss": 4.173348956758568, + "tokens_seen": 283967488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615847542627884, + "loss": 2.9936, + "theoretical_loss": 4.173234954175605, + "tokens_seen": 284033024 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046157472417251755, + "loss": 3.3821, + "theoretical_loss": 4.173120985257102, + "tokens_seen": 284098560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615646940822468, + "loss": 2.9413, + "theoretical_loss": 4.173007049985352, + "tokens_seen": 284164096 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615546639919759, + "loss": 2.9063, + "theoretical_loss": 4.172893148342667, + "tokens_seen": 284229632 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046154463390170515, + "loss": 3.2355, + "theoretical_loss": 4.172779280311372, + "tokens_seen": 284295168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615346038114343, + "loss": 3.1892, + "theoretical_loss": 4.172665445873801, + "tokens_seen": 284360704 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615245737211635, + "loss": 3.0414, + "theoretical_loss": 4.172551645012307, + "tokens_seen": 284426240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615145436308927, + "loss": 2.9639, + "theoretical_loss": 4.1724378777092515, + "tokens_seen": 284491776 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046150451354062187, + "loss": 3.2605, + "theoretical_loss": 4.172324143947012, + "tokens_seen": 284557312 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046149448345035105, + "loss": 3.0332, + "theoretical_loss": 4.172210443707979, + "tokens_seen": 284622848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614844533600803, + "loss": 3.0535, + "theoretical_loss": 4.1720967769745565, + "tokens_seen": 284688384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614744232698094, + "loss": 2.8846, + "theoretical_loss": 4.171983143729159, + "tokens_seen": 284753920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046146439317953865, + "loss": 3.3499, + "theoretical_loss": 4.1718695439542195, + "tokens_seen": 284819456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046145436308926783, + "loss": 3.0726, + "theoretical_loss": 4.17175597763218, + "tokens_seen": 284884992 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461444332998997, + "loss": 2.7706, + "theoretical_loss": 4.171642444745497, + "tokens_seen": 284950528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614343029087262, + "loss": 3.0167, + "theoretical_loss": 4.1715289452766395, + "tokens_seen": 285016064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 172985, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.503798246383667, + "objective/train/theoretical_loss": 4.1714154792080915, + "objective/train/tokens_used": 305541600, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614242728184554, + "loss": 3.1825, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046141424272818455, + "loss": 2.9299, + "theoretical_loss": 4.171302046522349, + "tokens_seen": 285147136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614042126379138, + "loss": 2.9893, + "theoretical_loss": 4.171188647201921, + "tokens_seen": 285212672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613941825476429, + "loss": 3.2218, + "theoretical_loss": 4.1710752812293315, + "tokens_seen": 285278208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046138415245737215, + "loss": 3.0444, + "theoretical_loss": 4.170961948587115, + "tokens_seen": 285343744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613741223671013, + "loss": 3.0153, + "theoretical_loss": 4.17084864925782, + "tokens_seen": 285409280 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613640922768305, + "loss": 3.0765, + "theoretical_loss": 4.1707353832240095, + "tokens_seen": 285474816 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613540621865597, + "loss": 3.1904, + "theoretical_loss": 4.170622150468258, + "tokens_seen": 285540352 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613440320962889, + "loss": 3.2306, + "theoretical_loss": 4.170508950973154, + "tokens_seen": 285605888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046133400200601806, + "loss": 3.3067, + "theoretical_loss": 4.1703957847213, + "tokens_seen": 285671424 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046132397191574724, + "loss": 3.0881, + "theoretical_loss": 4.170282651695308, + "tokens_seen": 285736960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613139418254764, + "loss": 2.9596, + "theoretical_loss": 4.170169551877808, + "tokens_seen": 285802496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046130391173520566, + "loss": 3.2447, + "theoretical_loss": 4.170056485251439, + "tokens_seen": 285868032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612938816449348, + "loss": 3.2723, + "theoretical_loss": 4.169943451798856, + "tokens_seen": 285933568 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461283851554664, + "loss": 3.0463, + "theoretical_loss": 4.169830451502724, + "tokens_seen": 285999104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612738214643932, + "loss": 2.924, + "theoretical_loss": 4.169717484345725, + "tokens_seen": 286064640 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612637913741224, + "loss": 3.0269, + "theoretical_loss": 4.1696045503105506, + "tokens_seen": 286130176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046125376128385156, + "loss": 3.1089, + "theoretical_loss": 4.169491649379905, + "tokens_seen": 286195712 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046124373119358074, + "loss": 3.0891, + "theoretical_loss": 4.169378781536509, + "tokens_seen": 286261248 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612337011033099, + "loss": 3.0611, + "theoretical_loss": 4.169265946763095, + "tokens_seen": 286326784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046122367101303916, + "loss": 3.0257, + "theoretical_loss": 4.169153145042405, + "tokens_seen": 286392320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612136409227683, + "loss": 3.1311, + "theoretical_loss": 4.169040376357199, + "tokens_seen": 286457856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612036108324975, + "loss": 3.2128, + "theoretical_loss": 4.168927640690246, + "tokens_seen": 286523392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046119358074222665, + "loss": 2.8733, + "theoretical_loss": 4.16881493802433, + "tokens_seen": 286588928 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611835506519559, + "loss": 2.9867, + "theoretical_loss": 4.168702268342248, + "tokens_seen": 286654464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 174327, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1925644874572754, + "objective/train/theoretical_loss": 4.168589631626808, + "objective/train/tokens_used": 307180000, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046117352056168506, + "loss": 3.043, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046116349047141425, + "loss": 2.8574, + "theoretical_loss": 4.168477027860833, + "tokens_seen": 286785536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611534603811434, + "loss": 3.0897, + "theoretical_loss": 4.168364457027158, + "tokens_seen": 286851072 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611434302908726, + "loss": 3.0685, + "theoretical_loss": 4.168251919108632, + "tokens_seen": 286916608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611334002006018, + "loss": 3.356, + "theoretical_loss": 4.168139414088113, + "tokens_seen": 286982144 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461123370110331, + "loss": 3.1332, + "theoretical_loss": 4.168026941948478, + "tokens_seen": 287047680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046111334002006015, + "loss": 3.1427, + "theoretical_loss": 4.167914502672611, + "tokens_seen": 287113216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611033099297894, + "loss": 3.263, + "theoretical_loss": 4.1678020962434115, + "tokens_seen": 287178752 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046109327983951857, + "loss": 2.847, + "theoretical_loss": 4.167689722643792, + "tokens_seen": 287244288 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046108324974924775, + "loss": 3.1922, + "theoretical_loss": 4.1675773818566775, + "tokens_seen": 287309824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046107321965897693, + "loss": 3.224, + "theoretical_loss": 4.167465073865006, + "tokens_seen": 287375360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610631895687061, + "loss": 3.2108, + "theoretical_loss": 4.167352798651726, + "tokens_seen": 287440896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610531594784353, + "loss": 2.9368, + "theoretical_loss": 4.167240556199802, + "tokens_seen": 287506432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610431293881645, + "loss": 3.025, + "theoretical_loss": 4.167128346492211, + "tokens_seen": 287571968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046103309929789365, + "loss": 3.1749, + "theoretical_loss": 4.16701616951194, + "tokens_seen": 287637504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610230692076229, + "loss": 3.2276, + "theoretical_loss": 4.1669040252419896, + "tokens_seen": 287703040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461013039117352, + "loss": 3.1166, + "theoretical_loss": 4.166791913665375, + "tokens_seen": 287768576 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046100300902708125, + "loss": 3.2476, + "theoretical_loss": 4.166679834765123, + "tokens_seen": 287834112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046099297893681043, + "loss": 3.2406, + "theoretical_loss": 4.166567788524272, + "tokens_seen": 287899648 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609829488465396, + "loss": 3.1236, + "theoretical_loss": 4.166455774925875, + "tokens_seen": 287965184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609729187562688, + "loss": 2.6893, + "theoretical_loss": 4.166343793952995, + "tokens_seen": 288030720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046096288866599803, + "loss": 2.9429, + "theoretical_loss": 4.166231845588712, + "tokens_seen": 288096256 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609528585757272, + "loss": 3.1807, + "theoretical_loss": 4.166119929816113, + "tokens_seen": 288161792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609428284854564, + "loss": 3.054, + "theoretical_loss": 4.166008046618303, + "tokens_seen": 288227328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609327983951856, + "loss": 3.1462, + "theoretical_loss": 4.1658961959783944, + "tokens_seen": 288292864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 174865, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1643199920654297, + "objective/train/theoretical_loss": 4.165784377879517, + "objective/train/tokens_used": 308818400, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046092276830491475, + "loss": 3.2366, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460912738214644, + "loss": 2.9021, + "theoretical_loss": 4.165672592304811, + "tokens_seen": 288423936 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609027081243731, + "loss": 3.0573, + "theoretical_loss": 4.165560839237429, + "tokens_seen": 288489472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046089267803410235, + "loss": 3.0862, + "theoretical_loss": 4.165449118660536, + "tokens_seen": 288555008 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608826479438315, + "loss": 3.149, + "theoretical_loss": 4.16533743055731, + "tokens_seen": 288620544 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608726178535607, + "loss": 3.21, + "theoretical_loss": 4.165225774910941, + "tokens_seen": 288686080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608625877632899, + "loss": 2.8823, + "theoretical_loss": 4.165114151704634, + "tokens_seen": 288751616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608525576730191, + "loss": 3.0306, + "theoretical_loss": 4.165002560921601, + "tokens_seen": 288817152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046084252758274826, + "loss": 3.2771, + "theoretical_loss": 4.164891002545073, + "tokens_seen": 288882688 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046083249749247744, + "loss": 2.7558, + "theoretical_loss": 4.16477947655829, + "tokens_seen": 288948224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608224674022066, + "loss": 3.0144, + "theoretical_loss": 4.164667982944504, + "tokens_seen": 289013760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046081243731193586, + "loss": 2.9461, + "theoretical_loss": 4.164556521686981, + "tokens_seen": 289079296 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460802407221665, + "loss": 3.1734, + "theoretical_loss": 4.1644450927689975, + "tokens_seen": 289144832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607923771313942, + "loss": 3.2232, + "theoretical_loss": 4.164333696173846, + "tokens_seen": 289210368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607823470411234, + "loss": 3.0528, + "theoretical_loss": 4.164222331884827, + "tokens_seen": 289275904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607723169508526, + "loss": 3.0869, + "theoretical_loss": 4.164110999885256, + "tokens_seen": 289341440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046076228686058176, + "loss": 3.0237, + "theoretical_loss": 4.163999700158462, + "tokens_seen": 289406976 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046075225677031094, + "loss": 3.0699, + "theoretical_loss": 4.163888432687784, + "tokens_seen": 289472512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607422266800401, + "loss": 2.7908, + "theoretical_loss": 4.163777197456573, + "tokens_seen": 289538048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046073219658976936, + "loss": 3.0935, + "theoretical_loss": 4.163665994448197, + "tokens_seen": 289603584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607221664994985, + "loss": 2.9748, + "theoretical_loss": 4.163554823646027, + "tokens_seen": 289669120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607121364092277, + "loss": 3.0475, + "theoretical_loss": 4.163443685033458, + "tokens_seen": 289734656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046070210631895685, + "loss": 2.9678, + "theoretical_loss": 4.163332578593889, + "tokens_seen": 289800192 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606920762286861, + "loss": 3.0884, + "theoretical_loss": 4.163221504310734, + "tokens_seen": 289865728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046068204613841526, + "loss": 3.1743, + "theoretical_loss": 4.1631104621674195, + "tokens_seen": 289931264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 176137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.816805124282837, + "objective/train/theoretical_loss": 4.162999452147384, + "objective/train/tokens_used": 310456800, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046067201604814445, + "loss": 2.9639, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606619859578736, + "loss": 2.9778, + "theoretical_loss": 4.1628884742340775, + "tokens_seen": 290062336 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606519558676028, + "loss": 3.0018, + "theoretical_loss": 4.162777528410963, + "tokens_seen": 290127872 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460641925777332, + "loss": 2.9703, + "theoretical_loss": 4.162666614661518, + "tokens_seen": 290193408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606318956870612, + "loss": 3.2376, + "theoretical_loss": 4.162555732969227, + "tokens_seen": 290258944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046062186559679035, + "loss": 3.0843, + "theoretical_loss": 4.162444883317591, + "tokens_seen": 290324480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606118355065196, + "loss": 3.1161, + "theoretical_loss": 4.162334065690123, + "tokens_seen": 290390016 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046060180541624877, + "loss": 3.0975, + "theoretical_loss": 4.162223280070345, + "tokens_seen": 290455552 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046059177532597795, + "loss": 3.0539, + "theoretical_loss": 4.1621125264417955, + "tokens_seen": 290521088 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046058174523570713, + "loss": 3.2598, + "theoretical_loss": 4.162001804788021, + "tokens_seen": 290586624 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605717151454363, + "loss": 3.1444, + "theoretical_loss": 4.161891115092583, + "tokens_seen": 290652160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605616850551655, + "loss": 3.164, + "theoretical_loss": 4.161780457339055, + "tokens_seen": 290717696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605516549648947, + "loss": 3.1421, + "theoretical_loss": 4.161669831511022, + "tokens_seen": 290783232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046054162487462385, + "loss": 3.167, + "theoretical_loss": 4.16155923759208, + "tokens_seen": 290848768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605315947843531, + "loss": 3.0247, + "theoretical_loss": 4.161448675565838, + "tokens_seen": 290914304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605215646940822, + "loss": 3.2994, + "theoretical_loss": 4.161338145415918, + "tokens_seen": 290979840 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046051153460381145, + "loss": 2.9411, + "theoretical_loss": 4.161227647125955, + "tokens_seen": 291045376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046050150451354063, + "loss": 2.9685, + "theoretical_loss": 4.161117180679591, + "tokens_seen": 291110912 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604914744232698, + "loss": 2.8291, + "theoretical_loss": 4.161006746060488, + "tokens_seen": 291176448 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460481444332999, + "loss": 3.0402, + "theoretical_loss": 4.160896343252311, + "tokens_seen": 291241984 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046047141424272823, + "loss": 3.0417, + "theoretical_loss": 4.160785972238745, + "tokens_seen": 291307520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046046138415245736, + "loss": 3.3467, + "theoretical_loss": 4.160675633003484, + "tokens_seen": 291373056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604513540621866, + "loss": 3.4501, + "theoretical_loss": 4.16056532553023, + "tokens_seen": 291438592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604413239719157, + "loss": 2.9161, + "theoretical_loss": 4.160455049802706, + "tokens_seen": 291504128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046043129388164495, + "loss": 3.0043, + "theoretical_loss": 4.1603448058046375, + "tokens_seen": 291569664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 176478, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.099493980407715, + "objective/train/theoretical_loss": 4.160234593519768, + "objective/train/tokens_used": 312095200, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046042126379137414, + "loss": 3.4061, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604112337011033, + "loss": 2.9333, + "theoretical_loss": 4.160124412931852, + "tokens_seen": 291700736 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604012036108325, + "loss": 3.2184, + "theoretical_loss": 4.160014264024654, + "tokens_seen": 291766272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603911735205617, + "loss": 3.0181, + "theoretical_loss": 4.159904146781952, + "tokens_seen": 291831808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046038114343029086, + "loss": 2.9699, + "theoretical_loss": 4.159794061187536, + "tokens_seen": 291897344 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603711133400201, + "loss": 3.0901, + "theoretical_loss": 4.1596840072252075, + "tokens_seen": 291962880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603610832497492, + "loss": 3.0625, + "theoretical_loss": 4.159573984878779, + "tokens_seen": 292028416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046035105315947846, + "loss": 3.0933, + "theoretical_loss": 4.159463994132079, + "tokens_seen": 292093952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603410230692076, + "loss": 3.1878, + "theoretical_loss": 4.15935403496894, + "tokens_seen": 292159488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603309929789368, + "loss": 3.2494, + "theoretical_loss": 4.159244107373215, + "tokens_seen": 292225024 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460320962888666, + "loss": 3.1238, + "theoretical_loss": 4.159134211328765, + "tokens_seen": 292290560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603109327983952, + "loss": 3.2343, + "theoretical_loss": 4.159024346819461, + "tokens_seen": 292356096 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046030090270812436, + "loss": 2.9651, + "theoretical_loss": 4.158914513829189, + "tokens_seen": 292421632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602908726178536, + "loss": 3.1782, + "theoretical_loss": 4.158804712341845, + "tokens_seen": 292487168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602808425275827, + "loss": 3.2679, + "theoretical_loss": 4.158694942341338, + "tokens_seen": 292552704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046027081243731196, + "loss": 3.0873, + "theoretical_loss": 4.1585852038115885, + "tokens_seen": 292618240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602607823470411, + "loss": 3.0467, + "theoretical_loss": 4.1584754967365285, + "tokens_seen": 292683776 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602507522567703, + "loss": 3.2751, + "theoretical_loss": 4.1583658211001016, + "tokens_seen": 292749312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602407221664995, + "loss": 3.3058, + "theoretical_loss": 4.158256176886264, + "tokens_seen": 292814848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602306920762287, + "loss": 3.0265, + "theoretical_loss": 4.158146564078982, + "tokens_seen": 292880384 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046022066198595787, + "loss": 3.0768, + "theoretical_loss": 4.158036982662237, + "tokens_seen": 292945920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046021063189568705, + "loss": 2.8461, + "theoretical_loss": 4.157927432620018, + "tokens_seen": 293011456 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602006018054163, + "loss": 3.1769, + "theoretical_loss": 4.157817913936329, + "tokens_seen": 293076992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046019057171514546, + "loss": 2.9548, + "theoretical_loss": 4.157708426595184, + "tokens_seen": 293142528 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046018054162487465, + "loss": 2.9129, + "theoretical_loss": 4.157598970580608, + "tokens_seen": 293208064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 177825, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.218212842941284, + "objective/train/theoretical_loss": 4.157489545876642, + "objective/train/tokens_used": 313733600, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601705115346038, + "loss": 3.1663, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460160481444333, + "loss": 3.228, + "theoretical_loss": 4.157380152467333, + "tokens_seen": 293339136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601504513540622, + "loss": 3.1855, + "theoretical_loss": 4.157270790336742, + "tokens_seen": 293404672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601404212637914, + "loss": 3.1031, + "theoretical_loss": 4.157161459468944, + "tokens_seen": 293470208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046013039117352055, + "loss": 3.11, + "theoretical_loss": 4.157052159848023, + "tokens_seen": 293535744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601203610832498, + "loss": 2.8797, + "theoretical_loss": 4.156942891458074, + "tokens_seen": 293601280 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046011033099297897, + "loss": 3.1523, + "theoretical_loss": 4.156833654283207, + "tokens_seen": 293666816 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046010030090270815, + "loss": 2.9046, + "theoretical_loss": 4.15672444830754, + "tokens_seen": 293732352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046009027081243733, + "loss": 3.1712, + "theoretical_loss": 4.156615273515205, + "tokens_seen": 293797888 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600802407221665, + "loss": 3.1373, + "theoretical_loss": 4.156506129890344, + "tokens_seen": 293863424 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600702106318957, + "loss": 3.2344, + "theoretical_loss": 4.156397017417111, + "tokens_seen": 293928960 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046006018054162493, + "loss": 3.2112, + "theoretical_loss": 4.156287936079675, + "tokens_seen": 293994496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046005015045135405, + "loss": 3.186, + "theoretical_loss": 4.156178885862209, + "tokens_seen": 294060032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600401203610833, + "loss": 2.9581, + "theoretical_loss": 4.156069866748906, + "tokens_seen": 294125568 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600300902708124, + "loss": 3.0633, + "theoretical_loss": 4.155960878723965, + "tokens_seen": 294191104 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046002006018054165, + "loss": 2.9769, + "theoretical_loss": 4.155851921771598, + "tokens_seen": 294256640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046001003009027083, + "loss": 3.154, + "theoretical_loss": 4.155742995876029, + "tokens_seen": 294322176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046, + "loss": 3.2509, + "theoretical_loss": 4.155634101021494, + "tokens_seen": 294387712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599899699097292, + "loss": 2.9986, + "theoretical_loss": 4.155525237192238, + "tokens_seen": 294453248 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045997993981945843, + "loss": 3.0416, + "theoretical_loss": 4.155416404372522, + "tokens_seen": 294518784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045996990972918756, + "loss": 3.113, + "theoretical_loss": 4.155307602546614, + "tokens_seen": 294584320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599598796389168, + "loss": 3.1698, + "theoretical_loss": 4.155198831698795, + "tokens_seen": 294649856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599498495486459, + "loss": 3.1415, + "theoretical_loss": 4.155090091813358, + "tokens_seen": 294715392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045993981945837515, + "loss": 2.9568, + "theoretical_loss": 4.154981382874608, + "tokens_seen": 294780928 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045992978936810434, + "loss": 3.0327, + "theoretical_loss": 4.154872704866859, + "tokens_seen": 294846464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 179027, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5512094497680664, + "objective/train/theoretical_loss": 4.15476405777444, + "objective/train/tokens_used": 315372000, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599197592778335, + "loss": 2.9603, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599097291875627, + "loss": 3.1664, + "theoretical_loss": 4.154655441581687, + "tokens_seen": 294977536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598996990972919, + "loss": 2.91, + "theoretical_loss": 4.154546856272952, + "tokens_seen": 295043072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045988966900702106, + "loss": 2.9691, + "theoretical_loss": 4.154438301832596, + "tokens_seen": 295108608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598796389167503, + "loss": 3.0933, + "theoretical_loss": 4.154329778244991, + "tokens_seen": 295174144 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598696088264794, + "loss": 2.9358, + "theoretical_loss": 4.154221285494521, + "tokens_seen": 295239680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045985957873620866, + "loss": 3.2587, + "theoretical_loss": 4.154112823565582, + "tokens_seen": 295305216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598495486459378, + "loss": 3.3291, + "theoretical_loss": 4.15400439244258, + "tokens_seen": 295370752 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459839518555667, + "loss": 3.3911, + "theoretical_loss": 4.153895992109935, + "tokens_seen": 295436288 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598294884653962, + "loss": 2.9428, + "theoretical_loss": 4.153787622552073, + "tokens_seen": 295501824 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598194583751254, + "loss": 3.1373, + "theoretical_loss": 4.153679283753439, + "tokens_seen": 295567360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045980942828485456, + "loss": 3.1252, + "theoretical_loss": 4.15357097569848, + "tokens_seen": 295632896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597993981945838, + "loss": 3.1504, + "theoretical_loss": 4.153462698371665, + "tokens_seen": 295698432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597893681043129, + "loss": 3.1088, + "theoretical_loss": 4.1533544517574645, + "tokens_seen": 295763968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045977933801404216, + "loss": 2.9716, + "theoretical_loss": 4.153246235840367, + "tokens_seen": 295829504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597693079237713, + "loss": 3.2071, + "theoretical_loss": 4.153138050604868, + "tokens_seen": 295895040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597592778335005, + "loss": 3.2361, + "theoretical_loss": 4.153029896035476, + "tokens_seen": 295960576 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597492477432297, + "loss": 2.9966, + "theoretical_loss": 4.152921772116712, + "tokens_seen": 296026112 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597392176529589, + "loss": 3.1588, + "theoretical_loss": 4.152813678833106, + "tokens_seen": 296091648 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045972918756268807, + "loss": 3.2001, + "theoretical_loss": 4.152705616169202, + "tokens_seen": 296157184 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045971915747241725, + "loss": 2.8042, + "theoretical_loss": 4.15259758410955, + "tokens_seen": 296222720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045970912738214643, + "loss": 3.135, + "theoretical_loss": 4.152489582638719, + "tokens_seen": 296288256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045969909729187566, + "loss": 3.3503, + "theoretical_loss": 4.152381611741281, + "tokens_seen": 296353792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596890672016048, + "loss": 3.0074, + "theoretical_loss": 4.152273671401824, + "tokens_seen": 296419328 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459679037111334, + "loss": 3.1205, + "theoretical_loss": 4.152165761604948, + "tokens_seen": 296484864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 179648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.256779432296753, + "objective/train/theoretical_loss": 4.152057882335261, + "objective/train/tokens_used": 317010400, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045966900702106315, + "loss": 3.1838, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596589769307924, + "loss": 3.178, + "theoretical_loss": 4.151950033577383, + "tokens_seen": 296615936 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045964894684052157, + "loss": 3.1013, + "theoretical_loss": 4.151842215315947, + "tokens_seen": 296681472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045963891675025075, + "loss": 3.056, + "theoretical_loss": 4.151734427535594, + "tokens_seen": 296747008 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045962888665997993, + "loss": 3.1372, + "theoretical_loss": 4.151626670220979, + "tokens_seen": 296812544 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045961885656970917, + "loss": 3.0696, + "theoretical_loss": 4.151518943356768, + "tokens_seen": 296878080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596088264794383, + "loss": 3.0229, + "theoretical_loss": 4.151411246927636, + "tokens_seen": 296943616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045959879638916753, + "loss": 2.9934, + "theoretical_loss": 4.15130358091827, + "tokens_seen": 297009152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045958876629889666, + "loss": 3.1986, + "theoretical_loss": 4.151195945313369, + "tokens_seen": 297074688 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595787362086259, + "loss": 3.0798, + "theoretical_loss": 4.151088340097642, + "tokens_seen": 297140224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595687061183551, + "loss": 3.0475, + "theoretical_loss": 4.15098076525581, + "tokens_seen": 297205760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045955867602808425, + "loss": 3.1249, + "theoretical_loss": 4.150873220772604, + "tokens_seen": 297271296 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045954864593781344, + "loss": 3.3091, + "theoretical_loss": 4.150765706632766, + "tokens_seen": 297336832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595386158475426, + "loss": 3.2366, + "theoretical_loss": 4.1506582228210505, + "tokens_seen": 297402368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595285857572718, + "loss": 2.9709, + "theoretical_loss": 4.150550769322221, + "tokens_seen": 297467904 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045951855566700103, + "loss": 3.1059, + "theoretical_loss": 4.150443346121054, + "tokens_seen": 297533440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045950852557673016, + "loss": 2.9407, + "theoretical_loss": 4.150335953202336, + "tokens_seen": 297598976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594984954864594, + "loss": 3.0818, + "theoretical_loss": 4.150228590550864, + "tokens_seen": 297664512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594884653961885, + "loss": 3.2076, + "theoretical_loss": 4.150121258151447, + "tokens_seen": 297730048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045947843530591776, + "loss": 2.9442, + "theoretical_loss": 4.150013955988905, + "tokens_seen": 297795584 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045946840521564694, + "loss": 3.1901, + "theoretical_loss": 4.149906684048068, + "tokens_seen": 297861120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594583751253761, + "loss": 2.6943, + "theoretical_loss": 4.1497994423137765, + "tokens_seen": 297926656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045944834503510536, + "loss": 2.9976, + "theoretical_loss": 4.149692230770884, + "tokens_seen": 297992192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045943831494483454, + "loss": 3.1648, + "theoretical_loss": 4.149585049404253, + "tokens_seen": 298057728 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594282848545637, + "loss": 3.1697, + "theoretical_loss": 4.149477898198759, + "tokens_seen": 298123264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 180307, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1973071098327637, + "objective/train/theoretical_loss": 4.149370777139286, + "objective/train/tokens_used": 318648800, + "theoretical_loss": 4.149370777139286, + "tokens_seen": 298188800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594182547642929, + "loss": 3.0318, + "theoretical_loss": 4.149370777139286, + "tokens_seen": 298188800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594082246740221, + "loss": 3.1242, + "theoretical_loss": 4.14926368621073, + "tokens_seen": 298254336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045939819458375126, + "loss": 3.0613, + "theoretical_loss": 4.149156625397998, + "tokens_seen": 298319872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593881644934805, + "loss": 2.8588, + "theoretical_loss": 4.149049594686008, + "tokens_seen": 298385408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593781344032096, + "loss": 2.7734, + "theoretical_loss": 4.1489425940596885, + "tokens_seen": 298450944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045936810431293886, + "loss": 2.7681, + "theoretical_loss": 4.148835623503978, + "tokens_seen": 298516480 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459358074222668, + "loss": 3.1072, + "theoretical_loss": 4.148728683003829, + "tokens_seen": 298582016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593480441323972, + "loss": 3.1108, + "theoretical_loss": 4.1486217725442005, + "tokens_seen": 298647552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593380140421264, + "loss": 2.91, + "theoretical_loss": 4.148514892110065, + "tokens_seen": 298713088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593279839518556, + "loss": 3.2009, + "theoretical_loss": 4.148408041686406, + "tokens_seen": 298778624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045931795386158476, + "loss": 3.0153, + "theoretical_loss": 4.148301221258217, + "tokens_seen": 298844160 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459307923771314, + "loss": 3.2481, + "theoretical_loss": 4.148194430810502, + "tokens_seen": 298909696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592978936810431, + "loss": 3.3113, + "theoretical_loss": 4.148087670328276, + "tokens_seen": 298975232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045928786359077236, + "loss": 3.2193, + "theoretical_loss": 4.147980939796565, + "tokens_seen": 299040768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592778335005015, + "loss": 3.0393, + "theoretical_loss": 4.147874239200405, + "tokens_seen": 299106304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592678034102307, + "loss": 2.8427, + "theoretical_loss": 4.147767568524845, + "tokens_seen": 299171840 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592577733199599, + "loss": 3.1985, + "theoretical_loss": 4.147660927754942, + "tokens_seen": 299237376 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592477432296891, + "loss": 2.9435, + "theoretical_loss": 4.147554316875766, + "tokens_seen": 299302912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045923771313941827, + "loss": 3.0539, + "theoretical_loss": 4.147447735872396, + "tokens_seen": 299368448 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045922768304914745, + "loss": 2.9897, + "theoretical_loss": 4.147341184729921, + "tokens_seen": 299433984 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045921765295887663, + "loss": 2.9482, + "theoretical_loss": 4.147234663433444, + "tokens_seen": 299499520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045920762286860586, + "loss": 2.8237, + "theoretical_loss": 4.147128171968077, + "tokens_seen": 299565056 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459197592778335, + "loss": 3.1106, + "theoretical_loss": 4.14702171031894, + "tokens_seen": 299630592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591875626880642, + "loss": 2.9476, + "theoretical_loss": 4.146915278471169, + "tokens_seen": 299696128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045917753259779335, + "loss": 2.968, + "theoretical_loss": 4.146808876409906, + "tokens_seen": 299761664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 181680, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3923745155334473, + "objective/train/theoretical_loss": 4.146702504120305, + "objective/train/tokens_used": 320287200, + "theoretical_loss": 4.146702504120305, + "tokens_seen": 299827200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591675025075226, + "loss": 2.9731, + "theoretical_loss": 4.146702504120305, + "tokens_seen": 299827200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045915747241725177, + "loss": 2.7663, + "theoretical_loss": 4.146596161587532, + "tokens_seen": 299892736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045914744232698095, + "loss": 2.5711, + "theoretical_loss": 4.146489848796763, + "tokens_seen": 299958272 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045913741223671013, + "loss": 3.2189, + "theoretical_loss": 4.146383565733184, + "tokens_seen": 300023808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045912738214643937, + "loss": 3.0249, + "theoretical_loss": 4.146277312381991, + "tokens_seen": 300089344 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591173520561685, + "loss": 2.903, + "theoretical_loss": 4.1461710887283925, + "tokens_seen": 300154880 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045910732196589773, + "loss": 3.1524, + "theoretical_loss": 4.146064894757606, + "tokens_seen": 300220416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045909729187562686, + "loss": 2.9013, + "theoretical_loss": 4.145958730454861, + "tokens_seen": 300285952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590872617853561, + "loss": 2.9616, + "theoretical_loss": 4.145852595805396, + "tokens_seen": 300351488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590772316950853, + "loss": 3.0813, + "theoretical_loss": 4.145746490794461, + "tokens_seen": 300417024 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045906720160481445, + "loss": 3.1084, + "theoretical_loss": 4.145640415407317, + "tokens_seen": 300482560 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045905717151454364, + "loss": 2.9712, + "theoretical_loss": 4.145534369629234, + "tokens_seen": 300548096 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590471414242728, + "loss": 2.9661, + "theoretical_loss": 4.145428353445494, + "tokens_seen": 300613632 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459037111334002, + "loss": 3.211, + "theoretical_loss": 4.145322366841389, + "tokens_seen": 300679168 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045902708124373123, + "loss": 2.9262, + "theoretical_loss": 4.145216409802221, + "tokens_seen": 300744704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045901705115346036, + "loss": 2.9333, + "theoretical_loss": 4.145110482313304, + "tokens_seen": 300810240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590070210631896, + "loss": 3.0263, + "theoretical_loss": 4.1450045843599606, + "tokens_seen": 300875776 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589969909729187, + "loss": 2.906, + "theoretical_loss": 4.144898715927525, + "tokens_seen": 300941312 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045898696088264796, + "loss": 2.8445, + "theoretical_loss": 4.144792877001342, + "tokens_seen": 301006848 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045897693079237714, + "loss": 2.984, + "theoretical_loss": 4.144687067566765, + "tokens_seen": 301072384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589669007021063, + "loss": 2.9933, + "theoretical_loss": 4.144581287609161, + "tokens_seen": 301137920 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589568706118355, + "loss": 2.9471, + "theoretical_loss": 4.144475537113905, + "tokens_seen": 301203456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045894684052156474, + "loss": 3.0003, + "theoretical_loss": 4.144369816066385, + "tokens_seen": 301268992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045893681043129386, + "loss": 3.1696, + "theoretical_loss": 4.144264124451995, + "tokens_seen": 301334528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589267803410231, + "loss": 3.1541, + "theoretical_loss": 4.1441584622561445, + "tokens_seen": 301400064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 182385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2453272342681885, + "objective/train/theoretical_loss": 4.144052829464249, + "objective/train/tokens_used": 321925600, + "theoretical_loss": 4.144052829464249, + "tokens_seen": 301465600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589167502507522, + "loss": 3.212, + "theoretical_loss": 4.144052829464249, + "tokens_seen": 301465600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045890672016048146, + "loss": 3.221, + "theoretical_loss": 4.143947226061737, + "tokens_seen": 301531136 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045889669007021064, + "loss": 3.1414, + "theoretical_loss": 4.143841652034048, + "tokens_seen": 301596672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588866599799398, + "loss": 2.8214, + "theoretical_loss": 4.143736107366629, + "tokens_seen": 301662208 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458876629889669, + "loss": 2.9196, + "theoretical_loss": 4.14363059204494, + "tokens_seen": 301727744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588665997993982, + "loss": 2.8951, + "theoretical_loss": 4.14352510605445, + "tokens_seen": 301793280 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045885656970912737, + "loss": 3.2871, + "theoretical_loss": 4.143419649380639, + "tokens_seen": 301858816 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588465396188566, + "loss": 3.0498, + "theoretical_loss": 4.143314222008997, + "tokens_seen": 301924352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045883650952858573, + "loss": 3.2756, + "theoretical_loss": 4.143208823925024, + "tokens_seen": 301989888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045882647943831496, + "loss": 2.7953, + "theoretical_loss": 4.143103455114231, + "tokens_seen": 302055424 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045881644934804415, + "loss": 2.9904, + "theoretical_loss": 4.142998115562139, + "tokens_seen": 302120960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588064192577733, + "loss": 2.8618, + "theoretical_loss": 4.14289280525428, + "tokens_seen": 302186496 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587963891675025, + "loss": 2.8056, + "theoretical_loss": 4.142787524176194, + "tokens_seen": 302252032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587863590772317, + "loss": 2.9704, + "theoretical_loss": 4.142682272313435, + "tokens_seen": 302317568 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045877632898696087, + "loss": 3.2137, + "theoretical_loss": 4.142577049651563, + "tokens_seen": 302383104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587662988966901, + "loss": 3.2904, + "theoretical_loss": 4.142471856176152, + "tokens_seen": 302448640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045875626880641923, + "loss": 2.9358, + "theoretical_loss": 4.142366691872784, + "tokens_seen": 302514176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045874623871614847, + "loss": 3.1388, + "theoretical_loss": 4.142261556727052, + "tokens_seen": 302579712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587362086258776, + "loss": 2.8735, + "theoretical_loss": 4.14215645072456, + "tokens_seen": 302645248 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045872617853560683, + "loss": 2.9464, + "theoretical_loss": 4.14205137385092, + "tokens_seen": 302710784 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458716148445336, + "loss": 3.0534, + "theoretical_loss": 4.141946326091756, + "tokens_seen": 302776320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587061183550652, + "loss": 3.0235, + "theoretical_loss": 4.141841307432703, + "tokens_seen": 302841856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586960882647944, + "loss": 2.9913, + "theoretical_loss": 4.1417363178594035, + "tokens_seen": 302907392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045868605817452355, + "loss": 3.1955, + "theoretical_loss": 4.141631357357513, + "tokens_seen": 302972928 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586760280842528, + "loss": 3.1505, + "theoretical_loss": 4.141526425912694, + "tokens_seen": 303038464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 183646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1540284156799316, + "objective/train/theoretical_loss": 4.141421523510623, + "objective/train/tokens_used": 323564000, + "theoretical_loss": 4.141421523510623, + "tokens_seen": 303104000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045866599799398197, + "loss": 2.9251, + "theoretical_loss": 4.141421523510623, + "tokens_seen": 303104000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045865596790371115, + "loss": 3.11, + "theoretical_loss": 4.141316650136983, + "tokens_seen": 303169536 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045864593781344033, + "loss": 2.9084, + "theoretical_loss": 4.14121180577747, + "tokens_seen": 303235072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045863590772316957, + "loss": 3.2785, + "theoretical_loss": 4.141106990417789, + "tokens_seen": 303300608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586258776328987, + "loss": 2.9644, + "theoretical_loss": 4.141002204043654, + "tokens_seen": 303366144 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045861584754262793, + "loss": 3.293, + "theoretical_loss": 4.140897446640793, + "tokens_seen": 303431680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045860581745235706, + "loss": 2.9656, + "theoretical_loss": 4.1407927181949375, + "tokens_seen": 303497216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585957873620863, + "loss": 2.9074, + "theoretical_loss": 4.140688018691835, + "tokens_seen": 303562752 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585857572718155, + "loss": 2.884, + "theoretical_loss": 4.140583348117241, + "tokens_seen": 303628288 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045857572718154465, + "loss": 3.0019, + "theoretical_loss": 4.140478706456921, + "tokens_seen": 303693824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045856569709127384, + "loss": 3.0583, + "theoretical_loss": 4.140374093696651, + "tokens_seen": 303759360 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458555667001003, + "loss": 3.1779, + "theoretical_loss": 4.1402695098222155, + "tokens_seen": 303824896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585456369107322, + "loss": 2.8, + "theoretical_loss": 4.140164954819412, + "tokens_seen": 303890432 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045853560682046143, + "loss": 3.1757, + "theoretical_loss": 4.140060428674046, + "tokens_seen": 303955968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045852557673019056, + "loss": 3.1018, + "theoretical_loss": 4.139955931371932, + "tokens_seen": 304021504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585155466399198, + "loss": 2.8922, + "theoretical_loss": 4.139851462898897, + "tokens_seen": 304087040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585055165496489, + "loss": 3.0495, + "theoretical_loss": 4.139747023240777, + "tokens_seen": 304152576 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045849548645937816, + "loss": 3.159, + "theoretical_loss": 4.139642612383418, + "tokens_seen": 304218112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045848545636910734, + "loss": 3.3414, + "theoretical_loss": 4.1395382303126755, + "tokens_seen": 304283648 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584754262788365, + "loss": 3.0128, + "theoretical_loss": 4.139433877014415, + "tokens_seen": 304349184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584653961885657, + "loss": 3.2096, + "theoretical_loss": 4.139329552474514, + "tokens_seen": 304414720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045845536609829494, + "loss": 2.9645, + "theoretical_loss": 4.139225256678857, + "tokens_seen": 304480256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045844533600802406, + "loss": 3.2456, + "theoretical_loss": 4.139120989613341, + "tokens_seen": 304545792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584353059177533, + "loss": 3.2019, + "theoretical_loss": 4.1390167512638705, + "tokens_seen": 304611328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584252758274824, + "loss": 3.2112, + "theoretical_loss": 4.138912541616363, + "tokens_seen": 304676864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 184075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.958650827407837, + "objective/train/theoretical_loss": 4.138808360656742, + "objective/train/tokens_used": 325202400, + "theoretical_loss": 4.138808360656742, + "tokens_seen": 304742400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045841524573721166, + "loss": 2.9379, + "theoretical_loss": 4.138808360656742, + "tokens_seen": 304742400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045840521564694084, + "loss": 3.0492, + "theoretical_loss": 4.138704208370944, + "tokens_seen": 304807936 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045839518555667, + "loss": 2.9333, + "theoretical_loss": 4.138600084744915, + "tokens_seen": 304873472 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583851554663992, + "loss": 3.0129, + "theoretical_loss": 4.1384959897646105, + "tokens_seen": 304939008 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583751253761284, + "loss": 3.1081, + "theoretical_loss": 4.138391923415996, + "tokens_seen": 305004544 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045836509528585757, + "loss": 3.0953, + "theoretical_loss": 4.138287885685045, + "tokens_seen": 305070080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583550651955868, + "loss": 3.0357, + "theoretical_loss": 4.138183876557745, + "tokens_seen": 305135616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045834503510531593, + "loss": 3.2149, + "theoretical_loss": 4.1380798960200895, + "tokens_seen": 305201152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045833500501504516, + "loss": 3.0338, + "theoretical_loss": 4.137975944058083, + "tokens_seen": 305266688 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045832497492477435, + "loss": 3.0291, + "theoretical_loss": 4.137872020657742, + "tokens_seen": 305332224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583149448345035, + "loss": 3.1508, + "theoretical_loss": 4.1377681258050885, + "tokens_seen": 305397760 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583049147442327, + "loss": 2.9683, + "theoretical_loss": 4.13766425948616, + "tokens_seen": 305463296 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582948846539619, + "loss": 3.0612, + "theoretical_loss": 4.137560421686998, + "tokens_seen": 305528832 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045828485456369107, + "loss": 2.8031, + "theoretical_loss": 4.137456612393658, + "tokens_seen": 305594368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582748244734203, + "loss": 3.2981, + "theoretical_loss": 4.137352831592203, + "tokens_seen": 305659904 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045826479438314943, + "loss": 3.1212, + "theoretical_loss": 4.137249079268707, + "tokens_seen": 305725440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045825476429287867, + "loss": 3.1303, + "theoretical_loss": 4.137145355409253, + "tokens_seen": 305790976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582447342026078, + "loss": 3.0405, + "theoretical_loss": 4.137041659999936, + "tokens_seen": 305856512 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045823470411233703, + "loss": 3.0228, + "theoretical_loss": 4.136937993026857, + "tokens_seen": 305922048 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582246740220662, + "loss": 3.0352, + "theoretical_loss": 4.136834354476129, + "tokens_seen": 305987584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582146439317954, + "loss": 3.0844, + "theoretical_loss": 4.1367307443338746, + "tokens_seen": 306053120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045820461384152457, + "loss": 3.0252, + "theoretical_loss": 4.136627162586226, + "tokens_seen": 306118656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045819458375125375, + "loss": 2.9533, + "theoretical_loss": 4.136523609219327, + "tokens_seen": 306184192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045818455366098294, + "loss": 3.0824, + "theoretical_loss": 4.136420084219327, + "tokens_seen": 306249728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045817452357071217, + "loss": 2.8966, + "theoretical_loss": 4.136316587572388, + "tokens_seen": 306315264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 185377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6815130710601807, + "objective/train/theoretical_loss": 4.136213119264681, + "objective/train/tokens_used": 326840800, + "theoretical_loss": 4.136213119264681, + "tokens_seen": 306380800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581644934804413, + "loss": 3.2741, + "theoretical_loss": 4.136213119264681, + "tokens_seen": 306380800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045815446339017053, + "loss": 3.0248, + "theoretical_loss": 4.136109679282388, + "tokens_seen": 306446336 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581444332998997, + "loss": 3.086, + "theoretical_loss": 4.136006267611697, + "tokens_seen": 306511872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581344032096289, + "loss": 3.1913, + "theoretical_loss": 4.135902884238812, + "tokens_seen": 306577408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581243731193581, + "loss": 3.2079, + "theoretical_loss": 4.135799529149939, + "tokens_seen": 306642944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045811434302908726, + "loss": 3.0883, + "theoretical_loss": 4.1356962023313, + "tokens_seen": 306708480 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045810431293881644, + "loss": 3.0245, + "theoretical_loss": 4.135592903769124, + "tokens_seen": 306774016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580942828485457, + "loss": 2.9307, + "theoretical_loss": 4.135489633449649, + "tokens_seen": 306839552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580842527582748, + "loss": 2.8345, + "theoretical_loss": 4.135386391359123, + "tokens_seen": 306905088 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045807422266800404, + "loss": 3.2493, + "theoretical_loss": 4.135283177483807, + "tokens_seen": 306970624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045806419257773316, + "loss": 3.0784, + "theoretical_loss": 4.135179991809965, + "tokens_seen": 307036160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580541624874624, + "loss": 2.9564, + "theoretical_loss": 4.135076834323876, + "tokens_seen": 307101696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580441323971916, + "loss": 3.2303, + "theoretical_loss": 4.134973705011828, + "tokens_seen": 307167232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045803410230692076, + "loss": 2.8665, + "theoretical_loss": 4.134870603860117, + "tokens_seen": 307232768 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045802407221664994, + "loss": 3.1065, + "theoretical_loss": 4.134767530855047, + "tokens_seen": 307298304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580140421263791, + "loss": 3.1849, + "theoretical_loss": 4.1346644859829365, + "tokens_seen": 307363840 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580040120361083, + "loss": 3.1406, + "theoretical_loss": 4.1345614692301105, + "tokens_seen": 307429376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045799398194583754, + "loss": 3.0077, + "theoretical_loss": 4.134458480582902, + "tokens_seen": 307494912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045798395185556667, + "loss": 2.9305, + "theoretical_loss": 4.134355520027657, + "tokens_seen": 307560448 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579739217652959, + "loss": 3.3013, + "theoretical_loss": 4.134252587550728, + "tokens_seen": 307625984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579638916750251, + "loss": 2.8856, + "theoretical_loss": 4.134149683138481, + "tokens_seen": 307691520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045795386158475426, + "loss": 2.9128, + "theoretical_loss": 4.134046806777286, + "tokens_seen": 307757056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579438314944835, + "loss": 3.0849, + "theoretical_loss": 4.133943958453528, + "tokens_seen": 307822592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579338014042126, + "loss": 2.9414, + "theoretical_loss": 4.133841138153597, + "tokens_seen": 307888128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045792377131394186, + "loss": 3.2586, + "theoretical_loss": 4.133738345863896, + "tokens_seen": 307953664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 186163, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.693579912185669, + "objective/train/theoretical_loss": 4.133635581570836, + "objective/train/tokens_used": 328479200, + "theoretical_loss": 4.133635581570836, + "tokens_seen": 308019200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045791374122367104, + "loss": 3.0399, + "theoretical_loss": 4.133635581570836, + "tokens_seen": 308019200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579037111334002, + "loss": 3.0709, + "theoretical_loss": 4.133532845260836, + "tokens_seen": 308084736 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578936810431294, + "loss": 3.0057, + "theoretical_loss": 4.133430136920327, + "tokens_seen": 308150272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578836509528586, + "loss": 3.1167, + "theoretical_loss": 4.133327456535749, + "tokens_seen": 308215808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045787362086258777, + "loss": 2.8566, + "theoretical_loss": 4.13322480409355, + "tokens_seen": 308281344 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457863590772317, + "loss": 2.9801, + "theoretical_loss": 4.133122179580189, + "tokens_seen": 308346880 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045785356068204613, + "loss": 2.9788, + "theoretical_loss": 4.133019582982134, + "tokens_seen": 308412416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045784353059177536, + "loss": 3.0067, + "theoretical_loss": 4.1329170142858604, + "tokens_seen": 308477952 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045783350050150455, + "loss": 3.0007, + "theoretical_loss": 4.132814473477857, + "tokens_seen": 308543488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578234704112337, + "loss": 3.2335, + "theoretical_loss": 4.1327119605446185, + "tokens_seen": 308609024 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578134403209629, + "loss": 2.903, + "theoretical_loss": 4.132609475472651, + "tokens_seen": 308674560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578034102306921, + "loss": 3.0954, + "theoretical_loss": 4.132507018248469, + "tokens_seen": 308740096 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045779338014042127, + "loss": 3.0897, + "theoretical_loss": 4.132404588858597, + "tokens_seen": 308805632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577833500501505, + "loss": 3.0264, + "theoretical_loss": 4.132302187289568, + "tokens_seen": 308871168 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045777331995987963, + "loss": 3.3141, + "theoretical_loss": 4.132199813527926, + "tokens_seen": 308936704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045776328986960887, + "loss": 3.1779, + "theoretical_loss": 4.132097467560223, + "tokens_seen": 309002240 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457753259779338, + "loss": 3.2129, + "theoretical_loss": 4.1319951493730205, + "tokens_seen": 309067776 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045774322968906723, + "loss": 3.3985, + "theoretical_loss": 4.131892858952889, + "tokens_seen": 309133312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577331995987964, + "loss": 3.0588, + "theoretical_loss": 4.131790596286409, + "tokens_seen": 309198848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577231695085256, + "loss": 2.8701, + "theoretical_loss": 4.1316883613601725, + "tokens_seen": 309264384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577131394182548, + "loss": 3.0681, + "theoretical_loss": 4.131586154160775, + "tokens_seen": 309329920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045770310932798395, + "loss": 3.0305, + "theoretical_loss": 4.131483974674827, + "tokens_seen": 309395456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045769307923771314, + "loss": 2.9912, + "theoretical_loss": 4.131381822888946, + "tokens_seen": 309460992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045768304914744237, + "loss": 2.9485, + "theoretical_loss": 4.131279698789759, + "tokens_seen": 309526528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576730190571715, + "loss": 3.1318, + "theoretical_loss": 4.1311776023639, + "tokens_seen": 309592064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 187126, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0981154441833496, + "objective/train/theoretical_loss": 4.131075533598018, + "objective/train/tokens_used": 330117600, + "theoretical_loss": 4.131075533598018, + "tokens_seen": 309657600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045766298896690073, + "loss": 3.0551, + "theoretical_loss": 4.131075533598018, + "tokens_seen": 309657600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576529588766299, + "loss": 3.3009, + "theoretical_loss": 4.130973492478766, + "tokens_seen": 309723136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576429287863591, + "loss": 3.293, + "theoretical_loss": 4.130871478992807, + "tokens_seen": 309788672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576328986960883, + "loss": 3.2951, + "theoretical_loss": 4.130769493126817, + "tokens_seen": 309854208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045762286860581746, + "loss": 3.1751, + "theoretical_loss": 4.130667534867476, + "tokens_seen": 309919744 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045761283851554664, + "loss": 3.2966, + "theoretical_loss": 4.130565604201477, + "tokens_seen": 309985280 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576028084252759, + "loss": 2.9467, + "theoretical_loss": 4.130463701115521, + "tokens_seen": 310050816 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457592778335005, + "loss": 2.7801, + "theoretical_loss": 4.130361825596317, + "tokens_seen": 310116352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045758274824473424, + "loss": 3.1128, + "theoretical_loss": 4.130259977630586, + "tokens_seen": 310181888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045757271815446336, + "loss": 3.0139, + "theoretical_loss": 4.130158157205056, + "tokens_seen": 310247424 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575626880641926, + "loss": 3.1759, + "theoretical_loss": 4.130056364306465, + "tokens_seen": 310312960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575526579739218, + "loss": 3.0092, + "theoretical_loss": 4.129954598921559, + "tokens_seen": 310378496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045754262788365096, + "loss": 2.7612, + "theoretical_loss": 4.1298528610370955, + "tokens_seen": 310444032 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045753259779338014, + "loss": 3.0608, + "theoretical_loss": 4.12975115063984, + "tokens_seen": 310509568 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575225677031093, + "loss": 2.6856, + "theoretical_loss": 4.129649467716565, + "tokens_seen": 310575104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575125376128385, + "loss": 3.2868, + "theoretical_loss": 4.1295478122540565, + "tokens_seen": 310640640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045750250752256774, + "loss": 3.2094, + "theoretical_loss": 4.1294461842391055, + "tokens_seen": 310706176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045749247743229687, + "loss": 2.9271, + "theoretical_loss": 4.129344583658516, + "tokens_seen": 310771712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574824473420261, + "loss": 3.1971, + "theoretical_loss": 4.1292430104990965, + "tokens_seen": 310837248 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574724172517553, + "loss": 2.8271, + "theoretical_loss": 4.1291414647476685, + "tokens_seen": 310902784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045746238716148446, + "loss": 2.9373, + "theoretical_loss": 4.129039946391062, + "tokens_seen": 310968320 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045745235707121364, + "loss": 3.2863, + "theoretical_loss": 4.128938455416115, + "tokens_seen": 311033856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574423269809428, + "loss": 3.0255, + "theoretical_loss": 4.128836991809674, + "tokens_seen": 311099392 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457432296890672, + "loss": 3.1146, + "theoretical_loss": 4.128735555558597, + "tokens_seen": 311164928 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045742226680040124, + "loss": 2.9283, + "theoretical_loss": 4.128634146649748, + "tokens_seen": 311230464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 187875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4823386669158936, + "objective/train/theoretical_loss": 4.128532765070004, + "objective/train/tokens_used": 331756000, + "theoretical_loss": 4.128532765070004, + "tokens_seen": 311296000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045741223671013037, + "loss": 3.1208, + "theoretical_loss": 4.128532765070004, + "tokens_seen": 311296000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574022066198596, + "loss": 2.94, + "theoretical_loss": 4.128431410806247, + "tokens_seen": 311361536 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045739217652958873, + "loss": 2.9579, + "theoretical_loss": 4.12833008384537, + "tokens_seen": 311427072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045738214643931797, + "loss": 3.2517, + "theoretical_loss": 4.128228784174275, + "tokens_seen": 311492608 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045737211634904715, + "loss": 3.2745, + "theoretical_loss": 4.128127511779873, + "tokens_seen": 311558144 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045736208625877633, + "loss": 2.9142, + "theoretical_loss": 4.128026266649085, + "tokens_seen": 311623680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004573520561685055, + "loss": 3.03, + "theoretical_loss": 4.127925048768839, + "tokens_seen": 311689216 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045734202607823475, + "loss": 3.068, + "theoretical_loss": 4.127823858126073, + "tokens_seen": 311754752 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045733199598796387, + "loss": 3.1706, + "theoretical_loss": 4.1277226947077335, + "tokens_seen": 311820288 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004573219658976931, + "loss": 2.9845, + "theoretical_loss": 4.127621558500778, + "tokens_seen": 311885824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045731193580742223, + "loss": 2.9558, + "theoretical_loss": 4.12752044949217, + "tokens_seen": 311951360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045730190571715147, + "loss": 3.0432, + "theoretical_loss": 4.127419367668884, + "tokens_seen": 312016896 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045729187562688065, + "loss": 2.9326, + "theoretical_loss": 4.127318313017904, + "tokens_seen": 312082432 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045728184553660983, + "loss": 3.057, + "theoretical_loss": 4.12721728552622, + "tokens_seen": 312147968 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457271815446339, + "loss": 2.9781, + "theoretical_loss": 4.1271162851808345, + "tokens_seen": 312213504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572617853560682, + "loss": 3.2397, + "theoretical_loss": 4.127015311968757, + "tokens_seen": 312279040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572517552657974, + "loss": 3.0724, + "theoretical_loss": 4.126914365877004, + "tokens_seen": 312344576 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572417251755266, + "loss": 3.1184, + "theoretical_loss": 4.126813446892607, + "tokens_seen": 312410112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045723169508525574, + "loss": 3.0199, + "theoretical_loss": 4.1267125550026, + "tokens_seen": 312475648 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457221664994985, + "loss": 3.1512, + "theoretical_loss": 4.1266116901940295, + "tokens_seen": 312541184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572116349047141, + "loss": 2.9637, + "theoretical_loss": 4.126510852453949, + "tokens_seen": 312606720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045720160481444334, + "loss": 3.1577, + "theoretical_loss": 4.126410041769423, + "tokens_seen": 312672256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045719157472417257, + "loss": 2.8805, + "theoretical_loss": 4.126309258127524, + "tokens_seen": 312737792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571815446339017, + "loss": 3.2546, + "theoretical_loss": 4.126208501515331, + "tokens_seen": 312803328 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045717151454363093, + "loss": 3.0705, + "theoretical_loss": 4.126107771919935, + "tokens_seen": 312868864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 188927, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9257431030273438, + "objective/train/theoretical_loss": 4.126007069328436, + "objective/train/tokens_used": 333394400, + "theoretical_loss": 4.126007069328436, + "tokens_seen": 312934400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571614844533601, + "loss": 3.1474, + "theoretical_loss": 4.126007069328436, + "tokens_seen": 312934400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571514543630893, + "loss": 3.0964, + "theoretical_loss": 4.125906393727941, + "tokens_seen": 312999936 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571414242728185, + "loss": 3.2524, + "theoretical_loss": 4.125805745105566, + "tokens_seen": 313065472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045713139418254766, + "loss": 3.0474, + "theoretical_loss": 4.125705123448437, + "tokens_seen": 313131008 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045712136409227684, + "loss": 3.2728, + "theoretical_loss": 4.125604528743689, + "tokens_seen": 313196544 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571113340020061, + "loss": 3.2944, + "theoretical_loss": 4.125503960978464, + "tokens_seen": 313262080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571013039117352, + "loss": 3.0976, + "theoretical_loss": 4.1254034201399135, + "tokens_seen": 313327616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045709127382146444, + "loss": 3.0754, + "theoretical_loss": 4.125302906215199, + "tokens_seen": 313393152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045708124373119356, + "loss": 3.1683, + "theoretical_loss": 4.12520241919149, + "tokens_seen": 313458688 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570712136409228, + "loss": 3.1685, + "theoretical_loss": 4.125101959055965, + "tokens_seen": 313524224 + }, + { + "epoch": 0.1, + "learning_rate": 0.000457061183550652, + "loss": 3.3666, + "theoretical_loss": 4.125001525795811, + "tokens_seen": 313589760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045705115346038116, + "loss": 3.0122, + "theoretical_loss": 4.124901119398222, + "tokens_seen": 313655296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045704112337011034, + "loss": 3.522, + "theoretical_loss": 4.124800739850406, + "tokens_seen": 313720832 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570310932798395, + "loss": 2.9341, + "theoretical_loss": 4.124700387139574, + "tokens_seen": 313786368 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570210631895687, + "loss": 3.1314, + "theoretical_loss": 4.12460006125295, + "tokens_seen": 313851904 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045701103309929794, + "loss": 3.1766, + "theoretical_loss": 4.124499762177764, + "tokens_seen": 313917440 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045700100300902707, + "loss": 3.2033, + "theoretical_loss": 4.124399489901254, + "tokens_seen": 313982976 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569909729187563, + "loss": 3.1851, + "theoretical_loss": 4.124299244410672, + "tokens_seen": 314048512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569809428284855, + "loss": 3.3205, + "theoretical_loss": 4.124199025693272, + "tokens_seen": 314114048 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045697091273821466, + "loss": 2.9844, + "theoretical_loss": 4.124098833736321, + "tokens_seen": 314179584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045696088264794384, + "loss": 3.2222, + "theoretical_loss": 4.123998668527094, + "tokens_seen": 314245120 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456950852557673, + "loss": 3.1528, + "theoretical_loss": 4.123898530052874, + "tokens_seen": 314310656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569408224674022, + "loss": 3.2758, + "theoretical_loss": 4.123798418300953, + "tokens_seen": 314376192 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045693079237713144, + "loss": 3.1248, + "theoretical_loss": 4.123698333258631, + "tokens_seen": 314441728 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045692076228686057, + "loss": 3.2705, + "theoretical_loss": 4.123598274913219, + "tokens_seen": 314507264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 189547, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8021438121795654, + "objective/train/theoretical_loss": 4.123498243252032, + "objective/train/tokens_used": 335032800, + "theoretical_loss": 4.123498243252032, + "tokens_seen": 314572800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569107321965898, + "loss": 2.9524, + "theoretical_loss": 4.123498243252032, + "tokens_seen": 314572800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045690070210631893, + "loss": 2.9818, + "theoretical_loss": 4.1233982382624, + "tokens_seen": 314638336 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045689067201604817, + "loss": 3.1294, + "theoretical_loss": 4.123298259931657, + "tokens_seen": 314703872 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045688064192577735, + "loss": 3.2011, + "theoretical_loss": 4.123198308247146, + "tokens_seen": 314769408 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045687061183550653, + "loss": 3.2102, + "theoretical_loss": 4.123098383196222, + "tokens_seen": 314834944 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004568605817452357, + "loss": 3.0984, + "theoretical_loss": 4.122998484766244, + "tokens_seen": 314900480 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045685055165496495, + "loss": 3.0736, + "theoretical_loss": 4.122898612944582, + "tokens_seen": 314966016 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045684052156469407, + "loss": 3.4115, + "theoretical_loss": 4.122798767718616, + "tokens_seen": 315031552 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004568304914744233, + "loss": 3.2703, + "theoretical_loss": 4.122698949075732, + "tokens_seen": 315097088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045682046138415243, + "loss": 3.3034, + "theoretical_loss": 4.122599157003327, + "tokens_seen": 315162624 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045681043129388167, + "loss": 3.0698, + "theoretical_loss": 4.1224993914888035, + "tokens_seen": 315228160 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045680040120361085, + "loss": 3.2926, + "theoretical_loss": 4.122399652519576, + "tokens_seen": 315293696 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045679037111334003, + "loss": 3.0107, + "theoretical_loss": 4.122299940083065, + "tokens_seen": 315359232 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567803410230692, + "loss": 3.1404, + "theoretical_loss": 4.1222002541667, + "tokens_seen": 315424768 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567703109327984, + "loss": 3.2998, + "theoretical_loss": 4.122100594757921, + "tokens_seen": 315490304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567602808425276, + "loss": 3.2533, + "theoretical_loss": 4.122000961844175, + "tokens_seen": 315555840 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567502507522568, + "loss": 3.2643, + "theoretical_loss": 4.121901355412917, + "tokens_seen": 315621376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045674022066198594, + "loss": 3.0887, + "theoretical_loss": 4.121801775451612, + "tokens_seen": 315686912 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567301905717152, + "loss": 3.0258, + "theoretical_loss": 4.121702221947732, + "tokens_seen": 315752448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567201604814443, + "loss": 3.0984, + "theoretical_loss": 4.121602694888759, + "tokens_seen": 315817984 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045671013039117354, + "loss": 3.0961, + "theoretical_loss": 4.121503194262183, + "tokens_seen": 315883520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567001003009027, + "loss": 3.4246, + "theoretical_loss": 4.121403720055502, + "tokens_seen": 315949056 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566900702106319, + "loss": 3.1232, + "theoretical_loss": 4.121304272256222, + "tokens_seen": 316014592 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566800401203611, + "loss": 3.2672, + "theoretical_loss": 4.121204850851861, + "tokens_seen": 316080128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566700100300903, + "loss": 3.2468, + "theoretical_loss": 4.121105455829939, + "tokens_seen": 316145664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 190733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.28241229057312, + "objective/train/theoretical_loss": 4.121006087177992, + "objective/train/tokens_used": 336671200, + "theoretical_loss": 4.121006087177992, + "tokens_seen": 316211200 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045665997993981944, + "loss": 3.0047, + "theoretical_loss": 4.121006087177992, + "tokens_seen": 316211200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566499498495487, + "loss": 3.0604, + "theoretical_loss": 4.120906744883559, + "tokens_seen": 316276736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566399197592778, + "loss": 3.1629, + "theoretical_loss": 4.120807428934189, + "tokens_seen": 316342272 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045662988966900704, + "loss": 3.0635, + "theoretical_loss": 4.120708139317441, + "tokens_seen": 316407808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566198595787362, + "loss": 3.2192, + "theoretical_loss": 4.12060887602088, + "tokens_seen": 316473344 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566098294884654, + "loss": 3.1621, + "theoretical_loss": 4.120509639032081, + "tokens_seen": 316538880 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565997993981946, + "loss": 2.9827, + "theoretical_loss": 4.120410428338628, + "tokens_seen": 316604416 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045658976930792376, + "loss": 2.9799, + "theoretical_loss": 4.120311243928111, + "tokens_seen": 316669952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045657973921765294, + "loss": 3.195, + "theoretical_loss": 4.120212085788131, + "tokens_seen": 316735488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565697091273822, + "loss": 3.0131, + "theoretical_loss": 4.120112953906296, + "tokens_seen": 316801024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565596790371113, + "loss": 3.2145, + "theoretical_loss": 4.120013848270222, + "tokens_seen": 316866560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045654964894684054, + "loss": 3.1744, + "theoretical_loss": 4.119914768867536, + "tokens_seen": 316932096 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045653961885656967, + "loss": 2.9482, + "theoretical_loss": 4.11981571568587, + "tokens_seen": 316997632 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565295887662989, + "loss": 3.1156, + "theoretical_loss": 4.119716688712866, + "tokens_seen": 317063168 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565195586760281, + "loss": 3.0127, + "theoretical_loss": 4.119617687936175, + "tokens_seen": 317128704 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045650952858575727, + "loss": 2.9893, + "theoretical_loss": 4.119518713343455, + "tokens_seen": 317194240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045649949849548645, + "loss": 2.9135, + "theoretical_loss": 4.119419764922374, + "tokens_seen": 317259776 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564894684052157, + "loss": 2.9428, + "theoretical_loss": 4.119320842660606, + "tokens_seen": 317325312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564794383149448, + "loss": 3.0893, + "theoretical_loss": 4.119221946545836, + "tokens_seen": 317390848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045646940822467405, + "loss": 2.954, + "theoretical_loss": 4.119123076565755, + "tokens_seen": 317456384 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045645937813440317, + "loss": 3.1031, + "theoretical_loss": 4.119024232708064, + "tokens_seen": 317521920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564493480441324, + "loss": 3.1224, + "theoretical_loss": 4.118925414960472, + "tokens_seen": 317587456 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045643931795386164, + "loss": 3.1016, + "theoretical_loss": 4.118826623310696, + "tokens_seen": 317652992 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045642928786359077, + "loss": 3.0232, + "theoretical_loss": 4.11872785774646, + "tokens_seen": 317718528 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045641925777332, + "loss": 3.211, + "theoretical_loss": 4.1186291182555, + "tokens_seen": 317784064 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 191330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.303915023803711, + "objective/train/theoretical_loss": 4.118530404825556, + "objective/train/tokens_used": 338309600, + "theoretical_loss": 4.118530404825556, + "tokens_seen": 317849600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045640922768304913, + "loss": 3.1621, + "theoretical_loss": 4.118530404825556, + "tokens_seen": 317849600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045639919759277837, + "loss": 3.2408, + "theoretical_loss": 4.11843171744438, + "tokens_seen": 317915136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045638916750250755, + "loss": 3.0122, + "theoretical_loss": 4.118333056099728, + "tokens_seen": 317980672 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045637913741223673, + "loss": 3.3026, + "theoretical_loss": 4.11823442077937, + "tokens_seen": 318046208 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004563691073219659, + "loss": 3.1502, + "theoretical_loss": 4.1181358114710775, + "tokens_seen": 318111744 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045635907723169515, + "loss": 3.1201, + "theoretical_loss": 4.1180372281626365, + "tokens_seen": 318177280 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045634904714142427, + "loss": 2.9375, + "theoretical_loss": 4.117938670841838, + "tokens_seen": 318242816 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004563390170511535, + "loss": 3.2836, + "theoretical_loss": 4.117840139496482, + "tokens_seen": 318308352 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045632898696088263, + "loss": 3.2809, + "theoretical_loss": 4.117741634114376, + "tokens_seen": 318373888 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045631895687061187, + "loss": 3.3275, + "theoretical_loss": 4.1176431546833365, + "tokens_seen": 318439424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045630892678034105, + "loss": 3.2046, + "theoretical_loss": 4.117544701191187, + "tokens_seen": 318504960 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045629889669007023, + "loss": 2.7935, + "theoretical_loss": 4.117446273625763, + "tokens_seen": 318570496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562888665997994, + "loss": 3.1972, + "theoretical_loss": 4.117347871974903, + "tokens_seen": 318636032 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562788365095286, + "loss": 3.283, + "theoretical_loss": 4.1172494962264565, + "tokens_seen": 318701568 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562688064192578, + "loss": 3.0075, + "theoretical_loss": 4.117151146368282, + "tokens_seen": 318767104 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456258776328987, + "loss": 3.2145, + "theoretical_loss": 4.117052822388243, + "tokens_seen": 318832640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045624874623871614, + "loss": 2.6977, + "theoretical_loss": 4.116954524274216, + "tokens_seen": 318898176 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562387161484454, + "loss": 3.2573, + "theoretical_loss": 4.11685625201408, + "tokens_seen": 318963712 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562286860581745, + "loss": 2.9499, + "theoretical_loss": 4.116758005595727, + "tokens_seen": 319029248 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045621865596790374, + "loss": 2.9781, + "theoretical_loss": 4.116659785007055, + "tokens_seen": 319094784 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562086258776329, + "loss": 3.1879, + "theoretical_loss": 4.116561590235969, + "tokens_seen": 319160320 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561985957873621, + "loss": 3.1697, + "theoretical_loss": 4.116463421270385, + "tokens_seen": 319225856 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561885656970913, + "loss": 2.9933, + "theoretical_loss": 4.116365278098225, + "tokens_seen": 319291392 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561785356068205, + "loss": 3.1033, + "theoretical_loss": 4.116267160707421, + "tokens_seen": 319356928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045616850551654964, + "loss": 2.9044, + "theoretical_loss": 4.11616906908591, + "tokens_seen": 319422464 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 192764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9580471515655518, + "objective/train/theoretical_loss": 4.11607100322164, + "objective/train/tokens_used": 339948000, + "theoretical_loss": 4.11607100322164, + "tokens_seen": 319488000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561584754262789, + "loss": 2.9887, + "theoretical_loss": 4.11607100322164, + "tokens_seen": 319488000 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456148445336008, + "loss": 3.0573, + "theoretical_loss": 4.115972963102565, + "tokens_seen": 319553536 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045613841524573724, + "loss": 3.2024, + "theoretical_loss": 4.11587494871665, + "tokens_seen": 319619072 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561283851554664, + "loss": 3.2376, + "theoretical_loss": 4.115776960051864, + "tokens_seen": 319684608 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561183550651956, + "loss": 3.0396, + "theoretical_loss": 4.11567899709619, + "tokens_seen": 319750144 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561083249749248, + "loss": 2.7815, + "theoretical_loss": 4.115581059837612, + "tokens_seen": 319815680 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045609829488465396, + "loss": 3.1572, + "theoretical_loss": 4.115483148264127, + "tokens_seen": 319881216 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045608826479438314, + "loss": 3.0449, + "theoretical_loss": 4.115385262363739, + "tokens_seen": 319946752 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560782347041124, + "loss": 3.0447, + "theoretical_loss": 4.1152874021244585, + "tokens_seen": 320012288 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560682046138415, + "loss": 2.905, + "theoretical_loss": 4.115189567534307, + "tokens_seen": 320077824 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045605817452357074, + "loss": 3.0628, + "theoretical_loss": 4.115091758581309, + "tokens_seen": 320143360 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045604814443329987, + "loss": 2.9603, + "theoretical_loss": 4.114993975253505, + "tokens_seen": 320208896 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560381143430291, + "loss": 2.9837, + "theoretical_loss": 4.114896217538935, + "tokens_seen": 320274432 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560280842527583, + "loss": 2.8237, + "theoretical_loss": 4.114798485425652, + "tokens_seen": 320339968 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045601805416248747, + "loss": 3.0052, + "theoretical_loss": 4.114700778901717, + "tokens_seen": 320405504 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045600802407221665, + "loss": 3.0487, + "theoretical_loss": 4.114603097955197, + "tokens_seen": 320471040 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559979939819459, + "loss": 3.2513, + "theoretical_loss": 4.114505442574167, + "tokens_seen": 320536576 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455987963891675, + "loss": 3.2844, + "theoretical_loss": 4.1144078127467125, + "tokens_seen": 320602112 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045597793380140425, + "loss": 2.7771, + "theoretical_loss": 4.114310208460924, + "tokens_seen": 320667648 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045596790371113337, + "loss": 3.1613, + "theoretical_loss": 4.114212629704902, + "tokens_seen": 320733184 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559578736208626, + "loss": 2.9396, + "theoretical_loss": 4.114115076466755, + "tokens_seen": 320798720 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559478435305918, + "loss": 2.9128, + "theoretical_loss": 4.114017548734598, + "tokens_seen": 320864256 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045593781344032097, + "loss": 2.9857, + "theoretical_loss": 4.113920046496554, + "tokens_seen": 320929792 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045592778335005015, + "loss": 3.1028, + "theoretical_loss": 4.113822569740757, + "tokens_seen": 320995328 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045591775325977933, + "loss": 3.5144, + "theoretical_loss": 4.113725118455344, + "tokens_seen": 321060864 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 193449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.532770872116089, + "objective/train/theoretical_loss": 4.113627692628464, + "objective/train/tokens_used": 341586400, + "theoretical_loss": 4.113627692628464, + "tokens_seen": 321126400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559077231695085, + "loss": 3.0643, + "theoretical_loss": 4.113627692628464, + "tokens_seen": 321126400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045589769307923775, + "loss": 3.1924, + "theoretical_loss": 4.113530292248273, + "tokens_seen": 321191936 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558876629889669, + "loss": 2.9817, + "theoretical_loss": 4.113432917302934, + "tokens_seen": 321257472 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558776328986961, + "loss": 2.954, + "theoretical_loss": 4.113335567780618, + "tokens_seen": 321323008 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045586760280842524, + "loss": 2.9283, + "theoretical_loss": 4.113238243669504, + "tokens_seen": 321388544 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558575727181545, + "loss": 3.0588, + "theoretical_loss": 4.113140944957781, + "tokens_seen": 321454080 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045584754262788365, + "loss": 2.9978, + "theoretical_loss": 4.113043671633641, + "tokens_seen": 321519616 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045583751253761284, + "loss": 3.1526, + "theoretical_loss": 4.11294642368529, + "tokens_seen": 321585152 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455827482447342, + "loss": 2.856, + "theoretical_loss": 4.112849201100938, + "tokens_seen": 321650688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045581745235707125, + "loss": 3.0283, + "theoretical_loss": 4.1127520038688035, + "tokens_seen": 321716224 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558074222668004, + "loss": 3.1706, + "theoretical_loss": 4.112654831977112, + "tokens_seen": 321781760 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557973921765296, + "loss": 3.1208, + "theoretical_loss": 4.1125576854141, + "tokens_seen": 321847296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045578736208625874, + "loss": 3.2345, + "theoretical_loss": 4.112460564168009, + "tokens_seen": 321912832 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455777331995988, + "loss": 3.0428, + "theoretical_loss": 4.112363468227088, + "tokens_seen": 321978368 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045576730190571716, + "loss": 3.3738, + "theoretical_loss": 4.112266397579598, + "tokens_seen": 322043904 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045575727181544634, + "loss": 3.0012, + "theoretical_loss": 4.112169352213801, + "tokens_seen": 322109440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557472417251755, + "loss": 2.7209, + "theoretical_loss": 4.1120723321179735, + "tokens_seen": 322174976 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557372116349047, + "loss": 3.0685, + "theoretical_loss": 4.111975337280397, + "tokens_seen": 322240512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557271815446339, + "loss": 3.2726, + "theoretical_loss": 4.111878367689359, + "tokens_seen": 322306048 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557171514543631, + "loss": 2.9605, + "theoretical_loss": 4.1117814233331575, + "tokens_seen": 322371584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045570712136409224, + "loss": 3.0389, + "theoretical_loss": 4.111684504200099, + "tokens_seen": 322437120 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556970912738215, + "loss": 3.0147, + "theoretical_loss": 4.111587610278494, + "tokens_seen": 322502656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556870611835507, + "loss": 3.126, + "theoretical_loss": 4.111490741556663, + "tokens_seen": 322568192 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045567703109327984, + "loss": 3.2523, + "theoretical_loss": 4.1113938980229365, + "tokens_seen": 322633728 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556670010030091, + "loss": 2.9724, + "theoretical_loss": 4.11129707966565, + "tokens_seen": 322699264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 194727, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2888762950897217, + "objective/train/theoretical_loss": 4.111200286473145, + "objective/train/tokens_used": 343224800, + "theoretical_loss": 4.111200286473145, + "tokens_seen": 322764800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556569709127382, + "loss": 3.1618, + "theoretical_loss": 4.111200286473145, + "tokens_seen": 322764800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045564694082246744, + "loss": 2.8571, + "theoretical_loss": 4.111103518433776, + "tokens_seen": 322830336 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556369107321966, + "loss": 3.1171, + "theoretical_loss": 4.111006775535901, + "tokens_seen": 322895872 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556268806419258, + "loss": 3.0964, + "theoretical_loss": 4.110910057767887, + "tokens_seen": 322961408 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455616850551655, + "loss": 3.2273, + "theoretical_loss": 4.110813365118109, + "tokens_seen": 323026944 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045560682046138416, + "loss": 3.0086, + "theoretical_loss": 4.110716697574951, + "tokens_seen": 323092480 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045559679037111334, + "loss": 2.9121, + "theoretical_loss": 4.110620055126802, + "tokens_seen": 323158016 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555867602808426, + "loss": 3.1999, + "theoretical_loss": 4.110523437762059, + "tokens_seen": 323223552 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555767301905717, + "loss": 3.254, + "theoretical_loss": 4.11042684546913, + "tokens_seen": 323289088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045556670010030094, + "loss": 3.0221, + "theoretical_loss": 4.110330278236427, + "tokens_seen": 323354624 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045555667001003007, + "loss": 2.9684, + "theoretical_loss": 4.110233736052372, + "tokens_seen": 323420160 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555466399197593, + "loss": 3.2605, + "theoretical_loss": 4.110137218905393, + "tokens_seen": 323485696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555366098294885, + "loss": 3.1783, + "theoretical_loss": 4.110040726783927, + "tokens_seen": 323551232 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045552657973921767, + "loss": 3.0009, + "theoretical_loss": 4.109944259676419, + "tokens_seen": 323616768 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045551654964894685, + "loss": 2.9394, + "theoretical_loss": 4.109847817571319, + "tokens_seen": 323682304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555065195586761, + "loss": 3.0756, + "theoretical_loss": 4.109751400457089, + "tokens_seen": 323747840 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554964894684052, + "loss": 3.1542, + "theoretical_loss": 4.109655008322195, + "tokens_seen": 323813376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045548645937813445, + "loss": 3.0225, + "theoretical_loss": 4.109558641155112, + "tokens_seen": 323878912 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045547642928786357, + "loss": 3.1003, + "theoretical_loss": 4.109462298944322, + "tokens_seen": 323944448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554663991975928, + "loss": 3.1359, + "theoretical_loss": 4.109365981678316, + "tokens_seen": 324009984 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455456369107322, + "loss": 3.0616, + "theoretical_loss": 4.109269689345592, + "tokens_seen": 324075520 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045544633901705117, + "loss": 3.0413, + "theoretical_loss": 4.109173421934654, + "tokens_seen": 324141056 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045543630892678035, + "loss": 3.1208, + "theoretical_loss": 4.109077179434016, + "tokens_seen": 324206592 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045542627883650953, + "loss": 3.1749, + "theoretical_loss": 4.1089809618321995, + "tokens_seen": 324272128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554162487462387, + "loss": 2.6465, + "theoretical_loss": 4.108884769117731, + "tokens_seen": 324337664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 195463, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8933980464935303, + "objective/train/theoretical_loss": 4.108788601279149, + "objective/train/tokens_used": 344863200, + "theoretical_loss": 4.108788601279149, + "tokens_seen": 324403200 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045540621865596795, + "loss": 3.0332, + "theoretical_loss": 4.108788601279149, + "tokens_seen": 324403200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553961885656971, + "loss": 2.9037, + "theoretical_loss": 4.108692458304994, + "tokens_seen": 324468736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553861584754263, + "loss": 3.2058, + "theoretical_loss": 4.108596340183819, + "tokens_seen": 324534272 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045537612838515544, + "loss": 3.1413, + "theoretical_loss": 4.108500246904184, + "tokens_seen": 324599808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553660982948847, + "loss": 3.1149, + "theoretical_loss": 4.108404178454651, + "tokens_seen": 324665344 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045535606820461385, + "loss": 2.7925, + "theoretical_loss": 4.1083081348237975, + "tokens_seen": 324730880 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045534603811434304, + "loss": 3.0427, + "theoretical_loss": 4.108212116000203, + "tokens_seen": 324796416 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553360080240722, + "loss": 2.9514, + "theoretical_loss": 4.108116121972457, + "tokens_seen": 324861952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045532597793380145, + "loss": 3.0382, + "theoretical_loss": 4.108020152729157, + "tokens_seen": 324927488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553159478435306, + "loss": 3.0567, + "theoretical_loss": 4.107924208258905, + "tokens_seen": 324993024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553059177532598, + "loss": 3.0865, + "theoretical_loss": 4.107828288550314, + "tokens_seen": 325058560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045529588766298894, + "loss": 3.1368, + "theoretical_loss": 4.107732393592003, + "tokens_seen": 325124096 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552858575727182, + "loss": 2.9862, + "theoretical_loss": 4.107636523372598, + "tokens_seen": 325189632 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045527582748244736, + "loss": 3.0282, + "theoretical_loss": 4.107540677880734, + "tokens_seen": 325255168 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045526579739217654, + "loss": 3.1162, + "theoretical_loss": 4.107444857105052, + "tokens_seen": 325320704 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552557673019057, + "loss": 3.1163, + "theoretical_loss": 4.107349061034201, + "tokens_seen": 325386240 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552457372116349, + "loss": 2.8527, + "theoretical_loss": 4.107253289656838, + "tokens_seen": 325451776 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552357071213641, + "loss": 3.0606, + "theoretical_loss": 4.107157542961628, + "tokens_seen": 325517312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552256770310933, + "loss": 3.0527, + "theoretical_loss": 4.10706182093724, + "tokens_seen": 325582848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045521564694082244, + "loss": 3.4321, + "theoretical_loss": 4.106966123572356, + "tokens_seen": 325648384 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552056168505517, + "loss": 2.9409, + "theoretical_loss": 4.106870450855661, + "tokens_seen": 325713920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551955867602808, + "loss": 3.1752, + "theoretical_loss": 4.106774802775849, + "tokens_seen": 325779456 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045518555667001004, + "loss": 3.0561, + "theoretical_loss": 4.106679179321622, + "tokens_seen": 325844992 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551755265797392, + "loss": 3.091, + "theoretical_loss": 4.106583580481689, + "tokens_seen": 325910528 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551654964894684, + "loss": 3.0422, + "theoretical_loss": 4.106488006244767, + "tokens_seen": 325976064 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 196731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.885859727859497, + "objective/train/theoretical_loss": 4.106392456599577, + "objective/train/tokens_used": 346501600, + "theoretical_loss": 4.106392456599577, + "tokens_seen": 326041600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551554663991976, + "loss": 2.9643, + "theoretical_loss": 4.106392456599577, + "tokens_seen": 326041600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551454363089268, + "loss": 3.112, + "theoretical_loss": 4.106296931534854, + "tokens_seen": 326107136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045513540621865595, + "loss": 2.9576, + "theoretical_loss": 4.106201431039335, + "tokens_seen": 326172672 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551253761283852, + "loss": 2.6294, + "theoretical_loss": 4.106105955101766, + "tokens_seen": 326238208 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551153460381143, + "loss": 3.1334, + "theoretical_loss": 4.1060105037109, + "tokens_seen": 326303744 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045510531594784354, + "loss": 2.9134, + "theoretical_loss": 4.105915076855499, + "tokens_seen": 326369280 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550952858575727, + "loss": 3.0419, + "theoretical_loss": 4.105819674524332, + "tokens_seen": 326434816 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550852557673019, + "loss": 3.009, + "theoretical_loss": 4.105724296706172, + "tokens_seen": 326500352 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550752256770311, + "loss": 2.9717, + "theoretical_loss": 4.105628943389805, + "tokens_seen": 326565888 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045506519558676027, + "loss": 2.9638, + "theoretical_loss": 4.1055336145640196, + "tokens_seen": 326631424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045505516549648945, + "loss": 3.3072, + "theoretical_loss": 4.105438310217615, + "tokens_seen": 326696960 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550451354062187, + "loss": 3.0272, + "theoretical_loss": 4.105343030339395, + "tokens_seen": 326762496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550351053159478, + "loss": 3.0128, + "theoretical_loss": 4.1052477749181735, + "tokens_seen": 326828032 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045502507522567705, + "loss": 3.0124, + "theoretical_loss": 4.10515254394277, + "tokens_seen": 326893568 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045501504513540623, + "loss": 2.8748, + "theoretical_loss": 4.1050573374020125, + "tokens_seen": 326959104 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550050150451354, + "loss": 3.0619, + "theoretical_loss": 4.104962155284734, + "tokens_seen": 327024640 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549949849548646, + "loss": 2.8882, + "theoretical_loss": 4.104866997579778, + "tokens_seen": 327090176 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045498495486459377, + "loss": 3.0027, + "theoretical_loss": 4.104771864275993, + "tokens_seen": 327155712 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045497492477432295, + "loss": 3.1058, + "theoretical_loss": 4.104676755362237, + "tokens_seen": 327221248 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549648946840522, + "loss": 3.0508, + "theoretical_loss": 4.104581670827372, + "tokens_seen": 327286784 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045495486459378137, + "loss": 2.8934, + "theoretical_loss": 4.10448661066027, + "tokens_seen": 327352320 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045494483450351055, + "loss": 3.1344, + "theoretical_loss": 4.104391574849812, + "tokens_seen": 327417856 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045493480441323973, + "loss": 3.0873, + "theoretical_loss": 4.10429656338488, + "tokens_seen": 327483392 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549247743229689, + "loss": 3.0143, + "theoretical_loss": 4.104201576254369, + "tokens_seen": 327548928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045491474423269815, + "loss": 2.9043, + "theoretical_loss": 4.10410661344718, + "tokens_seen": 327614464 + }, + { + "debugging/Self-BLEU-5": 0.3841532625917073, + "debugging/distinct-1-grams": 0.7832216589580511, + "debugging/distinct-2-grams": 0.9618292669346826, + "debugging/entropy-1-grams": 5.637375639920272, + "debugging/entropy-2-grams": 6.413549369763027, + "debugging/length": 486.6666666666667, + "debugging/num_segments": 9, + "epoch": 0.1, + "objective/train/docs_used": 197513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.052098035812378, + "objective/train/theoretical_loss": 4.10401167495222, + "objective/train/tokens_used": 348140000, + "theoretical_loss": 4.10401167495222, + "tokens_seen": 327680000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549047141424273, + "loss": 3.1254, + "theoretical_loss": 4.10401167495222, + "tokens_seen": 327680000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548946840521565, + "loss": 3.2898, + "theoretical_loss": 4.103916760758405, + "tokens_seen": 327745536 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045488465396188564, + "loss": 3.0752, + "theoretical_loss": 4.103821870854656, + "tokens_seen": 327811072 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548746238716149, + "loss": 3.0744, + "theoretical_loss": 4.103727005229903, + "tokens_seen": 327876608 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045486459378134405, + "loss": 3.0116, + "theoretical_loss": 4.103632163873083, + "tokens_seen": 327942144 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045485456369107324, + "loss": 3.1184, + "theoretical_loss": 4.10353734677314, + "tokens_seen": 328007680 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548445336008024, + "loss": 3.1614, + "theoretical_loss": 4.103442553919026, + "tokens_seen": 328073216 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045483450351053165, + "loss": 2.9932, + "theoretical_loss": 4.1033477852996985, + "tokens_seen": 328138752 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548244734202608, + "loss": 3.1685, + "theoretical_loss": 4.103253040904124, + "tokens_seen": 328204288 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045481444332999, + "loss": 2.8453, + "theoretical_loss": 4.103158320721276, + "tokens_seen": 328269824 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045480441323971914, + "loss": 3.1451, + "theoretical_loss": 4.103063624740133, + "tokens_seen": 328335360 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547943831494484, + "loss": 3.0218, + "theoretical_loss": 4.102968952949684, + "tokens_seen": 328400896 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045478435305917756, + "loss": 2.9797, + "theoretical_loss": 4.102874305338923, + "tokens_seen": 328466432 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045477432296890674, + "loss": 3.0406, + "theoretical_loss": 4.102779681896852, + "tokens_seen": 328531968 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547642928786359, + "loss": 2.8546, + "theoretical_loss": 4.10268508261248, + "tokens_seen": 328597504 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547542627883651, + "loss": 3.0949, + "theoretical_loss": 4.102590507474824, + "tokens_seen": 328663040 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547442326980943, + "loss": 3.256, + "theoretical_loss": 4.1024959564729055, + "tokens_seen": 328728576 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547342026078235, + "loss": 3.0471, + "theoretical_loss": 4.102401429595758, + "tokens_seen": 328794112 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045472417251755264, + "loss": 2.9156, + "theoretical_loss": 4.102306926832417, + "tokens_seen": 328859648 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547141424272819, + "loss": 3.2931, + "theoretical_loss": 4.102212448171928, + "tokens_seen": 328925184 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454704112337011, + "loss": 2.982, + "theoretical_loss": 4.1021179936033425, + "tokens_seen": 328990720 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045469408224674024, + "loss": 3.0106, + "theoretical_loss": 4.102023563115721, + "tokens_seen": 329056256 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546840521564694, + "loss": 3.0727, + "theoretical_loss": 4.10192915669813, + "tokens_seen": 329121792 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546740220661986, + "loss": 3.1747, + "theoretical_loss": 4.1018347743396415, + "tokens_seen": 329187328 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546639919759278, + "loss": 3.1083, + "theoretical_loss": 4.101740416029338, + "tokens_seen": 329252864 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 198138, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2101240158081055, + "objective/train/theoretical_loss": 4.101646081756305, + "objective/train/tokens_used": 349778400, + "theoretical_loss": 4.101646081756305, + "tokens_seen": 329318400 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454653961885657, + "loss": 3.0282, + "theoretical_loss": 4.101646081756305, + "tokens_seen": 329318400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045464393179538615, + "loss": 2.993, + "theoretical_loss": 4.101551771509641, + "tokens_seen": 329383936 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546339017051154, + "loss": 3.2896, + "theoretical_loss": 4.101457485278444, + "tokens_seen": 329449472 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546238716148445, + "loss": 3.0261, + "theoretical_loss": 4.101363223051826, + "tokens_seen": 329515008 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045461384152457374, + "loss": 3.0682, + "theoretical_loss": 4.101268984818901, + "tokens_seen": 329580544 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546038114343029, + "loss": 3.1339, + "theoretical_loss": 4.101174770568795, + "tokens_seen": 329646080 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545937813440321, + "loss": 3.0888, + "theoretical_loss": 4.1010805802906365, + "tokens_seen": 329711616 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545837512537613, + "loss": 3.0345, + "theoretical_loss": 4.100986413973564, + "tokens_seen": 329777152 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045457372116349047, + "loss": 3.1313, + "theoretical_loss": 4.100892271606721, + "tokens_seen": 329842688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045456369107321965, + "loss": 2.8485, + "theoretical_loss": 4.1007981531792606, + "tokens_seen": 329908224 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545536609829489, + "loss": 3.19, + "theoretical_loss": 4.100704058680341, + "tokens_seen": 329973760 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454543630892678, + "loss": 3.017, + "theoretical_loss": 4.1006099880991265, + "tokens_seen": 330039296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045453360080240725, + "loss": 2.9604, + "theoretical_loss": 4.100515941424792, + "tokens_seen": 330104832 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045452357071213643, + "loss": 2.9596, + "theoretical_loss": 4.100421918646517, + "tokens_seen": 330170368 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545135406218656, + "loss": 3.1628, + "theoretical_loss": 4.1003279197534885, + "tokens_seen": 330235904 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545035105315948, + "loss": 2.9544, + "theoretical_loss": 4.100233944734899, + "tokens_seen": 330301440 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045449348044132397, + "loss": 3.1729, + "theoretical_loss": 4.100139993579952, + "tokens_seen": 330366976 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045448345035105315, + "loss": 2.7687, + "theoretical_loss": 4.100046066277853, + "tokens_seen": 330432512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544734202607824, + "loss": 3.2236, + "theoretical_loss": 4.09995216281782, + "tokens_seen": 330498048 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544633901705115, + "loss": 3.1535, + "theoretical_loss": 4.0998582831890715, + "tokens_seen": 330563584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045445336008024075, + "loss": 3.11, + "theoretical_loss": 4.0997644273808405, + "tokens_seen": 330629120 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544433299899699, + "loss": 2.8457, + "theoretical_loss": 4.09967059538236, + "tokens_seen": 330694656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544332998996991, + "loss": 2.9948, + "theoretical_loss": 4.099576787182874, + "tokens_seen": 330760192 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544232698094283, + "loss": 3.1013, + "theoretical_loss": 4.099483002771633, + "tokens_seen": 330825728 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544132397191575, + "loss": 2.9936, + "theoretical_loss": 4.099389242137894, + "tokens_seen": 330891264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 199416, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0712082386016846, + "objective/train/theoretical_loss": 4.099295505270921, + "objective/train/tokens_used": 351416800, + "theoretical_loss": 4.099295505270921, + "tokens_seen": 330956800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045440320962888666, + "loss": 3.0119, + "theoretical_loss": 4.099295505270921, + "tokens_seen": 330956800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045439317953861584, + "loss": 2.9404, + "theoretical_loss": 4.099201792159985, + "tokens_seen": 331022336 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454383149448345, + "loss": 2.9773, + "theoretical_loss": 4.099108102794363, + "tokens_seen": 331087872 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045437311935807425, + "loss": 3.1837, + "theoretical_loss": 4.099014437163342, + "tokens_seen": 331153408 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543630892678034, + "loss": 2.9231, + "theoretical_loss": 4.098920795256213, + "tokens_seen": 331218944 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543530591775326, + "loss": 2.9996, + "theoretical_loss": 4.098827177062273, + "tokens_seen": 331284480 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543430290872618, + "loss": 2.9186, + "theoretical_loss": 4.098733582570831, + "tokens_seen": 331350016 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454332998996991, + "loss": 3.0911, + "theoretical_loss": 4.098640011771198, + "tokens_seen": 331415552 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045432296890672016, + "loss": 2.9677, + "theoretical_loss": 4.098546464652693, + "tokens_seen": 331481088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045431293881644934, + "loss": 2.8865, + "theoretical_loss": 4.098452941204643, + "tokens_seen": 331546624 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543029087261785, + "loss": 2.9476, + "theoretical_loss": 4.098359441416383, + "tokens_seen": 331612160 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045429287863590776, + "loss": 2.9588, + "theoretical_loss": 4.0982659652772515, + "tokens_seen": 331677696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542828485456369, + "loss": 3.1995, + "theoretical_loss": 4.098172512776597, + "tokens_seen": 331743232 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542728184553661, + "loss": 2.9885, + "theoretical_loss": 4.098079083903773, + "tokens_seen": 331808768 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045426278836509525, + "loss": 3.1498, + "theoretical_loss": 4.097985678648142, + "tokens_seen": 331874304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542527582748245, + "loss": 2.948, + "theoretical_loss": 4.09789229699907, + "tokens_seen": 331939840 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045424272818455366, + "loss": 3.1034, + "theoretical_loss": 4.097798938945933, + "tokens_seen": 332005376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045423269809428284, + "loss": 3.143, + "theoretical_loss": 4.097705604478112, + "tokens_seen": 332070912 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454222668004012, + "loss": 2.8918, + "theoretical_loss": 4.097612293584998, + "tokens_seen": 332136448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542126379137412, + "loss": 2.817, + "theoretical_loss": 4.0975190062559825, + "tokens_seen": 332201984 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045420260782347044, + "loss": 2.9891, + "theoretical_loss": 4.097425742480472, + "tokens_seen": 332267520 + } + ], + "max_steps": 50354, + "num_train_epochs": 9223372036854775807, + "total_flos": 1.6956807708672e+17, + "trial_name": null, + "trial_params": null +}