{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999812503515559, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.2e-06, "loss": 12.5503, "step": 1 }, { "epoch": 0.0, "learning_rate": 0.0003, "loss": 2.6746, "step": 250 }, { "epoch": 0.01, "learning_rate": 0.0006, "loss": 1.1432, "step": 500 }, { "epoch": 0.01, "learning_rate": 0.0005999622383021625, "loss": 1.0557, "step": 750 }, { "epoch": 0.02, "learning_rate": 0.0005998489627149555, "loss": 1.0155, "step": 1000 }, { "epoch": 0.02, "learning_rate": 0.0005996602017549024, "loss": 0.9903, "step": 1250 }, { "epoch": 0.03, "learning_rate": 0.0005993960029415653, "loss": 0.9722, "step": 1500 }, { "epoch": 0.03, "learning_rate": 0.0005990564327855827, "loss": 0.9576, "step": 1750 }, { "epoch": 0.04, "learning_rate": 0.0005986415767719254, "loss": 0.9452, "step": 2000 }, { "epoch": 0.04, "learning_rate": 0.0005981515393383762, "loss": 0.9357, "step": 2250 }, { "epoch": 0.05, "learning_rate": 0.0005975864438492385, "loss": 0.9271, "step": 2500 }, { "epoch": 0.05, "eval_loss": 0.870555579662323, "eval_runtime": 728.785, "eval_samples_per_second": 140.508, "eval_steps_per_second": 4.391, "step": 2500 }, { "epoch": 0.05, "learning_rate": 0.0005969464325642798, "loss": 0.9193, "step": 2750 }, { "epoch": 0.06, "learning_rate": 0.0005962316666029183, "loss": 0.9106, "step": 3000 }, { "epoch": 0.06, "learning_rate": 0.0005954423259036624, "loss": 0.9044, "step": 3250 }, { "epoch": 0.07, "learning_rate": 0.0005945786091788119, "loss": 0.8981, "step": 3500 }, { "epoch": 0.07, "learning_rate": 0.0005936407338644336, "loss": 0.8929, "step": 3750 }, { "epoch": 0.08, "learning_rate": 0.0005926289360656221, "loss": 0.8887, "step": 4000 }, { "epoch": 0.08, "learning_rate": 0.0005915434704970625, "loss": 0.8837, "step": 4250 }, { "epoch": 0.09, "learning_rate": 0.0005903846104189068, "loss": 0.8784, "step": 4500 }, { "epoch": 0.09, "learning_rate": 0.0005891526475679825, "loss": 0.875, "step": 4750 }, { "epoch": 0.1, "learning_rate": 0.0005878478920843492, "loss": 0.8708, "step": 5000 }, { "epoch": 0.1, "eval_loss": 0.8258238434791565, "eval_runtime": 729.8354, "eval_samples_per_second": 140.306, "eval_steps_per_second": 4.385, "step": 5000 }, { "epoch": 0.1, "learning_rate": 0.0005864706724332221, "loss": 0.8672, "step": 5250 }, { "epoch": 0.11, "learning_rate": 0.0005850213353222835, "loss": 0.8633, "step": 5500 }, { "epoch": 0.11, "learning_rate": 0.0005835002456144005, "loss": 0.861, "step": 5750 }, { "epoch": 0.12, "learning_rate": 0.0005819077862357724, "loss": 0.8572, "step": 6000 }, { "epoch": 0.12, "learning_rate": 0.000580244358079532, "loss": 0.8546, "step": 6250 }, { "epoch": 0.13, "learning_rate": 0.0005785103799048218, "loss": 0.8513, "step": 6500 }, { "epoch": 0.13, "learning_rate": 0.0005767062882313744, "loss": 0.8479, "step": 6750 }, { "epoch": 0.14, "learning_rate": 0.0005748325372296208, "loss": 0.846, "step": 7000 }, { "epoch": 0.14, "learning_rate": 0.0005728895986063555, "loss": 0.8439, "step": 7250 }, { "epoch": 0.15, "learning_rate": 0.0005708779614859863, "loss": 0.8404, "step": 7500 }, { "epoch": 0.15, "eval_loss": 0.8027751445770264, "eval_runtime": 730.3109, "eval_samples_per_second": 140.214, "eval_steps_per_second": 4.382, "step": 7500 }, { "epoch": 0.15, "learning_rate": 0.0005687981322874007, "loss": 0.8386, "step": 7750 }, { "epoch": 0.16, "learning_rate": 0.000566650634596477, "loss": 0.8367, "step": 8000 }, { "epoch": 0.16, "learning_rate": 0.0005644360090342746, "loss": 0.8343, "step": 8250 }, { "epoch": 0.17, "learning_rate": 0.0005621548131209354, "loss": 0.8325, "step": 8500 }, { "epoch": 0.17, "learning_rate": 0.0005598076211353316, "loss": 0.83, "step": 8750 }, { "epoch": 0.18, "learning_rate": 0.000557395023970493, "loss": 0.8281, "step": 9000 }, { "epoch": 0.18, "learning_rate": 0.0005549176289848543, "loss": 0.8266, "step": 9250 }, { "epoch": 0.19, "learning_rate": 0.0005523760598493544, "loss": 0.8253, "step": 9500 }, { "epoch": 0.19, "learning_rate": 0.0005497709563904314, "loss": 0.8237, "step": 9750 }, { "epoch": 0.2, "learning_rate": 0.0005471029744289498, "loss": 0.822, "step": 10000 }, { "epoch": 0.2, "eval_loss": 0.7851784229278564, "eval_runtime": 729.0178, "eval_samples_per_second": 140.463, "eval_steps_per_second": 4.389, "step": 10000 }, { "epoch": 0.2, "learning_rate": 0.0005443727856151006, "loss": 0.8202, "step": 10250 }, { "epoch": 0.21, "learning_rate": 0.0005415810772593175, "loss": 0.8188, "step": 10500 }, { "epoch": 0.21, "learning_rate": 0.0005387285521592496, "loss": 0.8174, "step": 10750 }, { "epoch": 0.22, "learning_rate": 0.0005358159284228363, "loss": 0.816, "step": 11000 }, { "epoch": 0.22, "learning_rate": 0.000532843939287527, "loss": 0.8142, "step": 11250 }, { "epoch": 0.23, "learning_rate": 0.0005298133329356933, "loss": 0.8125, "step": 11500 }, { "epoch": 0.23, "learning_rate": 0.0005267248723062775, "loss": 0.8116, "step": 11750 }, { "epoch": 0.24, "learning_rate": 0.0005235793349027264, "loss": 0.8094, "step": 12000 }, { "epoch": 0.24, "learning_rate": 0.0005203775125972599, "loss": 0.809, "step": 12250 }, { "epoch": 0.25, "learning_rate": 0.000517120211431521, "loss": 0.8076, "step": 12500 }, { "epoch": 0.25, "eval_loss": 0.7733312845230103, "eval_runtime": 729.3665, "eval_samples_per_second": 140.396, "eval_steps_per_second": 4.387, "step": 12500 }, { "epoch": 0.25, "learning_rate": 0.0005138082514136589, "loss": 0.8061, "step": 12750 }, { "epoch": 0.26, "learning_rate": 0.0005104424663118964, "loss": 0.8054, "step": 13000 }, { "epoch": 0.26, "learning_rate": 0.0005070237034446336, "loss": 0.8043, "step": 13250 }, { "epoch": 0.27, "learning_rate": 0.0005035528234671396, "loss": 0.8035, "step": 13500 }, { "epoch": 0.27, "learning_rate": 0.0005000307001548875, "loss": 0.8024, "step": 13750 }, { "epoch": 0.28, "learning_rate": 0.0004964582201835855, "loss": 0.8009, "step": 14000 }, { "epoch": 0.28, "learning_rate": 0.0004928362829059618, "loss": 0.8, "step": 14250 }, { "epoch": 0.29, "learning_rate": 0.0004891658001253567, "loss": 0.7986, "step": 14500 }, { "epoch": 0.29, "learning_rate": 0.00048544769586618153, "loss": 0.7972, "step": 14750 }, { "epoch": 0.3, "learning_rate": 0.00048168290614129995, "loss": 0.7975, "step": 15000 }, { "epoch": 0.3, "eval_loss": 0.7640220522880554, "eval_runtime": 728.9343, "eval_samples_per_second": 140.479, "eval_steps_per_second": 4.39, "step": 15000 }, { "epoch": 0.3, "learning_rate": 0.00047787237871639213, "loss": 0.7964, "step": 15250 }, { "epoch": 0.31, "learning_rate": 0.0004740170728713594, "loss": 0.7948, "step": 15500 }, { "epoch": 0.31, "learning_rate": 0.0004701179591588311, "loss": 0.7947, "step": 15750 }, { "epoch": 0.32, "learning_rate": 0.00046617601915983307, "loss": 0.7932, "step": 16000 }, { "epoch": 0.32, "learning_rate": 0.00046219224523667927, "loss": 0.7929, "step": 16250 }, { "epoch": 0.33, "learning_rate": 0.00045816764028315066, "loss": 0.7924, "step": 16500 }, { "epoch": 0.33, "learning_rate": 0.0004541032174720219, "loss": 0.7911, "step": 16750 }, { "epoch": 0.34, "learning_rate": 0.00045, "loss": 0.7901, "step": 17000 }, { "epoch": 0.34, "learning_rate": 0.00044585902083014057, "loss": 0.7893, "step": 17250 }, { "epoch": 0.35, "learning_rate": 0.0004416813224318048, "loss": 0.7882, "step": 17500 }, { "epoch": 0.35, "eval_loss": 0.7565080523490906, "eval_runtime": 729.2933, "eval_samples_per_second": 140.41, "eval_steps_per_second": 4.388, "step": 17500 }, { "epoch": 0.35, "learning_rate": 0.00043746795651822306, "loss": 0.788, "step": 17750 }, { "epoch": 0.36, "learning_rate": 0.0004332199837817322, "loss": 0.7869, "step": 18000 }, { "epoch": 0.36, "learning_rate": 0.0004289384736267515, "loss": 0.7864, "step": 18250 }, { "epoch": 0.37, "learning_rate": 0.00042462450390056593, "loss": 0.786, "step": 18500 }, { "epoch": 0.37, "learning_rate": 0.0004202791606219841, "loss": 0.7848, "step": 18750 }, { "epoch": 0.38, "learning_rate": 0.0004159035377079385, "loss": 0.7844, "step": 19000 }, { "epoch": 0.38, "learning_rate": 0.0004114987366980982, "loss": 0.7838, "step": 19250 }, { "epoch": 0.39, "learning_rate": 0.0004070658664775615, "loss": 0.7836, "step": 19500 }, { "epoch": 0.39, "learning_rate": 0.00040260604299770063, "loss": 0.7819, "step": 19750 }, { "epoch": 0.4, "learning_rate": 0.0003981203889952265, "loss": 0.7815, "step": 20000 }, { "epoch": 0.4, "eval_loss": 0.7509350776672363, "eval_runtime": 729.0652, "eval_samples_per_second": 140.454, "eval_steps_per_second": 4.389, "step": 20000 }, { "epoch": 0.4, "learning_rate": 0.0003936100337095461, "loss": 0.7807, "step": 20250 }, { "epoch": 0.41, "learning_rate": 0.0003890761125984825, "loss": 0.7797, "step": 20500 }, { "epoch": 0.41, "learning_rate": 0.0003845197670524289, "loss": 0.7796, "step": 20750 }, { "epoch": 0.42, "learning_rate": 0.0003799421441070104, "loss": 0.7786, "step": 21000 }, { "epoch": 0.42, "learning_rate": 0.0003753443961543237, "loss": 0.7789, "step": 21250 }, { "epoch": 0.43, "learning_rate": 0.0003707276806528282, "loss": 0.7779, "step": 21500 }, { "epoch": 0.43, "learning_rate": 0.0003660931598359622, "loss": 0.7771, "step": 21750 }, { "epoch": 0.44, "learning_rate": 0.0003614420004195572, "loss": 0.7768, "step": 22000 }, { "epoch": 0.44, "learning_rate": 0.000356775373308123, "loss": 0.7766, "step": 22250 }, { "epoch": 0.45, "learning_rate": 0.0003520944533000791, "loss": 0.7759, "step": 22500 }, { "epoch": 0.45, "eval_loss": 0.7447416186332703, "eval_runtime": 729.6789, "eval_samples_per_second": 140.336, "eval_steps_per_second": 4.385, "step": 22500 }, { "epoch": 0.45, "learning_rate": 0.00034740041879200497, "loss": 0.7742, "step": 22750 }, { "epoch": 0.46, "learning_rate": 0.00034269445148198553, "loss": 0.7742, "step": 23000 }, { "epoch": 0.46, "learning_rate": 0.00033797773607212474, "loss": 0.7734, "step": 23250 }, { "epoch": 0.47, "learning_rate": 0.0003332514599703033, "loss": 0.7733, "step": 23500 }, { "epoch": 0.47, "learning_rate": 0.0003285168129912547, "loss": 0.7723, "step": 23750 }, { "epoch": 0.48, "learning_rate": 0.0003237749870570365, "loss": 0.7712, "step": 24000 }, { "epoch": 0.48, "learning_rate": 0.0003190271758969693, "loss": 0.771, "step": 24250 }, { "epoch": 0.49, "learning_rate": 0.00031427457474712274, "loss": 0.7721, "step": 24500 }, { "epoch": 0.49, "learning_rate": 0.0003095183800494203, "loss": 0.7707, "step": 24750 }, { "epoch": 0.5, "learning_rate": 0.00030475978915044235, "loss": 0.7694, "step": 25000 }, { "epoch": 0.5, "eval_loss": 0.7401933073997498, "eval_runtime": 729.0015, "eval_samples_per_second": 140.466, "eval_steps_per_second": 4.39, "step": 25000 }, { "epoch": 0.5, "learning_rate": 0.0003, "loss": 0.7699, "step": 25250 }, { "epoch": 0.51, "learning_rate": 0.0002952402108495576, "loss": 0.7691, "step": 25500 }, { "epoch": 0.51, "learning_rate": 0.00029048161995057974, "loss": 0.7693, "step": 25750 }, { "epoch": 0.52, "learning_rate": 0.0002857254252528773, "loss": 0.7684, "step": 26000 }, { "epoch": 0.52, "learning_rate": 0.00028097282410303066, "loss": 0.7683, "step": 26250 }, { "epoch": 0.53, "learning_rate": 0.0002762250129429634, "loss": 0.7667, "step": 26500 }, { "epoch": 0.53, "learning_rate": 0.00027148318700874523, "loss": 0.7666, "step": 26750 }, { "epoch": 0.54, "learning_rate": 0.0002667485400296967, "loss": 0.767, "step": 27000 }, { "epoch": 0.54, "learning_rate": 0.00026202226392787515, "loss": 0.7661, "step": 27250 }, { "epoch": 0.55, "learning_rate": 0.0002573055485180145, "loss": 0.7655, "step": 27500 }, { "epoch": 0.55, "eval_loss": 0.7358129620552063, "eval_runtime": 728.9506, "eval_samples_per_second": 140.476, "eval_steps_per_second": 4.39, "step": 27500 }, { "epoch": 0.55, "learning_rate": 0.000252599581207995, "loss": 0.7659, "step": 27750 }, { "epoch": 0.56, "learning_rate": 0.0002479055466999209, "loss": 0.7642, "step": 28000 }, { "epoch": 0.56, "learning_rate": 0.00024322462669187702, "loss": 0.7641, "step": 28250 }, { "epoch": 0.57, "learning_rate": 0.0002385579995804428, "loss": 0.7637, "step": 28500 }, { "epoch": 0.57, "learning_rate": 0.00023390684016403777, "loss": 0.7632, "step": 28750 }, { "epoch": 0.58, "learning_rate": 0.00022927231934717176, "loss": 0.7636, "step": 29000 }, { "epoch": 0.58, "learning_rate": 0.00022465560384567624, "loss": 0.7629, "step": 29250 }, { "epoch": 0.59, "learning_rate": 0.00022005785589298952, "loss": 0.7619, "step": 29500 }, { "epoch": 0.59, "learning_rate": 0.00021548023294757105, "loss": 0.762, "step": 29750 }, { "epoch": 0.6, "learning_rate": 0.00021092388740151762, "loss": 0.761, "step": 30000 }, { "epoch": 0.6, "eval_loss": 0.7323540449142456, "eval_runtime": 729.9414, "eval_samples_per_second": 140.285, "eval_steps_per_second": 4.384, "step": 30000 }, { "epoch": 0.6, "learning_rate": 0.00020638996629045387, "loss": 0.7611, "step": 30250 }, { "epoch": 0.61, "learning_rate": 0.0002018796110047735, "loss": 0.7614, "step": 30500 }, { "epoch": 0.61, "learning_rate": 0.00019739395700229937, "loss": 0.7604, "step": 30750 }, { "epoch": 0.62, "learning_rate": 0.00019293413352243846, "loss": 0.7606, "step": 31000 }, { "epoch": 0.62, "learning_rate": 0.00018850126330190176, "loss": 0.7605, "step": 31250 }, { "epoch": 0.63, "learning_rate": 0.00018409646229206137, "loss": 0.7591, "step": 31500 }, { "epoch": 0.63, "learning_rate": 0.00017972083937801593, "loss": 0.7592, "step": 31750 }, { "epoch": 0.64, "learning_rate": 0.0001753754960994341, "loss": 0.759, "step": 32000 }, { "epoch": 0.64, "learning_rate": 0.0001710615263732485, "loss": 0.7587, "step": 32250 }, { "epoch": 0.65, "learning_rate": 0.00016678001621826772, "loss": 0.7576, "step": 32500 }, { "epoch": 0.65, "eval_loss": 0.7288678884506226, "eval_runtime": 729.3087, "eval_samples_per_second": 140.407, "eval_steps_per_second": 4.388, "step": 32500 }, { "epoch": 0.65, "learning_rate": 0.00016253204348177686, "loss": 0.7579, "step": 32750 }, { "epoch": 0.66, "learning_rate": 0.00015831867756819522, "loss": 0.758, "step": 33000 }, { "epoch": 0.66, "learning_rate": 0.00015414097916985944, "loss": 0.7577, "step": 33250 }, { "epoch": 0.67, "learning_rate": 0.00015000000000000004, "loss": 0.7559, "step": 33500 }, { "epoch": 0.67, "learning_rate": 0.00014589678252797817, "loss": 0.7567, "step": 33750 }, { "epoch": 0.68, "learning_rate": 0.00014183235971684924, "loss": 0.7562, "step": 34000 }, { "epoch": 0.68, "learning_rate": 0.00013780775476332082, "loss": 0.7557, "step": 34250 }, { "epoch": 0.69, "learning_rate": 0.0001338239808401669, "loss": 0.7559, "step": 34500 }, { "epoch": 0.69, "learning_rate": 0.0001298820408411688, "loss": 0.7556, "step": 34750 }, { "epoch": 0.7, "learning_rate": 0.00012598292712864058, "loss": 0.7546, "step": 35000 }, { "epoch": 0.7, "eval_loss": 0.7261090874671936, "eval_runtime": 729.8771, "eval_samples_per_second": 140.298, "eval_steps_per_second": 4.384, "step": 35000 }, { "epoch": 0.7, "learning_rate": 0.0001221276212836079, "loss": 0.7551, "step": 35250 }, { "epoch": 0.71, "learning_rate": 0.00011831709385870004, "loss": 0.7545, "step": 35500 }, { "epoch": 0.71, "learning_rate": 0.0001145523041338184, "loss": 0.7543, "step": 35750 }, { "epoch": 0.72, "learning_rate": 0.00011083419987464334, "loss": 0.7537, "step": 36000 }, { "epoch": 0.72, "learning_rate": 0.00010716371709403818, "loss": 0.7542, "step": 36250 }, { "epoch": 0.73, "learning_rate": 0.00010354177981641449, "loss": 0.754, "step": 36500 }, { "epoch": 0.73, "learning_rate": 9.996929984511254e-05, "loss": 0.7534, "step": 36750 }, { "epoch": 0.74, "learning_rate": 9.644717653286037e-05, "loss": 0.7524, "step": 37000 }, { "epoch": 0.74, "learning_rate": 9.297629655536644e-05, "loss": 0.753, "step": 37250 }, { "epoch": 0.75, "learning_rate": 8.955753368810358e-05, "loss": 0.7526, "step": 37500 }, { "epoch": 0.75, "eval_loss": 0.7241286635398865, "eval_runtime": 729.0335, "eval_samples_per_second": 140.46, "eval_steps_per_second": 4.389, "step": 37500 }, { "epoch": 0.75, "learning_rate": 8.619174858634122e-05, "loss": 0.7525, "step": 37750 }, { "epoch": 0.76, "learning_rate": 8.287978856847894e-05, "loss": 0.753, "step": 38000 }, { "epoch": 0.76, "learning_rate": 7.962248740274003e-05, "loss": 0.7516, "step": 38250 }, { "epoch": 0.77, "learning_rate": 7.642066509727359e-05, "loss": 0.7515, "step": 38500 }, { "epoch": 0.77, "learning_rate": 7.327512769372254e-05, "loss": 0.7518, "step": 38750 }, { "epoch": 0.78, "learning_rate": 7.018666706430662e-05, "loss": 0.7512, "step": 39000 }, { "epoch": 0.78, "learning_rate": 6.715606071247291e-05, "loss": 0.7513, "step": 39250 }, { "epoch": 0.79, "learning_rate": 6.418407157716381e-05, "loss": 0.7516, "step": 39500 }, { "epoch": 0.79, "learning_rate": 6.127144784075033e-05, "loss": 0.7512, "step": 39750 }, { "epoch": 0.8, "learning_rate": 5.841892274068241e-05, "loss": 0.7503, "step": 40000 }, { "epoch": 0.8, "eval_loss": 0.7225306034088135, "eval_runtime": 729.2044, "eval_samples_per_second": 140.427, "eval_steps_per_second": 4.388, "step": 40000 }, { "epoch": 0.8, "learning_rate": 5.562721438489928e-05, "loss": 0.7509, "step": 40250 }, { "epoch": 0.81, "learning_rate": 5.2897025571050186e-05, "loss": 0.7503, "step": 40500 }, { "epoch": 0.81, "learning_rate": 5.022904360956861e-05, "loss": 0.7506, "step": 40750 }, { "epoch": 0.82, "learning_rate": 4.76239401506456e-05, "loss": 0.7504, "step": 41000 }, { "epoch": 0.82, "learning_rate": 4.5082371015145716e-05, "loss": 0.7502, "step": 41250 }, { "epoch": 0.83, "learning_rate": 4.260497602950688e-05, "loss": 0.7495, "step": 41500 }, { "epoch": 0.83, "learning_rate": 4.019237886466838e-05, "loss": 0.7499, "step": 41750 }, { "epoch": 0.84, "learning_rate": 3.784518687906452e-05, "loss": 0.7497, "step": 42000 }, { "epoch": 0.84, "learning_rate": 3.556399096572541e-05, "loss": 0.7496, "step": 42250 }, { "epoch": 0.85, "learning_rate": 3.3349365403522986e-05, "loss": 0.7496, "step": 42500 }, { "epoch": 0.85, "eval_loss": 0.7210782766342163, "eval_runtime": 729.8142, "eval_samples_per_second": 140.31, "eval_steps_per_second": 4.385, "step": 42500 }, { "epoch": 0.85, "learning_rate": 3.120186771259927e-05, "loss": 0.7493, "step": 42750 }, { "epoch": 0.86, "learning_rate": 2.9122038514013678e-05, "loss": 0.7492, "step": 43000 }, { "epoch": 0.86, "learning_rate": 2.7110401393644464e-05, "loss": 0.7492, "step": 43250 }, { "epoch": 0.87, "learning_rate": 2.516746277037912e-05, "loss": 0.7487, "step": 43500 }, { "epoch": 0.87, "learning_rate": 2.329371176862562e-05, "loss": 0.7485, "step": 43750 }, { "epoch": 0.88, "learning_rate": 2.148962009517823e-05, "loss": 0.7483, "step": 44000 }, { "epoch": 0.88, "learning_rate": 1.9755641920468003e-05, "loss": 0.7494, "step": 44250 }, { "epoch": 0.89, "learning_rate": 1.8092213764227503e-05, "loss": 0.7493, "step": 44500 }, { "epoch": 0.89, "learning_rate": 1.6499754385599462e-05, "loss": 0.7487, "step": 44750 }, { "epoch": 0.9, "learning_rate": 1.4978664677716402e-05, "loss": 0.7481, "step": 45000 }, { "epoch": 0.9, "eval_loss": 0.7202715277671814, "eval_runtime": 729.7085, "eval_samples_per_second": 140.33, "eval_steps_per_second": 4.385, "step": 45000 }, { "epoch": 0.9, "learning_rate": 1.3529327566777836e-05, "loss": 0.7483, "step": 45250 }, { "epoch": 0.91, "learning_rate": 1.2152107915650821e-05, "loss": 0.7493, "step": 45500 }, { "epoch": 0.91, "learning_rate": 1.0847352432017387e-05, "loss": 0.748, "step": 45750 }, { "epoch": 0.92, "learning_rate": 9.615389581093124e-06, "loss": 0.7478, "step": 46000 }, { "epoch": 0.92, "learning_rate": 8.456529502937504e-06, "loss": 0.748, "step": 46250 }, { "epoch": 0.93, "learning_rate": 7.371063934377885e-06, "loss": 0.7484, "step": 46500 }, { "epoch": 0.93, "learning_rate": 6.35926613556641e-06, "loss": 0.7478, "step": 46750 }, { "epoch": 0.94, "learning_rate": 5.421390821187988e-06, "loss": 0.7476, "step": 47000 }, { "epoch": 0.94, "learning_rate": 4.557674096337593e-06, "loss": 0.7473, "step": 47250 }, { "epoch": 0.95, "learning_rate": 3.768333397081713e-06, "loss": 0.7485, "step": 47500 }, { "epoch": 0.95, "eval_loss": 0.720069944858551, "eval_runtime": 728.4966, "eval_samples_per_second": 140.563, "eval_steps_per_second": 4.393, "step": 47500 }, { "epoch": 0.95, "learning_rate": 3.0535674357201944e-06, "loss": 0.7485, "step": 47750 }, { "epoch": 0.96, "learning_rate": 2.4135561507613975e-06, "loss": 0.7478, "step": 48000 }, { "epoch": 0.96, "learning_rate": 1.848460661623763e-06, "loss": 0.748, "step": 48250 }, { "epoch": 0.97, "learning_rate": 1.3584232280746231e-06, "loss": 0.7478, "step": 48500 }, { "epoch": 0.97, "learning_rate": 9.435672144173178e-07, "loss": 0.7476, "step": 48750 }, { "epoch": 0.98, "learning_rate": 6.03997058434702e-07, "loss": 0.7469, "step": 49000 }, { "epoch": 0.98, "learning_rate": 3.397982450976111e-07, "loss": 0.7475, "step": 49250 }, { "epoch": 0.99, "learning_rate": 1.5103728504447522e-07, "loss": 0.7477, "step": 49500 }, { "epoch": 0.99, "learning_rate": 3.776169783747951e-08, "loss": 0.7473, "step": 49750 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 0.7477, "step": 50000 }, { "epoch": 1.0, "eval_loss": 0.7199350595474243, "eval_runtime": 728.7415, "eval_samples_per_second": 140.516, "eval_steps_per_second": 4.391, "step": 50000 }, { "epoch": 1.0, "step": 50000, "total_flos": 2.180439447726719e+19, "train_loss": 0.801539086227417, "train_runtime": 516604.2528, "train_samples_per_second": 49.554, "train_steps_per_second": 0.097 } ], "max_steps": 50000, "num_train_epochs": 1, "total_flos": 2.180439447726719e+19, "trial_name": null, "trial_params": null }